Repository: ccfos/nightingale
Branch: main
Commit: ea9c52c808cd
Files: 802
Total size: 9.6 MB
Directory structure:
gitextract_wwt_h4rf/
├── .gitattributes
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── config.yml
│ │ ├── enhancement.md
│ │ └── question.yml
│ ├── PULL_REQUEST_TEMPLATE.md
│ └── workflows/
│ ├── issue-translator.yml
│ └── n9e.yml
├── .gitignore
├── .goreleaser.yaml
├── .typos.toml
├── LICENSE
├── Makefile
├── README.md
├── README_zh.md
├── alert/
│ ├── aconf/
│ │ └── conf.go
│ ├── alert.go
│ ├── astats/
│ │ └── stats.go
│ ├── common/
│ │ └── key.go
│ ├── dispatch/
│ │ ├── consume.go
│ │ ├── dispatch.go
│ │ ├── log.go
│ │ ├── notify_channel.go
│ │ └── notify_target.go
│ ├── eval/
│ │ ├── alert_rule.go
│ │ ├── eval.go
│ │ └── eval_test.go
│ ├── mute/
│ │ └── mute.go
│ ├── naming/
│ │ ├── hashring.go
│ │ ├── heartbeat.go
│ │ └── leader.go
│ ├── pipeline/
│ │ ├── engine/
│ │ │ └── engine.go
│ │ ├── pipeline.go
│ │ └── processor/
│ │ ├── aisummary/
│ │ │ ├── ai_summary.go
│ │ │ └── ai_summary_test.go
│ │ ├── callback/
│ │ │ └── callback.go
│ │ ├── common/
│ │ │ └── common.go
│ │ ├── eventdrop/
│ │ │ └── event_drop.go
│ │ ├── eventupdate/
│ │ │ └── event_update.go
│ │ ├── logic/
│ │ │ ├── if.go
│ │ │ └── switch.go
│ │ ├── relabel/
│ │ │ └── relabel.go
│ │ └── utils/
│ │ └── utils.go
│ ├── process/
│ │ ├── alert_cur_event.go
│ │ └── process.go
│ ├── queue/
│ │ └── queue.go
│ ├── record/
│ │ ├── prom_rule.go
│ │ ├── sample.go
│ │ └── scheduler.go
│ ├── router/
│ │ ├── router.go
│ │ ├── router_alert_eval_detail.go
│ │ ├── router_event.go
│ │ ├── router_event_detail.go
│ │ └── router_trace_logs.go
│ └── sender/
│ ├── callback.go
│ ├── dingtalk.go
│ ├── email.go
│ ├── feishu.go
│ ├── feishucard.go
│ ├── global_webhook.go
│ ├── global_webhook_test.go
│ ├── ibex.go
│ ├── lark.go
│ ├── larkcard.go
│ ├── mm.go
│ ├── notify_record_queue.go
│ ├── plugin.go
│ ├── plugin_cmd_unix.go
│ ├── plugin_cmd_windows.go
│ ├── sender.go
│ ├── telegram.go
│ ├── webhook.go
│ ├── webhook_event_queue.go
│ ├── webhook_event_queue_test.go
│ ├── webhook_queue.go
│ └── wecom.go
├── center/
│ ├── cconf/
│ │ ├── conf.go
│ │ ├── event_example.go
│ │ ├── metric.go
│ │ ├── ops.go
│ │ ├── plugin.go
│ │ ├── rsa/
│ │ │ └── rsa_conf.go
│ │ └── sql_tpl.go
│ ├── center.go
│ ├── cstats/
│ │ └── stats.go
│ ├── integration/
│ │ └── init.go
│ ├── metas/
│ │ └── metas.go
│ ├── router/
│ │ ├── router.go
│ │ ├── router_alert_aggr_view.go
│ │ ├── router_alert_cur_event.go
│ │ ├── router_alert_eval_detail.go
│ │ ├── router_alert_his_event.go
│ │ ├── router_alert_rule.go
│ │ ├── router_alert_subscribe.go
│ │ ├── router_board.go
│ │ ├── router_builtin.go
│ │ ├── router_builtin_component.go
│ │ ├── router_builtin_metric_filter.go
│ │ ├── router_builtin_metrics.go
│ │ ├── router_builtin_payload.go
│ │ ├── router_busi_group.go
│ │ ├── router_captcha.go
│ │ ├── router_chart_share.go
│ │ ├── router_config.go
│ │ ├── router_configs.go
│ │ ├── router_crypto.go
│ │ ├── router_dash_annotation.go
│ │ ├── router_dashboard.go
│ │ ├── router_datasource.go
│ │ ├── router_datasource_db.go
│ │ ├── router_embedded.go
│ │ ├── router_es.go
│ │ ├── router_es_index_pattern.go
│ │ ├── router_event_detail.go
│ │ ├── router_event_pipeline.go
│ │ ├── router_funcs.go
│ │ ├── router_heartbeat.go
│ │ ├── router_login.go
│ │ ├── router_message_template.go
│ │ ├── router_metric_desc.go
│ │ ├── router_metric_view.go
│ │ ├── router_mute.go
│ │ ├── router_mw.go
│ │ ├── router_notification_record.go
│ │ ├── router_notify_channel.go
│ │ ├── router_notify_channel_test.go
│ │ ├── router_notify_config.go
│ │ ├── router_notify_rule.go
│ │ ├── router_notify_tpl.go
│ │ ├── router_opensearch.go
│ │ ├── router_proxy.go
│ │ ├── router_query.go
│ │ ├── router_recording_rule.go
│ │ ├── router_role.go
│ │ ├── router_role_operation.go
│ │ ├── router_saved_view.go
│ │ ├── router_self.go
│ │ ├── router_server.go
│ │ ├── router_source_token.go
│ │ ├── router_target.go
│ │ ├── router_task.go
│ │ ├── router_task_tpl.go
│ │ ├── router_tdengine.go
│ │ ├── router_trace_logs.go
│ │ ├── router_user.go
│ │ ├── router_user_group.go
│ │ └── router_user_variable_config.go
│ └── sso/
│ ├── init.go
│ └── sync.go
├── cli/
│ ├── cli.go
│ └── upgrade/
│ ├── config.go
│ ├── readme.md
│ ├── upgrade.go
│ └── upgrade.sql
├── cmd/
│ ├── alert/
│ │ └── main.go
│ ├── center/
│ │ └── main.go
│ ├── cli/
│ │ └── main.go
│ ├── edge/
│ │ ├── edge.go
│ │ └── main.go
│ └── pushgw/
│ └── main.go
├── conf/
│ ├── conf.go
│ └── crypto.go
├── cron/
│ ├── clean_notify_record.go
│ └── clean_pipeline_execution.go
├── datasource/
│ ├── ck/
│ │ └── clickhouse.go
│ ├── commons/
│ │ └── eslike/
│ │ └── eslike.go
│ ├── datasource.go
│ ├── doris/
│ │ └── doris.go
│ ├── es/
│ │ └── es.go
│ ├── mysql/
│ │ └── mysql.go
│ ├── opensearch/
│ │ └── opensearch.go
│ ├── postgresql/
│ │ └── postgresql.go
│ ├── prom/
│ │ └── prom.go
│ ├── tdengine/
│ │ └── tdengine.go
│ └── victorialogs/
│ └── victorialogs.go
├── doc/
│ ├── README.bak.md
│ ├── active-contributors.md
│ ├── committers.md
│ ├── community-governance.md
│ ├── contributors.md
│ ├── end-users.md
│ ├── pmc.md
│ └── server-dash.json
├── docker/
│ ├── .dockerignore
│ ├── Dockerfile.goreleaser
│ ├── Dockerfile.goreleaser.arm64
│ ├── build.sh
│ ├── compose-bridge/
│ │ ├── docker-compose.yaml
│ │ ├── etc-categraf/
│ │ │ ├── config.toml
│ │ │ ├── input.cpu/
│ │ │ │ └── cpu.toml
│ │ │ ├── input.disk/
│ │ │ │ └── disk.toml
│ │ │ ├── input.diskio/
│ │ │ │ └── diskio.toml
│ │ │ ├── input.kernel/
│ │ │ │ └── kernel.toml
│ │ │ ├── input.mem/
│ │ │ │ └── mem.toml
│ │ │ ├── input.mysql/
│ │ │ │ └── mysql.toml
│ │ │ ├── input.net/
│ │ │ │ └── net.toml
│ │ │ ├── input.netstat/
│ │ │ │ └── netstat.toml
│ │ │ ├── input.processes/
│ │ │ │ └── processes.toml
│ │ │ ├── input.prometheus/
│ │ │ │ └── prometheus.toml
│ │ │ ├── input.redis/
│ │ │ │ └── redis.toml
│ │ │ └── input.system/
│ │ │ └── system.toml
│ │ ├── etc-mysql/
│ │ │ └── my.cnf
│ │ └── etc-nightingale/
│ │ ├── config.toml
│ │ ├── metrics.yaml
│ │ └── script/
│ │ ├── notify.bak.py
│ │ ├── notify.py
│ │ ├── notify_feishu.py
│ │ └── rule_converter.py
│ ├── compose-host-network/
│ │ ├── docker-compose.yaml
│ │ ├── etc-categraf/
│ │ │ ├── config.toml
│ │ │ ├── input.cpu/
│ │ │ │ └── cpu.toml
│ │ │ ├── input.disk/
│ │ │ │ └── disk.toml
│ │ │ ├── input.diskio/
│ │ │ │ └── diskio.toml
│ │ │ ├── input.kernel/
│ │ │ │ └── kernel.toml
│ │ │ ├── input.mem/
│ │ │ │ └── mem.toml
│ │ │ ├── input.net/
│ │ │ │ └── net.toml
│ │ │ ├── input.netstat/
│ │ │ │ └── netstat.toml
│ │ │ ├── input.processes/
│ │ │ │ └── processes.toml
│ │ │ └── input.system/
│ │ │ └── system.toml
│ │ ├── etc-mysql/
│ │ │ └── my.cnf
│ │ ├── etc-nightingale/
│ │ │ ├── config.toml
│ │ │ ├── metrics.yaml
│ │ │ └── script/
│ │ │ ├── notify.bak.py
│ │ │ ├── notify.py
│ │ │ ├── notify_feishu.py
│ │ │ └── rule_converter.py
│ │ └── etc-prometheus/
│ │ └── prometheus.yml
│ ├── compose-host-network-metric-log/
│ │ ├── docker-compose.yaml
│ │ ├── etc-categraf/
│ │ │ ├── config.toml
│ │ │ ├── input.cpu/
│ │ │ │ └── cpu.toml
│ │ │ ├── input.disk/
│ │ │ │ └── disk.toml
│ │ │ ├── input.diskio/
│ │ │ │ └── diskio.toml
│ │ │ ├── input.kernel/
│ │ │ │ └── kernel.toml
│ │ │ ├── input.mem/
│ │ │ │ └── mem.toml
│ │ │ ├── input.net/
│ │ │ │ └── net.toml
│ │ │ ├── input.netstat/
│ │ │ │ └── netstat.toml
│ │ │ ├── input.processes/
│ │ │ │ └── processes.toml
│ │ │ ├── input.system/
│ │ │ │ └── system.toml
│ │ │ └── logs.toml
│ │ ├── etc-logstash/
│ │ │ └── logstash.yaml
│ │ ├── etc-mysql/
│ │ │ └── my.cnf
│ │ ├── etc-nightingale/
│ │ │ ├── config.toml
│ │ │ ├── metrics.yaml
│ │ │ └── script/
│ │ │ ├── notify.bak.py
│ │ │ ├── notify.py
│ │ │ ├── notify_feishu.py
│ │ │ └── rule_converter.py
│ │ └── etc-prometheus/
│ │ └── prometheus.yml
│ ├── compose-postgres/
│ │ ├── categraf/
│ │ │ └── conf/
│ │ │ ├── config.toml
│ │ │ ├── input.cpu/
│ │ │ │ └── cpu.toml
│ │ │ ├── input.disk/
│ │ │ │ └── disk.toml
│ │ │ ├── input.diskio/
│ │ │ │ └── diskio.toml
│ │ │ ├── input.docker/
│ │ │ │ └── docker.toml
│ │ │ ├── input.kernel/
│ │ │ │ └── kernel.toml
│ │ │ ├── input.mem/
│ │ │ │ └── mem.toml
│ │ │ ├── input.net/
│ │ │ │ └── net.toml
│ │ │ ├── input.netstat/
│ │ │ │ └── netstat.toml
│ │ │ ├── input.processes/
│ │ │ │ └── processes.toml
│ │ │ ├── input.system/
│ │ │ │ └── system.toml
│ │ │ └── prometheus.toml
│ │ ├── docker-compose.yaml
│ │ ├── initsql_for_postgres/
│ │ │ ├── a-n9e-for-Postgres.sql
│ │ │ └── b-ibex-for-Postgres.sql
│ │ ├── n9eetc_pg/
│ │ │ ├── config.toml
│ │ │ └── metrics.yaml
│ │ └── prometc_vm/
│ │ ├── prometheus.yml
│ │ └── targets.json
│ ├── initsql/
│ │ ├── a-n9e.sql
│ │ └── c-init.sql
│ ├── migratesql/
│ │ └── migrate.sql
│ └── sqlite.sql
├── dscache/
│ ├── cache.go
│ └── sync.go
├── dskit/
│ ├── clickhouse/
│ │ ├── clickhouse.go
│ │ ├── clickhouse_test.go
│ │ └── timeseries.go
│ ├── doris/
│ │ ├── doris.go
│ │ ├── logs.go
│ │ ├── sql_analyzer.go
│ │ ├── sql_analyzer_test.go
│ │ ├── template.md
│ │ └── timeseries.go
│ ├── mysql/
│ │ ├── mysql.go
│ │ ├── mysql_test.go
│ │ ├── timeseries.go
│ │ └── timeseries_test.go
│ ├── pool/
│ │ └── pool.go
│ ├── postgres/
│ │ ├── postgres.go
│ │ └── timeseries.go
│ ├── sqlbase/
│ │ ├── base.go
│ │ ├── timeseries.go
│ │ └── timeseries_test.go
│ ├── tdengine/
│ │ └── tdengine.go
│ ├── types/
│ │ ├── timeseries.go
│ │ └── types.go
│ └── victorialogs/
│ ├── victorialogs.go
│ └── victorialogs_test.go
├── dumper/
│ ├── dumper.go
│ └── sync.go
├── etc/
│ ├── config.toml
│ ├── edge/
│ │ └── edge.toml
│ ├── metrics.yaml
│ └── script/
│ ├── notify.bak.py
│ ├── notify.py
│ ├── notify_feishu.py
│ └── rule_converter.py
├── fe.sh
├── go.mod
├── go.sum
├── integrations/
│ ├── AMD_ROCm_SMI/
│ │ ├── collect/
│ │ │ └── amd_rocm_smi/
│ │ │ └── rocm.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── AliYun/
│ │ ├── collect/
│ │ │ └── aliyun/
│ │ │ └── cloud.toml
│ │ ├── dashboards/
│ │ │ ├── arms-api.json
│ │ │ ├── arms-application.json
│ │ │ ├── arms-db.json
│ │ │ ├── arms-jvm-service.json
│ │ │ ├── arms-machine.json
│ │ │ ├── arms_jvm.json
│ │ │ ├── cdn.json
│ │ │ ├── ecs.json
│ │ │ ├── mongodb.json
│ │ │ ├── mse.json
│ │ │ ├── mysql.json
│ │ │ ├── nat.json
│ │ │ ├── oss.json
│ │ │ ├── polardb_mysql.json
│ │ │ ├── rds.json
│ │ │ ├── rds_new.json
│ │ │ ├── redis.json
│ │ │ ├── redis_cluster.json
│ │ │ ├── redis_new.json
│ │ │ ├── redis_standard.json
│ │ │ ├── slb.json
│ │ │ ├── slb_new.json
│ │ │ └── waf.json
│ │ └── markdown/
│ │ └── README.md
│ ├── AppDynamics/
│ │ ├── collect/
│ │ │ └── appdynamics/
│ │ │ └── app.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── AutoMQ/
│ │ ├── alerts/
│ │ │ └── 常用告警规则.json
│ │ ├── collect/
│ │ │ └── prometheus/
│ │ │ └── 采集OTEL-COLLECTOR的样例.toml
│ │ ├── dashboards/
│ │ │ ├── broker_metrics.json
│ │ │ ├── cluster_overview.json
│ │ │ ├── detailed_metrics.json
│ │ │ ├── group_metrics.json
│ │ │ └── topic_metrics.json
│ │ ├── markdown/
│ │ │ └── overview.md
│ │ └── metrics/
│ │ └── exporter.json
│ ├── Bind/
│ │ ├── collect/
│ │ │ └── bind/
│ │ │ └── bind.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── Canal/
│ │ ├── dashboards/
│ │ │ └── canal_by_categraf.json
│ │ └── markdown/
│ │ └── README.md
│ ├── Ceph/
│ │ ├── alerts/
│ │ │ └── ceph_by_categraf.json
│ │ ├── dashboards/
│ │ │ └── ceph_by_categraf.json
│ │ └── markdown/
│ │ └── README.md
│ ├── ClickHouse/
│ │ ├── alerts/
│ │ │ ├── clickhouse_by_categraf.json
│ │ │ └── clickhouse_by_exporter.json
│ │ ├── collect/
│ │ │ └── clickhouse/
│ │ │ └── clickhouse.toml
│ │ ├── dashboards/
│ │ │ ├── clickhouse_by_categraf.json
│ │ │ └── clickhouse_by_exporter.json
│ │ ├── markdown/
│ │ │ └── README.md
│ │ └── metrics/
│ │ ├── clickhouse_by_categraf.json
│ │ └── clickhouse_by_exporter.json
│ ├── CloudWatch/
│ │ ├── collect/
│ │ │ └── cloudwatch/
│ │ │ └── cloud.toml
│ │ ├── dashboards/
│ │ │ └── dashboard-by-aws-rds.json
│ │ └── markdown/
│ │ └── README.md
│ ├── Consul/
│ │ ├── collect/
│ │ │ └── consul/
│ │ │ └── consul.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── Dns_Query/
│ │ ├── collect/
│ │ │ └── dns_query/
│ │ │ └── dns_query.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── Docker/
│ │ ├── collect/
│ │ │ └── docker/
│ │ │ └── docker.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── Doris/
│ │ ├── alerts/
│ │ │ └── doris_by_categraf.json
│ │ ├── collect/
│ │ │ └── prometheus/
│ │ │ └── collect_doris_examples.toml
│ │ ├── dashboards/
│ │ │ └── Doris_Overview.json
│ │ └── markdown/
│ │ └── README.md
│ ├── Elasticsearch/
│ │ ├── alerts/
│ │ │ ├── elasticsearch_by_categraf.json
│ │ │ └── elasticsearch_by_exporter.json
│ │ ├── collect/
│ │ │ └── elasticsearch/
│ │ │ └── elasticsearch.toml
│ │ ├── dashboards/
│ │ │ ├── elasticsearch_by_categraf.json
│ │ │ ├── elasticsearch_by_categraf_0.3.102.json
│ │ │ ├── elasticsearch_by_categraf_a.json
│ │ │ ├── elasticsearch_by_categraf_b.json
│ │ │ └── elasticsearch_by_exporter.json
│ │ ├── markdown/
│ │ │ └── README.md
│ │ └── metrics/
│ │ └── categraf-base.json
│ ├── Exec/
│ │ ├── collect/
│ │ │ └── exec/
│ │ │ └── exec.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── Filecount/
│ │ ├── collect/
│ │ │ └── filecount/
│ │ │ └── filecount.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── Gitlab/
│ │ ├── alerts/
│ │ │ └── gitlab_by_categraf.json
│ │ ├── dashboards/
│ │ │ ├── MachinePerformance.json
│ │ │ ├── NGINXVTS.json
│ │ │ ├── Overview.json
│ │ │ ├── PostgreSQL.json
│ │ │ └── Redis.json
│ │ └── markdown/
│ │ └── README.md
│ ├── GoogleCloud/
│ │ ├── collect/
│ │ │ └── googlecloud/
│ │ │ └── gcp.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── HAProxy/
│ │ ├── collect/
│ │ │ └── haproxy/
│ │ │ └── haproxy.toml
│ │ ├── dashboards/
│ │ │ └── dashboard.json
│ │ └── markdown/
│ │ └── README.md
│ ├── HTTP_Response/
│ │ ├── alerts/
│ │ │ └── http_response_by_categraf.json
│ │ ├── collect/
│ │ │ └── http_response/
│ │ │ └── http_response.toml
│ │ ├── dashboards/
│ │ │ └── http_response_by_categraf.json
│ │ ├── markdown/
│ │ │ └── http.md
│ │ └── metrics/
│ │ └── categraf.json
│ ├── IPMI/
│ │ ├── alerts/
│ │ │ └── alerts.json
│ │ ├── collect/
│ │ │ └── ipmi/
│ │ │ └── conf.toml
│ │ ├── dashboards/
│ │ │ ├── IPMI.json
│ │ │ ├── IPMI_by_categraf.json
│ │ │ └── IPMI_by_prometheus.json
│ │ └── markdown/
│ │ └── README.md
│ ├── IPVS/
│ │ ├── collect/
│ │ │ └── ipvs/
│ │ │ └── ipvs.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── Java/
│ │ └── dashboards/
│ │ ├── jmx_by_exporter.json
│ │ ├── jmx_by_kubernetes.json
│ │ └── jvm_by_opentelementry.json
│ ├── Jenkins/
│ │ ├── collect/
│ │ │ └── jenkins/
│ │ │ └── jenkins.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── Jolokia_Agent/
│ │ ├── collect/
│ │ │ └── jolokia_agent/
│ │ │ ├── activemq.toml
│ │ │ ├── bitbucket.toml
│ │ │ ├── cassandra.toml
│ │ │ ├── hadoop-hdfs.toml
│ │ │ ├── java.toml
│ │ │ ├── jboss.toml
│ │ │ ├── kafka-connect.toml
│ │ │ ├── kafka.toml
│ │ │ ├── tomcat.toml
│ │ │ ├── weblogic.toml
│ │ │ └── zookeeper.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── Kafka/
│ │ ├── alerts/
│ │ │ ├── kafka_by_categraf.json
│ │ │ └── kafka_by_exporter.json
│ │ ├── collect/
│ │ │ └── kafka/
│ │ │ └── kafka.toml
│ │ ├── dashboards/
│ │ │ ├── kafka_by_categraf.json
│ │ │ └── kafka_by_exporter.json
│ │ ├── markdown/
│ │ │ └── README.md
│ │ └── metrics/
│ │ └── categraf-base.json
│ ├── Kubernetes/
│ │ ├── alerts/
│ │ │ ├── apiserver.json
│ │ │ ├── kube-controller-plane.json
│ │ │ ├── kubelet.json
│ │ │ ├── node-exporter.json
│ │ │ ├── prometheus-operator.json
│ │ │ └── prometheus.json
│ │ ├── dashboards/
│ │ │ ├── APIServer.json
│ │ │ ├── ControllerManager.json
│ │ │ ├── DeploymentContainer.json
│ │ │ ├── KubeStateMetrics.json
│ │ │ ├── KubeletMetrics.json
│ │ │ ├── Pod.json
│ │ │ ├── Scheduler.json
│ │ │ └── StatefulsetContainer.json
│ │ ├── markdown/
│ │ │ └── README.md
│ │ ├── metrics/
│ │ │ ├── k8s-node.json
│ │ │ └── k8s-pod.json
│ │ └── record-rules/
│ │ ├── kube-controller-plane.json
│ │ └── node-exporter.json
│ ├── Ldap/
│ │ ├── collect/
│ │ │ └── ldap/
│ │ │ └── ldap.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── Linux/
│ │ ├── alerts/
│ │ │ ├── CommonAlertingRules-Categraf.json
│ │ │ ├── linux_by_categraf.json
│ │ │ ├── linux_by_exporter.json
│ │ │ ├── linux_by_telegraf.json
│ │ │ └── 常用中文告警规则-采集器Categraf.json
│ │ ├── collect/
│ │ │ ├── arp_packet/
│ │ │ │ └── arp_packet.toml
│ │ │ ├── kernel_vmstat/
│ │ │ │ └── kernel_vmstat.toml
│ │ │ ├── netstat/
│ │ │ │ └── netstat.toml
│ │ │ ├── ntp/
│ │ │ │ └── ntp.toml
│ │ │ └── processes/
│ │ │ └── processes.toml
│ │ ├── dashboards/
│ │ │ ├── categraf-detail.json
│ │ │ ├── categraf-overview.json
│ │ │ ├── categraf-processes.json
│ │ │ ├── categraf-table-ng.json
│ │ │ └── exporter-detail.json
│ │ ├── markdown/
│ │ │ └── README.md
│ │ └── metrics/
│ │ ├── categraf-base.json
│ │ └── exporter-base.json
│ ├── Logstash/
│ │ ├── collect/
│ │ │ └── logstash/
│ │ │ └── logstash.toml
│ │ ├── dashboards/
│ │ │ └── logstash-dash.json
│ │ └── markdown/
│ │ └── README.md
│ ├── MinIO/
│ │ ├── alerts/
│ │ │ └── minio_by_categraf.json
│ │ ├── dashboards/
│ │ │ ├── minio_by_categraf.json
│ │ │ └── new-version.json
│ │ └── markdown/
│ │ └── README.md
│ ├── MongoDB/
│ │ ├── alerts/
│ │ │ └── mongo_by_exporter.json
│ │ ├── collect/
│ │ │ └── mongodb/
│ │ │ └── mongodb.toml
│ │ ├── dashboards/
│ │ │ └── mongo_by_exporter.json
│ │ └── markdown/
│ │ └── README.md
│ ├── Mtail/
│ │ ├── collect/
│ │ │ └── mtail/
│ │ │ └── mtail.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── MySQL/
│ │ ├── alerts/
│ │ │ ├── mysql_by_categraf.json
│ │ │ └── mysql_by_exporter.json
│ │ ├── collect/
│ │ │ └── mysql/
│ │ │ └── mysql.toml
│ │ ├── dashboards/
│ │ │ ├── MySQL-by-address.json
│ │ │ ├── MySQL仪表盘-远端.json
│ │ │ ├── MySQL仪表盘.json
│ │ │ ├── mysql_by_categraf.json
│ │ │ ├── mysql_by_categraf_ident.json
│ │ │ ├── mysql_by_categraf_instance.json
│ │ │ └── mysql_by_exporter.json
│ │ ├── markdown/
│ │ │ ├── README.md
│ │ │ └── mysql.md
│ │ └── metrics/
│ │ └── categraf-base.json
│ ├── N9E/
│ │ ├── dashboards/
│ │ │ ├── n9e_server.json
│ │ │ ├── n9e_v6.json
│ │ │ └── n9e_v8.json
│ │ └── markdown/
│ │ └── README.md
│ ├── NFSClient/
│ │ ├── collect/
│ │ │ └── nfsclient/
│ │ │ └── nfsclient.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── NSQ/
│ │ ├── collect/
│ │ │ └── nsq/
│ │ │ └── nsq.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── NVIDIA/
│ │ ├── collect/
│ │ │ └── nvidia_smi/
│ │ │ └── nvidia_smi.toml
│ │ ├── dashboards/
│ │ │ └── nvidia-gpu-metrics-by-categraf.json
│ │ └── markdown/
│ │ └── README.md
│ ├── Net_Response/
│ │ ├── alerts/
│ │ │ └── net_response_by_categraf.json
│ │ ├── collect/
│ │ │ └── net_response/
│ │ │ └── net_response.toml
│ │ ├── dashboards/
│ │ │ ├── dashboard-by-ziv.json
│ │ │ └── net_response_by_categraf.json
│ │ ├── markdown/
│ │ │ └── README.md
│ │ └── metrics/
│ │ └── categraf.json
│ ├── Netstat_Filter/
│ │ ├── collect/
│ │ │ └── netstat_filter/
│ │ │ └── netstat_filter.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── Nginx/
│ │ ├── collect/
│ │ │ ├── nginx/
│ │ │ │ └── nginx.toml
│ │ │ └── nginx_upstream_check/
│ │ │ └── nginx_upstream_check.toml
│ │ ├── dashboards/
│ │ │ ├── nginx_stub_status.json
│ │ │ ├── nginx_upstream_check.json
│ │ │ └── nginx_vts.json
│ │ ├── markdown/
│ │ │ └── README.md
│ │ └── metrics/
│ │ └── categraf.json
│ ├── Oracle/
│ │ ├── alerts/
│ │ │ └── oracle_alert.json
│ │ ├── collect/
│ │ │ └── oracle/
│ │ │ └── oracle.toml
│ │ ├── dashboards/
│ │ │ └── oracle_by_categraf.json
│ │ └── markdown/
│ │ └── README.md
│ ├── PHP/
│ │ ├── collect/
│ │ │ └── phpfpm/
│ │ │ └── phpfpm.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── Ping/
│ │ ├── alerts/
│ │ │ └── ping_by_categraf.json
│ │ ├── collect/
│ │ │ └── ping/
│ │ │ └── ping.toml
│ │ ├── dashboards/
│ │ │ ├── ping_by_categraf_a.json
│ │ │ └── ping_by_categraf_b.json
│ │ ├── markdown/
│ │ │ └── README.md
│ │ └── metrics/
│ │ └── categraf.json
│ ├── PostgreSQL/
│ │ ├── alerts/
│ │ │ └── postgresql_by_categraf.json
│ │ ├── collect/
│ │ │ └── postgresql/
│ │ │ └── postgresql.toml
│ │ ├── dashboards/
│ │ │ └── postgresql_by_categraf.json
│ │ └── markdown/
│ │ └── README.md
│ ├── Procstat/
│ │ ├── alerts/
│ │ │ └── categraf-procstat.json
│ │ ├── collect/
│ │ │ └── procstat/
│ │ │ └── procstat.toml
│ │ ├── dashboards/
│ │ │ └── categraf-procstat.json
│ │ ├── markdown/
│ │ │ └── readme.md
│ │ └── metrics/
│ │ └── categraf.json
│ ├── Prometheus/
│ │ ├── collect/
│ │ │ └── prometheus/
│ │ │ └── prometheus.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── RabbitMQ/
│ │ ├── alerts/
│ │ │ └── alerts.json
│ │ ├── collect/
│ │ │ └── rabbitmq/
│ │ │ └── rabbitmq.toml
│ │ ├── dashboards/
│ │ │ ├── rabbitmq_CN_v3.8_gt.json
│ │ │ ├── rabbitmq_by_categraf.json
│ │ │ ├── rabbitmq_v3.8_gt.json
│ │ │ └── rabbitmq_v3.8_lt.json
│ │ └── markdown/
│ │ └── README.md
│ ├── Redis/
│ │ ├── alerts/
│ │ │ ├── redis_by_categraf.json
│ │ │ └── redis_by_exporter.json
│ │ ├── collect/
│ │ │ ├── redis/
│ │ │ │ └── redis.toml
│ │ │ └── redis_sentinel/
│ │ │ └── redis_sentinel.toml
│ │ ├── dashboards/
│ │ │ ├── FilterByAddress.json
│ │ │ ├── redis_by_categraf.json
│ │ │ └── redis_by_exporter.json
│ │ └── markdown/
│ │ └── README.md
│ ├── SMART/
│ │ ├── collect/
│ │ │ └── smart/
│ │ │ └── smart.toml
│ │ ├── dashboards/
│ │ │ └── smart.json
│ │ └── markdown/
│ │ └── README.md
│ ├── SNMP/
│ │ ├── collect/
│ │ │ └── snmp/
│ │ │ ├── Cisco.toml
│ │ │ ├── snmp.toml
│ │ │ └── snmp.toml.example
│ │ ├── dashboards/
│ │ │ ├── dashboards.json
│ │ │ ├── switch branch.json
│ │ │ └── switch main.json
│ │ └── markdown/
│ │ └── README.md
│ ├── SQLServer/
│ │ ├── collect/
│ │ │ └── sqlserver/
│ │ │ └── sqlserver.toml
│ │ ├── dashboards/
│ │ │ └── sqlserver.json
│ │ └── markdown/
│ │ └── README.md
│ ├── SpringBoot/
│ │ ├── alerts/
│ │ │ └── alerts.json
│ │ ├── dashboards/
│ │ │ ├── JVM(Actuator)withapplicationname.json
│ │ │ └── JVM.json
│ │ └── markdown/
│ │ └── README.md
│ ├── Switch_Legacy/
│ │ ├── collect/
│ │ │ └── switch_legacy/
│ │ │ └── switch_legacy.toml
│ │ ├── dashboards/
│ │ │ └── dashboard.json
│ │ └── markdown/
│ │ └── README.md
│ ├── Systemd/
│ │ ├── collect/
│ │ │ └── systemd/
│ │ │ └── systemd.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── TDEngine/
│ │ ├── dashboards/
│ │ │ └── tasokeeper3.x.json
│ │ └── markdown/
│ │ └── README.md
│ ├── TiDB/
│ │ ├── alerts/
│ │ │ └── tidb-alerts.json
│ │ └── dashboards/
│ │ └── tidb-dashboard.json
│ ├── Tomcat/
│ │ ├── collect/
│ │ │ └── tomcat/
│ │ │ └── tomcat.toml
│ │ ├── dashboards/
│ │ │ └── tomcat_by_categraf.json
│ │ └── markdown/
│ │ └── README.md
│ ├── VictoriaMetrics/
│ │ ├── alerts/
│ │ │ └── alerts.json
│ │ ├── dashboards/
│ │ │ ├── victoriametrics-cluster.json
│ │ │ └── victoriametrics-single.json
│ │ └── markdown/
│ │ └── README.md
│ ├── Whois/
│ │ ├── collect/
│ │ │ └── whois/
│ │ │ └── whois.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── Windows/
│ │ ├── alerts/
│ │ │ ├── windows_by_categraf.json
│ │ │ └── windows_by_exporter.json
│ │ ├── dashboards/
│ │ │ ├── windows_by_categraf.json
│ │ │ └── windows_by_exporter.json
│ │ └── markdown/
│ │ └── README.md
│ ├── XSKYApi/
│ │ ├── collect/
│ │ │ └── xskyapi/
│ │ │ └── xskyapi.toml
│ │ └── markdown/
│ │ └── README.md
│ ├── ZooKeeper/
│ │ ├── alerts/
│ │ │ └── zookeeper_by_exporter.json
│ │ ├── collect/
│ │ │ └── zookeeper/
│ │ │ └── zookeeper.toml
│ │ ├── dashboards/
│ │ │ └── zookeeper_by_exporter.json
│ │ └── markdown/
│ │ └── README.md
│ ├── cAdvisor/
│ │ ├── collect/
│ │ │ └── cadvisor/
│ │ │ └── cadvisor.toml
│ │ ├── dashboards/
│ │ │ └── dashboard.json
│ │ ├── markdown/
│ │ │ └── README.md
│ │ └── metrics/
│ │ └── exporter-base.json
│ └── vSphere/
│ ├── alerts/
│ │ └── alerts.json
│ ├── collect/
│ │ └── vsphere/
│ │ └── vsphere.toml
│ ├── dashboards/
│ │ ├── vmware_by_vsphere-monitor.json
│ │ └── vsphere.json
│ └── markdown/
│ └── README.md
├── memsto/
│ ├── alert_mute_cache.go
│ ├── alert_rule_cache.go
│ ├── alert_subscribe_cache.go
│ ├── busi_group_cache.go
│ ├── config_cache.go
│ ├── config_cval_cache.go
│ ├── datasource_cache.go
│ ├── drop_ident.go
│ ├── es_index_pattern.go
│ ├── event_processor_cache.go
│ ├── host_alert_rule_targets.go
│ ├── memsto.go
│ ├── message_template_cache.go
│ ├── notify_channel_cache.go
│ ├── notify_config.go
│ ├── notify_rule_cache.go
│ ├── recording_rule_cache.go
│ ├── stat.go
│ ├── target_cache.go
│ ├── task_tpl_cache.go
│ ├── user_cache.go
│ ├── user_group_cache.go
│ └── user_token_cache.go
├── models/
│ ├── alert_aggr_view.go
│ ├── alert_cur_event.go
│ ├── alert_his_event.go
│ ├── alert_mute.go
│ ├── alert_rule.go
│ ├── alert_subscribe.go
│ ├── alerting_engine.go
│ ├── anomaly_point.go
│ ├── board.go
│ ├── board_busi.go
│ ├── board_payload.go
│ ├── builtin_cate.go
│ ├── builtin_component.go
│ ├── builtin_metrics.go
│ ├── builtin_metrics_filter.go
│ ├── builtin_payload.go
│ ├── busi_group.go
│ ├── busi_group_member.go
│ ├── chart.go
│ ├── chart_group.go
│ ├── chart_share.go
│ ├── common.go
│ ├── configs.go
│ ├── dash_annotation.go
│ ├── dashboard.go
│ ├── datasource.go
│ ├── embedded_product.go
│ ├── es_index_pattern.go
│ ├── event_pipeline.go
│ ├── event_pipeline_execution.go
│ ├── event_processor.go
│ ├── host_meta.go
│ ├── message_tpl.go
│ ├── metric_view.go
│ ├── migrate/
│ │ ├── migrate.go
│ │ ├── migrate_es_index_pattern.go
│ │ └── migrate_test.go
│ ├── notification_record.go
│ ├── notify_channel.go
│ ├── notify_channel_test.go
│ ├── notify_config.go
│ ├── notify_rule.go
│ ├── notify_tpl.go
│ ├── prom_alert_rule.go
│ ├── prom_alert_rule_test.go
│ ├── recording_rule.go
│ ├── role.go
│ ├── role_operation.go
│ ├── saved_view.go
│ ├── source_token.go
│ ├── sso_config.go
│ ├── target.go
│ ├── target_busi_group.go
│ ├── task_record.go
│ ├── task_tpl.go
│ ├── ts.go
│ ├── user.go
│ ├── user_group.go
│ ├── user_group_member.go
│ ├── user_token.go
│ └── workflow.go
├── pkg/
│ ├── aop/
│ │ ├── log.go
│ │ └── rec.go
│ ├── cas/
│ │ └── cas.go
│ ├── cfg/
│ │ ├── cfg.go
│ │ └── scan.go
│ ├── choice/
│ │ └── choice.go
│ ├── cmdx/
│ │ ├── cmd_notwindows.go
│ │ ├── cmd_windows.go
│ │ └── cmdx.go
│ ├── ctx/
│ │ └── ctx.go
│ ├── dingtalk/
│ │ ├── dingtalk.go
│ │ └── user/
│ │ └── client.go
│ ├── fasttime/
│ │ └── fasttime.go
│ ├── feishu/
│ │ └── feishu.go
│ ├── flashduty/
│ │ ├── post.go
│ │ ├── sync_user.go
│ │ ├── sync_user_group.go
│ │ └── sync_user_test.go
│ ├── ginx/
│ │ ├── auth.go
│ │ ├── bytesconv.go
│ │ ├── errorx.go
│ │ ├── funcs.go
│ │ ├── param.go
│ │ └── render.go
│ ├── hash/
│ │ ├── hash.go
│ │ ├── hash_fnv.go
│ │ └── hash_md5.go
│ ├── httpx/
│ │ └── httpx.go
│ ├── i18nx/
│ │ ├── i18n.go
│ │ └── var.go
│ ├── ibex/
│ │ └── ibex.go
│ ├── ldapx/
│ │ ├── ldapx.go
│ │ └── user_sync.go
│ ├── loggrep/
│ │ └── loggrep.go
│ ├── logx/
│ │ └── logx.go
│ ├── macros/
│ │ └── macros.go
│ ├── oauth2x/
│ │ └── oauth2x.go
│ ├── oidcx/
│ │ └── oidc.go
│ ├── ormx/
│ │ ├── database_init.go
│ │ ├── database_init_test.go
│ │ ├── ormx.go
│ │ └── types.go
│ ├── osx/
│ │ └── osx.go
│ ├── parser/
│ │ ├── calc.go
│ │ └── calc_test.go
│ ├── poster/
│ │ ├── post.go
│ │ └── post_test.go
│ ├── prom/
│ │ ├── client_option.go
│ │ ├── conv.go
│ │ ├── conv_test.go
│ │ ├── reader.go
│ │ └── writer.go
│ ├── promql/
│ │ ├── parser.go
│ │ ├── perser_test.go
│ │ └── promql.go
│ ├── secu/
│ │ ├── aes.go
│ │ └── rsa.go
│ ├── slice/
│ │ └── contains.go
│ ├── strx/
│ │ └── verify.go
│ ├── tlsx/
│ │ ├── common.go
│ │ └── config.go
│ ├── tplx/
│ │ ├── conv.go
│ │ ├── fns.go
│ │ ├── tpl_test.go
│ │ └── tplx.go
│ ├── unit/
│ │ ├── unit_convert.go
│ │ └── unit_convert_test.go
│ └── version/
│ └── version.go
├── prom/
│ ├── client.go
│ ├── option.go
│ └── reader.go
├── pushgw/
│ ├── idents/
│ │ └── idents.go
│ ├── kafka/
│ │ └── producer.go
│ ├── pconf/
│ │ └── conf.go
│ ├── pstat/
│ │ └── pstat.go
│ ├── pushgw.go
│ ├── router/
│ │ ├── fns.go
│ │ ├── router.go
│ │ ├── router_datadog.go
│ │ ├── router_datadog_easyjson.go
│ │ ├── router_heartbeat.go
│ │ ├── router_openfalcon.go
│ │ ├── router_openfalcon_easyjson.go
│ │ ├── router_opentsdb.go
│ │ ├── router_opentsdb_easyjson.go
│ │ ├── router_proxy_remotewrite.go
│ │ ├── router_remotewrite.go
│ │ ├── router_target.go
│ │ └── vars.go
│ └── writer/
│ ├── kafka_writer.go
│ ├── queue.go
│ ├── relabel.go
│ ├── relabel_test.go
│ └── writer.go
└── storage/
├── redis.go
├── redis_test.go
└── storage.go
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitattributes
================================================
*.css linguist-language=go
*.less linguist-language=go
*.js linguist-language=go
*.tsx linguist-language=go
*.html linguist-language=go
================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
contact_links:
- name: Nightingale docs
url: https://n9e.github.io/
about: You may want to read through the document before asking questions.
================================================
FILE: .github/ISSUE_TEMPLATE/enhancement.md
================================================
---
name: Enhancement Request
about: Suggest an enhancement to the nightingale project
labels: kind/feature
---
**What would you like to be added**:
**Why is this needed**:
================================================
FILE: .github/ISSUE_TEMPLATE/question.yml
================================================
name: Bug Report & Usage Question
description: Reporting a bug or asking a question about how to use Nightingale
labels: []
body:
- type: markdown
attributes:
value: |
The more detailed the form is filled in, the easier the problem will be solved.
提供的信息越详细,问题解决的可能性就越大。另外, 提问之前请先搜索历史 issue (包括 close 的), 以免重复提问。
- type: textarea
id: question
attributes:
label: Question and Steps to reproduce
description: Describe your question and steps to reproduce the bug. 描述问题以及复现步骤
validations:
required: true
- type: textarea
id: logs
attributes:
label: Relevant logs and configurations
description: Relevant logs and configurations. 报错日志([查看方法](https://flashcat.cloud/docs/content/flashcat-monitor/nightingale-v6/faq/how-to-check-logs/))以及各个相关组件的配置信息
render: text
validations:
required: true
- type: textarea
id: system-info
attributes:
label: Version
description: Include nightingale version, operating system, and other relevant details. 请告知夜莺的版本、操作系统的版本、CPU架构等信息
validations:
required: true
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
**What type of PR is this?**
**What this PR does / why we need it**:
**Which issue(s) this PR fixes**:
Fixes #
**Special notes for your reviewer**:
================================================
FILE: .github/workflows/issue-translator.yml
================================================
name: 'Issue Translator'
on:
issues:
types: [opened]
jobs:
translate:
runs-on: ubuntu-latest
permissions:
issues: write
contents: read
steps:
- name: Translate Issues
uses: usthe/issues-translate-action@v2.7
with:
# 是否翻译 issue 标题
IS_MODIFY_TITLE: true
# GitHub Token
BOT_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# 自定义翻译标注(可选)
# CUSTOM_BOT_NOTE: "Translation by bot"
================================================
FILE: .github/workflows/n9e.yml
================================================
name: Release
on:
push:
tags:
- 'v*'
env:
GO_VERSION: 1.23
jobs:
goreleaser:
runs-on: ubuntu-latest
steps:
- name: Checkout Source Code
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Setup Go Environment
uses: actions/setup-go@v3
with:
go-version: ${{ env.GO_VERSION }}
- uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Run GoReleaser
uses: goreleaser/goreleaser-action@v3
with:
distribution: goreleaser
version: '~> v1'
args: release --rm-dist
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
================================================
FILE: .gitignore
================================================
*.exe
*.exe~
*.dll
*.dylib
*.test
*.out
*.prof
*.log
*.o
*.a
*.so
*.db
*.sw[po]
*.tar.gz
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
_obj
_test
/log*
/bin
/out
/build
/dist
/etc/*.local.yml
/etc/*.local.conf
/etc/rsa/*
/etc/plugins/*.local.yml
/etc/script/rules.yaml
/etc/script/alert-rules.json
/etc/script/record-rules.json
/data*
/tarball
/run
/vendor
/tmp
/pub
/n9e
/docker/pub
/docker/n9e
/docker/compose-bridge/mysqldata
/docker/compose-host-network/mysqldata
/docker/compose-host-network-metric-log/mysqldata
/docker/compose-host-network-metric-log/n9e-logs
/docker/compose-postgres/pgdata
/etc.local*
/front/statik/statik.go
/docker/compose-bridge/etc-nightingale/rsa/
.alerts
.idea
.index
.vscode
.issue
.issue/*
.cursor
.claude
.DS_Store
.cache-loader
.payload
queries.active
/n9e-*
n9e.sql
!/datasource
.env.json
================================================
FILE: .goreleaser.yaml
================================================
before:
hooks:
# You may remove this if you don't use go modules.
- go mod tidy
- go install github.com/rakyll/statik
snapshot:
name_template: '{{ .Tag }}'
checksum:
name_template: 'checksums.txt'
changelog:
skip: true
builds:
- id: build
hooks:
pre:
- cmd: sh -x ./fe.sh
output: true
main: ./cmd/center/
binary: n9e
env:
- CGO_ENABLED=0
goos:
- linux
goarch:
- amd64
- arm64
ldflags:
- -s -w
- -X github.com/ccfos/nightingale/v6/pkg/version.Version={{ .Tag }}-{{.Commit}}
- id: build-cli
main: ./cmd/cli/
binary: n9e-cli
env:
- CGO_ENABLED=0
goos:
- linux
goarch:
- amd64
- arm64
ldflags:
- -s -w
- -X github.com/ccfos/nightingale/v6/pkg/version.Version={{ .Tag }}-{{.Commit}}
- id: build-edge
main: ./cmd/edge/
binary: n9e-edge
env:
- CGO_ENABLED=0
goos:
- linux
goarch:
- amd64
- arm64
ldflags:
- -s -w
- -X github.com/ccfos/nightingale/v6/pkg/version.Version={{ .Tag }}-{{.Commit}}
archives:
- id: n9e
builds:
- build
- build-cli
- build-edge
format: tar.gz
format_overrides:
- goos: windows
format: zip
name_template: "n9e-v{{ .Version }}-{{ .Os }}-{{ .Arch }}"
wrap_in_directory: false
files:
- docker/*
- etc/*
- integrations/*
- cli/*
- n9e.sql
release:
github:
owner: ccfos
name: nightingale
name_template: "v{{ .Version }}"
dockers:
- image_templates:
- flashcatcloud/nightingale:{{ .Version }}-amd64
goos: linux
goarch: amd64
ids:
- build
dockerfile: docker/Dockerfile.goreleaser
extra_files:
- etc
- integrations
use: buildx
build_flag_templates:
- "--platform=linux/amd64"
- image_templates:
- flashcatcloud/nightingale:{{ .Version }}-arm64v8
goos: linux
goarch: arm64
ids:
- build
dockerfile: docker/Dockerfile.goreleaser.arm64
extra_files:
- etc
- integrations
use: buildx
build_flag_templates:
- "--platform=linux/arm64/v8"
docker_manifests:
- name_template: flashcatcloud/nightingale:{{ .Version }}
image_templates:
- flashcatcloud/nightingale:{{ .Version }}-amd64
- flashcatcloud/nightingale:{{ .Version }}-arm64v8
- name_template: flashcatcloud/nightingale:latest
image_templates:
- flashcatcloud/nightingale:{{ .Version }}-amd64
- flashcatcloud/nightingale:{{ .Version }}-arm64v8
================================================
FILE: .typos.toml
================================================
# Configuration for typos tool
[files]
extend-exclude = [
# Ignore auto-generated easyjson files
"*_easyjson.go",
# Ignore binary files
"*.gz",
"*.tar",
"n9e",
"n9e-*"
]
[default.extend-identifiers]
# Didi is a company name (DiDi), not a typo
Didi = "Didi"
# datas is intentionally used as plural of data (slice variable)
datas = "datas"
# pendings is intentionally used as plural
pendings = "pendings"
pendingsUseByRecover = "pendingsUseByRecover"
pendingsUseByRecoverMap = "pendingsUseByRecoverMap"
# typs is intentionally used as shorthand for types (parameter name)
typs = "typs"
[default.extend-words]
# Some false positives
ba = "ba"
# Specific corrections for ambiguous typos
contigious = "contiguous"
onw = "own"
componet = "component"
Patten = "Pattern"
Requets = "Requests"
Mis = "Miss"
exporer = "exporter"
soruce = "source"
verison = "version"
Configations = "Configurations"
emmited = "emitted"
Utlization = "Utilization"
serie = "series"
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright CCF ODC.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: Makefile
================================================
.PHONY: prebuild build
ROOT:=$(shell pwd -P)
GIT_COMMIT:=$(shell git --work-tree ${ROOT} rev-parse 'HEAD^{commit}')
_GIT_VERSION:=$(shell git --work-tree ${ROOT} describe --tags --abbrev=14 "${GIT_COMMIT}^{commit}" 2>/dev/null)
TAG=$(shell echo "${_GIT_VERSION}" | awk -F"-" '{print $$1}')
RELEASE_VERSION:="$(TAG)-$(GIT_COMMIT)"
all: prebuild build
prebuild:
echo "begin download and embed the front-end file..."
sh fe.sh
echo "front-end file download and embedding completed."
build:
go build -ldflags "-w -s -X github.com/ccfos/nightingale/v6/pkg/version.Version=$(RELEASE_VERSION)" -o n9e ./cmd/center/main.go
build-edge:
go build -ldflags "-w -s -X github.com/ccfos/nightingale/v6/pkg/version.Version=$(RELEASE_VERSION)" -o n9e-edge ./cmd/edge/
build-alert:
go build -ldflags "-w -s -X github.com/ccfos/nightingale/v6/pkg/version.Version=$(RELEASE_VERSION)" -o n9e-alert ./cmd/alert/main.go
build-pushgw:
go build -ldflags "-w -s -X github.com/ccfos/nightingale/v6/pkg/version.Version=$(RELEASE_VERSION)" -o n9e-pushgw ./cmd/pushgw/main.go
build-cli:
go build -ldflags "-w -s -X github.com/ccfos/nightingale/v6/pkg/version.Version=$(RELEASE_VERSION)" -o n9e-cli ./cmd/cli/main.go
run:
nohup ./n9e > n9e.log 2>&1 &
run-alert:
nohup ./n9e-alert > n9e-alert.log 2>&1 &
run-pushgw:
nohup ./n9e-pushgw > n9e-pushgw.log 2>&1 &
release:
goreleaser --skip-validate --skip-publish --snapshot
================================================
FILE: README.md
================================================
Open-Source Alerting Expert
[English](./README.md) | [中文](./README_zh.md)
## 🎯 What is Nightingale
Nightingale is an open-source monitoring project that focuses on alerting. Similar to Grafana, Nightingale also connects with various existing data sources. However, while Grafana emphasizes visualization, Nightingale places greater emphasis on the alerting engine, as well as the processing and distribution of alarms.
> 💡 Nightingale has now officially launched the [MCP-Server](https://github.com/n9e/n9e-mcp-server/). This MCP Server enables AI assistants to interact with the Nightingale API using natural language, facilitating alert management, monitoring, and observability tasks.
>
> The Nightingale project was initially developed and open-sourced by DiDi.inc. On May 11, 2022, it was donated to the Open Source Development Committee of the China Computer Federation (CCF ODTC).

## 💡 How Nightingale Works
Many users have already collected metrics and log data. In this case, you can connect your storage repositories (such as VictoriaMetrics, ElasticSearch, etc.) as data sources in Nightingale. This allows you to configure alerting rules and notification rules within Nightingale, enabling the generation and distribution of alarms.

Nightingale itself does not provide monitoring data collection capabilities. We recommend using [Categraf](https://github.com/flashcatcloud/categraf) as the collector, which integrates seamlessly with Nightingale.
[Categraf](https://github.com/flashcatcloud/categraf) can collect monitoring data from operating systems, network devices, various middleware, and databases. It pushes this data to Nightingale via the `Prometheus Remote Write` protocol. Nightingale then stores the monitoring data in a time-series database (such as Prometheus, VictoriaMetrics, etc.) and provides alerting and visualization capabilities.
For certain edge data centers with poor network connectivity to the central Nightingale server, we offer a distributed deployment mode for the alerting engine. In this mode, even if the network is disconnected, the alerting functionality remains unaffected.

> In the above diagram, Data Center A has a good network with the central data center, so it uses the Nightingale process in the central data center as the alerting engine. Data Center B has a poor network with the central data center, so it deploys `n9e-edge` as the alerting engine to handle alerting for its own data sources.
## 🔕 Alert Noise Reduction, Escalation, and Collaboration
Nightingale focuses on being an alerting engine, responsible for generating alarms and flexibly distributing them based on rules. It supports 20 built-in notification medias (such as phone calls, SMS, email, DingTalk, Slack, etc.).
If you have more advanced requirements, such as:
- Want to consolidate events from multiple monitoring systems into one platform for unified noise reduction, response handling, and data analysis.
- Want to support personnel scheduling, practice on-call culture, and support alert escalation (to avoid missing alerts) and collaborative handling.
Then Nightingale is not suitable. It is recommended that you choose on-call products such as PagerDuty and FlashDuty. These products are simple and easy to use.
## 🗨️ Communication Channels
- **Report Bugs:** It is highly recommended to submit issues via the [Nightingale GitHub Issue tracker](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=kind%2Fbug&projects=&template=bug_report.yml).
- **Documentation:** For more information, we recommend thoroughly browsing the [Nightingale Documentation Site](https://n9e.github.io/).
## 🔑 Key Features

- Nightingale supports alerting rules, mute rules, subscription rules, and notification rules. It natively supports 20 types of notification media and allows customization of message templates.
- It supports event pipelines for Pipeline processing of alarms, facilitating automated integration with in-house systems. For example, it can append metadata to alarms or perform relabeling on events.
- It introduces the concept of business groups and a permission system to manage various rules in a categorized manner.
- Many databases and middleware come with built-in alert rules that can be directly imported and used. It also supports direct import of Prometheus alerting rules.
- It supports alerting self-healing, which automatically triggers a script to execute predefined logic after an alarm is generated—such as cleaning up disk space or capturing the current system state.

- Nightingale archives historical alarms and supports multi-dimensional query and statistics.
- It supports flexible aggregation grouping, allowing a clear view of the distribution of alarms across the company.

- Nightingale has built-in metric descriptions, dashboards, and alerting rules for common operating systems, middleware, and databases, which are contributed by the community with varying quality.
- It directly receives data via multiple protocols such as Remote Write, OpenTSDB, Datadog, and Falcon, integrates with various Agents.
- It supports data sources like Prometheus, ElasticSearch, Loki, ClickHouse, MySQL, Postgres, allowing alerting based on data from these sources.
- Nightingale can be easily embedded into internal enterprise systems (e.g. Grafana, CMDB), and even supports configuring menu visibility for these embedded systems.

- Nightingale supports dashboard functionality, including common chart types, and comes with pre-built dashboards. The image above is a screenshot of one of these dashboards.
- If you are already accustomed to Grafana, it is recommended to continue using Grafana for visualization, as Grafana has deeper expertise in this area.
- For machine-related monitoring data collected by Categraf, it is advisable to use Nightingale's built-in dashboards for viewing. This is because Categraf's metric naming follows Telegraf's convention, which differs from that of Node Exporter.
- Due to Nightingale's concept of business groups (where machines can belong to different groups), there may be scenarios where you only want to view machines within the current business group on the dashboard. Thus, Nightingale's dashboards can be linked with business groups for interactive filtering.
## 🌟 Stargazers over time
[](https://star-history.com/#ccfos/nightingale&Date)
## 🔥 Users

## 🤝 Community Co-Building
- ❇️ Please read the [Nightingale Open Source Project and Community Governance Draft](./doc/community-governance.md). We sincerely welcome every user, developer, company, and organization to use Nightingale, actively report bugs, submit feature requests, share best practices, and help build a professional and active open-source community.
- ❤️ Nightingale Contributors
## 📜 License
- [Apache License V2.0](https://github.com/ccfos/nightingale/blob/main/LICENSE)
================================================
FILE: README_zh.md
================================================
开源监控告警管理专家
[English](./README.md) | [中文](./README_zh.md)
## 夜莺是什么
夜莺 Nightingale 是一款开源云原生监控告警工具,是中国计算机学会接受捐赠并托管的第一个开源项目,在 GitHub 上有超过 12000 颗星,广受关注和使用。夜莺的统一告警引擎,可以对接 Prometheus、Elasticsearch、ClickHouse、Loki、MySQL 等多种数据源,提供全面的告警判定、丰富的事件处理和灵活的告警分发及通知能力。
夜莺侧重于监控告警,类似于 Grafana 的数据源集成方式,夜莺也是对接多种既有的数据源,不过 Grafana 侧重于可视化,夜莺则是侧重于告警引擎、告警事件的处理和分发。
> - 💡夜莺正式推出了 [MCP-Server](https://github.com/n9e/n9e-mcp-server/),此 MCP Server 允许 AI 助手通过自然语言与夜莺 API 交互,实现告警管理、监控和可观测性任务。
> - 夜莺监控项目,最初由滴滴开发和开源,并于 2022 年 5 月 11 日,捐赠予中国计算机学会开源发展技术委员会(CCF ODTC),为 CCF ODTC 成立后接受捐赠的第一个开源项目。

## 夜莺的工作逻辑
很多用户已经自行采集了指标、日志数据,此时就把存储库(VictoriaMetrics、ElasticSearch等)作为数据源接入夜莺,即可在夜莺里配置告警规则、通知规则,完成告警事件的生成和派发。

夜莺项目本身不提供监控数据采集能力。推荐您使用 [Categraf](https://github.com/flashcatcloud/categraf) 作为采集器,可以和夜莺丝滑对接。
[Categraf](https://github.com/flashcatcloud/categraf) 可以采集操作系统、网络设备、各类中间件、数据库的监控数据,通过 Remote Write 协议推送给夜莺,夜莺把监控数据转存到时序库(如 Prometheus、VictoriaMetrics 等),并提供告警和可视化能力。
对于个别边缘机房,如果和中心夜莺服务端网络链路不好,希望提升告警可用性,夜莺也提供边缘机房告警引擎下沉部署模式,这个模式下,即便边缘和中心端网络割裂,告警功能也不受影响。

> 上图中,机房A和中心机房的网络链路很好,所以直接由中心端的夜莺进程做告警引擎,机房B和中心机房的网络链路不好,所以在机房B部署了 `n9e-edge` 做告警引擎,对机房B的数据源做告警判定。
## 告警降噪、升级、协同
夜莺的侧重点是做告警引擎,即负责产生告警事件,并根据规则做灵活派发,内置支持 20 种通知媒介(电话、短信、邮件、钉钉、飞书、企微、Slack 等)。
如果您有更高级的需求,比如:
- 想要把公司的多套监控系统产生的事件聚拢到一个平台,统一做收敛降噪、响应处理、数据分析
- 想要支持人员的排班,践行 On-call 文化,想要支持告警认领、升级(避免遗漏)、协同处理
那夜莺是不合适的,推荐您选用 [FlashDuty](https://flashcat.cloud/product/flashcat-duty/) 这样的 On-call 产品,产品简单易用,也有免费套餐。
## 相关资料 & 交流渠道
- 📚 [夜莺介绍PPT](https://mp.weixin.qq.com/s/Mkwx_46xrltSq8NLqAIYow) 对您了解夜莺各项关键特性会有帮助(PPT链接在文末)
- 👉 [文档中心](https://flashcat.cloud/docs/) 为了更快的访问速度,站点托管在 [FlashcatCloud](https://flashcat.cloud)
- ❤️ [报告 Bug](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=&projects=&template=question.yml) 写清楚问题描述、复现步骤、截图等信息,更容易得到答案
- 💡 前后端代码分离,前端代码仓库:[https://github.com/n9e/fe](https://github.com/n9e/fe)
- 🎯 关注[这个公众号](https://gitlink.org.cn/UlricQin)了解更多夜莺动态和知识
- 🌟 加我微信:`picobyte`(我已关闭好友验证)拉入微信群,备注:`夜莺互助群`,如果已经把夜莺上到生产环境,可联系我拉入资深监控用户群
## 关键特性简介

- 夜莺支持告警规则、屏蔽规则、订阅规则、通知规则,内置支持 20 种通知媒介,支持消息模板自定义
- 支持事件管道,对告警事件做 Pipeline 处理,方便和自有系统做自动化整合,比如给告警事件附加一些元信息,对事件做 relabel
- 支持业务组概念,引入权限体系,分门别类管理各类规则
- 很多数据库、中间件内置了告警规则,可以直接导入使用,也可以直接导入 Prometheus 的告警规则
- 支持告警自愈,即告警之后自动触发一个脚本执行一些预定义的逻辑,比如清理一下磁盘、抓一下现场等

- 夜莺存档了历史告警事件,支持多维度的查询和统计
- 支持灵活的聚合分组,一目了然看到公司的告警事件分布情况

- 夜莺内置常用操作系统、中间件、数据库的的指标说明、仪表盘、告警规则,不过都是社区贡献的,整体也是参差不齐
- 夜莺直接接收 Remote Write、OpenTSDB、Datadog、Falcon 等多种协议的数据,故而可以和各类 Agent 对接
- 夜莺支持 Prometheus、ElasticSearch、Loki、TDEngine 等多种数据源,可以对其中的数据做告警
- 夜莺可以很方便内嵌企业内部系统,比如 Grafana、CMDB 等,甚至可以配置这些内嵌系统的菜单可见性

- 夜莺支持仪表盘功能,支持常见的图表类型,也内置了一些仪表盘,上图是其中一个仪表盘的截图。
- 如果你已经习惯了 Grafana,建议仍然使用 Grafana 看图。Grafana 在看图方面道行更深。
- 机器相关的监控数据,如果是 Categraf 采集的,建议使用夜莺自带的仪表盘查看,因为 Categraf 的指标命名 Follow 的是 Telegraf 的命名方式,和 Node Exporter 不同
- 因为夜莺有个业务组的概念,机器可以归属不同的业务组,有时在仪表盘里只想查看当前所属业务组的机器,所以夜莺的仪表盘可以和业务组联动
## 广受关注
[](https://star-history.com/#ccfos/nightingale&Date)
## 感谢众多企业的信赖

## 社区共建
- ❇️ 请阅读浏览[夜莺开源项目和社区治理架构草案](./doc/community-governance.md),真诚欢迎每一位用户、开发者、公司以及组织,使用夜莺监控、积极反馈 Bug、提交功能需求、分享最佳实践,共建专业、活跃的夜莺开源社区。
- ❤️ 夜莺贡献者
## License
- [Apache License V2.0](https://github.com/ccfos/nightingale/blob/main/LICENSE)
================================================
FILE: alert/aconf/conf.go
================================================
package aconf
import (
"path"
)
type Alert struct {
Disable bool
EngineDelay int64
Heartbeat HeartbeatConfig
Alerting Alerting
}
type SMTPConfig struct {
Host string
Port int
User string
Pass string
From string
InsecureSkipVerify bool
Batch int
}
type HeartbeatConfig struct {
IP string
Interval int64
Endpoint string
EngineName string
}
type Alerting struct {
Timeout int64
TemplatesDir string
NotifyConcurrency int
WebhookBatchSend bool
GlobalWebhook GlobalWebhook
}
type GlobalWebhook struct {
Enable bool
Url string
BasicAuthUser string
BasicAuthPass string
Timeout int
Headers []string
SkipVerify bool
}
type CallPlugin struct {
Enable bool
PluginPath string
Caller string
}
type RedisPub struct {
Enable bool
ChannelPrefix string
ChannelKey string
}
func (a *Alert) PreCheck(configDir string) {
if a.Alerting.TemplatesDir == "" {
a.Alerting.TemplatesDir = path.Join(configDir, "template")
}
if a.Alerting.NotifyConcurrency == 0 {
a.Alerting.NotifyConcurrency = 10
}
if a.Heartbeat.Interval == 0 {
a.Heartbeat.Interval = 1000
}
if a.EngineDelay == 0 {
a.EngineDelay = 30
}
}
================================================
FILE: alert/alert.go
================================================
package alert
import (
"context"
"fmt"
"github.com/ccfos/nightingale/v6/dscache"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/alert/dispatch"
"github.com/ccfos/nightingale/v6/alert/eval"
"github.com/ccfos/nightingale/v6/alert/naming"
"github.com/ccfos/nightingale/v6/alert/process"
"github.com/ccfos/nightingale/v6/alert/queue"
"github.com/ccfos/nightingale/v6/alert/record"
"github.com/ccfos/nightingale/v6/alert/router"
"github.com/ccfos/nightingale/v6/alert/sender"
"github.com/ccfos/nightingale/v6/conf"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/httpx"
"github.com/ccfos/nightingale/v6/pkg/logx"
"github.com/ccfos/nightingale/v6/pkg/macros"
"github.com/ccfos/nightingale/v6/prom"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/ccfos/nightingale/v6/pushgw/writer"
"github.com/ccfos/nightingale/v6/storage"
"github.com/flashcatcloud/ibex/src/cmd/ibex"
)
func Initialize(configDir string, cryptoKey string) (func(), error) {
config, err := conf.InitConfig(configDir, cryptoKey)
if err != nil {
return nil, fmt.Errorf("failed to init config: %v", err)
}
logxClean, err := logx.Init(config.Log)
if err != nil {
return nil, err
}
ctx := ctx.NewContext(context.Background(), nil, false, config.CenterApi)
var redis storage.Redis
redis, err = storage.NewRedis(config.Redis)
if err != nil {
return nil, err
}
syncStats := memsto.NewSyncStats()
alertStats := astats.NewSyncStats()
configCache := memsto.NewConfigCache(ctx, syncStats, nil, "")
targetCache := memsto.NewTargetCache(ctx, syncStats, redis)
busiGroupCache := memsto.NewBusiGroupCache(ctx, syncStats)
alertMuteCache := memsto.NewAlertMuteCache(ctx, syncStats)
alertRuleCache := memsto.NewAlertRuleCache(ctx, syncStats)
notifyConfigCache := memsto.NewNotifyConfigCache(ctx, configCache)
dsCache := memsto.NewDatasourceCache(ctx, syncStats)
userCache := memsto.NewUserCache(ctx, syncStats)
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)
taskTplsCache := memsto.NewTaskTplCache(ctx)
configCvalCache := memsto.NewCvalCache(ctx, syncStats)
notifyRuleCache := memsto.NewNotifyRuleCache(ctx, syncStats)
notifyChannelCache := memsto.NewNotifyChannelCache(ctx, syncStats)
messageTemplateCache := memsto.NewMessageTemplateCache(ctx, syncStats)
promClients := prom.NewPromClient(ctx)
dispatch.InitRegisterQueryFunc(promClients)
externalProcessors := process.NewExternalProcessors()
macros.RegisterMacro(macros.MacroInVain)
dscache.Init(ctx, false)
Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, taskTplsCache, dsCache, ctx, promClients, userCache, userGroupCache, notifyRuleCache, notifyChannelCache, messageTemplateCache, configCvalCache)
r := httpx.GinEngine(config.Global.RunMode, config.HTTP,
configCvalCache.PrintBodyPaths, configCvalCache.PrintAccessLog)
rt := router.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors, config.Log.Dir)
if config.Ibex.Enable {
ibex.ServerStart(false, nil, redis, config.HTTP.APIForService.BasicAuth, config.Alert.Heartbeat, &config.CenterApi, r, nil, config.Ibex, config.HTTP.Port)
}
rt.Config(r)
dumper.ConfigRouter(r)
httpClean := httpx.Init(config.HTTP, r)
return func() {
logxClean()
httpClean()
}, nil
}
func Start(alertc aconf.Alert, pushgwc pconf.Pushgw, syncStats *memsto.Stats, alertStats *astats.Stats, externalProcessors *process.ExternalProcessorsType, targetCache *memsto.TargetCacheType, busiGroupCache *memsto.BusiGroupCacheType,
alertMuteCache *memsto.AlertMuteCacheType, alertRuleCache *memsto.AlertRuleCacheType, notifyConfigCache *memsto.NotifyConfigCacheType, taskTplsCache *memsto.TaskTplCache, datasourceCache *memsto.DatasourceCacheType, ctx *ctx.Context,
promClients *prom.PromClientMap, userCache *memsto.UserCacheType, userGroupCache *memsto.UserGroupCacheType, notifyRuleCache *memsto.NotifyRuleCacheType, notifyChannelCache *memsto.NotifyChannelCacheType, messageTemplateCache *memsto.MessageTemplateCacheType, configCvalCache *memsto.CvalCache) {
alertSubscribeCache := memsto.NewAlertSubscribeCache(ctx, syncStats)
recordingRuleCache := memsto.NewRecordingRuleCache(ctx, syncStats)
targetsOfAlertRulesCache := memsto.NewTargetOfAlertRuleCache(ctx, alertc.Heartbeat.EngineName, syncStats)
go models.InitNotifyConfig(ctx, alertc.Alerting.TemplatesDir)
go models.InitNotifyChannel(ctx)
go models.InitMessageTemplate(ctx)
naming := naming.NewNaming(ctx, alertc.Heartbeat, alertStats)
writers := writer.NewWriters(pushgwc)
record.NewScheduler(alertc, recordingRuleCache, promClients, writers, alertStats, datasourceCache)
eval.NewScheduler(alertc, externalProcessors, alertRuleCache, targetCache, targetsOfAlertRulesCache,
busiGroupCache, alertMuteCache, datasourceCache, promClients, naming, ctx, alertStats)
eventProcessorCache := memsto.NewEventProcessorCache(ctx, syncStats)
sender.InitStaticGlobalWebhook(alertc.Alerting.GlobalWebhook)
dp := dispatch.NewDispatch(alertRuleCache, userCache, userGroupCache, alertSubscribeCache, targetCache, notifyConfigCache, taskTplsCache, notifyRuleCache, notifyChannelCache, messageTemplateCache, eventProcessorCache, configCvalCache, alertc.Alerting, ctx, alertStats)
consumer := dispatch.NewConsumer(alertc.Alerting, ctx, dp, promClients, alertMuteCache)
notifyRecordConsumer := sender.NewNotifyRecordConsumer(ctx)
go dp.ReloadTpls()
go consumer.LoopConsume()
go notifyRecordConsumer.LoopConsume()
go queue.ReportQueueSize(alertStats)
go sender.ReportNotifyRecordQueueSize(alertStats)
go sender.InitEmailSender(ctx, notifyConfigCache)
}
================================================
FILE: alert/astats/stats.go
================================================
package astats
import (
"github.com/prometheus/client_golang/prometheus"
)
const (
namespace = "n9e"
subsystem = "alert"
)
type Stats struct {
AlertNotifyTotal *prometheus.CounterVec
AlertNotifyErrorTotal *prometheus.CounterVec
CounterAlertsTotal *prometheus.CounterVec
GaugeAlertQueueSize prometheus.Gauge
CounterRuleEval *prometheus.CounterVec
CounterQueryDataErrorTotal *prometheus.CounterVec
CounterQueryDataTotal *prometheus.CounterVec
CounterVarFillingQuery *prometheus.CounterVec
CounterRecordEval *prometheus.CounterVec
CounterRecordEvalErrorTotal *prometheus.CounterVec
CounterMuteTotal *prometheus.CounterVec
CounterRuleEvalErrorTotal *prometheus.CounterVec
CounterHeartbeatErrorTotal *prometheus.CounterVec
CounterSubEventTotal *prometheus.CounterVec
GaugeQuerySeriesCount *prometheus.GaugeVec
GaugeRuleEvalDuration *prometheus.GaugeVec
GaugeNotifyRecordQueueSize prometheus.Gauge
}
func NewSyncStats() *Stats {
CounterRuleEval := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "rule_eval_total",
Help: "Number of rule eval.",
}, []string{})
CounterRuleEvalErrorTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "rule_eval_error_total",
Help: "Number of rule eval error.",
}, []string{"datasource", "stage", "busi_group", "rule_id"})
CounterQueryDataErrorTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "query_data_error_total",
Help: "Number of rule eval query data error.",
}, []string{"datasource"})
CounterQueryDataTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "query_data_total",
Help: "Number of rule eval query data.",
}, []string{"datasource", "rule_id"})
CounterRecordEval := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "record_eval_total",
Help: "Number of record eval.",
}, []string{"datasource"})
CounterRecordEvalErrorTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "record_eval_error_total",
Help: "Number of record eval error.",
}, []string{"datasource"})
AlertNotifyTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "alert_notify_total",
Help: "Number of send msg.",
}, []string{"channel"})
AlertNotifyErrorTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "alert_notify_error_total",
Help: "Number of send msg.",
}, []string{"channel"})
// 产生的告警总量
CounterAlertsTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "alerts_total",
Help: "Total number alert events.",
}, []string{"cluster", "type", "busi_group"})
// 内存中的告警事件队列的长度
GaugeAlertQueueSize := prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "alert_queue_size",
Help: "The size of alert queue.",
})
CounterMuteTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "mute_total",
Help: "Number of mute.",
}, []string{"group", "rule_id", "mute_rule_id", "datasource_id"})
CounterSubEventTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "sub_event_total",
Help: "Number of sub event.",
}, []string{"group"})
CounterHeartbeatErrorTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "heartbeat_error_count",
Help: "Number of heartbeat error.",
}, []string{})
GaugeQuerySeriesCount := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "eval_query_series_count",
Help: "Number of series retrieved from data source after query.",
}, []string{"rule_id", "datasource_id", "ref"})
// 通知记录队列的长度
GaugeNotifyRecordQueueSize := prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "notify_record_queue_size",
Help: "The size of notify record queue.",
})
GaugeRuleEvalDuration := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "rule_eval_duration_ms",
Help: "Duration of rule eval in milliseconds.",
}, []string{"rule_id", "datasource_id"})
CounterVarFillingQuery := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "var_filling_query_total",
Help: "Number of var filling query.",
}, []string{"rule_id", "datasource_id", "ref", "typ"})
prometheus.MustRegister(
CounterAlertsTotal,
GaugeAlertQueueSize,
AlertNotifyTotal,
AlertNotifyErrorTotal,
CounterRuleEval,
CounterQueryDataTotal,
CounterQueryDataErrorTotal,
CounterRecordEval,
CounterRecordEvalErrorTotal,
CounterMuteTotal,
CounterRuleEvalErrorTotal,
CounterHeartbeatErrorTotal,
CounterSubEventTotal,
GaugeQuerySeriesCount,
GaugeRuleEvalDuration,
GaugeNotifyRecordQueueSize,
CounterVarFillingQuery,
)
return &Stats{
CounterAlertsTotal: CounterAlertsTotal,
GaugeAlertQueueSize: GaugeAlertQueueSize,
AlertNotifyTotal: AlertNotifyTotal,
AlertNotifyErrorTotal: AlertNotifyErrorTotal,
CounterRuleEval: CounterRuleEval,
CounterQueryDataTotal: CounterQueryDataTotal,
CounterQueryDataErrorTotal: CounterQueryDataErrorTotal,
CounterRecordEval: CounterRecordEval,
CounterRecordEvalErrorTotal: CounterRecordEvalErrorTotal,
CounterMuteTotal: CounterMuteTotal,
CounterRuleEvalErrorTotal: CounterRuleEvalErrorTotal,
CounterHeartbeatErrorTotal: CounterHeartbeatErrorTotal,
CounterSubEventTotal: CounterSubEventTotal,
GaugeQuerySeriesCount: GaugeQuerySeriesCount,
GaugeRuleEvalDuration: GaugeRuleEvalDuration,
GaugeNotifyRecordQueueSize: GaugeNotifyRecordQueueSize,
CounterVarFillingQuery: CounterVarFillingQuery,
}
}
================================================
FILE: alert/common/key.go
================================================
package common
import (
"encoding/json"
"fmt"
"strings"
"github.com/ccfos/nightingale/v6/models"
)
func RuleKey(datasourceId, id int64) string {
return fmt.Sprintf("alert-%d-%d", datasourceId, id)
}
func MatchTags(eventTagsMap map[string]string, itags []models.TagFilter) bool {
for _, filter := range itags {
// target_group in和not in优先特殊处理:匹配通过则继续下一个 filter,匹配失败则整组不匹配
if filter.Key == "target_group" {
// target 字段从 event.JsonTagsAndValue() 中获取的
v, ok := eventTagsMap["target"]
if !ok {
return false
}
if !targetGroupMatch(v, filter) {
return false
}
continue
}
// 普通标签按原逻辑处理
value, has := eventTagsMap[filter.Key]
if !has {
return false
}
if !matchTag(value, filter) {
return false
}
}
return true
}
func MatchGroupsName(groupName string, groupFilter []models.TagFilter) bool {
for _, filter := range groupFilter {
if !matchTag(groupName, filter) {
return false
}
}
return true
}
func matchTag(value string, filter models.TagFilter) bool {
switch filter.Func {
case "==":
return strings.TrimSpace(fmt.Sprintf("%v", filter.Value)) == strings.TrimSpace(value)
case "!=":
return strings.TrimSpace(fmt.Sprintf("%v", filter.Value)) != strings.TrimSpace(value)
case "in":
_, has := filter.Vset[value]
return has
case "not in":
_, has := filter.Vset[value]
return !has
case "=~":
return filter.Regexp.MatchString(value)
case "!~":
return !filter.Regexp.MatchString(value)
}
// unexpected func
return false
}
// targetGroupMatch 处理 target_group 的特殊匹配逻辑
func targetGroupMatch(value string, filter models.TagFilter) bool {
var valueMap map[string]interface{}
if err := json.Unmarshal([]byte(value), &valueMap); err != nil {
return false
}
switch filter.Func {
case "in", "not in":
// float64 类型的 id 切片
filterValueIds, ok := filter.Value.([]interface{})
if !ok {
return false
}
filterValueIdsMap := make(map[float64]struct{})
for _, id := range filterValueIds {
filterValueIdsMap[id.(float64)] = struct{}{}
}
// float64 类型的 groupIds 切片
groupIds, ok := valueMap["group_ids"].([]interface{})
if !ok {
return false
}
// in 只要 groupIds 中有一个在 filterGroupIds 中出现,就返回 true
// not in 则相反
found := false
for _, gid := range groupIds {
if _, found = filterValueIdsMap[gid.(float64)]; found {
break
}
}
if filter.Func == "in" {
return found
}
// filter.Func == "not in"
return !found
case "=~", "!~":
// 正则满足一个就认为 matched
groupNames, ok := valueMap["group_names"].([]interface{})
if !ok {
return false
}
matched := false
for _, gname := range groupNames {
if filter.Regexp.MatchString(fmt.Sprintf("%v", gname)) {
matched = true
break
}
}
if filter.Func == "=~" {
return matched
}
// "!~": 只要有一个匹配就返回 false,否则返回 true
return !matched
default:
return false
}
}
================================================
FILE: alert/dispatch/consume.go
================================================
package dispatch
import (
"context"
"encoding/json"
"fmt"
"strings"
"time"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/alert/queue"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
promsdk "github.com/ccfos/nightingale/v6/pkg/prom"
"github.com/ccfos/nightingale/v6/pkg/tplx"
"github.com/ccfos/nightingale/v6/prom"
"github.com/prometheus/common/model"
"github.com/toolkits/pkg/concurrent/semaphore"
"github.com/toolkits/pkg/logger"
)
type Consumer struct {
alerting aconf.Alerting
ctx *ctx.Context
dispatch *Dispatch
promClients *prom.PromClientMap
alertMuteCache *memsto.AlertMuteCacheType
}
type EventMuteHookFunc func(event *models.AlertCurEvent) bool
var EventMuteHook EventMuteHookFunc = func(event *models.AlertCurEvent) bool { return false }
func InitRegisterQueryFunc(promClients *prom.PromClientMap) {
tplx.RegisterQueryFunc(func(datasourceID int64, promql string) model.Value {
if promClients.IsNil(datasourceID) {
return nil
}
readerClient := promClients.GetCli(datasourceID)
value, _, _ := readerClient.Query(context.Background(), promql, time.Now())
return value
})
}
// 创建一个 Consumer 实例
func NewConsumer(alerting aconf.Alerting, ctx *ctx.Context, dispatch *Dispatch, promClients *prom.PromClientMap, alertMuteCache *memsto.AlertMuteCacheType) *Consumer {
return &Consumer{
alerting: alerting,
ctx: ctx,
dispatch: dispatch,
promClients: promClients,
alertMuteCache: alertMuteCache,
}
}
func (e *Consumer) LoopConsume() {
sema := semaphore.NewSemaphore(e.alerting.NotifyConcurrency)
duration := time.Duration(100) * time.Millisecond
for {
events := queue.EventQueue.PopBackBy(100)
if len(events) == 0 {
time.Sleep(duration)
continue
}
e.consume(events, sema)
}
}
func (e *Consumer) consume(events []interface{}, sema *semaphore.Semaphore) {
for i := range events {
if events[i] == nil {
continue
}
event := events[i].(*models.AlertCurEvent)
sema.Acquire()
go func(event *models.AlertCurEvent) {
defer sema.Release()
e.consumeOne(event)
}(event)
}
}
func (e *Consumer) consumeOne(event *models.AlertCurEvent) {
LogEvent(event, "consume")
eventType := "alert"
if event.IsRecovered {
eventType = "recovery"
}
e.dispatch.Astats.CounterAlertsTotal.WithLabelValues(event.Cluster, eventType, event.GroupName).Inc()
if err := event.ParseRule("rule_name"); err != nil {
logger.Warningf("alert_eval_%d datasource_%d failed to parse rule name: %v", event.RuleId, event.DatasourceId, err)
event.RuleName = fmt.Sprintf("failed to parse rule name: %v", err)
}
if err := event.ParseRule("annotations"); err != nil {
logger.Warningf("alert_eval_%d datasource_%d failed to parse annotations: %v", event.RuleId, event.DatasourceId, err)
event.Annotations = fmt.Sprintf("failed to parse annotations: %v", err)
event.AnnotationsJSON["error"] = event.Annotations
}
e.queryRecoveryVal(event)
if err := event.ParseRule("rule_note"); err != nil {
logger.Warningf("alert_eval_%d datasource_%d failed to parse rule note: %v", event.RuleId, event.DatasourceId, err)
event.RuleNote = fmt.Sprintf("failed to parse rule note: %v", err)
}
e.persist(event)
e.dispatch.HandleEventNotify(event, false)
}
func (e *Consumer) persist(event *models.AlertCurEvent) {
if event.Status != 0 {
return
}
if !e.ctx.IsCenter {
event.DB2FE()
var err error
event.Id, err = poster.PostByUrlsWithResp[int64](e.ctx, "/v1/n9e/event-persist", event)
if err != nil {
logger.Errorf("event:%s persist err:%v", event.Hash, err)
e.dispatch.Astats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", event.DatasourceId), "persist_event", event.GroupName, fmt.Sprintf("%v", event.RuleId)).Inc()
}
return
}
err := models.EventPersist(e.ctx, event)
if err != nil {
logger.Errorf("event:%s persist err:%v", event.Hash, err)
e.dispatch.Astats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", event.DatasourceId), "persist_event", event.GroupName, fmt.Sprintf("%v", event.RuleId)).Inc()
}
}
func (e *Consumer) queryRecoveryVal(event *models.AlertCurEvent) {
if !event.IsRecovered {
return
}
// If the event is a recovery event, execute the recovery_promql query
promql, ok := event.AnnotationsJSON["recovery_promql"]
if !ok {
return
}
promql = strings.TrimSpace(promql)
if promql == "" {
logger.Warningf("alert_eval_%d datasource_%d promql is blank", event.RuleId, event.DatasourceId)
return
}
if e.promClients.IsNil(event.DatasourceId) {
logger.Warningf("alert_eval_%d datasource_%d error reader client is nil", event.RuleId, event.DatasourceId)
return
}
readerClient := e.promClients.GetCli(event.DatasourceId)
var warnings promsdk.Warnings
value, warnings, err := readerClient.Query(e.ctx.Ctx, promql, time.Now())
if err != nil {
logger.Errorf("alert_eval_%d datasource_%d promql:%s, error:%v", event.RuleId, event.DatasourceId, promql, err)
event.AnnotationsJSON["recovery_promql_error"] = fmt.Sprintf("promql:%s error:%v", promql, err)
b, err := json.Marshal(event.AnnotationsJSON)
if err != nil {
event.AnnotationsJSON = make(map[string]string)
event.AnnotationsJSON["error"] = fmt.Sprintf("failed to parse annotations: %v", err)
} else {
event.Annotations = string(b)
}
return
}
if len(warnings) > 0 {
logger.Errorf("alert_eval_%d datasource_%d promql:%s, warnings:%v", event.RuleId, event.DatasourceId, promql, warnings)
}
anomalyPoints := models.ConvertAnomalyPoints(value)
if len(anomalyPoints) == 0 {
logger.Warningf("alert_eval_%d datasource_%d promql:%s, result is empty", event.RuleId, event.DatasourceId, promql)
event.AnnotationsJSON["recovery_promql_error"] = fmt.Sprintf("promql:%s error:%s", promql, "result is empty")
} else {
event.AnnotationsJSON["recovery_value"] = fmt.Sprintf("%v", anomalyPoints[0].Value)
}
b, err := json.Marshal(event.AnnotationsJSON)
if err != nil {
event.AnnotationsJSON = make(map[string]string)
event.AnnotationsJSON["error"] = fmt.Sprintf("failed to parse annotations: %v", err)
} else {
event.Annotations = string(b)
}
}
================================================
FILE: alert/dispatch/dispatch.go
================================================
package dispatch
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"html/template"
"net/url"
"strconv"
"strings"
"sync"
"time"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/alert/common"
"github.com/ccfos/nightingale/v6/alert/pipeline"
"github.com/ccfos/nightingale/v6/alert/pipeline/engine"
"github.com/ccfos/nightingale/v6/alert/sender"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/logger"
)
var ShouldSkipNotify func(*ctx.Context, *models.AlertCurEvent, int64) bool
var SendByNotifyRule func(*ctx.Context, *memsto.UserCacheType, *memsto.UserGroupCacheType, *memsto.NotifyChannelCacheType, *memsto.CvalCache,
[]*models.AlertCurEvent, int64, *models.NotifyConfig, *models.NotifyChannelConfig, *models.MessageTemplate)
var EventProcessorCache *memsto.EventProcessorCacheType
func init() {
ShouldSkipNotify = shouldSkipNotify
SendByNotifyRule = SendNotifyRuleMessage
}
type Dispatch struct {
alertRuleCache *memsto.AlertRuleCacheType
userCache *memsto.UserCacheType
userGroupCache *memsto.UserGroupCacheType
alertSubscribeCache *memsto.AlertSubscribeCacheType
targetCache *memsto.TargetCacheType
notifyConfigCache *memsto.NotifyConfigCacheType
taskTplsCache *memsto.TaskTplCache
configCvalCache *memsto.CvalCache
notifyRuleCache *memsto.NotifyRuleCacheType
notifyChannelCache *memsto.NotifyChannelCacheType
messageTemplateCache *memsto.MessageTemplateCacheType
eventProcessorCache *memsto.EventProcessorCacheType
alerting aconf.Alerting
Senders map[string]sender.Sender
CallBacks map[string]sender.CallBacker
tpls map[string]*template.Template
ExtraSenders map[string]sender.Sender
BeforeSenderHook func(*models.AlertCurEvent) bool
ctx *ctx.Context
Astats *astats.Stats
RwLock sync.RWMutex
}
// 创建一个 Notify 实例
func NewDispatch(alertRuleCache *memsto.AlertRuleCacheType, userCache *memsto.UserCacheType, userGroupCache *memsto.UserGroupCacheType,
alertSubscribeCache *memsto.AlertSubscribeCacheType, targetCache *memsto.TargetCacheType, notifyConfigCache *memsto.NotifyConfigCacheType,
taskTplsCache *memsto.TaskTplCache, notifyRuleCache *memsto.NotifyRuleCacheType, notifyChannelCache *memsto.NotifyChannelCacheType,
messageTemplateCache *memsto.MessageTemplateCacheType, eventProcessorCache *memsto.EventProcessorCacheType, configCvalCache *memsto.CvalCache, alerting aconf.Alerting, c *ctx.Context, astats *astats.Stats) *Dispatch {
notify := &Dispatch{
alertRuleCache: alertRuleCache,
userCache: userCache,
userGroupCache: userGroupCache,
alertSubscribeCache: alertSubscribeCache,
targetCache: targetCache,
notifyConfigCache: notifyConfigCache,
taskTplsCache: taskTplsCache,
notifyRuleCache: notifyRuleCache,
notifyChannelCache: notifyChannelCache,
messageTemplateCache: messageTemplateCache,
eventProcessorCache: eventProcessorCache,
configCvalCache: configCvalCache,
alerting: alerting,
Senders: make(map[string]sender.Sender),
tpls: make(map[string]*template.Template),
ExtraSenders: make(map[string]sender.Sender),
BeforeSenderHook: func(*models.AlertCurEvent) bool { return true },
ctx: c,
Astats: astats,
}
pipeline.Init()
EventProcessorCache = eventProcessorCache
// 设置通知记录回调函数
notifyChannelCache.SetNotifyRecordFunc(sender.NotifyRecord)
return notify
}
func (e *Dispatch) ReloadTpls() error {
err := e.reloadTpls()
if err != nil {
logger.Errorf("failed to reload tpls: %v", err)
}
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := e.reloadTpls(); err != nil {
logger.Warning("failed to reload tpls:", err)
}
}
}
func (e *Dispatch) reloadTpls() error {
tmpTpls, err := models.ListTpls(e.ctx)
if err != nil {
return err
}
smtp := e.notifyConfigCache.GetSMTP()
senders := map[string]sender.Sender{
models.Email: sender.NewSender(models.Email, tmpTpls, smtp),
models.Dingtalk: sender.NewSender(models.Dingtalk, tmpTpls),
models.Wecom: sender.NewSender(models.Wecom, tmpTpls),
models.Feishu: sender.NewSender(models.Feishu, tmpTpls),
models.Mm: sender.NewSender(models.Mm, tmpTpls),
models.Telegram: sender.NewSender(models.Telegram, tmpTpls),
models.FeishuCard: sender.NewSender(models.FeishuCard, tmpTpls),
models.Lark: sender.NewSender(models.Lark, tmpTpls),
models.LarkCard: sender.NewSender(models.LarkCard, tmpTpls),
}
// domain -> Callback()
callbacks := map[string]sender.CallBacker{
models.DingtalkDomain: sender.NewCallBacker(models.DingtalkDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls),
models.WecomDomain: sender.NewCallBacker(models.WecomDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls),
models.FeishuDomain: sender.NewCallBacker(models.FeishuDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls),
models.TelegramDomain: sender.NewCallBacker(models.TelegramDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls),
models.FeishuCardDomain: sender.NewCallBacker(models.FeishuCardDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls),
models.IbexDomain: sender.NewCallBacker(models.IbexDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls),
models.LarkDomain: sender.NewCallBacker(models.LarkDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls),
models.DefaultDomain: sender.NewCallBacker(models.DefaultDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls),
models.LarkCardDomain: sender.NewCallBacker(models.LarkCardDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls),
}
e.RwLock.RLock()
for channelName, extraSender := range e.ExtraSenders {
senders[channelName] = extraSender
}
e.RwLock.RUnlock()
e.RwLock.Lock()
e.tpls = tmpTpls
e.Senders = senders
e.CallBacks = callbacks
e.RwLock.Unlock()
return nil
}
func (e *Dispatch) HandleEventWithNotifyRule(eventOrigin *models.AlertCurEvent) {
if len(eventOrigin.NotifyRuleIds) > 0 {
for _, notifyRuleId := range eventOrigin.NotifyRuleIds {
// 深拷贝新的 event,避免并发修改 event 冲突
eventCopy := eventOrigin.DeepCopy()
logger.Infof("notify rule ids: %v, event: %s", notifyRuleId, eventCopy.Hash)
notifyRule := e.notifyRuleCache.Get(notifyRuleId)
if notifyRule == nil {
continue
}
if !notifyRule.Enable {
continue
}
eventCopy.NotifyRuleId = notifyRuleId
eventCopy.NotifyRuleName = notifyRule.Name
eventCopy = HandleEventPipeline(notifyRule.PipelineConfigs, eventOrigin, eventCopy, e.eventProcessorCache, e.ctx, notifyRuleId, "notify_rule")
if eventCopy == nil {
continue
}
if ShouldSkipNotify(e.ctx, eventCopy, notifyRuleId) {
logger.Infof("notify_id: %d, event:%s, should skip notify", notifyRuleId, eventCopy.Hash)
continue
}
// notify
for i := range notifyRule.NotifyConfigs {
err := NotifyRuleMatchCheck(¬ifyRule.NotifyConfigs[i], eventCopy)
if err != nil {
logger.Errorf("notify_id: %d, event:%s, channel_id:%d, template_id: %d, notify_config:%+v, err:%v", notifyRuleId, eventCopy.Hash, notifyRule.NotifyConfigs[i].ChannelID, notifyRule.NotifyConfigs[i].TemplateID, notifyRule.NotifyConfigs[i], err)
continue
}
notifyChannel := e.notifyChannelCache.Get(notifyRule.NotifyConfigs[i].ChannelID)
messageTemplate := e.messageTemplateCache.Get(notifyRule.NotifyConfigs[i].TemplateID)
if notifyChannel == nil {
sender.NotifyRecord(e.ctx, []*models.AlertCurEvent{eventCopy}, notifyRuleId, fmt.Sprintf("notify_channel_id:%d", notifyRule.NotifyConfigs[i].ChannelID), "", "", errors.New("notify_channel not found"))
logger.Warningf("notify_id: %d, event:%s, channel_id:%d, template_id: %d, notify_channel not found", notifyRuleId, eventCopy.Hash, notifyRule.NotifyConfigs[i].ChannelID, notifyRule.NotifyConfigs[i].TemplateID)
continue
}
if notifyChannel.RequestType != "flashduty" && notifyChannel.RequestType != "pagerduty" && messageTemplate == nil {
logger.Warningf("notify_id: %d, channel_name: %v, event:%s, template_id: %d, message_template not found", notifyRuleId, notifyChannel.Ident, eventCopy.Hash, notifyRule.NotifyConfigs[i].TemplateID)
sender.NotifyRecord(e.ctx, []*models.AlertCurEvent{eventCopy}, notifyRuleId, notifyChannel.Name, "", "", errors.New("message_template not found"))
continue
}
go SendByNotifyRule(e.ctx, e.userCache, e.userGroupCache, e.notifyChannelCache, e.configCvalCache, []*models.AlertCurEvent{eventCopy}, notifyRuleId, ¬ifyRule.NotifyConfigs[i], notifyChannel, messageTemplate)
}
}
}
}
func shouldSkipNotify(ctx *ctx.Context, event *models.AlertCurEvent, notifyRuleId int64) bool {
if event == nil {
// 如果 eventCopy 为 nil,说明 eventCopy 被 processor drop 掉了, 不再发送通知
return true
}
if event.IsRecovered && event.NotifyRecovered == 0 {
// 如果 eventCopy 是恢复事件,且 NotifyRecovered 为 0,则不发送通知
return true
}
return false
}
func HandleEventPipeline(pipelineConfigs []models.PipelineConfig, eventOrigin, event *models.AlertCurEvent, eventProcessorCache *memsto.EventProcessorCacheType, ctx *ctx.Context, id int64, from string) *models.AlertCurEvent {
workflowEngine := engine.NewWorkflowEngine(ctx)
for _, pipelineConfig := range pipelineConfigs {
if !pipelineConfig.Enable {
continue
}
eventPipeline := eventProcessorCache.Get(pipelineConfig.PipelineId)
if eventPipeline == nil {
logger.Warningf("processor_by_%s_id:%d pipeline_id:%d, event pipeline not found, event: %s", from, id, pipelineConfig.PipelineId, event.Hash)
continue
}
if !PipelineApplicable(eventPipeline, event) {
logger.Debugf("processor_by_%s_id:%d pipeline_id:%d, event pipeline not applicable, event: %s", from, id, pipelineConfig.PipelineId, event.Hash)
continue
}
// 统一使用工作流引擎执行(兼容线性模式和工作流模式)
triggerCtx := &models.WorkflowTriggerContext{
Mode: models.TriggerModeEvent,
TriggerBy: from + "_" + strconv.FormatInt(id, 10),
}
resultEvent, result, err := workflowEngine.Execute(eventPipeline, event, triggerCtx)
if err != nil {
logger.Errorf("processor_by_%s_id:%d pipeline_id:%d, pipeline execute error: %v", from, id, pipelineConfig.PipelineId, err)
continue
}
if resultEvent == nil {
logger.Infof("processor_by_%s_id:%d pipeline_id:%d, event dropped, event: %s", from, id, pipelineConfig.PipelineId, eventOrigin.Hash)
if from == "notify_rule" {
sender.NotifyRecord(ctx, []*models.AlertCurEvent{eventOrigin}, id, "", "", result.Message, fmt.Errorf("processor_by_%s_id:%d pipeline_id:%d, drop by pipeline", from, id, pipelineConfig.PipelineId))
}
return nil
}
event = resultEvent
logger.Infof("processor_by_%s_id:%d pipeline_id:%d, pipeline executed, status:%s, message:%s", from, id, pipelineConfig.PipelineId, result.Status, result.Message)
}
event.FE2DB()
event.FillTagsMap()
return event
}
func PipelineApplicable(pipeline *models.EventPipeline, event *models.AlertCurEvent) bool {
if pipeline == nil {
return true
}
if !pipeline.FilterEnable {
return true
}
tagMatch := true
if len(pipeline.LabelFilters) > 0 {
// Deep copy to avoid concurrent map writes on cached objects
labelFiltersCopy := make([]models.TagFilter, len(pipeline.LabelFilters))
copy(labelFiltersCopy, pipeline.LabelFilters)
for i := range labelFiltersCopy {
if labelFiltersCopy[i].Func == "" {
labelFiltersCopy[i].Func = labelFiltersCopy[i].Op
}
}
tagFilters, err := models.ParseTagFilter(labelFiltersCopy)
if err != nil {
logger.Errorf("pipeline applicable failed to parse tag filter: %v event:%s pipeline:%+v", err, event.Hash, pipeline)
return false
}
tagMatch = common.MatchTags(event.TagsMap, tagFilters)
}
attributesMatch := true
if len(pipeline.AttrFilters) > 0 {
// Deep copy to avoid concurrent map writes on cached objects
attrFiltersCopy := make([]models.TagFilter, len(pipeline.AttrFilters))
copy(attrFiltersCopy, pipeline.AttrFilters)
tagFilters, err := models.ParseTagFilter(attrFiltersCopy)
if err != nil {
logger.Errorf("pipeline applicable failed to parse tag filter: %v event:%s pipeline:%+v err:%v", tagFilters, event.Hash, pipeline, err)
return false
}
attributesMatch = common.MatchTags(event.JsonTagsAndValue(), tagFilters)
}
return tagMatch && attributesMatch
}
func NotifyRuleMatchCheck(notifyConfig *models.NotifyConfig, event *models.AlertCurEvent) error {
tm := time.Unix(event.TriggerTime, 0)
triggerTime := tm.Format("15:04")
triggerWeek := int(tm.Weekday())
timeMatch := false
if len(notifyConfig.TimeRanges) == 0 {
timeMatch = true
}
for j := range notifyConfig.TimeRanges {
if timeMatch {
break
}
enableStime := notifyConfig.TimeRanges[j].Start
enableEtime := notifyConfig.TimeRanges[j].End
enableDaysOfWeek := notifyConfig.TimeRanges[j].Week
length := len(enableDaysOfWeek)
// enableStime,enableEtime,enableDaysOfWeek三者长度肯定相同,这里循环一个即可
for i := 0; i < length; i++ {
if enableDaysOfWeek[i] != triggerWeek {
continue
}
if enableStime < enableEtime {
if enableEtime == "23:59" {
// 02:00-23:59,这种情况做个特殊处理,相当于左闭右闭区间了
if triggerTime < enableStime {
// mute, 即没生效
continue
}
} else {
// 02:00-04:00 或者 02:00-24:00
if triggerTime < enableStime || triggerTime >= enableEtime {
// mute, 即没生效
continue
}
}
} else if enableStime > enableEtime {
// 21:00-09:00
if triggerTime < enableStime && triggerTime >= enableEtime {
// mute, 即没生效
continue
}
}
// 到这里说明当前时刻在告警规则的某组生效时间范围内,即没有 mute,直接返回 false
timeMatch = true
break
}
}
if !timeMatch {
return fmt.Errorf("event time not match time filter")
}
severityMatch := false
for i := range notifyConfig.Severities {
if notifyConfig.Severities[i] == event.Severity {
severityMatch = true
}
}
if !severityMatch {
return fmt.Errorf("event severity not match severity filter")
}
tagMatch := true
if len(notifyConfig.LabelKeys) > 0 {
// Deep copy to avoid concurrent map writes on cached objects
labelKeysCopy := make([]models.TagFilter, len(notifyConfig.LabelKeys))
copy(labelKeysCopy, notifyConfig.LabelKeys)
for i := range labelKeysCopy {
if labelKeysCopy[i].Func == "" {
labelKeysCopy[i].Func = labelKeysCopy[i].Op
}
}
tagFilters, err := models.ParseTagFilter(labelKeysCopy)
if err != nil {
logger.Errorf("notify send failed to parse tag filter: %v event:%s notify_config:%+v", err, event.Hash, notifyConfig)
return fmt.Errorf("failed to parse tag filter: %v", err)
}
tagMatch = common.MatchTags(event.TagsMap, tagFilters)
}
if !tagMatch {
return fmt.Errorf("event tag not match tag filter")
}
attributesMatch := true
if len(notifyConfig.Attributes) > 0 {
// Deep copy to avoid concurrent map writes on cached objects
attributesCopy := make([]models.TagFilter, len(notifyConfig.Attributes))
copy(attributesCopy, notifyConfig.Attributes)
tagFilters, err := models.ParseTagFilter(attributesCopy)
if err != nil {
logger.Errorf("notify send failed to parse tag filter: %v event:%s notify_config:%+v err:%v", tagFilters, event.Hash, notifyConfig, err)
return fmt.Errorf("failed to parse tag filter: %v", err)
}
attributesMatch = common.MatchTags(event.JsonTagsAndValue(), tagFilters)
}
if !attributesMatch {
return fmt.Errorf("event attributes not match attributes filter")
}
logger.Infof("notify send timeMatch:%v severityMatch:%v tagMatch:%v attributesMatch:%v event:%s notify_config:%+v", timeMatch, severityMatch, tagMatch, attributesMatch, event.Hash, notifyConfig)
return nil
}
func GetNotifyConfigParams(notifyConfig *models.NotifyConfig, contactKey string, userCache *memsto.UserCacheType, userGroupCache *memsto.UserGroupCacheType) ([]string, []int64, []string, map[string]string) {
customParams := make(map[string]string)
var flashDutyChannelIDs []int64
var pagerDutyRoutingKeys []string
var userInfoParams models.CustomParams
for key, value := range notifyConfig.Params {
switch key {
case "user_ids", "user_group_ids", "ids":
if data, err := json.Marshal(value); err == nil {
var ids []int64
if json.Unmarshal(data, &ids) == nil {
if key == "user_ids" {
userInfoParams.UserIDs = ids
} else if key == "user_group_ids" {
userInfoParams.UserGroupIDs = ids
} else if key == "ids" {
flashDutyChannelIDs = ids
}
}
}
case "pagerduty_integration_keys", "pagerduty_integration_ids":
if key == "pagerduty_integration_ids" {
// 不处理ids,直接跳过,这个字段只给前端标记用
continue
}
if data, err := json.Marshal(value); err == nil {
var keys []string
if json.Unmarshal(data, &keys) == nil {
pagerDutyRoutingKeys = keys
break
}
}
default:
// 避免直接 value.(string) 导致 panic,支持多种类型并统一为字符串
customParams[key] = value.(string)
}
}
if len(userInfoParams.UserIDs) == 0 && len(userInfoParams.UserGroupIDs) == 0 {
return []string{}, flashDutyChannelIDs, pagerDutyRoutingKeys, customParams
}
userIds := make([]int64, 0)
userIds = append(userIds, userInfoParams.UserIDs...)
if len(userInfoParams.UserGroupIDs) > 0 {
userGroups := userGroupCache.GetByUserGroupIds(userInfoParams.UserGroupIDs)
for _, userGroup := range userGroups {
userIds = append(userIds, userGroup.UserIds...)
}
}
users := userCache.GetByUserIds(userIds)
visited := make(map[int64]bool)
sendtos := make([]string, 0)
for _, user := range users {
if visited[user.Id] {
continue
}
var sendto string
if contactKey == "phone" {
sendto = user.Phone
} else if contactKey == "email" {
sendto = user.Email
} else {
sendto, _ = user.ExtractToken(contactKey)
}
if sendto == "" {
continue
}
sendtos = append(sendtos, sendto)
visited[user.Id] = true
}
return sendtos, flashDutyChannelIDs, pagerDutyRoutingKeys, customParams
}
func SendNotifyRuleMessage(ctx *ctx.Context, userCache *memsto.UserCacheType, userGroupCache *memsto.UserGroupCacheType, notifyChannelCache *memsto.NotifyChannelCacheType, configCvalCache *memsto.CvalCache,
events []*models.AlertCurEvent, notifyRuleId int64, notifyConfig *models.NotifyConfig, notifyChannel *models.NotifyChannelConfig, messageTemplate *models.MessageTemplate) {
if len(events) == 0 {
logger.Errorf("notify_id: %d events is empty", notifyRuleId)
return
}
siteInfo := configCvalCache.GetSiteInfo()
tplContent := make(map[string]interface{})
if notifyChannel.RequestType != "flashduty" {
tplContent = messageTemplate.RenderEvent(events, siteInfo.SiteUrl)
}
var contactKey string
if notifyChannel.ParamConfig != nil && notifyChannel.ParamConfig.UserInfo != nil {
contactKey = notifyChannel.ParamConfig.UserInfo.ContactKey
}
sendtos, flashDutyChannelIDs, pagerdutyRoutingKeys, customParams := GetNotifyConfigParams(notifyConfig, contactKey, userCache, userGroupCache)
switch notifyChannel.RequestType {
case "flashduty":
if len(flashDutyChannelIDs) == 0 {
flashDutyChannelIDs = []int64{0} // 如果 flashduty 通道没有配置,则使用 0, 给 SendFlashDuty 判断使用, 不给 flashduty 传 channel_id 参数
}
for i := range flashDutyChannelIDs {
start := time.Now()
respBody, err := notifyChannel.SendFlashDuty(events, flashDutyChannelIDs[i], notifyChannelCache.GetHttpClient(notifyChannel.ID))
respBody = fmt.Sprintf("send_time: %s duration: %d ms %s", time.Now().Format("2006-01-02 15:04:05"), time.Since(start).Milliseconds(), respBody)
logger.Infof("duty_sender notify_id: %d, channel_name: %v, event:%s, IntegrationUrl: %v dutychannel_id: %v, respBody: %v, err: %v", notifyRuleId, notifyChannel.Name, events[0].Hash, notifyChannel.RequestConfig.FlashDutyRequestConfig.IntegrationUrl, flashDutyChannelIDs[i], respBody, err)
sender.NotifyRecord(ctx, events, notifyRuleId, notifyChannel.Name, strconv.FormatInt(flashDutyChannelIDs[i], 10), respBody, err)
}
case "pagerduty":
for _, routingKey := range pagerdutyRoutingKeys {
start := time.Now()
respBody, err := notifyChannel.SendPagerDuty(events, routingKey, siteInfo.SiteUrl, notifyChannelCache.GetHttpClient(notifyChannel.ID))
respBody = fmt.Sprintf("send_time: %s duration: %d ms %s", time.Now().Format("2006-01-02 15:04:05"), time.Since(start).Milliseconds(), respBody)
logger.Infof("pagerduty_sender notify_id: %d, channel_name: %v, event:%s, respBody: %v, err: %v", notifyRuleId, notifyChannel.Name, events[0].Hash, respBody, err)
sender.NotifyRecord(ctx, events, notifyRuleId, notifyChannel.Name, "", respBody, err)
}
case "http":
// 使用队列模式处理 http 通知
// 创建通知任务
task := &memsto.NotifyTask{
Events: events,
NotifyRuleId: notifyRuleId,
NotifyChannel: notifyChannel,
TplContent: tplContent,
CustomParams: customParams,
Sendtos: sendtos,
}
// 将任务加入队列
success := notifyChannelCache.EnqueueNotifyTask(task)
if !success {
logger.Errorf("failed to enqueue notify task for channel %d, notify_id: %d", notifyChannel.ID, notifyRuleId)
// 如果入队失败,记录错误通知
sender.NotifyRecord(ctx, events, notifyRuleId, notifyChannel.Name, getSendTarget(customParams, sendtos), "", errors.New("failed to enqueue notify task, queue is full"))
}
case "smtp":
notifyChannel.SendEmail(notifyRuleId, events, tplContent, sendtos, notifyChannelCache.GetSmtpClient(notifyChannel.ID))
case "script":
start := time.Now()
target, res, err := notifyChannel.SendScript(events, tplContent, customParams, sendtos)
res = fmt.Sprintf("send_time: %s duration: %d ms %s", time.Now().Format("2006-01-02 15:04:05"), time.Since(start).Milliseconds(), res)
logger.Infof("script_sender notify_id: %d, channel_name: %v, event:%s, tplContent:%s, customParams:%v, target:%s, res:%s, err:%v", notifyRuleId, notifyChannel.Name, events[0].Hash, tplContent, customParams, target, res, err)
sender.NotifyRecord(ctx, events, notifyRuleId, notifyChannel.Name, target, res, err)
default:
logger.Warningf("notify_id: %d, channel_name: %v, event:%s send type not found", notifyRuleId, notifyChannel.Name, events[0].Hash)
}
}
func NeedBatchContacts(requestConfig *models.HTTPRequestConfig) bool {
b, _ := json.Marshal(requestConfig)
return strings.Contains(string(b), "$sendtos")
}
// HandleEventNotify 处理event事件的主逻辑
// event: 告警/恢复事件
// isSubscribe: 告警事件是否由subscribe的配置产生
func (e *Dispatch) HandleEventNotify(event *models.AlertCurEvent, isSubscribe bool) {
go e.HandleEventWithNotifyRule(event)
if event.IsRecovered && event.NotifyRecovered == 0 {
return
}
if !isSubscribe {
go sender.SendStaticGlobalWebhook(e.ctx, event.DeepCopy(), e.Astats)
}
rule := e.alertRuleCache.Get(event.RuleId)
if rule == nil {
return
}
fillUsers(event, e.userCache, e.userGroupCache)
var (
// 处理事件到 notifyTarget 关系,处理的notifyTarget用OrMerge进行合并
handlers []NotifyTargetDispatch
// 额外去掉一些订阅,处理的notifyTarget用AndMerge进行合并, 如设置 channel=false,合并后不通过这个channel发送
// 如果实现了相关 Dispatch,可以添加到interceptors中
interceptorHandlers []NotifyTargetDispatch
)
if isSubscribe {
handlers = []NotifyTargetDispatch{NotifyGroupDispatch, EventCallbacksDispatch}
} else {
handlers = []NotifyTargetDispatch{NotifyGroupDispatch, GlobalWebhookDispatch, EventCallbacksDispatch}
}
notifyTarget := NewNotifyTarget()
// 处理订阅关系使用OrMerge
for _, handler := range handlers {
notifyTarget.OrMerge(handler(rule, event, notifyTarget, e))
}
// 处理移除订阅关系的逻辑,比如员工离职,临时静默某个通道的策略等
for _, handler := range interceptorHandlers {
notifyTarget.AndMerge(handler(rule, event, notifyTarget, e))
}
go e.Send(rule, event, notifyTarget, isSubscribe)
// 如果是不是订阅规则出现的event, 则需要处理订阅规则的event
if !isSubscribe {
e.handleSubs(event)
}
}
func (e *Dispatch) handleSubs(event *models.AlertCurEvent) {
// handle alert subscribes
subscribes := make([]*models.AlertSubscribe, 0)
// rule specific subscribes
if subs, has := e.alertSubscribeCache.Get(event.RuleId); has {
subscribes = append(subscribes, subs...)
}
// global subscribes
if subs, has := e.alertSubscribeCache.Get(0); has {
subscribes = append(subscribes, subs...)
}
for _, sub := range subscribes {
e.handleSub(sub, *event)
}
}
// handleSub 处理订阅规则的event,注意这里event要使用值传递,因为后面会修改event的状态
func (e *Dispatch) handleSub(sub *models.AlertSubscribe, event models.AlertCurEvent) {
if sub.IsDisabled() {
return
}
if !sub.MatchCluster(event.DatasourceId) {
return
}
if !sub.MatchProd(event.RuleProd) {
return
}
if !sub.MatchCate(event.Cate) {
return
}
if !common.MatchTags(event.TagsMap, sub.ITags) {
return
}
// event BusiGroups filter
if !common.MatchGroupsName(event.GroupName, sub.IBusiGroups) {
return
}
if sub.ForDuration > (event.TriggerTime - event.FirstTriggerTime) {
return
}
if len(sub.SeveritiesJson) != 0 {
match := false
for _, s := range sub.SeveritiesJson {
if s == event.Severity || s == 0 {
match = true
break
}
}
if !match {
return
}
}
e.Astats.CounterSubEventTotal.WithLabelValues(event.GroupName).Inc()
sub.ModifyEvent(&event)
event.SubRuleId = sub.Id
LogEvent(&event, "subscribe")
e.HandleEventNotify(&event, true)
}
func (e *Dispatch) Send(rule *models.AlertRule, event *models.AlertCurEvent, notifyTarget *NotifyTarget, isSubscribe bool) {
needSend := e.BeforeSenderHook(event)
if needSend {
for channel, uids := range notifyTarget.ToChannelUserMap() {
msgCtx := sender.BuildMessageContext(e.ctx, rule, []*models.AlertCurEvent{event},
uids, e.userCache, e.Astats)
e.RwLock.RLock()
s := e.Senders[channel]
e.RwLock.RUnlock()
if s == nil {
logger.Debugf("no sender for channel: %s", channel)
continue
}
var event *models.AlertCurEvent
if len(msgCtx.Events) > 0 {
event = msgCtx.Events[0]
}
logger.Debugf("send to channel:%s event:%s users:%+v", channel, event.Hash, msgCtx.Users)
s.Send(msgCtx)
}
}
// handle event callbacks
e.SendCallbacks(rule, notifyTarget, event)
// handle global webhooks
if !event.OverrideGlobalWebhook() {
if e.alerting.WebhookBatchSend {
sender.BatchSendWebhooks(e.ctx, notifyTarget.ToWebhookMap(), event, e.Astats)
} else {
sender.SingleSendWebhooks(e.ctx, notifyTarget.ToWebhookMap(), event, e.Astats)
}
}
// handle plugin call
go sender.MayPluginNotify(e.ctx, e.genNoticeBytes(event), e.notifyConfigCache.
GetNotifyScript(), e.Astats, event)
if !isSubscribe {
// handle ibex callbacks
e.HandleIbex(rule, event)
}
}
func (e *Dispatch) SendCallbacks(rule *models.AlertRule, notifyTarget *NotifyTarget, event *models.AlertCurEvent) {
uids := notifyTarget.ToUidList()
urls := notifyTarget.ToCallbackList()
whMap := notifyTarget.ToWebhookMap()
ogw := event.OverrideGlobalWebhook()
for _, urlStr := range urls {
if len(urlStr) == 0 {
continue
}
cbCtx := sender.BuildCallBackContext(e.ctx, urlStr, rule, []*models.AlertCurEvent{event}, uids, e.userCache, e.alerting.WebhookBatchSend, e.Astats)
if wh, ok := whMap[cbCtx.CallBackURL]; !ogw && ok && wh.Enable {
logger.Debugf("SendCallbacks: webhook[%s] is in global conf.", cbCtx.CallBackURL)
continue
}
if strings.HasPrefix(urlStr, "${ibex}") {
e.CallBacks[models.IbexDomain].CallBack(cbCtx)
continue
}
if !(strings.HasPrefix(urlStr, "http://") || strings.HasPrefix(urlStr, "https://")) {
cbCtx.CallBackURL = "http://" + urlStr
}
parsedURL, err := url.Parse(urlStr)
if err != nil {
logger.Errorf("SendCallbacks: failed to url.Parse(urlStr=%s): %v", urlStr, err)
continue
}
// process feishu card
if parsedURL.Host == models.FeishuDomain && parsedURL.Query().Get("card") == "1" {
e.CallBacks[models.FeishuCardDomain].CallBack(cbCtx)
continue
}
// process lark card
if parsedURL.Host == models.LarkDomain && parsedURL.Query().Get("card") == "1" {
e.CallBacks[models.LarkCardDomain].CallBack(cbCtx)
continue
}
callBacker, ok := e.CallBacks[parsedURL.Host]
if ok {
callBacker.CallBack(cbCtx)
} else {
e.CallBacks[models.DefaultDomain].CallBack(cbCtx)
}
}
}
func (e *Dispatch) HandleIbex(rule *models.AlertRule, event *models.AlertCurEvent) {
// 解析 RuleConfig 字段
var ruleConfig struct {
TaskTpls []*models.Tpl `json:"task_tpls"`
}
json.Unmarshal([]byte(rule.RuleConfig), &ruleConfig)
if event.IsRecovered {
// 恢复事件不需要走故障自愈的逻辑
return
}
for _, t := range ruleConfig.TaskTpls {
if t.TplId == 0 {
continue
}
if len(t.Host) == 0 {
sender.CallIbex(e.ctx, t.TplId, event.TargetIdent,
e.taskTplsCache, e.targetCache, e.userCache, event, "")
continue
}
for _, host := range t.Host {
sender.CallIbex(e.ctx, t.TplId, host,
e.taskTplsCache, e.targetCache, e.userCache, event, "")
}
}
}
type Notice struct {
Event *models.AlertCurEvent `json:"event"`
Tpls map[string]string `json:"tpls"`
}
func (e *Dispatch) genNoticeBytes(event *models.AlertCurEvent) []byte {
// build notice body with templates
ntpls := make(map[string]string)
e.RwLock.RLock()
defer e.RwLock.RUnlock()
for filename, tpl := range e.tpls {
var body bytes.Buffer
if err := tpl.Execute(&body, event); err != nil {
ntpls[filename] = err.Error()
} else {
ntpls[filename] = body.String()
}
}
notice := Notice{Event: event, Tpls: ntpls}
stdinBytes, err := json.Marshal(notice)
if err != nil {
logger.Errorf("event_notify: failed to marshal notice: %v", err)
return nil
}
return stdinBytes
}
// for alerting
func fillUsers(ce *models.AlertCurEvent, uc *memsto.UserCacheType, ugc *memsto.UserGroupCacheType) {
gids := make([]int64, 0, len(ce.NotifyGroupsJSON))
for i := 0; i < len(ce.NotifyGroupsJSON); i++ {
gid, err := strconv.ParseInt(ce.NotifyGroupsJSON[i], 10, 64)
if err != nil {
continue
}
gids = append(gids, gid)
}
ce.NotifyGroupsObj = ugc.GetByUserGroupIds(gids)
uids := make(map[int64]struct{})
for i := 0; i < len(ce.NotifyGroupsObj); i++ {
ug := ce.NotifyGroupsObj[i]
for j := 0; j < len(ug.UserIds); j++ {
uids[ug.UserIds[j]] = struct{}{}
}
}
ce.NotifyUsersObj = uc.GetByUserIds(mapKeys(uids))
}
func mapKeys(m map[int64]struct{}) []int64 {
lst := make([]int64, 0, len(m))
for k := range m {
lst = append(lst, k)
}
return lst
}
func getSendTarget(customParams map[string]string, sendtos []string) string {
if len(customParams) == 0 {
return strings.Join(sendtos, ",")
}
values := make([]string, 0)
for _, value := range customParams {
runes := []rune(value)
if len(runes) <= 4 {
values = append(values, value)
} else {
maskedValue := string(runes[:len(runes)-4]) + "****"
values = append(values, maskedValue)
}
}
return strings.Join(values, ",")
}
================================================
FILE: alert/dispatch/log.go
================================================
package dispatch
import (
"github.com/ccfos/nightingale/v6/models"
"github.com/toolkits/pkg/logger"
)
func LogEvent(event *models.AlertCurEvent, location string, err ...error) {
status := "triggered"
if event.IsRecovered {
status = "recovered"
}
message := ""
if len(err) > 0 && err[0] != nil {
message = "error_message: " + err[0].Error()
}
logger.Infof(
"alert_eval_%d event(%s %s) %s: sub_id:%d notify_rule_ids:%v cluster:%s %v%s@%d last_eval_time:%d %s",
event.RuleId,
event.Hash,
status,
location,
event.SubRuleId,
event.NotifyRuleIds,
event.Cluster,
event.TagsJSON,
event.TriggerValue,
event.TriggerTime,
event.LastEvalTime,
message,
)
}
================================================
FILE: alert/dispatch/notify_channel.go
================================================
package dispatch
// NotifyChannels channelKey -> bool
type NotifyChannels map[string]bool
func NewNotifyChannels(channels []string) NotifyChannels {
nc := make(NotifyChannels)
for _, ch := range channels {
nc[ch] = true
}
return nc
}
func (nc NotifyChannels) OrMerge(other NotifyChannels) {
nc.merge(other, func(a, b bool) bool { return a || b })
}
func (nc NotifyChannels) AndMerge(other NotifyChannels) {
nc.merge(other, func(a, b bool) bool { return a && b })
}
func (nc NotifyChannels) merge(other NotifyChannels, f func(bool, bool) bool) {
if other == nil {
return
}
for k, v := range other {
if curV, has := nc[k]; has {
nc[k] = f(curV, v)
} else {
nc[k] = v
}
}
}
================================================
FILE: alert/dispatch/notify_target.go
================================================
package dispatch
import (
"strconv"
"github.com/ccfos/nightingale/v6/models"
)
// NotifyTarget 维护所有需要发送的目标 用户-通道/回调/钩子信息,用map维护的数据结构具有去重功能
type NotifyTarget struct {
userMap map[int64]NotifyChannels
webhooks map[string]*models.Webhook
callbacks map[string]struct{}
}
func NewNotifyTarget() *NotifyTarget {
return &NotifyTarget{
userMap: make(map[int64]NotifyChannels),
webhooks: make(map[string]*models.Webhook),
callbacks: make(map[string]struct{}),
}
}
// OrMerge 将 channelMap 按照 or 的方式合并,方便实现多种组合的策略,比如根据某个 tag 进行路由等
func (s *NotifyTarget) OrMerge(other *NotifyTarget) {
s.merge(other, NotifyChannels.OrMerge)
}
// AndMerge 将 channelMap 中的 bool 值按照 and 的逻辑进行合并,可以单独将人/通道维度的通知移除
// 常用的场景有:
// 1. 人员离职了不需要发送告警了
// 2. 某个告警通道进行维护,暂时不需要发送告警了
// 3. 业务值班的重定向逻辑,将高等级的告警额外发送给应急人员等
// 可以结合业务需求自己实现router
func (s *NotifyTarget) AndMerge(other *NotifyTarget) {
s.merge(other, NotifyChannels.AndMerge)
}
func (s *NotifyTarget) merge(other *NotifyTarget, f func(NotifyChannels, NotifyChannels)) {
if other == nil {
return
}
for k, v := range other.userMap {
if curV, has := s.userMap[k]; has {
f(curV, v)
} else {
s.userMap[k] = v
}
}
for k, v := range other.webhooks {
s.webhooks[k] = v
}
for k, v := range other.callbacks {
s.callbacks[k] = v
}
}
// ToChannelUserMap userMap(map[uid][channel]bool) 转换为 map[channel][]uid 的结构
func (s *NotifyTarget) ToChannelUserMap() map[string][]int64 {
m := make(map[string][]int64)
for uid, nc := range s.userMap {
for ch, send := range nc {
if send {
m[ch] = append(m[ch], uid)
}
}
}
return m
}
func (s *NotifyTarget) ToCallbackList() []string {
callbacks := make([]string, 0, len(s.callbacks))
for cb := range s.callbacks {
callbacks = append(callbacks, cb)
}
return callbacks
}
func (s *NotifyTarget) ToWebhookMap() map[string]*models.Webhook {
return s.webhooks
}
func (s *NotifyTarget) ToUidList() []int64 {
uids := make([]int64, 0, len(s.userMap))
for uid, _ := range s.userMap {
uids = append(uids, uid)
}
return uids
}
// Dispatch 抽象由告警事件到信息接收者的路由策略
// rule: 告警规则
// event: 告警事件
// prev: 前一次路由结果, Dispatch 的实现可以直接修改 prev, 也可以返回一个新的 NotifyTarget 用于 AndMerge/OrMerge
type NotifyTargetDispatch func(rule *models.AlertRule, event *models.AlertCurEvent, prev *NotifyTarget, dispatch *Dispatch) *NotifyTarget
// GroupDispatch 处理告警规则的组订阅关系
func NotifyGroupDispatch(rule *models.AlertRule, event *models.AlertCurEvent, prev *NotifyTarget, dispatch *Dispatch) *NotifyTarget {
groupIds := make([]int64, 0, len(event.NotifyGroupsJSON))
for _, groupId := range event.NotifyGroupsJSON {
gid, err := strconv.ParseInt(groupId, 10, 64)
if err != nil {
continue
}
groupIds = append(groupIds, gid)
}
groups := dispatch.userGroupCache.GetByUserGroupIds(groupIds)
NotifyTarget := NewNotifyTarget()
for _, group := range groups {
for _, userId := range group.UserIds {
NotifyTarget.userMap[userId] = NewNotifyChannels(event.NotifyChannelsJSON)
}
}
return NotifyTarget
}
func GlobalWebhookDispatch(rule *models.AlertRule, event *models.AlertCurEvent, prev *NotifyTarget, dispatch *Dispatch) *NotifyTarget {
webhooks := dispatch.notifyConfigCache.GetWebhooks()
NotifyTarget := NewNotifyTarget()
for _, webhook := range webhooks {
if !webhook.Enable {
continue
}
NotifyTarget.webhooks[webhook.Url] = webhook
}
return NotifyTarget
}
func EventCallbacksDispatch(rule *models.AlertRule, event *models.AlertCurEvent, prev *NotifyTarget, dispatch *Dispatch) *NotifyTarget {
for _, c := range event.CallbacksJSON {
if c == "" {
continue
}
prev.callbacks[c] = struct{}{}
}
return nil
}
================================================
FILE: alert/eval/alert_rule.go
================================================
package eval
import (
"context"
"fmt"
"strconv"
"time"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/alert/naming"
"github.com/ccfos/nightingale/v6/alert/process"
"github.com/ccfos/nightingale/v6/datasource/commons/eslike"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/prom"
"github.com/toolkits/pkg/logger"
)
type Scheduler struct {
// key: hash
alertRules map[string]*AlertRuleWorker
ExternalProcessors *process.ExternalProcessorsType
aconf aconf.Alert
alertRuleCache *memsto.AlertRuleCacheType
targetCache *memsto.TargetCacheType
targetsOfAlertRuleCache *memsto.TargetsOfAlertRuleCacheType
busiGroupCache *memsto.BusiGroupCacheType
alertMuteCache *memsto.AlertMuteCacheType
datasourceCache *memsto.DatasourceCacheType
promClients *prom.PromClientMap
naming *naming.Naming
ctx *ctx.Context
stats *astats.Stats
}
func NewScheduler(aconf aconf.Alert, externalProcessors *process.ExternalProcessorsType, arc *memsto.AlertRuleCacheType,
targetCache *memsto.TargetCacheType, toarc *memsto.TargetsOfAlertRuleCacheType,
busiGroupCache *memsto.BusiGroupCacheType, alertMuteCache *memsto.AlertMuteCacheType, datasourceCache *memsto.DatasourceCacheType,
promClients *prom.PromClientMap, naming *naming.Naming, ctx *ctx.Context, stats *astats.Stats) *Scheduler {
scheduler := &Scheduler{
aconf: aconf,
alertRules: make(map[string]*AlertRuleWorker),
ExternalProcessors: externalProcessors,
alertRuleCache: arc,
targetCache: targetCache,
targetsOfAlertRuleCache: toarc,
busiGroupCache: busiGroupCache,
alertMuteCache: alertMuteCache,
datasourceCache: datasourceCache,
promClients: promClients,
naming: naming,
ctx: ctx,
stats: stats,
}
eslike.SetEsIndexPatternCacheType(memsto.NewEsIndexPatternCacheType(ctx))
go scheduler.LoopSyncRules(context.Background())
return scheduler
}
func (s *Scheduler) LoopSyncRules(ctx context.Context) {
time.Sleep(time.Duration(s.aconf.EngineDelay) * time.Second)
duration := 9000 * time.Millisecond
for {
select {
case <-ctx.Done():
return
case <-time.After(duration):
s.syncAlertRules()
}
}
}
func (s *Scheduler) syncAlertRules() {
ids := s.alertRuleCache.GetRuleIds()
alertRuleWorkers := make(map[string]*AlertRuleWorker)
externalRuleWorkers := make(map[string]*process.Processor)
for _, id := range ids {
rule := s.alertRuleCache.Get(id)
if rule == nil {
continue
}
ruleType := rule.GetRuleType()
if rule.IsPrometheusRule() || rule.IsInnerRule() {
datasourceIds := s.datasourceCache.GetIDsByDsCateAndQueries(rule.Cate, rule.DatasourceQueries)
for _, dsId := range datasourceIds {
if !naming.DatasourceHashRing.IsHit(strconv.FormatInt(dsId, 10), fmt.Sprintf("%d", rule.Id), s.aconf.Heartbeat.Endpoint) {
continue
}
ds := s.datasourceCache.GetById(dsId)
if ds == nil {
logger.Debugf("alert_eval_%d datasource %d not found", rule.Id, dsId)
continue
}
if ds.PluginType != ruleType {
logger.Debugf("alert_eval_%d datasource %d category is %s not %s", rule.Id, dsId, ds.PluginType, ruleType)
continue
}
if ds.Status != "enabled" {
logger.Debugf("alert_eval_%d datasource %d status is %s", rule.Id, dsId, ds.Status)
continue
}
processor := process.NewProcessor(s.aconf.Heartbeat.EngineName, rule, dsId, s.alertRuleCache, s.targetCache, s.targetsOfAlertRuleCache, s.busiGroupCache, s.alertMuteCache, s.datasourceCache, s.ctx, s.stats)
alertRule := NewAlertRuleWorker(rule, dsId, processor, s.promClients, s.ctx)
alertRuleWorkers[alertRule.Hash()] = alertRule
}
} else if rule.IsHostRule() {
// all host rule will be processed by center instance
if !naming.DatasourceHashRing.IsHit(s.aconf.Heartbeat.EngineName, strconv.FormatInt(rule.Id, 10), s.aconf.Heartbeat.Endpoint) {
continue
}
processor := process.NewProcessor(s.aconf.Heartbeat.EngineName, rule, 0, s.alertRuleCache, s.targetCache, s.targetsOfAlertRuleCache, s.busiGroupCache, s.alertMuteCache, s.datasourceCache, s.ctx, s.stats)
alertRule := NewAlertRuleWorker(rule, 0, processor, s.promClients, s.ctx)
alertRuleWorkers[alertRule.Hash()] = alertRule
} else {
// 如果 rule 不是通过 prometheus engine 来告警的,则创建为 externalRule
// if rule is not processed by prometheus engine, create it as externalRule
dsIds := s.datasourceCache.GetIDsByDsCateAndQueries(rule.Cate, rule.DatasourceQueries)
for _, dsId := range dsIds {
ds := s.datasourceCache.GetById(dsId)
if ds == nil {
logger.Debugf("alert_eval_%d datasource %d not found", rule.Id, dsId)
continue
}
if ds.Status != "enabled" {
logger.Debugf("alert_eval_%d datasource %d status is %s", rule.Id, dsId, ds.Status)
continue
}
processor := process.NewProcessor(s.aconf.Heartbeat.EngineName, rule, dsId, s.alertRuleCache, s.targetCache, s.targetsOfAlertRuleCache, s.busiGroupCache, s.alertMuteCache, s.datasourceCache, s.ctx, s.stats)
externalRuleWorkers[processor.Key()] = processor
}
}
}
for hash, rule := range alertRuleWorkers {
if _, has := s.alertRules[hash]; !has {
rule.Prepare()
time.Sleep(time.Duration(20) * time.Millisecond)
rule.Start()
s.alertRules[hash] = rule
}
}
for hash, rule := range s.alertRules {
if _, has := alertRuleWorkers[hash]; !has {
rule.Stop()
delete(s.alertRules, hash)
}
}
s.ExternalProcessors.ExternalLock.Lock()
for key, processor := range externalRuleWorkers {
if curProcessor, has := s.ExternalProcessors.Processors[key]; has {
// rule存在,且hash一致,认为没有变更,这里可以根据需求单独实现一个关联数据更多的hash函数
if processor.Hash() == curProcessor.Hash() {
continue
}
}
// 现有规则中没有rule以及有rule但hash不一致的场景,需要触发rule的update
processor.RecoverAlertCurEventFromDb()
s.ExternalProcessors.Processors[key] = processor
}
for key := range s.ExternalProcessors.Processors {
if _, has := externalRuleWorkers[key]; !has {
delete(s.ExternalProcessors.Processors, key)
}
}
s.ExternalProcessors.ExternalLock.Unlock()
}
================================================
FILE: alert/eval/eval.go
================================================
package eval
import (
"context"
"encoding/json"
"errors"
"fmt"
"math"
"reflect"
"sort"
"strconv"
"strings"
"sync"
"text/template"
"time"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/alert/common"
"github.com/ccfos/nightingale/v6/alert/process"
"github.com/ccfos/nightingale/v6/dscache"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/hash"
"github.com/ccfos/nightingale/v6/pkg/parser"
"github.com/ccfos/nightingale/v6/pkg/poster"
promsdk "github.com/ccfos/nightingale/v6/pkg/prom"
promql2 "github.com/ccfos/nightingale/v6/pkg/promql"
"github.com/ccfos/nightingale/v6/pkg/tplx"
"github.com/ccfos/nightingale/v6/pkg/unit"
"github.com/ccfos/nightingale/v6/prom"
"github.com/prometheus/common/model"
"github.com/robfig/cron/v3"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/str"
)
type AlertRuleWorker struct {
DatasourceId int64
Quit chan struct{}
Inhibit bool
Rule *models.AlertRule
Processor *process.Processor
PromClients *prom.PromClientMap
Ctx *ctx.Context
Scheduler *cron.Cron
HostAndDeviceIdentCache sync.Map
LastSeriesStore map[uint64]models.DataResp
DeviceIdentHook func(arw *AlertRuleWorker, paramQuery models.ParamQuery) ([]string, error)
}
const (
GET_RULE_CONFIG = "get_rule_config"
GET_Processor = "get_Processor"
CHECK_QUERY = "check_query_config"
GET_CLIENT = "get_client"
QUERY_DATA = "query_data"
EXEC_TEMPLATE = "exec_template"
)
const (
JoinMark = "@@"
)
type JoinType string
const (
Left JoinType = "left"
Right JoinType = "right"
Inner JoinType = "inner"
)
func NewAlertRuleWorker(rule *models.AlertRule, datasourceId int64, Processor *process.Processor, promClients *prom.PromClientMap, ctx *ctx.Context) *AlertRuleWorker {
arw := &AlertRuleWorker{
DatasourceId: datasourceId,
Quit: make(chan struct{}),
Rule: rule,
Processor: Processor,
PromClients: promClients,
Ctx: ctx,
HostAndDeviceIdentCache: sync.Map{},
DeviceIdentHook: func(arw *AlertRuleWorker, paramQuery models.ParamQuery) ([]string, error) {
return nil, nil
},
LastSeriesStore: make(map[uint64]models.DataResp),
}
interval := rule.PromEvalInterval
if interval <= 0 {
interval = 10
}
if rule.CronPattern == "" {
rule.CronPattern = fmt.Sprintf("@every %ds", interval)
}
arw.Scheduler = cron.New(cron.WithSeconds(), cron.WithChain(cron.SkipIfStillRunning(cron.DefaultLogger)))
entryID, err := arw.Scheduler.AddFunc(rule.CronPattern, func() {
arw.Eval()
})
if err != nil {
logger.Errorf("alert_eval_%d datasource_%d add cron pattern error: %v", arw.Rule.Id, arw.DatasourceId, err)
}
Processor.ScheduleEntry = arw.Scheduler.Entry(entryID)
Processor.PromEvalInterval = getPromEvalInterval(Processor.ScheduleEntry.Schedule)
return arw
}
func getPromEvalInterval(schedule cron.Schedule) int {
now := time.Now()
next1 := schedule.Next(now)
next2 := schedule.Next(next1)
return int(next2.Sub(next1).Seconds())
}
func (arw *AlertRuleWorker) Key() string {
return common.RuleKey(arw.DatasourceId, arw.Rule.Id)
}
func (arw *AlertRuleWorker) Hash() string {
return str.MD5(fmt.Sprintf("%d_%s_%s_%d",
arw.Rule.Id,
arw.Rule.CronPattern,
arw.Rule.RuleConfig,
arw.DatasourceId,
))
}
func (arw *AlertRuleWorker) Prepare() {
arw.Processor.RecoverAlertCurEventFromDb()
}
func (arw *AlertRuleWorker) Start() {
arw.Scheduler.Start()
}
func (arw *AlertRuleWorker) Eval() {
begin := time.Now()
var message string
defer func() {
if len(message) == 0 {
logger.Infof("alert_eval_%d datasource_%d finished, duration:%v", arw.Rule.Id, arw.DatasourceId, time.Since(begin))
} else {
logger.Warningf("alert_eval_%d datasource_%d finished, duration:%v, message:%s", arw.Rule.Id, arw.DatasourceId, time.Since(begin), message)
}
}()
if arw.Processor.PromEvalInterval == 0 {
arw.Processor.PromEvalInterval = getPromEvalInterval(arw.Processor.ScheduleEntry.Schedule)
}
cachedRule := arw.Rule
if cachedRule == nil {
message = "rule not found"
return
}
arw.Processor.Stats.CounterRuleEval.WithLabelValues().Inc()
arw.HostAndDeviceIdentCache = sync.Map{}
typ := cachedRule.GetRuleType()
var (
anomalyPoints []models.AnomalyPoint
recoverPoints []models.AnomalyPoint
err error
)
switch typ {
case models.PROMETHEUS:
anomalyPoints, err = arw.GetPromAnomalyPoint(cachedRule.RuleConfig)
case models.HOST:
anomalyPoints, err = arw.GetHostAnomalyPoint(cachedRule.RuleConfig)
case models.LOKI:
anomalyPoints, err = arw.GetPromAnomalyPoint(cachedRule.RuleConfig)
default:
anomalyPoints, recoverPoints, err = arw.GetAnomalyPoint(cachedRule, arw.Processor.DatasourceId())
}
if err != nil {
message = fmt.Sprintf("failed to get anomaly points: %v", err)
return
}
if arw.Processor == nil {
message = "processor is nil"
return
}
if arw.Inhibit {
pointsMap := make(map[string]models.AnomalyPoint)
for _, point := range recoverPoints {
// 对于恢复的事件,合并处理
tagHash := process.TagHash(point)
p, exists := pointsMap[tagHash]
if !exists {
pointsMap[tagHash] = point
continue
}
if p.Severity > point.Severity {
hash := process.Hash(cachedRule.Id, arw.Processor.DatasourceId(), p)
arw.Processor.DeleteProcessEvent(hash)
models.AlertCurEventDelByHash(arw.Ctx, hash)
pointsMap[tagHash] = point
}
}
now := time.Now().Unix()
for _, point := range pointsMap {
str := fmt.Sprintf("%v", point.Value)
arw.Processor.RecoverSingle(true, process.Hash(cachedRule.Id, arw.Processor.DatasourceId(), point), now, &str)
}
} else {
now := time.Now().Unix()
for _, point := range recoverPoints {
str := fmt.Sprintf("%v", point.Value)
arw.Processor.RecoverSingle(true, process.Hash(cachedRule.Id, arw.Processor.DatasourceId(), point), now, &str)
}
}
arw.Processor.Handle(anomalyPoints, "inner", arw.Inhibit)
}
func (arw *AlertRuleWorker) Stop() {
logger.Infof("alert_eval_%d datasource_%d stopped", arw.Rule.Id, arw.DatasourceId)
close(arw.Quit)
c := arw.Scheduler.Stop()
<-c.Done()
}
func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) ([]models.AnomalyPoint, error) {
var lst []models.AnomalyPoint
start := time.Now()
defer func() {
arw.Processor.Stats.GaugeRuleEvalDuration.WithLabelValues(fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId())).Set(float64(time.Since(start).Milliseconds()))
}()
var rule *models.PromRuleConfig
if err := json.Unmarshal([]byte(ruleConfig), &rule); err != nil {
logger.Errorf("alert_eval_%d datasource_%d rule_config:%s, error:%v", arw.Rule.Id, arw.DatasourceId, ruleConfig, err)
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_RULE_CONFIG, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
"",
).Set(0)
return lst, err
}
if rule == nil {
logger.Errorf("alert_eval_%d datasource_%d rule_config:%s, error:rule is nil", arw.Rule.Id, arw.DatasourceId, ruleConfig)
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_RULE_CONFIG, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
"",
).Set(0)
return lst, errors.New("rule is nil")
}
arw.Inhibit = rule.Inhibit
for i, query := range rule.Queries {
readerClient := arw.PromClients.GetCli(arw.DatasourceId)
if readerClient == nil {
logger.Warningf("alert_eval_%d datasource_%d error reader client is nil", arw.Rule.Id, arw.DatasourceId)
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_CLIENT, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
fmt.Sprintf("%v", i),
).Set(-2)
continue
}
if query.VarEnabled && strings.Contains(query.PromQl, "$") {
var anomalyPoints []models.AnomalyPoint
if hasLabelLossAggregator(query) || notExactMatch(query) {
// 若有聚合函数或非精确匹配则需要先填充变量然后查询,这个方式效率较低
anomalyPoints = arw.VarFillingBeforeQuery(query, readerClient)
arw.Processor.Stats.CounterVarFillingQuery.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
fmt.Sprintf("%v", i),
"BeforeQuery",
).Inc()
} else {
// 先查询再过滤变量,效率较高,但无法处理有聚合函数的情况
anomalyPoints = arw.VarFillingAfterQuery(query, readerClient)
arw.Processor.Stats.CounterVarFillingQuery.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
fmt.Sprintf("%v", i),
"AfterQuery",
).Inc()
}
lst = append(lst, anomalyPoints...)
} else {
// 无变量
promql := strings.TrimSpace(query.PromQl)
if promql == "" {
logger.Warningf("alert_eval_%d datasource_%d promql is blank", arw.Rule.Id, arw.DatasourceId)
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), CHECK_QUERY, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
continue
}
if arw.PromClients.IsNil(arw.DatasourceId) {
logger.Warningf("alert_eval_%d datasource_%d error reader client is nil", arw.Rule.Id, arw.DatasourceId)
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_CLIENT, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
continue
}
var warnings promsdk.Warnings
arw.Processor.Stats.CounterQueryDataTotal.WithLabelValues(fmt.Sprintf("%d", arw.DatasourceId), fmt.Sprintf("%d", arw.Rule.Id)).Inc()
value, warnings, err := readerClient.Query(context.Background(), promql, time.Now())
if err != nil {
logger.Errorf("alert_eval_%d datasource_%d promql:%s, error:%v", arw.Rule.Id, arw.DatasourceId, promql, err)
arw.Processor.Stats.CounterQueryDataErrorTotal.WithLabelValues(fmt.Sprintf("%d", arw.DatasourceId)).Inc()
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), QUERY_DATA, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
fmt.Sprintf("%v", i),
).Set(-1)
return lst, err
}
if len(warnings) > 0 {
logger.Errorf("alert_eval_%d datasource_%d promql:%s, warnings:%v", arw.Rule.Id, arw.DatasourceId, promql, warnings)
arw.Processor.Stats.CounterQueryDataErrorTotal.WithLabelValues(fmt.Sprintf("%d", arw.DatasourceId)).Inc()
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), QUERY_DATA, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
}
logger.Infof("alert_eval_%d datasource_%d query:%+v, value:%v", arw.Rule.Id, arw.DatasourceId, query, value)
points := models.ConvertAnomalyPoints(value)
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
fmt.Sprintf("%v", i),
).Set(float64(len(points)))
for i := 0; i < len(points); i++ {
points[i].Severity = query.Severity
points[i].Query = promql
points[i].ValuesUnit = map[string]unit.FormattedValue{
"v": unit.ValueFormatter(query.Unit, 2, points[i].Value),
}
}
lst = append(lst, points...)
}
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
fmt.Sprintf("%v", i),
).Set(float64(len(lst)))
}
return lst, nil
}
type sample struct {
Metric model.Metric `json:"metric"`
Value model.SampleValue `json:"value"`
Timestamp model.Time
}
// VarFillingAfterQuery 填充变量,先查询再填充变量
// 公式: mem_used_percent{host="$host"} > $val 其中 $host 为参数变量,$val 为值变量
// 实现步骤:
// 依次遍历参数配置节点,保证同一参数变量的子筛选可以覆盖上一层筛选
// 每个节点先查询无参数的 query, 即 mem_used_percent{} > curVal, 得到满足值变量的所有结果
// 结果中有满足本节点参数变量的值,加入异常点列表
// 参数变量的值不满足的组合,需要覆盖上层筛选中产生的异常点
func (arw *AlertRuleWorker) VarFillingAfterQuery(query models.PromQuery, readerClient promsdk.API) []models.AnomalyPoint {
varToLabel := ExtractVarMapping(query.PromQl)
fullQuery := removeVal(query.PromQl)
// 存储所有的异常点,key 为参数变量的组合,可以实现子筛选对上一层筛选的覆盖
anomalyPointsMap := make(map[string]models.AnomalyPoint)
// 统一变量配置格式
VarConfigForCalc := &models.ChildVarConfig{
ParamVal: make([]map[string]models.ParamQuery, 1),
ChildVarConfigs: query.VarConfig.ChildVarConfigs,
}
VarConfigForCalc.ParamVal[0] = make(map[string]models.ParamQuery)
for _, p := range query.VarConfig.ParamVal {
VarConfigForCalc.ParamVal[0][p.Name] = models.ParamQuery{
ParamType: p.ParamType,
Query: p.Query,
}
}
// 使用一个统一的参数变量顺序
var ParamKeys []string
for val, valQuery := range VarConfigForCalc.ParamVal[0] {
if valQuery.ParamType == "threshold" {
continue
}
ParamKeys = append(ParamKeys, val)
}
sort.Slice(ParamKeys, func(i, j int) bool {
return ParamKeys[i] < ParamKeys[j]
})
// 遍历变量配置链表
curNode := VarConfigForCalc
for curNode != nil {
for _, param := range curNode.ParamVal {
// curQuery 当前节点的无参数 query,用于时序库查询
curQuery := fullQuery
// realQuery 当前节点产生异常点的 query,用于告警展示
realQuery := query.PromQl
// 取出阈值变量
valMap := make(map[string]string)
for val, valQuery := range param {
if valQuery.ParamType == "threshold" {
valMap[val] = getString(valQuery.Query)
}
}
// 替换值变量
for key, val := range valMap {
curQuery = strings.Replace(curQuery, fmt.Sprintf("$%s", key), val, -1)
realQuery = strings.Replace(realQuery, fmt.Sprintf("$%s", key), val, -1)
}
// 得到满足值变量的所有结果
arw.Processor.Stats.CounterQueryDataTotal.WithLabelValues(fmt.Sprintf("%d", arw.DatasourceId), fmt.Sprintf("%d", arw.Rule.Id)).Inc()
value, _, err := readerClient.Query(context.Background(), curQuery, time.Now())
if err != nil {
logger.Errorf("alert_eval_%d datasource_%d promql:%s, error:%v", arw.Rule.Id, arw.DatasourceId, curQuery, err)
continue
}
seqVals := getSamples(value)
// 得到参数变量的所有组合
paramPermutation, err := arw.getParamPermutation(param, ParamKeys, varToLabel, query.PromQl, readerClient)
if err != nil {
logger.Errorf("alert_eval_%d datasource_%d paramPermutation error:%v", arw.Rule.Id, arw.DatasourceId, err)
continue
}
// 判断哪些参数值符合条件
for i := range seqVals {
curRealQuery := realQuery
var cur []string
for _, paramKey := range ParamKeys {
val := string(seqVals[i].Metric[model.LabelName(varToLabel[paramKey])])
cur = append(cur, val)
curRealQuery = fillVar(curRealQuery, paramKey, val)
}
if _, ok := paramPermutation[strings.Join(cur, JoinMark)]; ok {
anomalyPointsMap[strings.Join(cur, JoinMark)] = models.AnomalyPoint{
Key: seqVals[i].Metric.String(),
Timestamp: seqVals[i].Timestamp.Unix(),
Value: float64(seqVals[i].Value),
Labels: seqVals[i].Metric,
Severity: query.Severity,
Query: curRealQuery,
}
// 生成异常点后,删除该参数组合
delete(paramPermutation, strings.Join(cur, JoinMark))
}
}
// 剩余的参数组合为本层筛选不产生异常点的组合,需要覆盖上层筛选中产生的异常点
for k, _ := range paramPermutation {
delete(anomalyPointsMap, k)
}
}
curNode = curNode.ChildVarConfigs
}
anomalyPoints := make([]models.AnomalyPoint, 0)
for _, point := range anomalyPointsMap {
anomalyPoints = append(anomalyPoints, point)
}
return anomalyPoints
}
// getSamples 获取查询结果的所有样本,并转化为统一的格式
func getSamples(value model.Value) []sample {
var seqVals []sample
switch value.Type() {
case model.ValVector:
items, ok := value.(model.Vector)
if !ok {
break
}
for i := range items {
seqVals = append(seqVals, sample{
Metric: items[i].Metric,
Value: items[i].Value,
Timestamp: items[i].Timestamp,
})
}
case model.ValMatrix:
items, ok := value.(model.Matrix)
if !ok {
break
}
for i := range items {
last := items[i].Values[len(items[i].Values)-1]
seqVals = append(seqVals, sample{
Metric: items[i].Metric,
Value: last.Value,
Timestamp: last.Timestamp,
})
}
default:
}
return seqVals
}
// removeVal 去除 promql 中的参数变量
// mem{test1=\"$test1\",test2=\"test2\"} > $val1 and mem{test3=\"test3\",test4=\"$test4\"} > $val2
// ==> mem{test2=\"test2\"} > $val1 and mem{test3=\"test3\"} > $val2
func removeVal(promql string) string {
sb := strings.Builder{}
n := len(promql)
start := false
lastIdx := 0
curIdx := 0
isVar := false
for curIdx < n {
if !start {
if promql[curIdx] == '{' {
start = true
lastIdx = curIdx
}
sb.WriteRune(rune(promql[curIdx]))
} else {
if promql[curIdx] == '$' {
isVar = true
}
if promql[curIdx] == ',' || promql[curIdx] == '}' {
if !isVar {
if sb.String()[sb.Len()-1] == '{' {
lastIdx++
}
sb.WriteString(promql[lastIdx:curIdx])
}
isVar = false
if promql[curIdx] == '}' {
start = false
sb.WriteRune(rune(promql[curIdx]))
}
lastIdx = curIdx
}
}
curIdx++
}
return sb.String()
}
// 获取参数变量的所有组合
func (arw *AlertRuleWorker) getParamPermutation(paramVal map[string]models.ParamQuery, paramKeys []string, varToLabel map[string]string, originPromql string, readerClient promsdk.API) (map[string]struct{}, error) {
// 参数变量查询,得到参数变量值
paramMap := make(map[string][]string)
for _, paramKey := range paramKeys {
var params []string
paramQuery, ok := paramVal[paramKey]
if !ok {
return nil, fmt.Errorf("param key not found: %s", paramKey)
}
switch paramQuery.ParamType {
case "host":
hostIdents, err := arw.getHostIdents(paramQuery)
if err != nil {
logger.Errorf("alert_eval_%d datasource_%d fail to get host idents, error:%v", arw.Rule.Id, arw.DatasourceId, err)
break
}
params = hostIdents
case "device":
deviceIdents, err := arw.getDeviceIdents(paramQuery)
if err != nil {
logger.Errorf("alert_eval_%d datasource_%d fail to get device idents, error:%v", arw.Rule.Id, arw.DatasourceId, err)
break
}
params = deviceIdents
case "enum":
q, _ := json.Marshal(paramQuery.Query)
var query []string
err := json.Unmarshal(q, &query)
if err != nil {
logger.Errorf("alert_eval_%d datasource_%d query:%s fail to unmarshalling into string slice, error:%v", arw.Rule.Id, arw.DatasourceId, paramQuery.Query, err)
}
if len(query) == 0 {
paramsKeyAllLabel, err := getParamKeyAllLabel(varToLabel[paramKey], originPromql, readerClient, arw.DatasourceId, arw.Rule.Id, arw.Processor.Stats)
if err != nil {
logger.Errorf("alert_eval_%d datasource_%d fail to getParamKeyAllLabel, error:%v query:%s", arw.Rule.Id, arw.DatasourceId, err, paramQuery.Query)
}
params = paramsKeyAllLabel
} else {
params = query
}
default:
return nil, fmt.Errorf("unknown param type: %s", paramQuery.ParamType)
}
if len(params) == 0 {
return nil, fmt.Errorf("param key: %s, params is empty", paramKey)
}
logger.Infof("alert_eval_%d datasource_%d paramKey: %s, params: %v", arw.Rule.Id, arw.DatasourceId, paramKey, params)
paramMap[paramKey] = params
}
// 得到以 paramKeys 为顺序的所有参数组合
permutation := mapPermutation(paramKeys, paramMap)
res := make(map[string]struct{})
for i := range permutation {
res[strings.Join(permutation[i], JoinMark)] = struct{}{}
}
return res, nil
}
func getParamKeyAllLabel(paramKey string, promql string, client promsdk.API, dsId int64, rid int64, stats *astats.Stats) ([]string, error) {
labels, metricName, err := promql2.GetLabelsAndMetricNameWithReplace(promql, "$")
if err != nil {
return nil, fmt.Errorf("promql:%s, get labels error:%v", promql, err)
}
labelstrs := make([]string, 0)
for _, label := range labels {
if strings.HasPrefix(label.Value, "$") {
continue
}
labelstrs = append(labelstrs, label.Name+label.Op+label.Value)
}
pr := metricName + "{" + strings.Join(labelstrs, ",") + "}"
stats.CounterQueryDataTotal.WithLabelValues(fmt.Sprintf("%d", dsId), fmt.Sprintf("%d", rid)).Inc()
value, _, err := client.Query(context.Background(), pr, time.Now())
if err != nil {
return nil, fmt.Errorf("promql: %s query error: %v", pr, err)
}
labelValuesMap := make(map[string]struct{})
switch value.Type() {
case model.ValVector:
vector := value.(model.Vector)
for _, sample := range vector {
for labelName, labelValue := range sample.Metric {
// 只处理ParamKeys中指定的label
if string(labelName) == paramKey {
labelValuesMap[string(labelValue)] = struct{}{}
}
}
}
case model.ValMatrix:
matrix := value.(model.Matrix)
for _, series := range matrix {
for labelName, labelValue := range series.Metric {
// 只处理ParamKeys中指定的label
if string(labelName) == paramKey {
labelValuesMap[string(labelValue)] = struct{}{}
}
}
}
}
result := make([]string, 0)
for labelValue, _ := range labelValuesMap {
result = append(result, labelValue)
}
return result, nil
}
func (arw *AlertRuleWorker) getHostIdents(paramQuery models.ParamQuery) ([]string, error) {
var params []string
q, _ := json.Marshal(paramQuery.Query)
cacheKey := "Host_" + string(q)
value, hit := arw.HostAndDeviceIdentCache.Load(cacheKey)
if idents, ok := value.([]string); hit && ok {
params = idents
return params, nil
}
var queries []models.HostQuery
err := json.Unmarshal(q, &queries)
if err != nil {
return nil, err
}
if !arw.Ctx.IsCenter {
lst, err := poster.PostByUrlsWithResp[[]*models.Target](arw.Ctx, "/v1/n9e/targets-of-host-query", queries)
if err != nil {
return nil, err
}
for i := range lst {
params = append(params, lst[i].Ident)
}
} else {
hostsQuery := models.GetHostsQuery(queries)
session := models.TargetFilterQueryBuild(arw.Ctx, hostsQuery, 0, 0)
var lst []*models.Target
err = session.Find(&lst).Error
if err != nil {
return nil, err
}
for i := range lst {
params = append(params, lst[i].Ident)
}
}
arw.HostAndDeviceIdentCache.Store(cacheKey, params)
return params, nil
}
func (arw *AlertRuleWorker) getDeviceIdents(paramQuery models.ParamQuery) ([]string, error) {
return arw.DeviceIdentHook(arw, paramQuery)
}
// 生成所有排列组合
func mapPermutation(paramKeys []string, paraMap map[string][]string) [][]string {
var result [][]string
current := make([]string, len(paramKeys))
combine(paramKeys, paraMap, 0, current, &result)
return result
}
// 递归生成所有排列组合
func combine(paramKeys []string, paraMap map[string][]string, index int, current []string, result *[][]string) {
// 当到达最后一个 key 时,存储当前的组合
if index == len(paramKeys) {
combination := make([]string, len(current))
copy(combination, current)
*result = append(*result, combination)
return
}
// 获取当前 key 对应的 value 列表
key := paramKeys[index]
valueList := paraMap[key]
// 遍历每个 value,并递归生成下一个 key 的组合
for _, value := range valueList {
current[index] = value
combine(paramKeys, paraMap, index+1, current, result)
}
}
func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) ([]models.AnomalyPoint, error) {
var lst []models.AnomalyPoint
start := time.Now()
defer func() {
arw.Processor.Stats.GaugeRuleEvalDuration.WithLabelValues(fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId())).Set(float64(time.Since(start).Milliseconds()))
}()
var rule *models.HostRuleConfig
if err := json.Unmarshal([]byte(ruleConfig), &rule); err != nil {
logger.Errorf("alert_eval_%d datasource_%d rule_config:%s, error:%v", arw.Rule.Id, arw.DatasourceId, ruleConfig, err)
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_RULE_CONFIG, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
"",
).Set(0)
return lst, err
}
if rule == nil {
logger.Errorf("alert_eval_%d datasource_%d rule_config:%s, error:rule is nil", arw.Rule.Id, arw.DatasourceId, ruleConfig)
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_RULE_CONFIG, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
"",
).Set(0)
return lst, errors.New("rule is nil")
}
arw.Inhibit = rule.Inhibit
now := time.Now().Unix()
for _, trigger := range rule.Triggers {
switch trigger.Type {
case "target_miss":
t := now - int64(trigger.Duration)
var idents, engineIdents, missEngineIdents []string
var exists bool
if arw.Ctx.IsCenter {
// 如果是中心节点, 将不再上报数据的主机 engineName 为空的机器,也加入到 targets 中
missEngineIdents, exists = arw.Processor.TargetsOfAlertRuleCache.Get("", arw.Rule.Id)
if !exists {
logger.Debugf("alert_eval_%d datasource_%d targets not found engineName:%s", arw.Rule.Id, arw.DatasourceId, arw.Processor.EngineName)
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), QUERY_DATA, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
}
}
idents = append(idents, missEngineIdents...)
engineIdents, exists = arw.Processor.TargetsOfAlertRuleCache.Get(arw.Processor.EngineName, arw.Rule.Id)
if !exists {
logger.Warningf("alert_eval_%d datasource_%d targets not found engineName:%s", arw.Rule.Id, arw.DatasourceId, arw.Processor.EngineName)
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), QUERY_DATA, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
}
idents = append(idents, engineIdents...)
if len(idents) == 0 {
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
"",
).Set(0)
continue
}
var missTargets []string
targetUpdateTimeMap := arw.Processor.TargetCache.GetHostUpdateTime(idents)
for ident, updateTime := range targetUpdateTimeMap {
if updateTime < t {
missTargets = append(missTargets, ident)
}
}
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
"",
).Set(float64(len(missTargets)))
logger.Debugf("alert_eval_%d datasource_%d missTargets:%v", arw.Rule.Id, arw.DatasourceId, missTargets)
targets := arw.Processor.TargetCache.Gets(missTargets)
for _, target := range targets {
m := make(map[string]string)
for k, v := range target.TagsMap {
m[k] = v
}
m["ident"] = target.Ident
lst = append(lst, models.NewAnomalyPoint(trigger.Type, m, now, float64(now-target.BeatTime), trigger.Severity))
}
case "offset":
idents, exists := arw.Processor.TargetsOfAlertRuleCache.Get(arw.Processor.EngineName, arw.Rule.Id)
if !exists {
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
"",
).Set(0)
logger.Warningf("alert_eval_%d datasource_%d targets not found", arw.Rule.Id, arw.DatasourceId)
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), QUERY_DATA, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
continue
}
targets := arw.Processor.TargetCache.Gets(idents)
targetMap := make(map[string]*models.Target)
for _, target := range targets {
targetMap[target.Ident] = target
}
offsetIdents := make(map[string]int64)
targetsMeta := arw.Processor.TargetCache.GetHostMetas(targets)
for ident, meta := range targetsMeta {
if meta.CpuNum <= 0 {
// means this target is not collect by categraf, do not check offset
continue
}
if target, exists := targetMap[ident]; exists {
if now-target.BeatTime > 120 {
// means this target is not a active host, do not check offset
continue
}
}
offset := meta.Offset
if math.Abs(float64(offset)) > float64(trigger.Duration) {
offsetIdents[ident] = offset
}
}
logger.Debugf("alert_eval_%d datasource_%d offsetIdents:%v", arw.Rule.Id, arw.DatasourceId, offsetIdents)
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
"",
).Set(float64(len(offsetIdents)))
for host, offset := range offsetIdents {
m := make(map[string]string)
target, exists := arw.Processor.TargetCache.Get(host)
if exists {
for k, v := range target.TagsMap {
m[k] = v
}
}
m["ident"] = host
lst = append(lst, models.NewAnomalyPoint(trigger.Type, m, now, float64(offset), trigger.Severity))
}
case "pct_target_miss":
t := now - int64(trigger.Duration)
idents, exists := arw.Processor.TargetsOfAlertRuleCache.Get(arw.Processor.EngineName, arw.Rule.Id)
if !exists {
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
"",
).Set(0)
logger.Warningf("alert_eval_%d datasource_%d targets not found", arw.Rule.Id, arw.DatasourceId)
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), QUERY_DATA, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
continue
}
var missTargets []string
targetUpdateTimeMap := arw.Processor.TargetCache.GetHostUpdateTime(idents)
for ident, updateTime := range targetUpdateTimeMap {
if updateTime < t {
missTargets = append(missTargets, ident)
}
}
logger.Debugf("alert_eval_%d datasource_%d missTargets:%v", arw.Rule.Id, arw.DatasourceId, missTargets)
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
"",
).Set(float64(len(missTargets)))
pct := float64(len(missTargets)) / float64(len(idents)) * 100
if pct >= float64(trigger.Percent) {
lst = append(lst, models.NewAnomalyPoint(trigger.Type, nil, now, pct, trigger.Severity))
}
}
}
return lst, nil
}
func flatten(rehashed map[uint64][][]uint64) map[uint64][]uint64 {
seriesTagIndex := make(map[uint64][]uint64)
var i uint64
for _, HashTagIndex := range rehashed {
for u := range HashTagIndex {
seriesTagIndex[i] = HashTagIndex[u]
i++
}
}
return seriesTagIndex
}
// onJoin 组合两个经过 rehash 之后的集合
// 如查询 A,经过 on data_base rehash 分组后
// [[A1{data_base=1, table=alert},A2{data_base=1, table=alert}],[A5{data_base=1, table=board}]]
// [[A3{data_base=2, table=board}],[A4{data_base=2, table=alert}]]
// 查询 B,经过 on data_base rehash 分组后
// [[B1{data_base=1, table=alert}]]
// [[B2{data_base=2, table=alert}]]
// 内联得到
// [[A1{data_base=1, table=alert},A2{data_base=1, table=alert},B1{data_base=1, table=alert}],[A5{data_base=1, table=board},[B1{data_base=1, table=alert}]]
// [[A3{data_base=2, table=board},B2{data_base=2, table=alert}],[A4{data_base=2, table=alert},B2{data_base=2, table=alert}]]
func onJoin(reHashTagIndex1 map[uint64][][]uint64, reHashTagIndex2 map[uint64][][]uint64, joinType JoinType) map[uint64][][]uint64 {
reHashTagIndex := make(map[uint64][][]uint64)
for rehash := range reHashTagIndex1 {
if _, ok := reHashTagIndex2[rehash]; ok {
// 若有 rehash 相同的记录,两两合并
for i1 := range reHashTagIndex1[rehash] {
for i2 := range reHashTagIndex2[rehash] {
reHashTagIndex[rehash] = append(reHashTagIndex[rehash], mergeNewArray(reHashTagIndex1[rehash][i1], reHashTagIndex2[rehash][i2]))
}
}
} else {
// 合并方式不为 inner 时,需要保留 reHashTagIndex1 中未匹配的记录
if joinType != Inner {
reHashTagIndex[rehash] = reHashTagIndex1[rehash]
}
}
}
return reHashTagIndex
}
// rehashSet 重新 hash 分组
// 如当前查询 A 有五条记录
// A1{data_base=1, table=alert}
// A2{data_base=1, table=alert}
// A3{data_base=2, table=board}
// A4{data_base=2, table=alert}
// A5{data_base=1, table=board}
// 经过预处理(按曲线分组,此步已在进入 GetAnomalyPoint 函数前完成)后,分为 4 组,
// [A1{data_base=1, table=alert},A2{data_base=1, table=alert}]
// [A3{data_base=2, table=board}]
// [A4{data_base=2, table=alert}]
// [A5{data_base=1, table=board}]
// 若 rehashSet 按 data_base 重新分组,此时会得到按 rehash 值分的二维数组,即不会将 rehash 值相同的记录完全合并
// [[A1{data_base=1, table=alert},A2{data_base=1, table=alert}],[A5{data_base=1, table=board}]]
// [[A3{data_base=2, table=board}],[A4{data_base=2, table=alert}]]
func rehashSet(seriesTagIndex1 map[uint64][]uint64, seriesStore map[uint64]models.DataResp, on []string) map[uint64][][]uint64 {
reHashTagIndex := make(map[uint64][][]uint64)
for _, seriesHashes := range seriesTagIndex1 {
if len(seriesHashes) == 0 {
continue
}
series, exists := seriesStore[seriesHashes[0]]
if !exists {
continue
}
rehash := hash.GetTargetTagHash(series.Metric, on)
if _, ok := reHashTagIndex[rehash]; !ok {
reHashTagIndex[rehash] = make([][]uint64, 0)
}
reHashTagIndex[rehash] = append(reHashTagIndex[rehash], seriesHashes)
}
return reHashTagIndex
}
// 笛卡尔积,查询的结果两两合并
func cartesianJoin(seriesTagIndex1 map[uint64][]uint64, seriesTagIndex2 map[uint64][]uint64) map[uint64][]uint64 {
var index uint64
seriesTagIndex := make(map[uint64][]uint64)
for _, seriesHashes1 := range seriesTagIndex1 {
for _, seriesHashes2 := range seriesTagIndex2 {
seriesTagIndex[index] = mergeNewArray(seriesHashes1, seriesHashes2)
index++
}
}
return seriesTagIndex
}
// noneJoin 直接拼接
func noneJoin(seriesTagIndex1 map[uint64][]uint64, seriesTagIndex2 map[uint64][]uint64) map[uint64][]uint64 {
seriesTagIndex := make(map[uint64][]uint64)
var index uint64
for _, seriesHashes := range seriesTagIndex1 {
seriesTagIndex[index] = seriesHashes
index++
}
for _, seriesHashes := range seriesTagIndex2 {
seriesTagIndex[index] = seriesHashes
index++
}
return seriesTagIndex
}
// originalJoin 原始分组方案,key 相同,即标签全部相同分为一组
func originalJoin(seriesTagIndex1 map[uint64][]uint64, seriesTagIndex2 map[uint64][]uint64) map[uint64][]uint64 {
seriesTagIndex := make(map[uint64][]uint64)
for tagHash, seriesHashes := range seriesTagIndex1 {
if _, ok := seriesTagIndex[tagHash]; !ok {
seriesTagIndex[tagHash] = mergeNewArray(seriesHashes)
} else {
seriesTagIndex[tagHash] = append(seriesTagIndex[tagHash], seriesHashes...)
}
}
for tagHash, seriesHashes := range seriesTagIndex2 {
if _, ok := seriesTagIndex[tagHash]; !ok {
seriesTagIndex[tagHash] = mergeNewArray(seriesHashes)
} else {
seriesTagIndex[tagHash] = append(seriesTagIndex[tagHash], seriesHashes...)
}
}
return seriesTagIndex
}
// exclude 左斥,留下在 reHashTagIndex1 中,但不在 reHashTagIndex2 中的记录
func exclude(reHashTagIndex1 map[uint64][][]uint64, reHashTagIndex2 map[uint64][][]uint64) map[uint64][][]uint64 {
reHashTagIndex := make(map[uint64][][]uint64)
for rehash, _ := range reHashTagIndex1 {
if _, ok := reHashTagIndex2[rehash]; !ok {
reHashTagIndex[rehash] = reHashTagIndex1[rehash]
}
}
return reHashTagIndex
}
func MakeSeriesMap(series []models.DataResp, seriesTagIndex map[uint64][]uint64, seriesStore map[uint64]models.DataResp) {
for i := 0; i < len(series); i++ {
seriesHash := hash.GetHash(series[i].Metric, series[i].Ref)
tagHash := hash.GetTagHash(series[i].Metric)
seriesStore[seriesHash] = series[i]
// 将曲线按照相同的 tag 分组
if _, exists := seriesTagIndex[tagHash]; !exists {
seriesTagIndex[tagHash] = make([]uint64, 0)
}
seriesTagIndex[tagHash] = append(seriesTagIndex[tagHash], seriesHash)
}
}
func mergeNewArray(arg ...[]uint64) []uint64 {
res := make([]uint64, 0)
for _, a := range arg {
res = append(res, a...)
}
return res
}
func ProcessJoins(ruleId int64, trigger models.Trigger, seriesTagIndexes map[string]map[uint64][]uint64, seriesStore map[uint64]models.DataResp) map[uint64][]uint64 {
last := make(map[uint64][]uint64)
if len(seriesTagIndexes) == 0 {
return last
}
if len(trigger.Joins) == 0 {
idx := 0
for _, seriesTagIndex := range seriesTagIndexes {
if idx == 0 {
last = seriesTagIndex
} else {
last = originalJoin(last, seriesTagIndex)
}
idx++
}
return last
}
// 有 join 条件,按条件依次合并
if len(seriesTagIndexes) < len(trigger.Joins)+1 {
logger.Errorf("alert_eval_%d queries' count: %d not match join condition's count: %d", ruleId, len(seriesTagIndexes), len(trigger.Joins))
return nil
}
last = seriesTagIndexes[trigger.JoinRef]
lastRehashed := rehashSet(last, seriesStore, trigger.Joins[0].On)
for i := range trigger.Joins {
cur := seriesTagIndexes[trigger.Joins[i].Ref]
switch trigger.Joins[i].JoinType {
case "original":
last = originalJoin(last, cur)
case "none":
last = noneJoin(last, cur)
case "cartesian":
last = cartesianJoin(last, cur)
case "inner_join":
curRehashed := rehashSet(cur, seriesStore, trigger.Joins[i].On)
lastRehashed = onJoin(lastRehashed, curRehashed, Inner)
last = flatten(lastRehashed)
case "left_join":
curRehashed := rehashSet(cur, seriesStore, trigger.Joins[i].On)
lastRehashed = onJoin(lastRehashed, curRehashed, Left)
last = flatten(lastRehashed)
case "right_join":
curRehashed := rehashSet(cur, seriesStore, trigger.Joins[i].On)
lastRehashed = onJoin(curRehashed, lastRehashed, Right)
last = flatten(lastRehashed)
case "left_exclude":
curRehashed := rehashSet(cur, seriesStore, trigger.Joins[i].On)
lastRehashed = exclude(lastRehashed, curRehashed)
last = flatten(lastRehashed)
case "right_exclude":
curRehashed := rehashSet(cur, seriesStore, trigger.Joins[i].On)
lastRehashed = exclude(curRehashed, lastRehashed)
last = flatten(lastRehashed)
default:
logger.Warningf("alert_eval_%d join type:%s not support", ruleId, trigger.Joins[i].JoinType)
}
}
return last
}
func GetQueryRef(query interface{}) (string, error) {
// 首先检查是否为 map
if m, ok := query.(map[string]interface{}); ok {
if ref, exists := m["ref"]; exists {
if refStr, ok := ref.(string); ok {
return refStr, nil
}
return "", fmt.Errorf("ref 字段不是字符串类型")
}
return "", fmt.Errorf("query 中没有找到 ref 字段")
}
// 如果不是 map,则按原来的方式处理结构体
v := reflect.ValueOf(query)
if v.Kind() == reflect.Ptr {
v = v.Elem()
}
if v.Kind() != reflect.Struct {
return "", fmt.Errorf("query not a struct or map")
}
refField := v.FieldByName("Ref")
if !refField.IsValid() {
return "", fmt.Errorf("not find ref field")
}
if refField.Kind() != reflect.String {
return "", fmt.Errorf("ref not a string")
}
return refField.String(), nil
}
// query 可能是 string 或是 int int64 float64 等数字,全部转为 string
func getString(query interface{}) string {
switch query.(type) {
case string:
return query.(string)
case float64:
return strconv.FormatFloat(query.(float64), 'f', -1, 64)
default:
return ""
}
}
func GetQueryRefAndUnit(query interface{}) (string, string, error) {
type Query struct {
Ref string `json:"ref"`
Unit string `json:"unit"`
}
queryMap := Query{}
queryBytes, err := json.Marshal(query)
if err != nil {
return "", "", err
}
json.Unmarshal(queryBytes, &queryMap)
return queryMap.Ref, queryMap.Unit, nil
}
// VarFillingBeforeQuery 填充变量,先填充变量再查询,针对有聚合函数的情况
// 公式: avg(mem_used_percent{host="$host"}) > $val 其中 $host 为参数变量,$val 为值变量
// 实现步骤:
// 依次遍历参数配置节点,保证同一参数变量的子筛选可以覆盖上一层筛选
// 每个节点先填充参数再进行查询, 即先得到完整的 promql avg(mem_used_percent{host="127.0.0.1"}) > 5
// 再查询得到满足值变量的所有结果加入异常点列表
// 参数变量的值不满足的组合,需要覆盖上层筛选中产生的异常点
func (arw *AlertRuleWorker) VarFillingBeforeQuery(query models.PromQuery, readerClient promsdk.API) []models.AnomalyPoint {
varToLabel := ExtractVarMapping(query.PromQl)
// 存储异常点的 map,key 为参数变量的组合,可以实现子筛选对上一层筛选的覆盖
anomalyPointsMap := sync.Map{}
// 统一变量配置格式
VarConfigForCalc := &models.ChildVarConfig{
ParamVal: make([]map[string]models.ParamQuery, 1),
ChildVarConfigs: query.VarConfig.ChildVarConfigs,
}
VarConfigForCalc.ParamVal[0] = make(map[string]models.ParamQuery)
for _, p := range query.VarConfig.ParamVal {
VarConfigForCalc.ParamVal[0][p.Name] = models.ParamQuery{
ParamType: p.ParamType,
Query: p.Query,
}
}
// 使用一个统一的参数变量顺序
var ParamKeys []string
for val, valQuery := range VarConfigForCalc.ParamVal[0] {
if valQuery.ParamType == "threshold" {
continue
}
ParamKeys = append(ParamKeys, val)
}
sort.Slice(ParamKeys, func(i, j int) bool {
return ParamKeys[i] < ParamKeys[j]
})
// 遍历变量配置链表
curNode := VarConfigForCalc
for curNode != nil {
for _, param := range curNode.ParamVal {
curPromql := query.PromQl
// 取出阈值变量
valMap := make(map[string]string)
for val, valQuery := range param {
if valQuery.ParamType == "threshold" {
valMap[val] = getString(valQuery.Query)
}
}
// 替换阈值变量
for key, val := range valMap {
curPromql = strings.Replace(curPromql, fmt.Sprintf("$%s", key), val, -1)
}
// 得到参数变量的所有组合
paramPermutation, err := arw.getParamPermutation(param, ParamKeys, varToLabel, query.PromQl, readerClient)
if err != nil {
logger.Errorf("alert_eval_%d datasource_%d paramPermutation error:%v", arw.Rule.Id, arw.DatasourceId, err)
continue
}
keyToPromql := make(map[string]string)
for paramPermutationKeys, _ := range paramPermutation {
realPromql := curPromql
split := strings.Split(paramPermutationKeys, JoinMark)
for j := range ParamKeys {
realPromql = fillVar(realPromql, ParamKeys[j], split[j])
}
keyToPromql[paramPermutationKeys] = realPromql
}
// 并发查询
wg := sync.WaitGroup{}
semaphore := make(chan struct{}, 200)
for key, promql := range keyToPromql {
wg.Add(1)
semaphore <- struct{}{}
go func(key, promql string) {
defer func() {
<-semaphore
wg.Done()
}()
arw.Processor.Stats.CounterQueryDataTotal.WithLabelValues(fmt.Sprintf("%d", arw.DatasourceId), fmt.Sprintf("%d", arw.Rule.Id)).Inc()
value, _, err := readerClient.Query(context.Background(), promql, time.Now())
if err != nil {
logger.Errorf("alert_eval_%d datasource_%d promql:%s, error:%v", arw.Rule.Id, arw.DatasourceId, promql, err)
return
}
logger.Infof("alert_eval_%d datasource_%d promql:%s, value:%+v", arw.Rule.Id, arw.DatasourceId, promql, value)
points := models.ConvertAnomalyPoints(value)
if len(points) == 0 {
anomalyPointsMap.Delete(key)
return
}
for i := 0; i < len(points); i++ {
points[i].Severity = query.Severity
points[i].Query = promql
points[i].ValuesUnit = map[string]unit.FormattedValue{
"v": unit.ValueFormatter(query.Unit, 2, points[i].Value),
}
// 每个异常点都需要生成 key,子筛选使用 key 覆盖上层筛选,解决 issue https://github.com/ccfos/nightingale/issues/2433 提的问题
var cur []string
for _, paramKey := range ParamKeys {
val := string(points[i].Labels[model.LabelName(varToLabel[paramKey])])
cur = append(cur, val)
}
anomalyPointsMap.Store(strings.Join(cur, JoinMark), points[i])
}
}(key, promql)
}
wg.Wait()
}
curNode = curNode.ChildVarConfigs
}
anomalyPoints := make([]models.AnomalyPoint, 0)
anomalyPointsMap.Range(func(key, value any) bool {
if point, ok := value.(models.AnomalyPoint); ok {
anomalyPoints = append(anomalyPoints, point)
}
return true
})
return anomalyPoints
}
// 判断 query 中是否有会导致标签丢失的聚合函数
func hasLabelLossAggregator(query models.PromQuery) bool {
noLabelAggregators := []string{
"sum", "min", "max", "avg",
"stddev", "stdvar",
"count", "quantile",
"group",
}
promql := strings.ToLower(query.PromQl)
for _, fn := range noLabelAggregators {
// 检查是否包含这些聚合函数,需要确保函数名后面跟着左括号
if strings.Contains(promql, fn+"(") {
return true
}
}
return false
}
// 判断 query 中是否有 != =~ !~
func notExactMatch(query models.PromQuery) bool {
promql := strings.ToLower(query.PromQl)
if strings.Contains(promql, "!=") || strings.Contains(promql, "=~") || strings.Contains(promql, "!~") {
return true
}
return false
}
// ExtractVarMapping 从 promql 中提取变量映射关系,为了在 query 之后可以将标签正确的放回 promql
// 输入: sum(rate(mem_used_percent{host="$my_host"})) by (instance) + avg(node_load1{region="$region"}) > $val
// 输出: map[string]string{"my_host":"host", "region":"region"}
func ExtractVarMapping(promql string) map[string]string {
varMapping := make(map[string]string)
// 遍历所有花括号对
for {
start := strings.Index(promql, "{")
if start == -1 {
break
}
end := strings.Index(promql, "}")
if end == -1 {
break
}
// 提取标签键值对
labels := promql[start+1 : end]
pairs := strings.Split(labels, ",")
for _, pair := range pairs {
// 分割键值对
var kv []string
if strings.Contains(pair, "!=") {
kv = strings.Split(pair, "!=")
} else if strings.Contains(pair, "=~") {
kv = strings.Split(pair, "=~")
} else if strings.Contains(pair, "!~") {
kv = strings.Split(pair, "!~")
} else {
kv = strings.Split(pair, "=")
}
if len(kv) != 2 {
continue
}
key := strings.TrimSpace(kv[0])
value := strings.Trim(strings.TrimSpace(kv[1]), "\"")
value = strings.Trim(value, "'")
// 检查值是否为变量(以$开头)
if strings.HasPrefix(value, "$") {
varName := value[1:] // 去掉$前缀
varMapping[varName] = key
}
}
// 继续处理剩余部分
promql = promql[end+1:]
}
return varMapping
}
func fillVar(curRealQuery string, paramKey string, val string) string {
curRealQuery = strings.Replace(curRealQuery, fmt.Sprintf("'$%s'", paramKey), fmt.Sprintf("'%s'", val), -1)
curRealQuery = strings.Replace(curRealQuery, fmt.Sprintf("\"$%s\"", paramKey), fmt.Sprintf("\"%s\"", val), -1)
return curRealQuery
}
func (arw *AlertRuleWorker) GetAnomalyPoint(rule *models.AlertRule, dsId int64) ([]models.AnomalyPoint, []models.AnomalyPoint, error) {
// 获取查询和规则判断条件
start := time.Now()
defer func() {
arw.Processor.Stats.GaugeRuleEvalDuration.WithLabelValues(fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId())).Set(float64(time.Since(start).Milliseconds()))
}()
points := []models.AnomalyPoint{}
recoverPoints := []models.AnomalyPoint{}
ruleConfig := strings.TrimSpace(rule.RuleConfig)
if ruleConfig == "" {
logger.Warningf("alert_eval_%d datasource_%d ruleConfig is blank", rule.Id, dsId)
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_RULE_CONFIG, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
"",
).Set(0)
return points, recoverPoints, fmt.Errorf("alert_eval_%d datasource_%d ruleConfig is blank", rule.Id, dsId)
}
var ruleQuery models.RuleQuery
err := json.Unmarshal([]byte(ruleConfig), &ruleQuery)
if err != nil {
logger.Warningf("alert_eval_%d datasource_%d promql parse error:%s", rule.Id, dsId, err.Error())
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_RULE_CONFIG, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
return points, recoverPoints, fmt.Errorf("alert_eval_%d datasource_%d promql parse error:%s", rule.Id, dsId, err.Error())
}
arw.Inhibit = ruleQuery.Inhibit
if len(ruleQuery.Queries) > 0 {
seriesStore := make(map[uint64]models.DataResp)
seriesTagIndexes := make(map[string]map[uint64][]uint64, 0)
for i, query := range ruleQuery.Queries {
seriesTagIndex := make(map[uint64][]uint64)
plug, exists := dscache.DsCache.Get(rule.Cate, dsId)
if !exists {
logger.Warningf("alert_eval_%d datasource_%d not exists", rule.Id, dsId)
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_CLIENT, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
fmt.Sprintf("%v", i),
).Set(-2)
return points, recoverPoints, fmt.Errorf("alert_eval_%d datasource_%d not exists", rule.Id, dsId)
}
if err = ExecuteQueryTemplate(rule.Cate, query, nil); err != nil {
logger.Warningf("alert_eval_%d datasource_%d execute query template error: %v", rule.Id, dsId, err)
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), EXEC_TEMPLATE, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
fmt.Sprintf("%v", i),
).Set(-3)
}
ctx := context.WithValue(context.Background(), "delay", int64(rule.Delay))
series, err := plug.QueryData(ctx, query)
arw.Processor.Stats.CounterQueryDataTotal.WithLabelValues(fmt.Sprintf("%d", arw.DatasourceId), fmt.Sprintf("%d", rule.Id)).Inc()
if err != nil {
logger.Warningf("alert_eval_%d datasource_%d query data error: %v", rule.Id, dsId, err)
arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_CLIENT, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc()
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
fmt.Sprintf("%v", i),
).Set(-1)
return points, recoverPoints, fmt.Errorf("alert_eval_%d datasource_%d query data error: %v", rule.Id, dsId, err)
}
arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues(
fmt.Sprintf("%v", arw.Rule.Id),
fmt.Sprintf("%v", arw.Processor.DatasourceId()),
fmt.Sprintf("%v", i),
).Set(float64(len(series)))
// 此条日志很重要,是告警判断的现场值
logger.Infof("alert_eval_%d datasource_%d req:%+v resp:%v", rule.Id, dsId, query, series)
for i := 0; i < len(series); i++ {
seriesHash := hash.GetHash(series[i].Metric, series[i].Ref)
tagHash := hash.GetTagHash(series[i].Metric)
seriesStore[seriesHash] = series[i]
// 将曲线按照相同的 tag 分组
if _, exists := seriesTagIndex[tagHash]; !exists {
seriesTagIndex[tagHash] = make([]uint64, 0)
}
seriesTagIndex[tagHash] = append(seriesTagIndex[tagHash], seriesHash)
}
ref, err := GetQueryRef(query)
if err != nil {
logger.Warningf("alert_eval_%d datasource_%d query:%+v get ref error:%s", rule.Id, dsId, query, err.Error())
continue
}
seriesTagIndexes[ref] = seriesTagIndex
}
unitMap := make(map[string]string)
for _, query := range ruleQuery.Queries {
ref, unit, err := GetQueryRefAndUnit(query)
if err != nil {
logger.Warningf("alert_eval_%d datasource_%d query:%+v get ref and unit error:%s", rule.Id, dsId, query, err.Error())
continue
}
unitMap[ref] = unit
}
if !ruleQuery.ExpTriggerDisable {
for _, trigger := range ruleQuery.Triggers {
seriesTagIndex := ProcessJoins(rule.Id, trigger, seriesTagIndexes, seriesStore)
for _, seriesHash := range seriesTagIndex {
valuesUnitMap := make(map[string]unit.FormattedValue)
sort.Slice(seriesHash, func(i, j int) bool {
return seriesHash[i] < seriesHash[j]
})
m := make(map[string]interface{})
var ts int64
var sample models.DataResp
var value float64
for _, seriesHash := range seriesHash {
series, exists := seriesStore[seriesHash]
if !exists {
logger.Warningf("alert_eval_%d datasource_%d series:%+v not found", rule.Id, dsId, series)
continue
}
t, v, exists := series.Last()
if !exists {
logger.Warningf("alert_eval_%d datasource_%d series:%+v value not found", rule.Id, dsId, series)
continue
}
if !strings.Contains(trigger.Exp, "$"+series.Ref) {
// 表达式中不包含该变量
continue
}
m["$"+series.Ref] = v
m["$"+series.Ref+"."+series.MetricName()] = v
for k, v := range series.Metric {
if k == "__name__" {
continue
}
if !strings.Contains(trigger.Exp, "$"+series.Ref+"."+string(k)) {
// 过滤掉表达式中不包含的标签
continue
}
m["$"+series.Ref+"."+string(k)] = string(v)
}
if u, exists := unitMap[series.Ref]; exists {
valuesUnitMap["$"+series.Ref+"."+series.MetricName()] = unit.ValueFormatter(u, 2, v)
}
ts = int64(t)
sample = series
value = v
logger.Infof("alert_eval_%d datasource_%d origin series labels:%+v", rule.Id, dsId, series.Metric)
}
isTriggered := parser.CalcWithRid(trigger.Exp, m, rule.Id)
// 此条日志很重要,是告警判断的现场值
logger.Infof("alert_eval_%d datasource_%d trigger:%+v exp:%s res:%v m:%v", rule.Id, dsId, trigger, trigger.Exp, isTriggered, m)
var values string
for k, v := range m {
if !strings.Contains(k, ".") {
continue
}
if u, exists := valuesUnitMap[k]; exists { // 配置了单位,优先用配置了单位的值
values += fmt.Sprintf("%s:%s ", k, u.Text)
} else {
switch v.(type) {
case float64:
values += fmt.Sprintf("%s:%.3f ", k, v)
case string:
values += fmt.Sprintf("%s:%s ", k, v)
}
}
}
queries := ruleQuery.Queries
if sample.Query != "" {
queries = []interface{}{sample.Query}
}
point := models.AnomalyPoint{
Key: sample.MetricName(),
Labels: sample.Metric,
Timestamp: int64(ts),
Value: value,
Values: values,
Severity: trigger.Severity,
Triggered: isTriggered,
Query: fmt.Sprintf("query:%+v trigger:%+v", queries, trigger),
RecoverConfig: trigger.RecoverConfig,
ValuesUnit: valuesUnitMap,
}
if isTriggered {
points = append(points, point)
} else {
switch trigger.RecoverConfig.JudgeType {
case models.Origin:
// do nothing
case models.RecoverOnCondition:
fulfill := parser.CalcWithRid(trigger.RecoverConfig.RecoverExp, m, rule.Id)
if !fulfill {
continue
}
}
recoverPoints = append(recoverPoints, point)
}
}
}
}
if ruleQuery.NodataTrigger.Enable {
now := time.Now().Unix()
// 使用 arw.LastSeriesStore 检查上次查询结果
if len(arw.LastSeriesStore) > 0 {
// 遍历上次的曲线数据
for hash, lastSeries := range arw.LastSeriesStore {
if ruleQuery.NodataTrigger.ResolveAfterEnable {
lastTs, _, exists := lastSeries.Last()
if !exists {
continue
}
// 检查是否超过 resolve_after 时间
if now-int64(lastTs) > int64(ruleQuery.NodataTrigger.ResolveAfter) {
logger.Infof("alert_eval_%d datasource_%d series:%+v resolve after %d seconds now:%d lastTs:%d", rule.Id, dsId, lastSeries, ruleQuery.NodataTrigger.ResolveAfter, now, int64(lastTs))
delete(arw.LastSeriesStore, hash)
continue
}
}
// 检查是否在本次查询结果中存在
if _, exists := seriesStore[hash]; !exists {
// 生成无数据告警点
point := models.AnomalyPoint{
Key: lastSeries.MetricName(),
Labels: lastSeries.Metric,
Timestamp: now,
Value: 0,
Values: fmt.Sprintf("nodata since %v", time.Unix(now, 0).Format("2006-01-02 15:04:05")),
Severity: ruleQuery.NodataTrigger.Severity,
Triggered: true,
Query: fmt.Sprintf("nodata check for %s", lastSeries.LabelsString()),
TriggerType: models.TriggerTypeNodata,
}
points = append(points, point)
logger.Infof("alert_eval_%d datasource_%d nodata point:%+v", rule.Id, dsId, point)
}
}
}
// 更新 arw.LastSeriesStore
for hash, series := range seriesStore {
arw.LastSeriesStore[hash] = series
}
}
}
return points, recoverPoints, nil
}
// ExecuteQueryTemplate 根据数据源类型对 Query 进行模板渲染处理
// cate: 数据源类别,如 "mysql", "pgsql" 等
// query: 查询对象,如果是数据库类型的数据源,会处理其中的 sql 字段
// data: 模板数据对象,如果为 nil 则使用空结构体(不支持变量渲染),如果不为 nil 则使用传入的数据(支持变量渲染)
func ExecuteQueryTemplate(cate string, query interface{}, data interface{}) error {
// 检查 query 是否是 map,且包含 sql 字段
queryMap, ok := query.(map[string]interface{})
if !ok {
return nil
}
sqlVal, exists := queryMap["sql"]
if !exists {
return nil
}
sqlStr, ok := sqlVal.(string)
if !ok {
return nil
}
// 调用 ExecuteSqlTemplate 处理 sql 字段
processedSQL, err := ExecuteSqlTemplate(sqlStr, data)
if err != nil {
return fmt.Errorf("execute sql template error: %w", err)
}
// 更新 query 中的 sql 字段
queryMap["sql"] = processedSQL
return nil
}
// ExecuteSqlTemplate 执行 query 中的 golang 模板语法函数
// query: 要处理的 query 字符串
// data: 模板数据对象,如果为 nil 则使用空结构体(不支持变量渲染),如果不为 nil 则使用传入的数据(支持变量渲染)
func ExecuteSqlTemplate(query string, data interface{}) (string, error) {
if !strings.Contains(query, "{{") || !strings.Contains(query, "}}") {
return query, nil
}
tmpl, err := template.New("query").Funcs(tplx.TemplateFuncMap).Parse(query)
if err != nil {
return "", fmt.Errorf("query tmpl parse error: %w", err)
}
var buf strings.Builder
templateData := data
if templateData == nil {
templateData = struct{}{}
}
if err := tmpl.Execute(&buf, templateData); err != nil {
return "", fmt.Errorf("query tmpl execute error: %w", err)
}
return buf.String(), nil
}
================================================
FILE: alert/eval/eval_test.go
================================================
package eval
import (
"reflect"
"testing"
"golang.org/x/exp/slices"
)
var (
reHashTagIndex1 = map[uint64][][]uint64{
1: {
{1, 2}, {3, 4},
},
2: {
{5, 6}, {7, 8},
},
}
reHashTagIndex2 = map[uint64][][]uint64{
1: {
{9, 10}, {11, 12},
},
3: {
{13, 14}, {15, 16},
},
}
seriesTagIndex1 = map[uint64][]uint64{
1: {1, 2, 3, 4},
2: {5, 6, 7, 8},
}
seriesTagIndex2 = map[uint64][]uint64{
1: {9, 10, 11, 12},
3: {13, 14, 15, 16},
}
)
func Test_originalJoin(t *testing.T) {
type args struct {
seriesTagIndex1 map[uint64][]uint64
seriesTagIndex2 map[uint64][]uint64
}
tests := []struct {
name string
args args
want map[uint64][]uint64
}{
{
name: "original join",
args: args{
seriesTagIndex1: map[uint64][]uint64{
1: {1, 2, 3, 4},
2: {5, 6, 7, 8},
},
seriesTagIndex2: map[uint64][]uint64{
1: {9, 10, 11, 12},
3: {13, 14, 15, 16},
},
},
want: map[uint64][]uint64{
1: {1, 2, 3, 4, 9, 10, 11, 12},
2: {5, 6, 7, 8},
3: {13, 14, 15, 16},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := originalJoin(tt.args.seriesTagIndex1, tt.args.seriesTagIndex2); !reflect.DeepEqual(got, tt.want) {
t.Errorf("originalJoin() = %v, want %v", got, tt.want)
}
})
}
}
func Test_exclude(t *testing.T) {
type args struct {
reHashTagIndex1 map[uint64][][]uint64
reHashTagIndex2 map[uint64][][]uint64
}
tests := []struct {
name string
args args
want map[uint64][]uint64
}{
{
name: "left exclude",
args: args{
reHashTagIndex1: reHashTagIndex1,
reHashTagIndex2: reHashTagIndex2,
},
want: map[uint64][]uint64{
0: {5, 6},
1: {7, 8},
},
},
{
name: "right exclude",
args: args{
reHashTagIndex1: reHashTagIndex2,
reHashTagIndex2: reHashTagIndex1,
},
want: map[uint64][]uint64{
3: {13, 14},
4: {15, 16},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := exclude(tt.args.reHashTagIndex1, tt.args.reHashTagIndex2); !allValueDeepEqual(flatten(got), tt.want) {
t.Errorf("exclude() = %v, want %v", got, tt.want)
}
})
}
}
func Test_noneJoin(t *testing.T) {
type args struct {
seriesTagIndex1 map[uint64][]uint64
seriesTagIndex2 map[uint64][]uint64
}
tests := []struct {
name string
args args
want map[uint64][]uint64
}{
{
name: "none join, direct splicing",
args: args{
seriesTagIndex1: seriesTagIndex1,
seriesTagIndex2: seriesTagIndex2,
},
want: map[uint64][]uint64{
0: {1, 2, 3, 4},
1: {5, 6, 7, 8},
2: {9, 10, 11, 12},
3: {13, 14, 15, 16},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := noneJoin(tt.args.seriesTagIndex1, tt.args.seriesTagIndex2); !allValueDeepEqual(got, tt.want) {
t.Errorf("noneJoin() = %v, want %v", got, tt.want)
}
})
}
}
func Test_cartesianJoin(t *testing.T) {
type args struct {
seriesTagIndex1 map[uint64][]uint64
seriesTagIndex2 map[uint64][]uint64
}
tests := []struct {
name string
args args
want map[uint64][]uint64
}{
{
name: "cartesian join",
args: args{
seriesTagIndex1: seriesTagIndex1,
seriesTagIndex2: seriesTagIndex2,
},
want: map[uint64][]uint64{
0: {1, 2, 3, 4, 9, 10, 11, 12},
1: {5, 6, 7, 8, 9, 10, 11, 12},
2: {5, 6, 7, 8, 13, 14, 15, 16},
3: {1, 2, 3, 4, 13, 14, 15, 16},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := cartesianJoin(tt.args.seriesTagIndex1, tt.args.seriesTagIndex2); !allValueDeepEqual(got, tt.want) {
t.Errorf("cartesianJoin() = %v, want %v", got, tt.want)
}
})
}
}
func Test_onJoin(t *testing.T) {
type args struct {
reHashTagIndex1 map[uint64][][]uint64
reHashTagIndex2 map[uint64][][]uint64
joinType JoinType
}
tests := []struct {
name string
args args
want map[uint64][]uint64
}{
{
name: "left join",
args: args{
reHashTagIndex1: reHashTagIndex1,
reHashTagIndex2: reHashTagIndex2,
joinType: Left,
},
want: map[uint64][]uint64{
1: {1, 2, 9, 10},
2: {3, 4, 9, 10},
3: {1, 2, 11, 12},
4: {3, 4, 11, 12},
5: {5, 6},
6: {7, 8},
},
},
{
name: "right join",
args: args{
reHashTagIndex1: reHashTagIndex2,
reHashTagIndex2: reHashTagIndex1,
joinType: Right,
},
want: map[uint64][]uint64{
1: {1, 2, 9, 10},
2: {3, 4, 9, 10},
3: {1, 2, 11, 12},
4: {3, 4, 11, 12},
5: {13, 14},
6: {15, 16},
},
},
{
name: "inner join",
args: args{
reHashTagIndex1: reHashTagIndex1,
reHashTagIndex2: reHashTagIndex2,
joinType: Inner,
},
want: map[uint64][]uint64{
1: {1, 2, 9, 10},
2: {3, 4, 9, 10},
3: {1, 2, 11, 12},
4: {3, 4, 11, 12},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := onJoin(tt.args.reHashTagIndex1, tt.args.reHashTagIndex2, tt.args.joinType); !allValueDeepEqual(flatten(got), tt.want) {
t.Errorf("onJoin() = %v, want %v", got, tt.want)
}
})
}
}
// allValueDeepEqual 判断 map 的 value 是否相同,不考虑 key
func allValueDeepEqual(got, want map[uint64][]uint64) bool {
if len(got) != len(want) {
return false
}
for _, v1 := range got {
curEqual := false
slices.Sort(v1)
for _, v2 := range want {
slices.Sort(v2)
if reflect.DeepEqual(v1, v2) {
curEqual = true
break
}
}
if !curEqual {
return false
}
}
return true
}
// allValueDeepEqualOmitOrder 判断两个字符串切片是否相等,不考虑顺序
func allValueDeepEqualOmitOrder(got, want []string) bool {
if len(got) != len(want) {
return false
}
slices.Sort(got)
slices.Sort(want)
for i := range got {
if got[i] != want[i] {
return false
}
}
return true
}
func Test_removeVal(t *testing.T) {
type args struct {
promql string
}
tests := []struct {
name string
args args
want string
}{
// TODO: Add test cases.
{
name: "removeVal1",
args: args{
promql: "mem{test1=\"$test1\",test2=\"$test2\",test3=\"$test3\"} > $val",
},
want: "mem{} > $val",
},
{
name: "removeVal2",
args: args{
promql: "mem{test1=\"test1\",test2=\"$test2\",test3=\"$test3\"} > $val",
},
want: "mem{test1=\"test1\"} > $val",
},
{
name: "removeVal3",
args: args{
promql: "mem{test1=\"$test1\",test2=\"test2\",test3=\"$test3\"} > $val",
},
want: "mem{test2=\"test2\"} > $val",
},
{
name: "removeVal4",
args: args{
promql: "mem{test1=\"$test1\",test2=\"$test2\",test3=\"test3\"} > $val",
},
want: "mem{test3=\"test3\"} > $val",
},
{
name: "removeVal5",
args: args{
promql: "mem{test1=\"$test1\",test2=\"test2\",test3=\"test3\"} > $val",
},
want: "mem{test2=\"test2\",test3=\"test3\"} > $val",
},
{
name: "removeVal6",
args: args{
promql: "mem{test1=\"test1\",test2=\"$test2\",test3=\"test3\"} > $val",
},
want: "mem{test1=\"test1\",test3=\"test3\"} > $val",
},
{
name: "removeVal7",
args: args{
promql: "mem{test1=\"test1\",test2=\"test2\",test3='$test3'} > $val",
},
want: "mem{test1=\"test1\",test2=\"test2\"} > $val",
},
{
name: "removeVal8",
args: args{
promql: "mem{test1=\"test1\",test2=\"test2\",test3=\"test3\"} > $val",
},
want: "mem{test1=\"test1\",test2=\"test2\",test3=\"test3\"} > $val",
},
{
name: "removeVal9",
args: args{
promql: "mem{test1=\"$test1\",test2=\"test2\"} > $val1 and mem{test3=\"test3\",test4=\"test4\"} > $val2",
},
want: "mem{test2=\"test2\"} > $val1 and mem{test3=\"test3\",test4=\"test4\"} > $val2",
},
{
name: "removeVal10",
args: args{
promql: "mem{test1=\"test1\",test2='$test2'} > $val1 and mem{test3=\"test3\",test4=\"test4\"} > $val2",
},
want: "mem{test1=\"test1\"} > $val1 and mem{test3=\"test3\",test4=\"test4\"} > $val2",
},
{
name: "removeVal11",
args: args{
promql: "mem{test1='test1',test2=\"test2\"} > $val1 and mem{test3=\"$test3\",test4=\"test4\"} > $val2",
},
want: "mem{test1='test1',test2=\"test2\"} > $val1 and mem{test4=\"test4\"} > $val2",
},
{
name: "removeVal12",
args: args{
promql: "mem{test1=\"test1\",test2=\"test2\"} > $val1 and mem{test3=\"test3\",test4=\"$test4\"} > $val2",
},
want: "mem{test1=\"test1\",test2=\"test2\"} > $val1 and mem{test3=\"test3\"} > $val2",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := removeVal(tt.args.promql); got != tt.want {
t.Errorf("removeVal() = %v, want %v", got, tt.want)
}
})
}
}
func TestExtractVarMapping(t *testing.T) {
tests := []struct {
name string
promql string
want map[string]string
}{
{
name: "单个花括号单个变量",
promql: `mem_used_percent{host="$my_host"} > $val`,
want: map[string]string{"my_host": "host"},
},
{
name: "单个花括号多个变量",
promql: `mem_used_percent{host="$my_host",region="$region",env="prod"} > $val`,
want: map[string]string{"my_host": "host", "region": "region"},
},
{
name: "多个花括号多个变量",
promql: `sum(rate(mem_used_percent{host="$my_host"})) by (instance) + avg(node_load1{region="$region"}) > $val`,
want: map[string]string{"my_host": "host", "region": "region"},
},
{
name: "相同变量出现多次",
promql: `sum(rate(mem_used_percent{host="$my_host"})) + avg(node_load1{host="$my_host"}) > $val`,
want: map[string]string{"my_host": "host"},
},
{
name: "没有变量",
promql: `mem_used_percent{host="localhost",region="cn"} > 80`,
want: map[string]string{},
},
{
name: "没有花括号",
promql: `80 > $val`,
want: map[string]string{},
},
{
name: "格式不规范的标签",
promql: `mem_used_percent{host=$my_host,region = $region} > $val`,
want: map[string]string{"my_host": "host", "region": "region"},
},
{
name: "空花括号",
promql: `mem_used_percent{} > $val`,
want: map[string]string{},
},
{
name: "不完整的花括号",
promql: `mem_used_percent{host="$my_host"`,
want: map[string]string{},
},
{
name: "复杂表达式",
promql: `sum(rate(http_requests_total{handler="$handler",code="$code"}[5m])) by (handler) / sum(rate(http_requests_total{handler="$handler"}[5m])) by (handler) * 100 > $threshold`,
want: map[string]string{"handler": "handler", "code": "code"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := ExtractVarMapping(tt.promql)
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("ExtractVarMapping() = %v, want %v", got, tt.want)
}
})
}
}
================================================
FILE: alert/mute/mute.go
================================================
package mute
import (
"slices"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/alert/common"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
func IsMuted(rule *models.AlertRule, event *models.AlertCurEvent, targetCache *memsto.TargetCacheType, alertMuteCache *memsto.AlertMuteCacheType) (bool, string, int64) {
if rule.Disabled == 1 {
return true, "rule disabled", 0
}
if TimeSpanMuteStrategy(rule, event) {
return true, "rule is not effective for period of time", 0
}
if IdentNotExistsMuteStrategy(rule, event, targetCache) {
return true, "ident not exists mute", 0
}
if BgNotMatchMuteStrategy(rule, event, targetCache) {
return true, "bg not match mute", 0
}
hit, muteId := EventMuteStrategy(event, alertMuteCache)
if hit {
return true, "match mute rule", muteId
}
return false, "", 0
}
// TimeSpanMuteStrategy 根据规则配置的告警生效时间段过滤,如果产生的告警不在规则配置的告警生效时间段内,则不告警,即被mute
// 时间范围,左闭右开,默认范围:00:00-24:00
// 如果规则配置了时区,则在该时区下进行时间判断;如果时区为空,则使用系统时区
func TimeSpanMuteStrategy(rule *models.AlertRule, event *models.AlertCurEvent) bool {
// 确定使用的时区
var targetLoc *time.Location
var err error
timezone := rule.TimeZone
if timezone == "" {
// 如果时区为空,使用系统时区(保持原有逻辑)
targetLoc = time.Local
} else {
// 加载规则配置的时区
targetLoc, err = time.LoadLocation(timezone)
if err != nil {
// 如果时区加载失败,记录错误并使用系统时区
logger.Warningf("Failed to load timezone %s for rule %d, using system timezone: %v", timezone, rule.Id, err)
targetLoc = time.Local
}
}
// 将触发时间转换到目标时区
tm := time.Unix(event.TriggerTime, 0).In(targetLoc)
triggerTime := tm.Format("15:04")
triggerWeek := strconv.Itoa(int(tm.Weekday()))
if rule.EnableDaysOfWeek == "" {
// 如果规则没有配置生效时间,则默认全天生效
return false
}
enableStime := strings.Fields(rule.EnableStime)
enableEtime := strings.Fields(rule.EnableEtime)
enableDaysOfWeek := strings.Split(rule.EnableDaysOfWeek, ";")
length := len(enableDaysOfWeek)
// enableStime,enableEtime,enableDaysOfWeek三者长度肯定相同,这里循环一个即可
for i := 0; i < length; i++ {
enableDaysOfWeek[i] = strings.Replace(enableDaysOfWeek[i], "7", "0", 1)
if !strings.Contains(enableDaysOfWeek[i], triggerWeek) {
continue
}
if enableStime[i] < enableEtime[i] {
if enableEtime[i] == "23:59" {
// 02:00-23:59,这种情况做个特殊处理,相当于左闭右闭区间了
if triggerTime < enableStime[i] {
// mute, 即没生效
continue
}
} else {
// 02:00-04:00 或者 02:00-24:00
if triggerTime < enableStime[i] || triggerTime >= enableEtime[i] {
// mute, 即没生效
continue
}
}
} else if enableStime[i] > enableEtime[i] {
// 21:00-09:00
if triggerTime < enableStime[i] && triggerTime >= enableEtime[i] {
// mute, 即没生效
continue
}
}
// 到这里说明当前时刻在告警规则的某组生效时间范围内,即没有 mute,直接返回 false
return false
}
return true
}
// IdentNotExistsMuteStrategy 根据ident是否存在过滤,如果ident不存在,则target_up的告警直接过滤掉
func IdentNotExistsMuteStrategy(rule *models.AlertRule, event *models.AlertCurEvent, targetCache *memsto.TargetCacheType) bool {
ident, has := event.TagsMap["ident"]
if !has {
return false
}
_, exists := targetCache.Get(ident)
// 如果是target_up的告警,且ident已经不存在了,直接过滤掉
// 这里的判断有点太粗暴了,但是目前没有更好的办法
if !exists && strings.Contains(rule.PromQl, "target_up") {
logger.Debugf("alert_eval_%d [IdentNotExistsMuteStrategy] mute: cluster:%s ident:%s", rule.Id, event.Cluster, ident)
return true
}
return false
}
// BgNotMatchMuteStrategy 当规则开启只在bg内部告警时,对于非bg内部的机器过滤
func BgNotMatchMuteStrategy(rule *models.AlertRule, event *models.AlertCurEvent, targetCache *memsto.TargetCacheType) bool {
// 没有开启BG内部告警,直接不过滤
if rule.EnableInBG == 0 {
return false
}
ident, has := event.TagsMap["ident"]
if !has {
return false
}
target, exists := targetCache.Get(ident)
// 对于包含ident的告警事件,check一下ident所属bg和rule所属bg是否相同
// 如果告警规则选择了只在本BG生效,那其他BG的机器就不能因此规则产生告警
if exists && !target.MatchGroupId(rule.GroupId) {
logger.Debugf("alert_eval_%d [BgNotMatchMuteStrategy] mute: cluster:%s", rule.Id, event.Cluster)
return true
}
return false
}
func EventMuteStrategy(event *models.AlertCurEvent, alertMuteCache *memsto.AlertMuteCacheType) (bool, int64) {
mutes, has := alertMuteCache.Gets(event.GroupId)
if !has || len(mutes) == 0 {
return false, 0
}
for i := 0; i < len(mutes); i++ {
matched, _ := MatchMute(event, mutes[i])
if matched {
return true, mutes[i].Id
}
}
return false, 0
}
// MatchMute 如果传入了clock这个可选参数,就表示使用这个clock表示的时间,否则就从event的字段中取TriggerTime
func MatchMute(event *models.AlertCurEvent, mute *models.AlertMute, clock ...int64) (bool, error) {
if mute.Disabled == 1 {
return false, errors.New("mute is disabled")
}
// 如果不是全局的,判断 匹配的 datasource id
if len(mute.DatasourceIdsJson) != 0 && mute.DatasourceIdsJson[0] != 0 && event.DatasourceId != 0 {
if !slices.Contains(mute.DatasourceIdsJson, event.DatasourceId) {
return false, errors.New("datasource id not match")
}
}
if mute.MuteTimeType == models.TimeRange {
if !mute.IsWithinTimeRange(event.TriggerTime) {
return false, errors.New("event trigger time not within mute time range")
}
} else if mute.MuteTimeType == models.Periodic {
ts := event.TriggerTime
if len(clock) > 0 {
ts = clock[0]
}
if !mute.IsWithinPeriodicMute(ts) {
return false, errors.New("event trigger time not within periodic mute range")
}
} else {
logger.Warningf("mute time type invalid, %d", mute.MuteTimeType)
return false, errors.New("mute time type invalid")
}
var matchSeverity bool
if len(mute.SeveritiesJson) > 0 {
for _, s := range mute.SeveritiesJson {
if event.Severity == s || s == 0 {
matchSeverity = true
break
}
}
} else {
matchSeverity = true
}
if !matchSeverity {
return false, errors.New("event severity not match mute severity")
}
if len(mute.ITags) == 0 {
return true, nil
}
if !common.MatchTags(event.TagsMap, mute.ITags) {
return false, errors.New("event tags not match mute tags")
}
return true, nil
}
================================================
FILE: alert/naming/hashring.go
================================================
package naming
import (
"errors"
"sync"
"github.com/toolkits/pkg/consistent"
"github.com/toolkits/pkg/logger"
)
const NodeReplicas = 500
type DatasourceHashRingType struct {
sync.RWMutex
Rings map[string]*consistent.Consistent
}
// for alert_rule sharding
var HostDatasource int64 = 99999999
var DatasourceHashRing = DatasourceHashRingType{Rings: make(map[string]*consistent.Consistent)}
func NewConsistentHashRing(replicas int32, nodes []string) *consistent.Consistent {
ret := consistent.New()
ret.NumberOfReplicas = int(replicas)
for i := 0; i < len(nodes); i++ {
ret.Add(nodes[i])
}
return ret
}
func RebuildConsistentHashRing(datasourceId string, nodes []string) {
r := consistent.New()
r.NumberOfReplicas = NodeReplicas
for i := 0; i < len(nodes); i++ {
r.Add(nodes[i])
}
DatasourceHashRing.Set(datasourceId, r)
logger.Infof("hash ring %s rebuild %+v", datasourceId, r.Members())
}
func (chr *DatasourceHashRingType) GetNode(datasourceId string, pk string) (string, error) {
chr.Lock()
defer chr.Unlock()
_, exists := chr.Rings[datasourceId]
if !exists {
chr.Rings[datasourceId] = NewConsistentHashRing(int32(NodeReplicas), []string{})
}
return chr.Rings[datasourceId].Get(pk)
}
func (chr *DatasourceHashRingType) IsHit(datasourceId string, pk string, currentNode string) bool {
node, err := chr.GetNode(datasourceId, pk)
if err != nil {
if !errors.Is(err, consistent.ErrEmptyCircle) {
logger.Errorf("rule id:%s is not work, datasource id:%s failed to get node from hashring:%v", pk, datasourceId, err)
}
return false
}
return node == currentNode
}
func (chr *DatasourceHashRingType) Set(datasourceId string, r *consistent.Consistent) {
chr.Lock()
defer chr.Unlock()
chr.Rings[datasourceId] = r
}
func (chr *DatasourceHashRingType) Del(datasourceId string) {
chr.Lock()
defer chr.Unlock()
delete(chr.Rings, datasourceId)
}
func (chr *DatasourceHashRingType) Clear(engineName string) {
chr.Lock()
defer chr.Unlock()
for id := range chr.Rings {
if id == engineName {
continue
}
delete(chr.Rings, id)
}
}
================================================
FILE: alert/naming/heartbeat.go
================================================
package naming
import (
"fmt"
"sort"
"strings"
"time"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/toolkits/pkg/logger"
)
type Naming struct {
ctx *ctx.Context
heartbeatConfig aconf.HeartbeatConfig
astats *astats.Stats
}
func NewNaming(ctx *ctx.Context, heartbeat aconf.HeartbeatConfig, alertStats *astats.Stats) *Naming {
naming := &Naming{
ctx: ctx,
heartbeatConfig: heartbeat,
astats: alertStats,
}
naming.Heartbeats()
return naming
}
// local servers
var localss map[int64]string
var localHostServers map[string]string
func (n *Naming) Heartbeats() error {
localss = make(map[int64]string)
localHostServers = make(map[string]string)
if err := n.heartbeat(); err != nil {
fmt.Println("failed to heartbeat:", err)
return err
}
go n.loopHeartbeat()
go n.loopDeleteInactiveInstances()
return nil
}
func (n *Naming) loopDeleteInactiveInstances() {
if !n.ctx.IsCenter {
return
}
interval := time.Duration(10) * time.Minute
for {
time.Sleep(interval)
n.DeleteInactiveInstances()
}
}
func (n *Naming) DeleteInactiveInstances() {
err := models.DB(n.ctx).Where("clock < ?", time.Now().Unix()-600).Delete(new(models.AlertingEngines)).Error
if err != nil {
logger.Errorf("delete inactive instances err:%v", err)
}
}
func (n *Naming) loopHeartbeat() {
interval := time.Duration(n.heartbeatConfig.Interval) * time.Millisecond
for {
time.Sleep(interval)
if err := n.heartbeat(); err != nil {
logger.Warning(err)
}
}
}
func (n *Naming) heartbeat() error {
var datasourceIds []int64
var err error
// 在页面上维护实例和集群的对应关系
datasourceIds, err = models.GetDatasourceIdsByEngineName(n.ctx, n.heartbeatConfig.EngineName)
if err != nil {
return err
}
if len(datasourceIds) == 0 {
err := models.AlertingEngineHeartbeatWithCluster(n.ctx, n.heartbeatConfig.Endpoint, n.heartbeatConfig.EngineName, 0)
if err != nil {
logger.Warningf("heartbeat with cluster %s err:%v", "", err)
n.astats.CounterHeartbeatErrorTotal.WithLabelValues().Inc()
}
} else {
for i := 0; i < len(datasourceIds); i++ {
err := models.AlertingEngineHeartbeatWithCluster(n.ctx, n.heartbeatConfig.Endpoint, n.heartbeatConfig.EngineName, datasourceIds[i])
if err != nil {
logger.Warningf("heartbeat with cluster %d err:%v", datasourceIds[i], err)
n.astats.CounterHeartbeatErrorTotal.WithLabelValues().Inc()
}
}
}
if len(datasourceIds) == 0 {
DatasourceHashRing.Clear(n.heartbeatConfig.EngineName)
for dsId := range localss {
delete(localss, dsId)
}
}
newDatasource := make(map[int64]struct{})
for i := 0; i < len(datasourceIds); i++ {
newDatasource[datasourceIds[i]] = struct{}{}
servers, err := n.ActiveServers(datasourceIds[i])
if err != nil {
logger.Warningf("heartbeat %d get active server err:%v", datasourceIds[i], err)
n.astats.CounterHeartbeatErrorTotal.WithLabelValues().Inc()
continue
}
sort.Strings(servers)
newss := strings.Join(servers, " ")
oldss, exists := localss[datasourceIds[i]]
if exists && oldss == newss {
continue
}
RebuildConsistentHashRing(fmt.Sprintf("%d", datasourceIds[i]), servers)
localss[datasourceIds[i]] = newss
}
for dsId := range localss {
if _, exists := newDatasource[dsId]; !exists {
delete(localss, dsId)
DatasourceHashRing.Del(fmt.Sprintf("%d", dsId))
}
}
// host 告警使用的是 hash ring
err = models.AlertingEngineHeartbeatWithCluster(n.ctx, n.heartbeatConfig.Endpoint, n.heartbeatConfig.EngineName, HostDatasource)
if err != nil {
logger.Warningf("heartbeat with cluster %s err:%v", "", err)
n.astats.CounterHeartbeatErrorTotal.WithLabelValues().Inc()
}
servers, err := n.ActiveServersByEngineName()
if err != nil {
logger.Warningf("heartbeat %d get active server err:%v", HostDatasource, err)
n.astats.CounterHeartbeatErrorTotal.WithLabelValues().Inc()
return nil
}
sort.Strings(servers)
newss := strings.Join(servers, " ")
oldss, exists := localHostServers[n.heartbeatConfig.EngineName]
if exists && oldss == newss {
return nil
}
RebuildConsistentHashRing(n.heartbeatConfig.EngineName, servers)
localHostServers[n.heartbeatConfig.EngineName] = newss
return nil
}
func (n *Naming) ActiveServers(datasourceId int64) ([]string, error) {
if datasourceId == -1 {
return nil, fmt.Errorf("cluster is empty")
}
if !n.ctx.IsCenter {
lst, err := poster.GetByUrls[[]string](n.ctx, "/v1/n9e/servers-active?dsid="+fmt.Sprintf("%d", datasourceId))
return lst, err
}
// 30秒内有心跳,就认为是活的
return models.AlertingEngineGetsInstances(n.ctx, "datasource_id = ? and clock > ?", datasourceId, time.Now().Unix()-30)
}
func (n *Naming) ActiveServersByEngineName() ([]string, error) {
if !n.ctx.IsCenter {
lst, err := poster.GetByUrls[[]string](n.ctx, "/v1/n9e/servers-active?engine_name="+n.heartbeatConfig.EngineName)
return lst, err
}
// 30秒内有心跳,就认为是活的
return models.AlertingEngineGetsInstances(n.ctx, "engine_cluster = ? and clock > ?", n.heartbeatConfig.EngineName, time.Now().Unix()-30)
}
================================================
FILE: alert/naming/leader.go
================================================
package naming
import (
"sort"
"github.com/toolkits/pkg/logger"
)
func (n *Naming) IamLeader() bool {
if !n.ctx.IsCenter {
return false
}
servers, err := n.ActiveServersByEngineName()
if err != nil {
logger.Errorf("failed to get active servers: %v", err)
return false
}
if len(servers) == 0 {
logger.Errorf("active servers empty")
return false
}
sort.Strings(servers)
return n.heartbeatConfig.Endpoint == servers[0]
}
================================================
FILE: alert/pipeline/engine/engine.go
================================================
package engine
import (
"fmt"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/google/uuid"
"github.com/toolkits/pkg/logger"
)
type WorkflowEngine struct {
ctx *ctx.Context
}
func NewWorkflowEngine(c *ctx.Context) *WorkflowEngine {
return &WorkflowEngine{ctx: c}
}
func (e *WorkflowEngine) Execute(pipeline *models.EventPipeline, event *models.AlertCurEvent, triggerCtx *models.WorkflowTriggerContext) (*models.AlertCurEvent, *models.WorkflowResult, error) {
startTime := time.Now()
wfCtx := e.initWorkflowContext(pipeline, event, triggerCtx)
nodes := pipeline.GetWorkflowNodes()
connections := pipeline.GetWorkflowConnections()
if len(nodes) == 0 {
return event, &models.WorkflowResult{
Event: event,
Status: models.ExecutionStatusSuccess,
Message: "no nodes to execute",
}, nil
}
nodeMap := make(map[string]*models.WorkflowNode)
for i := range nodes {
if nodes[i].RetryInterval == 0 {
nodes[i].RetryInterval = 1
}
if nodes[i].MaxRetries == 0 {
nodes[i].MaxRetries = 1
}
nodeMap[nodes[i].ID] = &nodes[i]
}
result := e.executeDAG(nodeMap, connections, wfCtx)
result.Event = wfCtx.Event
duration := time.Since(startTime).Milliseconds()
if triggerCtx != nil && triggerCtx.Mode != "" {
e.saveExecutionRecord(pipeline, wfCtx, result, triggerCtx, startTime.Unix(), duration)
}
return wfCtx.Event, result, nil
}
func (e *WorkflowEngine) initWorkflowContext(pipeline *models.EventPipeline, event *models.AlertCurEvent, triggerCtx *models.WorkflowTriggerContext) *models.WorkflowContext {
// 合并输入参数
inputs := pipeline.GetInputsMap()
if triggerCtx != nil && triggerCtx.InputsOverrides != nil {
for k, v := range triggerCtx.InputsOverrides {
inputs[k] = v
}
}
metadata := map[string]string{
"start_time": fmt.Sprintf("%d", time.Now().Unix()),
"pipeline_id": fmt.Sprintf("%d", pipeline.ID),
}
// 是否启用流式输出
stream := false
if triggerCtx != nil {
metadata["request_id"] = triggerCtx.RequestID
metadata["trigger_mode"] = triggerCtx.Mode
metadata["trigger_by"] = triggerCtx.TriggerBy
stream = triggerCtx.Stream
}
return &models.WorkflowContext{
Event: event,
Inputs: inputs,
Vars: make(map[string]interface{}), // 初始化空的 Vars,供节点间传递数据
Metadata: metadata,
Stream: stream,
}
}
// executeDAG 使用 Kahn 算法执行 DAG
func (e *WorkflowEngine) executeDAG(nodeMap map[string]*models.WorkflowNode, connections models.Connections, wfCtx *models.WorkflowContext) *models.WorkflowResult {
result := &models.WorkflowResult{
Status: models.ExecutionStatusSuccess,
NodeResults: make([]*models.NodeExecutionResult, 0),
Stream: wfCtx.Stream, // 从上下文继承流式输出设置
}
// 计算每个节点的入度
inDegree := make(map[string]int)
for nodeID := range nodeMap {
inDegree[nodeID] = 0
}
// 遍历连接,计算入度
for _, nodeConns := range connections {
for _, targets := range nodeConns.Main {
for _, target := range targets {
inDegree[target.Node]++
}
}
}
// 找到所有入度为 0 的节点(起始节点)
queue := make([]string, 0)
for nodeID, degree := range inDegree {
if degree == 0 {
queue = append(queue, nodeID)
}
}
// 如果没有起始节点,说明存在循环依赖
if len(queue) == 0 && len(nodeMap) > 0 {
result.Status = models.ExecutionStatusFailed
result.Message = "workflow has circular dependency"
return result
}
// 记录已执行的节点
executed := make(map[string]bool)
// 记录节点的分支选择结果
branchResults := make(map[string]*int)
for len(queue) > 0 {
// 取出队首节点
nodeID := queue[0]
queue = queue[1:]
// 检查是否已执行
if executed[nodeID] {
continue
}
node, exists := nodeMap[nodeID]
if !exists {
continue
}
// 执行节点
nodeResult, nodeOutput := e.executeNode(node, wfCtx)
result.NodeResults = append(result.NodeResults, nodeResult)
if nodeOutput != nil && nodeOutput.Stream && nodeOutput.StreamChan != nil {
// 流式输出节点通常是最后一个节点
// 直接传递 StreamChan 给 WorkflowResult,不阻塞等待
result.Stream = true
result.StreamChan = nodeOutput.StreamChan
result.Event = wfCtx.Event
result.Status = "streaming"
result.Message = fmt.Sprintf("streaming output from node: %s", node.Name)
// 更新节点状态为 streaming
nodeResult.Status = "streaming"
nodeResult.Message = "streaming in progress"
// 立即返回,让 API 层处理流式响应
return result
}
executed[nodeID] = true
// 保存分支结果
if nodeResult.BranchIndex != nil {
branchResults[nodeID] = nodeResult.BranchIndex
}
// 检查执行状态
if nodeResult.Status == "failed" {
if !node.ContinueOnFail {
result.Status = models.ExecutionStatusFailed
result.ErrorNode = nodeID
result.Message = fmt.Sprintf("node %s failed: %s", node.Name, nodeResult.Error)
}
}
// 检查是否终止
if nodeResult.Status == "terminated" {
result.Message = fmt.Sprintf("workflow terminated at node %s", node.Name)
return result
}
// 更新后继节点的入度
if nodeConns, ok := connections[nodeID]; ok {
for outputIndex, targets := range nodeConns.Main {
// 检查是否应该走这个分支
if !e.shouldFollowBranch(nodeID, outputIndex, branchResults) {
continue
}
for _, target := range targets {
inDegree[target.Node]--
if inDegree[target.Node] == 0 {
queue = append(queue, target.Node)
}
}
}
}
}
return result
}
// executeNode 执行单个节点
// 返回:节点执行结果、节点输出(用于流式输出检测)
func (e *WorkflowEngine) executeNode(node *models.WorkflowNode, wfCtx *models.WorkflowContext) (*models.NodeExecutionResult, *models.NodeOutput) {
startTime := time.Now()
nodeResult := &models.NodeExecutionResult{
NodeID: node.ID,
NodeName: node.Name,
NodeType: node.Type,
StartedAt: startTime.Unix(),
}
var nodeOutput *models.NodeOutput
// 跳过禁用的节点
if node.Disabled {
nodeResult.Status = "skipped"
nodeResult.Message = "node is disabled"
nodeResult.FinishedAt = time.Now().Unix()
nodeResult.DurationMs = time.Since(startTime).Milliseconds()
return nodeResult, nil
}
// 获取处理器
processor, err := models.GetProcessorByType(node.Type, node.Config)
if err != nil {
nodeResult.Status = "failed"
nodeResult.Error = fmt.Sprintf("failed to get processor: %v", err)
nodeResult.FinishedAt = time.Now().Unix()
nodeResult.DurationMs = time.Since(startTime).Milliseconds()
return nodeResult, nil
}
// 执行处理器(带重试)
var retries int
maxRetries := node.MaxRetries
if !node.RetryOnFail {
maxRetries = 0
}
for retries <= maxRetries {
// 检查是否为分支处理器
if branchProcessor, ok := processor.(models.BranchProcessor); ok {
output, err := branchProcessor.ProcessWithBranch(e.ctx, wfCtx)
if err != nil {
if retries < maxRetries {
retries++
time.Sleep(time.Duration(node.RetryInterval) * time.Second)
continue
}
nodeResult.Status = "failed"
nodeResult.Error = err.Error()
} else {
nodeResult.Status = "success"
if output != nil {
nodeOutput = output
if output.WfCtx != nil {
wfCtx = output.WfCtx
}
nodeResult.Message = output.Message
nodeResult.BranchIndex = output.BranchIndex
if output.Terminate {
nodeResult.Status = "terminated"
}
}
}
break
}
// 普通处理器
newWfCtx, msg, err := processor.Process(e.ctx, wfCtx)
if err != nil {
if retries < maxRetries {
retries++
time.Sleep(time.Duration(node.RetryInterval) * time.Second)
continue
}
nodeResult.Status = "failed"
nodeResult.Error = err.Error()
} else {
nodeResult.Status = "success"
nodeResult.Message = msg
if newWfCtx != nil {
wfCtx = newWfCtx
// 检测流式输出标记
if newWfCtx.Stream && newWfCtx.StreamChan != nil {
nodeOutput = &models.NodeOutput{
WfCtx: newWfCtx,
Message: msg,
Stream: true,
StreamChan: newWfCtx.StreamChan,
}
}
}
// 如果事件被 drop(返回 nil 或 Event 为 nil),标记为终止
if newWfCtx == nil || newWfCtx.Event == nil {
nodeResult.Status = "terminated"
nodeResult.Message = msg
}
}
break
}
nodeResult.FinishedAt = time.Now().Unix()
nodeResult.DurationMs = time.Since(startTime).Milliseconds()
logger.Infof("workflow: executed node %s (type=%s) status=%s msg=%s duration=%dms",
node.Name, node.Type, nodeResult.Status, nodeResult.Message, nodeResult.DurationMs)
return nodeResult, nodeOutput
}
// shouldFollowBranch 判断是否应该走某个分支
func (e *WorkflowEngine) shouldFollowBranch(nodeID string, outputIndex int, branchResults map[string]*int) bool {
branchIndex, hasBranch := branchResults[nodeID]
if !hasBranch {
// 没有分支结果,说明不是分支节点,只走第一个输出
return outputIndex == 0
}
if branchIndex == nil {
// branchIndex 为 nil,走默认分支(通常是最后一个)
return true
}
// 只走选中的分支
return outputIndex == *branchIndex
}
func (e *WorkflowEngine) saveExecutionRecord(pipeline *models.EventPipeline, wfCtx *models.WorkflowContext, result *models.WorkflowResult, triggerCtx *models.WorkflowTriggerContext, startTime int64, duration int64) {
executionID := triggerCtx.RequestID
if executionID == "" {
executionID = uuid.New().String()
}
execution := &models.EventPipelineExecution{
ID: executionID,
PipelineID: pipeline.ID,
PipelineName: pipeline.Name,
Mode: triggerCtx.Mode,
Status: result.Status,
ErrorMessage: result.Message,
ErrorNode: result.ErrorNode,
CreatedAt: startTime,
FinishedAt: time.Now().Unix(),
DurationMs: duration,
TriggerBy: triggerCtx.TriggerBy,
}
if wfCtx.Event != nil {
execution.EventID = wfCtx.Event.Id
}
if err := execution.SetNodeResults(result.NodeResults); err != nil {
logger.Errorf("workflow: failed to set node results: pipeline_id=%d, error=%v", pipeline.ID, err)
}
if err := execution.SetInputsSnapshot(wfCtx.Inputs); err != nil {
logger.Errorf("workflow: failed to set inputs snapshot: pipeline_id=%d, error=%v", pipeline.ID, err)
}
if err := models.CreateEventPipelineExecution(e.ctx, execution); err != nil {
logger.Errorf("workflow: failed to save execution record: pipeline_id=%d, error=%v", pipeline.ID, err)
}
}
================================================
FILE: alert/pipeline/pipeline.go
================================================
package pipeline
import (
_ "github.com/ccfos/nightingale/v6/alert/pipeline/processor/aisummary"
_ "github.com/ccfos/nightingale/v6/alert/pipeline/processor/callback"
_ "github.com/ccfos/nightingale/v6/alert/pipeline/processor/eventdrop"
_ "github.com/ccfos/nightingale/v6/alert/pipeline/processor/eventupdate"
_ "github.com/ccfos/nightingale/v6/alert/pipeline/processor/logic"
_ "github.com/ccfos/nightingale/v6/alert/pipeline/processor/relabel"
)
func Init() {
}
================================================
FILE: alert/pipeline/processor/aisummary/ai_summary.go
================================================
package aisummary
import (
"bytes"
"crypto/tls"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strconv"
"strings"
"text/template"
"time"
"github.com/ccfos/nightingale/v6/alert/pipeline/processor/callback"
"github.com/ccfos/nightingale/v6/alert/pipeline/processor/common"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/tplx"
)
const (
HTTP_STATUS_SUCCESS_MAX = 299
)
// AISummaryConfig 配置结构体
type AISummaryConfig struct {
callback.HTTPConfig
ModelName string `json:"model_name"`
APIKey string `json:"api_key"`
PromptTemplate string `json:"prompt_template"`
CustomParams map[string]interface{} `json:"custom_params"`
}
type Message struct {
Role string `json:"role"`
Content string `json:"content"`
}
type ChatCompletionResponse struct {
Choices []struct {
Message struct {
Content string `json:"content"`
} `json:"message"`
} `json:"choices"`
}
func init() {
models.RegisterProcessor("ai_summary", &AISummaryConfig{})
}
func (c *AISummaryConfig) Init(settings interface{}) (models.Processor, error) {
result, err := common.InitProcessor[*AISummaryConfig](settings)
return result, err
}
func (c *AISummaryConfig) Process(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.WorkflowContext, string, error) {
event := wfCtx.Event
if c.Client == nil {
if err := c.initHTTPClient(); err != nil {
return wfCtx, "", fmt.Errorf("failed to initialize HTTP client: %v processor: %v", err, c)
}
}
// 准备告警事件信息
eventInfo, err := c.prepareEventInfo(wfCtx)
if err != nil {
return wfCtx, "", fmt.Errorf("failed to prepare event info: %v processor: %v", err, c)
}
// 调用AI模型生成总结
summary, err := c.generateAISummary(eventInfo)
if err != nil {
return wfCtx, "", fmt.Errorf("failed to generate AI summary: %v processor: %v", err, c)
}
// 将总结添加到annotations字段
if event.AnnotationsJSON == nil {
event.AnnotationsJSON = make(map[string]string)
}
event.AnnotationsJSON["ai_summary"] = summary
// 更新Annotations字段
b, err := json.Marshal(event.AnnotationsJSON)
if err != nil {
return wfCtx, "", fmt.Errorf("failed to marshal annotations: %v processor: %v", err, c)
}
event.Annotations = string(b)
return wfCtx, "", nil
}
func (c *AISummaryConfig) initHTTPClient() error {
transport := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: c.SkipSSLVerify},
}
if c.Proxy != "" {
proxyURL, err := url.Parse(c.Proxy)
if err != nil {
return fmt.Errorf("failed to parse proxy url: %v", err)
}
transport.Proxy = http.ProxyURL(proxyURL)
}
c.Client = &http.Client{
Timeout: time.Duration(c.Timeout) * time.Millisecond,
Transport: transport,
}
return nil
}
func (c *AISummaryConfig) prepareEventInfo(wfCtx *models.WorkflowContext) (string, error) {
var defs = []string{
"{{$event := .Event}}",
"{{$inputs := .Inputs}}",
}
text := strings.Join(append(defs, c.PromptTemplate), "")
t, err := template.New("prompt").Funcs(template.FuncMap(tplx.TemplateFuncMap)).Parse(text)
if err != nil {
return "", fmt.Errorf("failed to parse prompt template: %v", err)
}
var body bytes.Buffer
err = t.Execute(&body, wfCtx)
if err != nil {
return "", fmt.Errorf("failed to execute prompt template: %v", err)
}
return body.String(), nil
}
func (c *AISummaryConfig) generateAISummary(eventInfo string) (string, error) {
// 构建基础请求参数
reqParams := map[string]interface{}{
"model": c.ModelName,
"messages": []Message{
{
Role: "user",
Content: eventInfo,
},
},
}
// 合并自定义参数
for k, v := range c.CustomParams {
converted, err := convertCustomParam(v)
if err != nil {
return "", fmt.Errorf("failed to convert custom param %s: %v", k, err)
}
reqParams[k] = converted
}
// 序列化请求体
jsonData, err := json.Marshal(reqParams)
if err != nil {
return "", fmt.Errorf("failed to marshal request body: %v", err)
}
// 创建HTTP请求
req, err := http.NewRequest("POST", c.URL, bytes.NewBuffer(jsonData))
if err != nil {
return "", fmt.Errorf("failed to create request: %v", err)
}
// 设置请求头
req.Header.Set("Authorization", "Bearer "+c.APIKey)
req.Header.Set("Content-Type", "application/json")
for k, v := range c.Headers {
req.Header.Set(k, v)
}
// 发送请求
resp, err := c.Client.Do(req)
if err != nil {
return "", fmt.Errorf("failed to send request: %v", err)
}
defer resp.Body.Close()
// 检查响应状态码
if resp.StatusCode > HTTP_STATUS_SUCCESS_MAX {
body, _ := io.ReadAll(resp.Body)
return "", fmt.Errorf("unexpected status code: %d, body: %s", resp.StatusCode, string(body))
}
// 读取响应
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("failed to read response body: %v", err)
}
// 解析响应
var chatResp ChatCompletionResponse
if err := json.Unmarshal(body, &chatResp); err != nil {
return "", fmt.Errorf("failed to unmarshal response: %v", err)
}
if len(chatResp.Choices) == 0 {
return "", fmt.Errorf("no response from AI model")
}
return chatResp.Choices[0].Message.Content, nil
}
// convertCustomParam 将前端传入的参数转换为正确的类型
func convertCustomParam(value interface{}) (interface{}, error) {
if value == nil {
return nil, nil
}
// 如果是字符串,尝试转换为其他类型
if str, ok := value.(string); ok {
// 尝试转换为数字
if f, err := strconv.ParseFloat(str, 64); err == nil {
// 检查是否为整数
if f == float64(int64(f)) {
return int64(f), nil
}
return f, nil
}
// 尝试转换为布尔值
if b, err := strconv.ParseBool(str); err == nil {
return b, nil
}
// 尝试解析为JSON数组
if strings.HasPrefix(strings.TrimSpace(str), "[") {
var arr []interface{}
if err := json.Unmarshal([]byte(str), &arr); err == nil {
return arr, nil
}
}
// 尝试解析为JSON对象
if strings.HasPrefix(strings.TrimSpace(str), "{") {
var obj map[string]interface{}
if err := json.Unmarshal([]byte(str), &obj); err == nil {
return obj, nil
}
}
}
return value, nil
}
================================================
FILE: alert/pipeline/processor/aisummary/ai_summary_test.go
================================================
package aisummary
import (
"testing"
"github.com/ccfos/nightingale/v6/alert/pipeline/processor/callback"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/stretchr/testify/assert"
)
func TestAISummaryConfig_Process(t *testing.T) {
// 创建测试配置
config := &AISummaryConfig{
HTTPConfig: callback.HTTPConfig{
URL: "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions",
Timeout: 30000,
SkipSSLVerify: true,
Headers: map[string]string{
"Content-Type": "application/json",
},
},
ModelName: "gemini-2.0-flash",
APIKey: "*",
PromptTemplate: "告警规则:{{$event.RuleName}}\n严重程度:{{$event.Severity}}",
CustomParams: map[string]interface{}{
"temperature": 0.7,
"max_tokens": 2000,
"top_p": 0.9,
},
}
// 创建测试事件
event := &models.AlertCurEvent{
RuleName: "Test Rule",
Severity: 1,
TagsMap: map[string]string{
"host": "test-host",
},
AnnotationsJSON: map[string]string{
"description": "Test alert",
},
}
// 创建 WorkflowContext
wfCtx := &models.WorkflowContext{
Event: event,
Inputs: map[string]string{},
}
// 测试模板处理
eventInfo, err := config.prepareEventInfo(wfCtx)
assert.NoError(t, err)
assert.Contains(t, eventInfo, "Test Rule")
assert.Contains(t, eventInfo, "1")
// 测试配置初始化
processor, err := config.Init(config)
assert.NoError(t, err)
assert.NotNil(t, processor)
// 测试处理函数
result, _, err := processor.Process(&ctx.Context{}, wfCtx)
assert.NoError(t, err)
assert.NotNil(t, result)
assert.NotEmpty(t, result.Event.AnnotationsJSON["ai_summary"])
// 展示处理结果
t.Log("\n=== 处理结果 ===")
t.Logf("告警规则: %s", result.Event.RuleName)
t.Logf("严重程度: %d", result.Event.Severity)
t.Logf("标签: %v", result.Event.TagsMap)
t.Logf("原始注释: %v", result.Event.AnnotationsJSON["description"])
t.Logf("AI总结: %s", result.Event.AnnotationsJSON["ai_summary"])
}
func TestConvertCustomParam(t *testing.T) {
tests := []struct {
name string
input interface{}
expected interface{}
hasError bool
}{
{
name: "nil value",
input: nil,
expected: nil,
hasError: false,
},
{
name: "string number to int64",
input: "123",
expected: int64(123),
hasError: false,
},
{
name: "string float to float64",
input: "123.45",
expected: 123.45,
hasError: false,
},
{
name: "string boolean to bool",
input: "true",
expected: true,
hasError: false,
},
{
name: "string false to bool",
input: "false",
expected: false,
hasError: false,
},
{
name: "JSON array string to slice",
input: `["a", "b", "c"]`,
expected: []interface{}{"a", "b", "c"},
hasError: false,
},
{
name: "JSON object string to map",
input: `{"key": "value", "num": 123}`,
expected: map[string]interface{}{"key": "value", "num": float64(123)},
hasError: false,
},
{
name: "plain string remains string",
input: "hello world",
expected: "hello world",
hasError: false,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
converted, err := convertCustomParam(test.input)
if test.hasError {
assert.Error(t, err)
return
}
assert.NoError(t, err)
assert.Equal(t, test.expected, converted)
})
}
}
================================================
FILE: alert/pipeline/processor/callback/callback.go
================================================
package callback
import (
"crypto/tls"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
"github.com/ccfos/nightingale/v6/alert/pipeline/processor/common"
"github.com/ccfos/nightingale/v6/alert/pipeline/processor/utils"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/logger"
)
type HTTPConfig struct {
URL string `json:"url"`
Method string `json:"method,omitempty"`
Body string `json:"body,omitempty"`
Headers map[string]string `json:"header"`
AuthUsername string `json:"auth_username"`
AuthPassword string `json:"auth_password"`
Timeout int `json:"timeout"` // 单位:ms
SkipSSLVerify bool `json:"skip_ssl_verify"`
Proxy string `json:"proxy"`
Client *http.Client `json:"-"`
}
// RelabelConfig
type CallbackConfig struct {
HTTPConfig
}
func init() {
models.RegisterProcessor("callback", &CallbackConfig{})
}
func (c *CallbackConfig) Init(settings interface{}) (models.Processor, error) {
result, err := common.InitProcessor[*CallbackConfig](settings)
return result, err
}
func (c *CallbackConfig) Process(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.WorkflowContext, string, error) {
event := wfCtx.Event
if c.Client == nil {
transport := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: c.SkipSSLVerify},
}
if c.Proxy != "" {
proxyURL, err := url.Parse(c.Proxy)
if err != nil {
return wfCtx, "", fmt.Errorf("failed to parse proxy url: %v processor: %v", err, c)
} else {
transport.Proxy = http.ProxyURL(proxyURL)
}
}
c.Client = &http.Client{
Timeout: time.Duration(c.Timeout) * time.Millisecond,
Transport: transport,
}
}
headers := make(map[string]string)
headers["Content-Type"] = "application/json"
for k, v := range c.Headers {
headers[k] = v
}
url, err := utils.TplRender(wfCtx, c.URL)
if err != nil {
return wfCtx, "", fmt.Errorf("failed to render url template: %v processor: %v", err, c)
}
body, err := json.Marshal(event)
if err != nil {
return wfCtx, "", fmt.Errorf("failed to marshal event: %v processor: %v", err, c)
}
req, err := http.NewRequest("POST", url, strings.NewReader(string(body)))
if err != nil {
return wfCtx, "", fmt.Errorf("failed to create request: %v processor: %v", err, c)
}
for k, v := range headers {
req.Header.Set(k, v)
}
if c.AuthUsername != "" && c.AuthPassword != "" {
req.SetBasicAuth(c.AuthUsername, c.AuthPassword)
}
resp, err := c.Client.Do(req)
if err != nil {
return wfCtx, "", fmt.Errorf("failed to send request: %v processor: %v", err, c)
}
b, err := io.ReadAll(resp.Body)
if err != nil {
return wfCtx, "", fmt.Errorf("failed to read response body: %v processor: %v", err, c)
}
logger.Debugf("callback processor response body: %s", string(b))
return wfCtx, "callback success", nil
}
================================================
FILE: alert/pipeline/processor/common/common.go
================================================
package common
import (
"encoding/json"
)
// InitProcessor 是一个通用的初始化处理器的方法
// 使用泛型简化处理器初始化逻辑
// T 必须是 models.Processor 接口的实现
func InitProcessor[T any](settings interface{}) (T, error) {
var zero T
b, err := json.Marshal(settings)
if err != nil {
return zero, err
}
var result T
err = json.Unmarshal(b, &result)
if err != nil {
return zero, err
}
return result, nil
}
================================================
FILE: alert/pipeline/processor/eventdrop/event_drop.go
================================================
package eventdrop
import (
"bytes"
"fmt"
"strings"
texttemplate "text/template"
"github.com/ccfos/nightingale/v6/alert/pipeline/processor/common"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/tplx"
"github.com/toolkits/pkg/logger"
)
type EventDropConfig struct {
Content string `json:"content"`
}
func init() {
models.RegisterProcessor("event_drop", &EventDropConfig{})
}
func (c *EventDropConfig) Init(settings interface{}) (models.Processor, error) {
result, err := common.InitProcessor[*EventDropConfig](settings)
return result, err
}
func (c *EventDropConfig) Process(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.WorkflowContext, string, error) {
// 使用背景是可以根据此处理器,实现对事件进行更加灵活的过滤的逻辑
// 在标签过滤和属性过滤都不满足需求时可以使用
// 如果模板执行结果为 true,则删除该事件
event := wfCtx.Event
var defs = []string{
"{{ $event := .Event }}",
"{{ $labels := .Event.TagsMap }}",
"{{ $value := .Event.TriggerValue }}",
"{{ $inputs := .Inputs }}",
}
text := strings.Join(append(defs, c.Content), "")
tpl, err := texttemplate.New("eventdrop").Funcs(tplx.TemplateFuncMap).Parse(text)
if err != nil {
return wfCtx, "", fmt.Errorf("processor failed to parse template: %v processor: %v", err, c)
}
var body bytes.Buffer
if err = tpl.Execute(&body, wfCtx); err != nil {
return wfCtx, "", fmt.Errorf("processor failed to execute template: %v processor: %v", err, c)
}
result := strings.TrimSpace(body.String())
logger.Infof("processor eventdrop result: %v", result)
if result == "true" {
wfCtx.Event = nil
logger.Infof("processor eventdrop drop event: %s", event.Hash)
return wfCtx, "drop event success", nil
}
return wfCtx, "drop event failed", nil
}
================================================
FILE: alert/pipeline/processor/eventupdate/event_update.go
================================================
package eventupdate
import (
"crypto/tls"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
"github.com/ccfos/nightingale/v6/alert/pipeline/processor/callback"
"github.com/ccfos/nightingale/v6/alert/pipeline/processor/common"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/logger"
)
// RelabelConfig
type EventUpdateConfig struct {
callback.HTTPConfig
}
func init() {
models.RegisterProcessor("event_update", &EventUpdateConfig{})
}
func (c *EventUpdateConfig) Init(settings interface{}) (models.Processor, error) {
result, err := common.InitProcessor[*EventUpdateConfig](settings)
return result, err
}
func (c *EventUpdateConfig) Process(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.WorkflowContext, string, error) {
event := wfCtx.Event
if c.Client == nil {
transport := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: c.SkipSSLVerify},
}
if c.Proxy != "" {
proxyURL, err := url.Parse(c.Proxy)
if err != nil {
return wfCtx, "", fmt.Errorf("failed to parse proxy url: %v processor: %v", err, c)
} else {
transport.Proxy = http.ProxyURL(proxyURL)
}
}
c.Client = &http.Client{
Timeout: time.Duration(c.Timeout) * time.Millisecond,
Transport: transport,
}
}
headers := make(map[string]string)
headers["Content-Type"] = "application/json"
for k, v := range c.Headers {
headers[k] = v
}
body, err := json.Marshal(event)
if err != nil {
return wfCtx, "", fmt.Errorf("failed to marshal event: %v processor: %v", err, c)
}
req, err := http.NewRequest("POST", c.URL, strings.NewReader(string(body)))
if err != nil {
return wfCtx, "", fmt.Errorf("failed to create request: %v processor: %v", err, c)
}
for k, v := range headers {
req.Header.Set(k, v)
}
if c.AuthUsername != "" && c.AuthPassword != "" {
req.SetBasicAuth(c.AuthUsername, c.AuthPassword)
}
resp, err := c.Client.Do(req)
if err != nil {
return wfCtx, "", fmt.Errorf("failed to send request: %v processor: %v", err, c)
}
b, err := io.ReadAll(resp.Body)
if err != nil {
return nil, "", fmt.Errorf("failed to read response body: %v processor: %v", err, c)
}
logger.Debugf("event update processor response body: %s", string(b))
err = json.Unmarshal(b, &event)
if err != nil {
return wfCtx, "", fmt.Errorf("failed to unmarshal response body: %v processor: %v", err, c)
}
return wfCtx, "", nil
}
================================================
FILE: alert/pipeline/processor/logic/if.go
================================================
package logic
import (
"bytes"
"fmt"
"strings"
"text/template"
alertCommon "github.com/ccfos/nightingale/v6/alert/common"
"github.com/ccfos/nightingale/v6/alert/pipeline/processor/common"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/tplx"
)
// 判断模式常量
const (
ConditionModeExpression = "expression" // 表达式模式(默认)
ConditionModeTags = "tags" // 标签/属性模式
)
// IfConfig If 条件处理器配置
type IfConfig struct {
// 判断模式:expression(表达式)或 tags(标签/属性)
Mode string `json:"mode,omitempty"`
// 表达式模式配置
// 条件表达式(支持 Go 模板语法)
// 例如:{{ if eq .Severity 1 }}true{{ end }}
Condition string `json:"condition,omitempty"`
// 标签/属性模式配置
LabelKeys []models.TagFilter `json:"label_keys,omitempty"` // 适用标签
Attributes []models.TagFilter `json:"attributes,omitempty"` // 适用属性
// 内部使用,解析后的过滤器
parsedLabelKeys []models.TagFilter `json:"-"`
parsedAttributes []models.TagFilter `json:"-"`
}
func init() {
models.RegisterProcessor("logic.if", &IfConfig{})
}
func (c *IfConfig) Init(settings interface{}) (models.Processor, error) {
result, err := common.InitProcessor[*IfConfig](settings)
if err != nil {
return nil, err
}
// 解析标签过滤器
if len(result.LabelKeys) > 0 {
// Deep copy to avoid concurrent map writes on cached objects
labelKeysCopy := make([]models.TagFilter, len(result.LabelKeys))
copy(labelKeysCopy, result.LabelKeys)
for i := range labelKeysCopy {
if labelKeysCopy[i].Func == "" {
labelKeysCopy[i].Func = labelKeysCopy[i].Op
}
}
result.parsedLabelKeys, err = models.ParseTagFilter(labelKeysCopy)
if err != nil {
return nil, fmt.Errorf("failed to parse label_keys: %v", err)
}
}
// 解析属性过滤器
if len(result.Attributes) > 0 {
// Deep copy to avoid concurrent map writes on cached objects
attributesCopy := make([]models.TagFilter, len(result.Attributes))
copy(attributesCopy, result.Attributes)
for i := range attributesCopy {
if attributesCopy[i].Func == "" {
attributesCopy[i].Func = attributesCopy[i].Op
}
}
result.parsedAttributes, err = models.ParseTagFilter(attributesCopy)
if err != nil {
return nil, fmt.Errorf("failed to parse attributes: %v", err)
}
}
return result, nil
}
// Process 实现 Processor 接口(兼容旧模式)
func (c *IfConfig) Process(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.WorkflowContext, string, error) {
result, err := c.evaluateCondition(wfCtx)
if err != nil {
return wfCtx, "", fmt.Errorf("if processor: failed to evaluate condition: %v", err)
}
if result {
return wfCtx, "condition matched (true branch)", nil
}
return wfCtx, "condition not matched (false branch)", nil
}
// ProcessWithBranch 实现 BranchProcessor 接口
func (c *IfConfig) ProcessWithBranch(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.NodeOutput, error) {
result, err := c.evaluateCondition(wfCtx)
if err != nil {
return nil, fmt.Errorf("if processor: failed to evaluate condition: %v", err)
}
output := &models.NodeOutput{
WfCtx: wfCtx,
}
if result {
// 条件为 true,走输出 0(true 分支)
branchIndex := 0
output.BranchIndex = &branchIndex
output.Message = "condition matched (true branch)"
} else {
// 条件为 false,走输出 1(false 分支)
branchIndex := 1
output.BranchIndex = &branchIndex
output.Message = "condition not matched (false branch)"
}
return output, nil
}
// evaluateCondition 评估条件
func (c *IfConfig) evaluateCondition(wfCtx *models.WorkflowContext) (bool, error) {
mode := c.Mode
if mode == "" {
mode = ConditionModeExpression // 默认表达式模式
}
switch mode {
case ConditionModeTags:
return c.evaluateTagsCondition(wfCtx.Event)
default:
return c.evaluateExpressionCondition(wfCtx)
}
}
// evaluateExpressionCondition 评估表达式条件
func (c *IfConfig) evaluateExpressionCondition(wfCtx *models.WorkflowContext) (bool, error) {
if c.Condition == "" {
return true, nil
}
// 构建模板数据
var defs = []string{
"{{ $event := .Event }}",
"{{ $labels := .Event.TagsMap }}",
"{{ $value := .Event.TriggerValue }}",
"{{ $inputs := .Inputs }}",
}
text := strings.Join(append(defs, c.Condition), "")
tpl, err := template.New("if_condition").Funcs(tplx.TemplateFuncMap).Parse(text)
if err != nil {
return false, err
}
var buf bytes.Buffer
if err = tpl.Execute(&buf, wfCtx); err != nil {
return false, err
}
result := strings.TrimSpace(strings.ToLower(buf.String()))
return result == "true" || result == "1", nil
}
// evaluateTagsCondition 评估标签/属性条件
func (c *IfConfig) evaluateTagsCondition(event *models.AlertCurEvent) (bool, error) {
// 如果没有配置任何过滤条件,默认返回 true
if len(c.parsedLabelKeys) == 0 && len(c.parsedAttributes) == 0 {
return true, nil
}
// 匹配标签 (TagsMap)
if len(c.parsedLabelKeys) > 0 {
tagsMap := event.TagsMap
if tagsMap == nil {
tagsMap = make(map[string]string)
}
if !alertCommon.MatchTags(tagsMap, c.parsedLabelKeys) {
return false, nil
}
}
// 匹配属性 (JsonTagsAndValue - 所有 JSON 字段)
if len(c.parsedAttributes) > 0 {
attributesMap := event.JsonTagsAndValue()
if !alertCommon.MatchTags(attributesMap, c.parsedAttributes) {
return false, nil
}
}
return true, nil
}
================================================
FILE: alert/pipeline/processor/logic/switch.go
================================================
package logic
import (
"bytes"
"fmt"
"strings"
"text/template"
alertCommon "github.com/ccfos/nightingale/v6/alert/common"
"github.com/ccfos/nightingale/v6/alert/pipeline/processor/common"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/tplx"
)
// SwitchCase Switch 分支定义
type SwitchCase struct {
// 判断模式:expression(表达式)或 tags(标签/属性)
Mode string `json:"mode,omitempty"`
// 表达式模式配置
// 条件表达式(支持 Go 模板语法)
Condition string `json:"condition,omitempty"`
// 标签/属性模式配置
LabelKeys []models.TagFilter `json:"label_keys,omitempty"` // 适用标签
Attributes []models.TagFilter `json:"attributes,omitempty"` // 适用属性
// 分支名称(可选,用于日志)
Name string `json:"name,omitempty"`
// 内部使用,解析后的过滤器
parsedLabelKeys []models.TagFilter `json:"-"`
parsedAttributes []models.TagFilter `json:"-"`
}
// SwitchConfig Switch 多分支处理器配置
type SwitchConfig struct {
// 分支条件列表
// 按顺序匹配,第一个为 true 的分支将被选中
Cases []SwitchCase `json:"cases"`
// 是否允许多个分支同时匹配(默认 false,只走第一个匹配的)
AllowMultiple bool `json:"allow_multiple,omitempty"`
}
func init() {
models.RegisterProcessor("logic.switch", &SwitchConfig{})
}
func (c *SwitchConfig) Init(settings interface{}) (models.Processor, error) {
result, err := common.InitProcessor[*SwitchConfig](settings)
if err != nil {
return nil, err
}
// 解析每个 case 的标签和属性过滤器
for i := range result.Cases {
if len(result.Cases[i].LabelKeys) > 0 {
// Deep copy to avoid concurrent map writes on cached objects
labelKeysCopy := make([]models.TagFilter, len(result.Cases[i].LabelKeys))
copy(labelKeysCopy, result.Cases[i].LabelKeys)
for j := range labelKeysCopy {
if labelKeysCopy[j].Func == "" {
labelKeysCopy[j].Func = labelKeysCopy[j].Op
}
}
result.Cases[i].parsedLabelKeys, err = models.ParseTagFilter(labelKeysCopy)
if err != nil {
return nil, fmt.Errorf("failed to parse label_keys for case[%d]: %v", i, err)
}
}
if len(result.Cases[i].Attributes) > 0 {
// Deep copy to avoid concurrent map writes on cached objects
attributesCopy := make([]models.TagFilter, len(result.Cases[i].Attributes))
copy(attributesCopy, result.Cases[i].Attributes)
for j := range attributesCopy {
if attributesCopy[j].Func == "" {
attributesCopy[j].Func = attributesCopy[j].Op
}
}
result.Cases[i].parsedAttributes, err = models.ParseTagFilter(attributesCopy)
if err != nil {
return nil, fmt.Errorf("failed to parse attributes for case[%d]: %v", i, err)
}
}
}
return result, nil
}
// Process 实现 Processor 接口(兼容旧模式)
func (c *SwitchConfig) Process(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.WorkflowContext, string, error) {
index, caseName, err := c.evaluateCases(wfCtx)
if err != nil {
return wfCtx, "", fmt.Errorf("switch processor: failed to evaluate cases: %v", err)
}
if index >= 0 {
if caseName != "" {
return wfCtx, fmt.Sprintf("matched case[%d]: %s", index, caseName), nil
}
return wfCtx, fmt.Sprintf("matched case[%d]", index), nil
}
// 走默认分支(最后一个输出)
return wfCtx, "no case matched, using default branch", nil
}
// ProcessWithBranch 实现 BranchProcessor 接口
func (c *SwitchConfig) ProcessWithBranch(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.NodeOutput, error) {
index, caseName, err := c.evaluateCases(wfCtx)
if err != nil {
return nil, fmt.Errorf("switch processor: failed to evaluate cases: %v", err)
}
output := &models.NodeOutput{
WfCtx: wfCtx,
}
if index >= 0 {
output.BranchIndex = &index
if caseName != "" {
output.Message = fmt.Sprintf("matched case[%d]: %s", index, caseName)
} else {
output.Message = fmt.Sprintf("matched case[%d]", index)
}
} else {
// 默认分支的索引是 cases 数量(即最后一个输出端口)
defaultIndex := len(c.Cases)
output.BranchIndex = &defaultIndex
output.Message = "no case matched, using default branch"
}
return output, nil
}
// evaluateCases 评估所有分支条件
// 返回匹配的分支索引和分支名称,如果没有匹配返回 -1
func (c *SwitchConfig) evaluateCases(wfCtx *models.WorkflowContext) (int, string, error) {
for i := range c.Cases {
matched, err := c.evaluateCaseCondition(&c.Cases[i], wfCtx)
if err != nil {
return -1, "", fmt.Errorf("case[%d] evaluation error: %v", i, err)
}
if matched {
return i, c.Cases[i].Name, nil
}
}
return -1, "", nil
}
// evaluateCaseCondition 评估单个分支条件
func (c *SwitchConfig) evaluateCaseCondition(caseItem *SwitchCase, wfCtx *models.WorkflowContext) (bool, error) {
mode := caseItem.Mode
if mode == "" {
mode = ConditionModeExpression // 默认表达式模式
}
switch mode {
case ConditionModeTags:
return c.evaluateTagsCondition(caseItem, wfCtx.Event)
default:
return c.evaluateExpressionCondition(caseItem.Condition, wfCtx)
}
}
// evaluateExpressionCondition 评估表达式条件
func (c *SwitchConfig) evaluateExpressionCondition(condition string, wfCtx *models.WorkflowContext) (bool, error) {
if condition == "" {
return false, nil
}
var defs = []string{
"{{ $event := .Event }}",
"{{ $labels := .Event.TagsMap }}",
"{{ $value := .Event.TriggerValue }}",
"{{ $inputs := .Inputs }}",
}
text := strings.Join(append(defs, condition), "")
tpl, err := template.New("switch_condition").Funcs(tplx.TemplateFuncMap).Parse(text)
if err != nil {
return false, err
}
var buf bytes.Buffer
if err = tpl.Execute(&buf, wfCtx); err != nil {
return false, err
}
result := strings.TrimSpace(strings.ToLower(buf.String()))
return result == "true" || result == "1", nil
}
// evaluateTagsCondition 评估标签/属性条件
func (c *SwitchConfig) evaluateTagsCondition(caseItem *SwitchCase, event *models.AlertCurEvent) (bool, error) {
// 如果没有配置任何过滤条件,默认返回 false(不匹配)
if len(caseItem.parsedLabelKeys) == 0 && len(caseItem.parsedAttributes) == 0 {
return false, nil
}
// 匹配标签 (TagsMap)
if len(caseItem.parsedLabelKeys) > 0 {
tagsMap := event.TagsMap
if tagsMap == nil {
tagsMap = make(map[string]string)
}
if !alertCommon.MatchTags(tagsMap, caseItem.parsedLabelKeys) {
return false, nil
}
}
// 匹配属性 (JsonTagsAndValue - 所有 JSON 字段)
if len(caseItem.parsedAttributes) > 0 {
attributesMap := event.JsonTagsAndValue()
if !alertCommon.MatchTags(attributesMap, caseItem.parsedAttributes) {
return false, nil
}
}
return true, nil
}
================================================
FILE: alert/pipeline/processor/relabel/relabel.go
================================================
package relabel
import (
"fmt"
"regexp"
"strings"
"github.com/ccfos/nightingale/v6/alert/pipeline/processor/common"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/ccfos/nightingale/v6/pushgw/writer"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/prompb"
)
const (
REPLACE_DOT = "___"
)
// RelabelConfig
type RelabelConfig struct {
SourceLabels []string `json:"source_labels"`
Separator string `json:"separator"`
Regex string `json:"regex"`
RegexCompiled *regexp.Regexp
If string `json:"if"`
IfRegex *regexp.Regexp
Modulus uint64 `json:"modulus"`
TargetLabel string `json:"target_label"`
Replacement string `json:"replacement"`
Action string `json:"action"`
}
func init() {
models.RegisterProcessor("relabel", &RelabelConfig{})
}
func (r *RelabelConfig) Init(settings interface{}) (models.Processor, error) {
result, err := common.InitProcessor[*RelabelConfig](settings)
return result, err
}
func (r *RelabelConfig) Process(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.WorkflowContext, string, error) {
sourceLabels := make([]model.LabelName, len(r.SourceLabels))
for i := range r.SourceLabels {
sourceLabels[i] = model.LabelName(strings.ReplaceAll(r.SourceLabels[i], ".", REPLACE_DOT))
}
relabelConfigs := []*pconf.RelabelConfig{
{
SourceLabels: sourceLabels,
Separator: r.Separator,
Regex: r.Regex,
RegexCompiled: r.RegexCompiled,
If: r.If,
IfRegex: r.IfRegex,
Modulus: r.Modulus,
TargetLabel: r.TargetLabel,
Replacement: r.Replacement,
Action: r.Action,
},
}
EventRelabel(wfCtx.Event, relabelConfigs)
return wfCtx, "", nil
}
func EventRelabel(event *models.AlertCurEvent, relabelConfigs []*pconf.RelabelConfig) {
labels := make([]prompb.Label, len(event.TagsJSON))
event.OriginalTagsJSON = make([]string, len(event.TagsJSON))
for i, tag := range event.TagsJSON {
label := strings.SplitN(tag, "=", 2)
if len(label) != 2 {
continue
}
event.OriginalTagsJSON[i] = tag
label[0] = strings.ReplaceAll(string(label[0]), ".", REPLACE_DOT)
labels[i] = prompb.Label{Name: label[0], Value: label[1]}
}
for i := 0; i < len(relabelConfigs); i++ {
if relabelConfigs[i].Replacement == "" {
relabelConfigs[i].Replacement = "$1"
}
if relabelConfigs[i].Separator == "" {
relabelConfigs[i].Separator = ";"
}
if relabelConfigs[i].Regex == "" {
relabelConfigs[i].Regex = "(.*)"
}
}
gotLabels := writer.Process(labels, relabelConfigs...)
event.TagsJSON = make([]string, len(gotLabels))
event.TagsMap = make(map[string]string, len(gotLabels))
for i, label := range gotLabels {
label.Name = strings.ReplaceAll(string(label.Name), REPLACE_DOT, ".")
event.TagsJSON[i] = fmt.Sprintf("%s=%s", label.Name, label.Value)
event.TagsMap[label.Name] = label.Value
}
event.Tags = strings.Join(event.TagsJSON, ",,")
}
================================================
FILE: alert/pipeline/processor/utils/utils.go
================================================
package utils
import (
"bytes"
"fmt"
"strings"
"text/template"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/tplx"
)
func TplRender(wfCtx *models.WorkflowContext, content string) (string, error) {
var defs = []string{
"{{ $event := .Event }}",
"{{ $labels := .Event.TagsMap }}",
"{{ $value := .Event.TriggerValue }}",
"{{ $inputs := .Inputs }}",
}
text := strings.Join(append(defs, content), "")
tpl, err := template.New("tpl").Funcs(tplx.TemplateFuncMap).Parse(text)
if err != nil {
return "", fmt.Errorf("failed to parse template: %v", err)
}
var body bytes.Buffer
if err = tpl.Execute(&body, wfCtx); err != nil {
return "", fmt.Errorf("failed to execute template: %v", err)
}
return strings.TrimSpace(body.String()), nil
}
================================================
FILE: alert/process/alert_cur_event.go
================================================
package process
import (
"sync"
"github.com/ccfos/nightingale/v6/models"
)
type AlertCurEventMap struct {
sync.RWMutex
Data map[string]*models.AlertCurEvent
}
func NewAlertCurEventMap(data map[string]*models.AlertCurEvent) *AlertCurEventMap {
if data == nil {
return &AlertCurEventMap{
Data: make(map[string]*models.AlertCurEvent),
}
}
return &AlertCurEventMap{
Data: data,
}
}
func (a *AlertCurEventMap) SetAll(data map[string]*models.AlertCurEvent) {
a.Lock()
defer a.Unlock()
a.Data = data
}
func (a *AlertCurEventMap) Set(key string, value *models.AlertCurEvent) {
a.Lock()
defer a.Unlock()
a.Data[key] = value
}
func (a *AlertCurEventMap) Get(key string) (*models.AlertCurEvent, bool) {
a.RLock()
defer a.RUnlock()
event, exists := a.Data[key]
return event, exists
}
func (a *AlertCurEventMap) UpdateLastEvalTime(key string, lastEvalTime int64) {
a.Lock()
defer a.Unlock()
event, exists := a.Data[key]
if !exists {
return
}
event.LastEvalTime = lastEvalTime
}
func (a *AlertCurEventMap) Delete(key string) {
a.Lock()
defer a.Unlock()
delete(a.Data, key)
}
func (a *AlertCurEventMap) Keys() []string {
a.RLock()
defer a.RUnlock()
keys := make([]string, 0, len(a.Data))
for k := range a.Data {
keys = append(keys, k)
}
return keys
}
func (a *AlertCurEventMap) GetAll() map[string]*models.AlertCurEvent {
a.RLock()
defer a.RUnlock()
return a.Data
}
================================================
FILE: alert/process/process.go
================================================
package process
import (
"bytes"
"encoding/json"
"fmt"
"html/template"
"sort"
"strings"
"sync"
"time"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/alert/common"
"github.com/ccfos/nightingale/v6/alert/dispatch"
"github.com/ccfos/nightingale/v6/alert/mute"
"github.com/ccfos/nightingale/v6/alert/pipeline/processor/relabel"
"github.com/ccfos/nightingale/v6/alert/queue"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/tplx"
"github.com/robfig/cron/v3"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/str"
)
type ExternalProcessorsType struct {
ExternalLock sync.RWMutex
Processors map[string]*Processor
}
var ExternalProcessors ExternalProcessorsType
func NewExternalProcessors() *ExternalProcessorsType {
return &ExternalProcessorsType{
Processors: make(map[string]*Processor),
}
}
func (e *ExternalProcessorsType) GetExternalAlertRule(datasourceId, id int64) (*Processor, bool) {
e.ExternalLock.RLock()
defer e.ExternalLock.RUnlock()
processor, has := e.Processors[common.RuleKey(datasourceId, id)]
return processor, has
}
type HandleEventFunc func(event *models.AlertCurEvent)
type Processor struct {
datasourceId int64
EngineName string
rule *models.AlertRule
fires *AlertCurEventMap
pendings *AlertCurEventMap
pendingsUseByRecover *AlertCurEventMap
inhibit bool
tagsMap map[string]string
tagsArr []string
groupName string
alertRuleCache *memsto.AlertRuleCacheType
TargetCache *memsto.TargetCacheType
TargetsOfAlertRuleCache *memsto.TargetsOfAlertRuleCacheType
BusiGroupCache *memsto.BusiGroupCacheType
alertMuteCache *memsto.AlertMuteCacheType
datasourceCache *memsto.DatasourceCacheType
ctx *ctx.Context
Stats *astats.Stats
HandleFireEventHook HandleEventFunc
HandleRecoverEventHook HandleEventFunc
ScheduleEntry cron.Entry
PromEvalInterval int
}
func (p *Processor) Key() string {
return common.RuleKey(p.datasourceId, p.rule.Id)
}
func (p *Processor) DatasourceId() int64 {
return p.datasourceId
}
func (p *Processor) Hash() string {
return str.MD5(fmt.Sprintf("%d_%s_%s_%d",
p.rule.Id,
p.rule.CronPattern,
p.rule.RuleConfig,
p.datasourceId,
))
}
func NewProcessor(engineName string, rule *models.AlertRule, datasourceId int64, alertRuleCache *memsto.AlertRuleCacheType,
targetCache *memsto.TargetCacheType, targetsOfAlertRuleCache *memsto.TargetsOfAlertRuleCacheType,
busiGroupCache *memsto.BusiGroupCacheType, alertMuteCache *memsto.AlertMuteCacheType, datasourceCache *memsto.DatasourceCacheType, ctx *ctx.Context,
stats *astats.Stats) *Processor {
p := &Processor{
EngineName: engineName,
datasourceId: datasourceId,
rule: rule,
TargetCache: targetCache,
TargetsOfAlertRuleCache: targetsOfAlertRuleCache,
BusiGroupCache: busiGroupCache,
alertMuteCache: alertMuteCache,
alertRuleCache: alertRuleCache,
datasourceCache: datasourceCache,
ctx: ctx,
Stats: stats,
HandleFireEventHook: func(event *models.AlertCurEvent) {},
HandleRecoverEventHook: func(event *models.AlertCurEvent) {},
}
p.mayHandleGroup()
return p
}
func (p *Processor) Handle(anomalyPoints []models.AnomalyPoint, from string, inhibit bool) {
// 有可能rule的一些配置已经发生变化,比如告警接收人、callbacks等
// 这些信息的修改是不会引起worker restart的,但是确实会影响告警处理逻辑
// 所以,这里直接从memsto.AlertRuleCache中获取并覆盖
p.inhibit = inhibit
cachedRule := p.alertRuleCache.Get(p.rule.Id)
if cachedRule == nil {
logger.Warningf("alert_eval_%d datasource_%d handle error: rule not found, maybe rule has been deleted, anomalyPoints:%+v", p.rule.Id, p.datasourceId, anomalyPoints)
p.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", p.DatasourceId()), "handle_event", p.BusiGroupCache.GetNameByBusiGroupId(p.rule.GroupId), fmt.Sprintf("%v", p.rule.Id)).Inc()
return
}
// 在 rule 变化之前取到 ruleHash
ruleHash := p.rule.Hash()
p.rule = cachedRule
now := time.Now().Unix()
alertingKeys := map[string]struct{}{}
// 根据 event 的 tag 将 events 分组,处理告警抑制的情况
eventsMap := make(map[string][]*models.AlertCurEvent)
for _, anomalyPoint := range anomalyPoints {
event := p.BuildEvent(anomalyPoint, from, now, ruleHash)
event.NotifyRuleIds = cachedRule.NotifyRuleIds
// 如果 event 被 mute 了,本质也是 fire 的状态,这里无论如何都添加到 alertingKeys 中,防止 fire 的事件自动恢复了
hash := event.Hash
alertingKeys[hash] = struct{}{}
// event processor
eventCopy := event.DeepCopy()
event = dispatch.HandleEventPipeline(cachedRule.PipelineConfigs, eventCopy, event, dispatch.EventProcessorCache, p.ctx, cachedRule.Id, "alert_rule")
if event == nil {
logger.Infof("alert_eval_%d datasource_%d is muted drop by pipeline event:%s", p.rule.Id, p.datasourceId, eventCopy.Hash)
continue
}
// event mute
isMuted, detail, muteId := mute.IsMuted(cachedRule, event, p.TargetCache, p.alertMuteCache)
if isMuted {
logger.Infof("alert_eval_%d datasource_%d is muted, detail:%s event:%s", p.rule.Id, p.datasourceId, detail, event.Hash)
p.Stats.CounterMuteTotal.WithLabelValues(
fmt.Sprintf("%v", event.GroupName),
fmt.Sprintf("%v", p.rule.Id),
fmt.Sprintf("%v", muteId),
fmt.Sprintf("%v", p.datasourceId),
).Inc()
continue
}
if dispatch.EventMuteHook(event) {
logger.Infof("alert_eval_%d datasource_%d is muted by hook event:%s", p.rule.Id, p.datasourceId, event.Hash)
p.Stats.CounterMuteTotal.WithLabelValues(
fmt.Sprintf("%v", event.GroupName),
fmt.Sprintf("%v", p.rule.Id),
fmt.Sprintf("%v", 0),
fmt.Sprintf("%v", p.datasourceId),
).Inc()
continue
}
tagHash := TagHash(anomalyPoint)
eventsMap[tagHash] = append(eventsMap[tagHash], event)
}
for _, events := range eventsMap {
p.handleEvent(events)
}
if from == "inner" {
p.HandleRecover(alertingKeys, now, inhibit)
}
}
func (p *Processor) BuildEvent(anomalyPoint models.AnomalyPoint, from string, now int64, ruleHash string) *models.AlertCurEvent {
p.fillTags(anomalyPoint)
hash := Hash(p.rule.Id, p.datasourceId, anomalyPoint)
ds := p.datasourceCache.GetById(p.datasourceId)
var dsName string
if ds != nil {
dsName = ds.Name
}
event := p.rule.GenerateNewEvent(p.ctx)
bg := p.BusiGroupCache.GetByBusiGroupId(p.rule.GroupId)
if bg != nil {
event.GroupName = bg.Name
}
event.TriggerTime = anomalyPoint.Timestamp
event.TagsMap = p.tagsMap
event.DatasourceId = p.datasourceId
event.Cluster = dsName
event.Hash = hash
event.TriggerValue = anomalyPoint.ReadableValue()
event.TriggerValues = anomalyPoint.Values
event.TriggerValuesJson = models.EventTriggerValues{ValuesWithUnit: anomalyPoint.ValuesUnit}
event.TagsJSON = p.tagsArr
event.Tags = strings.Join(p.tagsArr, ",,")
event.IsRecovered = false
event.Callbacks = p.rule.Callbacks
event.CallbacksJSON = p.rule.CallbacksJSON
event.Annotations = p.rule.Annotations
event.RuleConfig = p.rule.RuleConfig
event.RuleConfigJson = p.rule.RuleConfigJson
event.Severity = anomalyPoint.Severity
event.ExtraConfig = p.rule.ExtraConfigJSON
event.PromQl = anomalyPoint.Query
event.RecoverConfig = anomalyPoint.RecoverConfig
event.RuleHash = ruleHash
if anomalyPoint.TriggerType == models.TriggerTypeNodata {
event.TriggerValue = "nodata"
ruleConfig := models.RuleQuery{}
json.Unmarshal([]byte(p.rule.RuleConfig), &ruleConfig)
ruleConfig.TriggerType = anomalyPoint.TriggerType
b, _ := json.Marshal(ruleConfig)
event.RuleConfig = string(b)
}
if err := json.Unmarshal([]byte(p.rule.Annotations), &event.AnnotationsJSON); err != nil {
event.AnnotationsJSON = make(map[string]string) // 解析失败时使用空 map
logger.Warningf("alert_eval_%d datasource_%d unmarshal annotations json failed: %v", p.rule.Id, p.datasourceId, err)
}
if event.TriggerValues != "" && strings.Count(event.TriggerValues, "$") > 1 {
// TriggerValues 有多个变量,将多个变量都放到 TriggerValue 中
event.TriggerValue = event.TriggerValues
}
if from == "inner" {
event.LastEvalTime = now
} else {
event.LastEvalTime = event.TriggerTime
}
// 生成事件之后,立马进程 relabel 处理
Relabel(p.rule, event)
// 放到 Relabel(p.rule, event) 下面,为了处理 relabel 之后,标签里才出现 ident 的情况
p.mayHandleIdent(event)
if event.TargetIdent != "" {
if pt, exist := p.TargetCache.Get(event.TargetIdent); exist {
pt.GroupNames = p.BusiGroupCache.GetNamesByBusiGroupIds(pt.GroupIds)
event.Target = pt
} else {
logger.Infof("alert_eval_%d datasource_%d fill event target error, ident: %s doesn't exist in cache.", p.rule.Id, p.datasourceId, event.TargetIdent)
}
}
return event
}
func Relabel(rule *models.AlertRule, event *models.AlertCurEvent) {
if rule == nil {
return
}
// need to keep the original label
event.OriginalTags = event.Tags
event.OriginalTagsJSON = event.TagsJSON
if len(rule.EventRelabelConfig) == 0 {
return
}
relabel.EventRelabel(event, rule.EventRelabelConfig)
}
func (p *Processor) HandleRecover(alertingKeys map[string]struct{}, now int64, inhibit bool) {
for _, hash := range p.pendings.Keys() {
if _, has := alertingKeys[hash]; has {
continue
}
p.pendings.Delete(hash)
}
hashArr := make([]string, 0, len(alertingKeys))
for hash, _ := range p.fires.GetAll() {
if _, has := alertingKeys[hash]; has {
continue
}
hashArr = append(hashArr, hash)
}
p.HandleRecoverEvent(hashArr, now, inhibit)
}
func (p *Processor) HandleRecoverEvent(hashArr []string, now int64, inhibit bool) {
cachedRule := p.rule
if cachedRule == nil {
return
}
if !inhibit {
for _, hash := range hashArr {
p.RecoverSingle(false, hash, now, nil)
}
return
}
eventMap := make(map[string]models.AlertCurEvent)
for _, hash := range hashArr {
event, has := p.fires.Get(hash)
if !has {
continue
}
e, exists := eventMap[event.Tags]
if !exists {
eventMap[event.Tags] = *event
continue
}
if e.Severity > event.Severity {
// hash 对应的恢复事件的被抑制了,把之前的事件删除
p.fires.Delete(e.Hash)
p.pendings.Delete(e.Hash)
models.AlertCurEventDelByHash(p.ctx, e.Hash)
eventMap[event.Tags] = *event
}
}
for _, event := range eventMap {
p.RecoverSingle(false, event.Hash, now, nil)
}
}
func (p *Processor) RecoverSingle(byRecover bool, hash string, now int64, value *string, values ...string) {
cachedRule := p.rule
if cachedRule == nil {
return
}
event, has := p.fires.Get(hash)
if !has {
return
}
// 如果配置了留观时长,就不能立马恢复了
if cachedRule.RecoverDuration > 0 {
lastPendingEvent, has := p.pendingsUseByRecover.Get(hash)
if !has {
// 说明没有产生过异常点,就不需要恢复了
logger.Debugf("alert_eval_%d datasource_%d event:%s do not has pending event, not recover", p.rule.Id, p.datasourceId, event.Hash)
return
}
if now-lastPendingEvent.LastEvalTime < cachedRule.RecoverDuration {
logger.Debugf("alert_eval_%d datasource_%d event:%s not recover", p.rule.Id, p.datasourceId, event.Hash)
return
}
}
// 如果设置了恢复条件,则不能在此处恢复,必须依靠 recoverPoint 来恢复
if event.RecoverConfig.JudgeType != models.Origin && !byRecover {
logger.Debugf("alert_eval_%d datasource_%d event:%s not recover", p.rule.Id, p.datasourceId, event.Hash)
return
}
if value != nil {
event.TriggerValue = *value
if len(values) > 0 {
event.TriggerValues = values[0]
}
}
// 没查到触发阈值的vector,姑且就认为这个vector的值恢复了
// 我确实无法分辨,是prom中有值但是未满足阈值所以没返回,还是prom中确实丢了一些点导致没有数据可以返回,尴尬
p.fires.Delete(hash)
p.pendings.Delete(hash)
p.pendingsUseByRecover.Delete(hash)
// 可能是因为调整了promql才恢复的,所以事件里边要体现最新的promql,否则用户会比较困惑
// 当然,其实rule的各个字段都可能发生变化了,都更新一下吧
cachedRule.UpdateEvent(event)
event.IsRecovered = true
event.LastEvalTime = now
p.HandleRecoverEventHook(event)
p.pushEventToQueue(event)
}
func (p *Processor) handleEvent(events []*models.AlertCurEvent) {
var fireEvents []*models.AlertCurEvent
// severity 初始为最低优先级, 一定为遇到比自己优先级高的事件
severity := models.SeverityLowest
for _, event := range events {
if event == nil {
continue
}
if _, has := p.pendingsUseByRecover.Get(event.Hash); has {
p.pendingsUseByRecover.UpdateLastEvalTime(event.Hash, event.LastEvalTime)
} else {
p.pendingsUseByRecover.Set(event.Hash, event)
}
event.PromEvalInterval = p.PromEvalInterval
if p.rule.PromForDuration == 0 {
fireEvents = append(fireEvents, event)
if severity > event.Severity {
severity = event.Severity
}
continue
}
var preEvalTime int64 // 第一个 pending event 的检测时间
preEvent, has := p.pendings.Get(event.Hash)
if has {
p.pendings.UpdateLastEvalTime(event.Hash, event.LastEvalTime)
preEvalTime = preEvent.FirstEvalTime
} else {
event.FirstEvalTime = event.LastEvalTime
p.pendings.Set(event.Hash, event)
preEvalTime = event.FirstEvalTime
}
if event.LastEvalTime-preEvalTime+int64(event.PromEvalInterval) >= int64(p.rule.PromForDuration) {
fireEvents = append(fireEvents, event)
if severity > event.Severity {
severity = event.Severity
}
continue
}
}
p.inhibitEvent(fireEvents, severity)
}
func (p *Processor) inhibitEvent(events []*models.AlertCurEvent, highSeverity int) {
for _, event := range events {
if p.inhibit && event.Severity > highSeverity {
logger.Debugf("alert_eval_%d datasource_%d event:%s inhibit highSeverity:%d", p.rule.Id, p.datasourceId, event.Hash, highSeverity)
continue
}
p.fireEvent(event)
}
}
func (p *Processor) fireEvent(event *models.AlertCurEvent) {
// As p.rule maybe outdated, use rule from cache
cachedRule := p.rule
if cachedRule == nil {
return
}
message := "unknown"
defer func() {
logger.Infof("alert_eval_%d datasource_%d event-hash-%s %s", p.rule.Id, p.datasourceId, event.Hash, message)
}()
if fired, has := p.fires.Get(event.Hash); has {
p.fires.UpdateLastEvalTime(event.Hash, event.LastEvalTime)
event.FirstTriggerTime = fired.FirstTriggerTime
p.HandleFireEventHook(event)
if cachedRule.NotifyRepeatStep == 0 {
message = "stalled, rule.notify_repeat_step is 0, no need to repeat notify"
return
}
// 之前发送过告警了,这次是否要继续发送,要看是否过了通道静默时间
if event.LastEvalTime >= fired.LastSentTime+int64(cachedRule.NotifyRepeatStep)*60 {
if cachedRule.NotifyMaxNumber == 0 {
// 最大可以发送次数如果是0,表示不想限制最大发送次数,一直发即可
event.NotifyCurNumber = fired.NotifyCurNumber + 1
message = fmt.Sprintf("fired, notify_repeat_step_matched(%d >= %d + %d * 60) notify_max_number_ignore(#%d / %d)", event.LastEvalTime, fired.LastSentTime, cachedRule.NotifyRepeatStep, event.NotifyCurNumber, cachedRule.NotifyMaxNumber)
p.pushEventToQueue(event)
} else {
// 有最大发送次数的限制,就要看已经发了几次了,是否达到了最大发送次数
if fired.NotifyCurNumber >= cachedRule.NotifyMaxNumber {
message = fmt.Sprintf("stalled, notify_repeat_step_matched(%d >= %d + %d * 60) notify_max_number_not_matched(#%d / %d)", event.LastEvalTime, fired.LastSentTime, cachedRule.NotifyRepeatStep, fired.NotifyCurNumber, cachedRule.NotifyMaxNumber)
return
} else {
event.NotifyCurNumber = fired.NotifyCurNumber + 1
message = fmt.Sprintf("fired, notify_repeat_step_matched(%d >= %d + %d * 60) notify_max_number_matched(#%d / %d)", event.LastEvalTime, fired.LastSentTime, cachedRule.NotifyRepeatStep, event.NotifyCurNumber, cachedRule.NotifyMaxNumber)
p.pushEventToQueue(event)
}
}
} else {
message = fmt.Sprintf("stalled, notify_repeat_step_not_matched(%d < %d + %d * 60)", event.LastEvalTime, fired.LastSentTime, cachedRule.NotifyRepeatStep)
}
} else {
event.NotifyCurNumber = 1
event.FirstTriggerTime = event.TriggerTime
message = fmt.Sprintf("fired, first_trigger_time: %d", event.FirstTriggerTime)
p.HandleFireEventHook(event)
p.pushEventToQueue(event)
}
}
func (p *Processor) pushEventToQueue(e *models.AlertCurEvent) {
if !e.IsRecovered {
e.LastSentTime = e.LastEvalTime
p.fires.Set(e.Hash, e)
}
dispatch.LogEvent(e, "push_queue")
if !queue.EventQueue.PushFront(e) {
logger.Warningf("alert_eval_%d datasource_%d event_push_queue: queue is full, event:%s", p.rule.Id, p.datasourceId, e.Hash)
p.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", p.DatasourceId()), "push_event_queue", p.BusiGroupCache.GetNameByBusiGroupId(p.rule.GroupId), fmt.Sprintf("%v", p.rule.Id)).Inc()
}
}
func (p *Processor) RecoverAlertCurEventFromDb() {
p.pendings = NewAlertCurEventMap(nil)
p.pendingsUseByRecover = NewAlertCurEventMap(nil)
curEvents, err := models.AlertCurEventGetByRuleIdAndDsId(p.ctx, p.rule.Id, p.datasourceId)
if err != nil {
logger.Errorf("alert_eval_%d datasource_%d recover event from db failed, err:%s", p.rule.Id, p.datasourceId, err)
p.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", p.DatasourceId()), "get_recover_event", p.BusiGroupCache.GetNameByBusiGroupId(p.rule.GroupId), fmt.Sprintf("%v", p.rule.Id)).Inc()
p.fires = NewAlertCurEventMap(nil)
return
}
fireMap := make(map[string]*models.AlertCurEvent)
pendingsUseByRecoverMap := make(map[string]*models.AlertCurEvent)
for _, event := range curEvents {
alertRule := p.alertRuleCache.Get(event.RuleId)
if alertRule == nil {
continue
}
event.NotifyRuleIds = alertRule.NotifyRuleIds
if event.Cate == models.HOST {
target, exists := p.TargetCache.Get(event.TargetIdent)
if exists && target.EngineName != p.EngineName && !(p.ctx.IsCenter && target.EngineName == "") {
// 如果是 host rule,且 target 的 engineName 不是当前的 engineName 或者是中心机房 target EngineName 为空,就跳过
continue
}
}
event.DB2Mem()
target, exists := p.TargetCache.Get(event.TargetIdent)
if exists {
target.GroupNames = p.BusiGroupCache.GetNamesByBusiGroupIds(target.GroupIds)
event.Target = target
}
fireMap[event.Hash] = event
e := *event
pendingsUseByRecoverMap[event.Hash] = &e
}
p.fires = NewAlertCurEventMap(fireMap)
// 修改告警规则,或者进程重启之后,需要重新加载 pendingsUseByRecover
p.pendingsUseByRecover = NewAlertCurEventMap(pendingsUseByRecoverMap)
}
func (p *Processor) fillTags(anomalyPoint models.AnomalyPoint) {
// handle series tags
tagsMap := make(map[string]string)
for label, value := range anomalyPoint.Labels {
tagsMap[string(label)] = string(value)
}
var e = &models.AlertCurEvent{
TagsMap: tagsMap,
}
// handle rule tags
tags := p.rule.AppendTagsJSON
tags = append(tags, "rulename="+p.rule.Name)
for _, tag := range tags {
arr := strings.SplitN(tag, "=", 2)
var defs = []string{
"{{$labels := .TagsMap}}",
"{{$value := .TriggerValue}}",
}
tagValue := arr[1]
text := strings.Join(append(defs, tagValue), "")
t, err := template.New(fmt.Sprint(p.rule.Id)).Funcs(template.FuncMap(tplx.TemplateFuncMap)).Parse(text)
if err != nil {
tagValue = fmt.Sprintf("parse tag value failed, err:%s", err)
tagsMap[arr[0]] = tagValue
continue
}
var body bytes.Buffer
err = t.Execute(&body, e)
if err != nil {
tagValue = fmt.Sprintf("parse tag value failed, err:%s", err)
tagsMap[arr[0]] = tagValue
continue
}
tagsMap[arr[0]] = body.String()
}
p.tagsMap = tagsMap
// handle tagsArr
p.tagsArr = labelMapToArr(tagsMap)
}
func (p *Processor) mayHandleIdent(event *models.AlertCurEvent) {
// handle ident
if ident, has := event.TagsMap["ident"]; has {
if target, exists := p.TargetCache.Get(ident); exists {
event.TargetIdent = target.Ident
event.TargetNote = target.Note
} else {
event.TargetIdent = ident
event.TargetNote = ""
}
} else {
event.TargetIdent = ""
event.TargetNote = ""
}
}
func (p *Processor) mayHandleGroup() {
// handle bg
bg := p.BusiGroupCache.GetByBusiGroupId(p.rule.GroupId)
if bg != nil {
p.groupName = bg.Name
}
}
func (p *Processor) DeleteProcessEvent(hash string) {
p.fires.Delete(hash)
p.pendings.Delete(hash)
p.pendingsUseByRecover.Delete(hash)
}
func labelMapToArr(m map[string]string) []string {
numLabels := len(m)
labelStrings := make([]string, 0, numLabels)
for label, value := range m {
labelStrings = append(labelStrings, fmt.Sprintf("%s=%s", label, value))
}
if numLabels > 1 {
sort.Strings(labelStrings)
}
return labelStrings
}
func Hash(ruleId, datasourceId int64, vector models.AnomalyPoint) string {
return str.MD5(fmt.Sprintf("%d_%s_%d_%d_%s", ruleId, vector.Labels.String(), datasourceId, vector.Severity, vector.Query))
}
func TagHash(vector models.AnomalyPoint) string {
return str.MD5(vector.Labels.String())
}
================================================
FILE: alert/queue/queue.go
================================================
package queue
import (
"time"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/toolkits/pkg/container/list"
)
var EventQueue = list.NewSafeListLimited(10000000)
func ReportQueueSize(stats *astats.Stats) {
for {
time.Sleep(time.Second)
stats.GaugeAlertQueueSize.Set(float64(EventQueue.Len()))
}
}
================================================
FILE: alert/record/prom_rule.go
================================================
package record
import (
"context"
"fmt"
"strings"
"time"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/prom"
"github.com/ccfos/nightingale/v6/pushgw/writer"
"github.com/robfig/cron/v3"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/str"
)
type RecordRuleContext struct {
datasourceId int64
quit chan struct{}
scheduler *cron.Cron
rule *models.RecordingRule
promClients *prom.PromClientMap
stats *astats.Stats
}
func NewRecordRuleContext(rule *models.RecordingRule, datasourceId int64, promClients *prom.PromClientMap, writers *writer.WritersType, stats *astats.Stats) *RecordRuleContext {
rrc := &RecordRuleContext{
datasourceId: datasourceId,
quit: make(chan struct{}),
rule: rule,
promClients: promClients,
stats: stats,
}
if rule.CronPattern == "" && rule.PromEvalInterval != 0 {
rule.CronPattern = fmt.Sprintf("@every %ds", rule.PromEvalInterval)
}
rrc.scheduler = cron.New(cron.WithSeconds(), cron.WithChain(cron.SkipIfStillRunning(cron.DefaultLogger)))
_, err := rrc.scheduler.AddFunc(rule.CronPattern, func() {
rrc.Eval()
})
if err != nil {
logger.Errorf("add cron pattern error: %v", err)
}
return rrc
}
func (rrc *RecordRuleContext) Key() string {
return fmt.Sprintf("record-%d-%d", rrc.datasourceId, rrc.rule.Id)
}
func (rrc *RecordRuleContext) Hash() string {
return str.MD5(fmt.Sprintf("%d_%s_%s_%d_%s_%s",
rrc.rule.Id,
rrc.rule.CronPattern,
rrc.rule.PromQl,
rrc.datasourceId,
rrc.rule.AppendTags,
rrc.rule.Name,
))
}
func (rrc *RecordRuleContext) Prepare() {}
func (rrc *RecordRuleContext) Start() {
logger.Infof("eval:%s started", rrc.Key())
rrc.scheduler.Start()
}
func (rrc *RecordRuleContext) Eval() {
rrc.stats.CounterRecordEval.WithLabelValues(fmt.Sprintf("%d", rrc.datasourceId)).Inc()
promql := strings.TrimSpace(rrc.rule.PromQl)
if promql == "" {
logger.Errorf("eval:%s promql is blank", rrc.Key())
return
}
if rrc.promClients.IsNil(rrc.datasourceId) {
logger.Errorf("eval:%s reader client is nil", rrc.Key())
rrc.stats.CounterRecordEvalErrorTotal.WithLabelValues(fmt.Sprintf("%d", rrc.datasourceId)).Inc()
return
}
value, warnings, err := rrc.promClients.GetCli(rrc.datasourceId).Query(context.Background(), promql, time.Now())
if err != nil {
logger.Errorf("eval:%s promql:%s, error:%v", rrc.Key(), promql, err)
rrc.stats.CounterRecordEvalErrorTotal.WithLabelValues(fmt.Sprintf("%d", rrc.datasourceId)).Inc()
return
}
if len(warnings) > 0 {
logger.Errorf("eval:%s promql:%s, warnings:%v", rrc.Key(), promql, warnings)
rrc.stats.CounterRecordEvalErrorTotal.WithLabelValues(fmt.Sprintf("%d", rrc.datasourceId)).Inc()
return
}
ts := ConvertToTimeSeries(value, rrc.rule)
if len(ts) != 0 {
err := rrc.promClients.GetWriterCli(rrc.datasourceId).Write(ts)
if err != nil {
logger.Errorf("eval:%s promql:%s, error:%v", rrc.Key(), promql, err)
rrc.stats.CounterRecordEvalErrorTotal.WithLabelValues(fmt.Sprintf("%d", rrc.datasourceId)).Inc()
}
}
}
func (rrc *RecordRuleContext) Stop() {
logger.Infof("%s stopped", rrc.Key())
c := rrc.scheduler.Stop()
<-c.Done()
close(rrc.quit)
}
================================================
FILE: alert/record/sample.go
================================================
package record
import (
"math"
"strings"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/prompb"
)
const (
LabelName = "__name__"
)
func ConvertToTimeSeries(value model.Value, rule *models.RecordingRule) (lst []prompb.TimeSeries) {
switch value.Type() {
case model.ValVector:
items, ok := value.(model.Vector)
if !ok {
return
}
for _, item := range items {
if math.IsNaN(float64(item.Value)) {
continue
}
s := prompb.Sample{}
s.Timestamp = time.Unix(item.Timestamp.Unix(), 0).UnixNano() / 1e6
s.Value = float64(item.Value)
l := labelsToLabelsProto(item.Metric, rule)
lst = append(lst, prompb.TimeSeries{
Labels: l,
Samples: []prompb.Sample{s},
})
}
case model.ValMatrix:
items, ok := value.(model.Matrix)
if !ok {
return
}
for _, item := range items {
if len(item.Values) == 0 {
return
}
last := item.Values[len(item.Values)-1]
if math.IsNaN(float64(last.Value)) {
continue
}
l := labelsToLabelsProto(item.Metric, rule)
var slst []prompb.Sample
for _, v := range item.Values {
if math.IsNaN(float64(v.Value)) {
continue
}
slst = append(slst, prompb.Sample{
Timestamp: time.Unix(v.Timestamp.Unix(), 0).UnixNano() / 1e6,
Value: float64(v.Value),
})
}
lst = append(lst, prompb.TimeSeries{
Labels: l,
Samples: slst,
})
}
case model.ValScalar:
item, ok := value.(*model.Scalar)
if !ok {
return
}
if math.IsNaN(float64(item.Value)) {
return
}
lst = append(lst, prompb.TimeSeries{
Labels: nil,
Samples: []prompb.Sample{{Value: float64(item.Value), Timestamp: time.Unix(item.Timestamp.Unix(), 0).UnixNano() / 1e6}},
})
default:
return
}
return
}
func labelsToLabelsProto(labels model.Metric, rule *models.RecordingRule) (result []prompb.Label) {
//name
nameLs := prompb.Label{
Name: LabelName,
Value: rule.Name,
}
result = append(result, nameLs)
for k, v := range labels {
if k == LabelName {
continue
}
if model.LabelNameRE.MatchString(string(k)) {
result = append(result, prompb.Label{
Name: string(k),
Value: string(v),
})
}
}
if len(rule.AppendTagsJSON) != 0 {
for _, v := range rule.AppendTagsJSON {
index := strings.Index(v, "=")
if model.LabelNameRE.MatchString(v[:index]) {
result = append(result, prompb.Label{
Name: v[:index],
Value: v[index+1:],
})
}
}
}
return result
}
================================================
FILE: alert/record/scheduler.go
================================================
package record
import (
"context"
"fmt"
"strconv"
"time"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/alert/naming"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/prom"
"github.com/ccfos/nightingale/v6/pushgw/writer"
)
type Scheduler struct {
// key: hash
recordRules map[string]*RecordRuleContext
aconf aconf.Alert
recordingRuleCache *memsto.RecordingRuleCacheType
promClients *prom.PromClientMap
writers *writer.WritersType
stats *astats.Stats
datasourceCache *memsto.DatasourceCacheType
}
func NewScheduler(aconf aconf.Alert, rrc *memsto.RecordingRuleCacheType, promClients *prom.PromClientMap, writers *writer.WritersType, stats *astats.Stats, datasourceCache *memsto.DatasourceCacheType) *Scheduler {
scheduler := &Scheduler{
aconf: aconf,
recordRules: make(map[string]*RecordRuleContext),
recordingRuleCache: rrc,
promClients: promClients,
writers: writers,
stats: stats,
datasourceCache: datasourceCache,
}
go scheduler.LoopSyncRules(context.Background())
return scheduler
}
func (s *Scheduler) LoopSyncRules(ctx context.Context) {
time.Sleep(time.Duration(s.aconf.EngineDelay) * time.Second)
duration := 9000 * time.Millisecond
for {
select {
case <-ctx.Done():
return
case <-time.After(duration):
s.syncRecordRules()
}
}
}
func (s *Scheduler) syncRecordRules() {
ids := s.recordingRuleCache.GetRuleIds()
recordRules := make(map[string]*RecordRuleContext)
for _, id := range ids {
rule := s.recordingRuleCache.Get(id)
if rule == nil {
continue
}
datasourceIds := s.datasourceCache.GetIDsByDsCateAndQueries("prometheus", rule.DatasourceQueries)
for _, dsId := range datasourceIds {
if !naming.DatasourceHashRing.IsHit(strconv.FormatInt(dsId, 10), fmt.Sprintf("%d", rule.Id), s.aconf.Heartbeat.Endpoint) {
continue
}
recordRule := NewRecordRuleContext(rule, dsId, s.promClients, s.writers, s.stats)
recordRules[recordRule.Hash()] = recordRule
}
}
for hash, rule := range recordRules {
if _, has := s.recordRules[hash]; !has {
rule.Prepare()
rule.Start()
s.recordRules[hash] = rule
}
}
for hash, rule := range s.recordRules {
if _, has := recordRules[hash]; !has {
rule.Stop()
delete(s.recordRules, hash)
}
}
}
================================================
FILE: alert/router/router.go
================================================
package router
import (
"net/http"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/alert/process"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/httpx"
"github.com/gin-gonic/gin"
)
type Router struct {
HTTP httpx.Config
Alert aconf.Alert
AlertMuteCache *memsto.AlertMuteCacheType
TargetCache *memsto.TargetCacheType
BusiGroupCache *memsto.BusiGroupCacheType
AlertStats *astats.Stats
Ctx *ctx.Context
ExternalProcessors *process.ExternalProcessorsType
LogDir string
}
func New(httpConfig httpx.Config, alert aconf.Alert, amc *memsto.AlertMuteCacheType, tc *memsto.TargetCacheType, bgc *memsto.BusiGroupCacheType,
astats *astats.Stats, ctx *ctx.Context, externalProcessors *process.ExternalProcessorsType, logDir string) *Router {
return &Router{
HTTP: httpConfig,
Alert: alert,
AlertMuteCache: amc,
TargetCache: tc,
BusiGroupCache: bgc,
AlertStats: astats,
Ctx: ctx,
ExternalProcessors: externalProcessors,
LogDir: logDir,
}
}
func (rt *Router) Config(r *gin.Engine) {
if !rt.HTTP.APIForService.Enable {
return
}
service := r.Group("/v1/n9e")
if len(rt.HTTP.APIForService.BasicAuth) > 0 {
service.Use(gin.BasicAuth(rt.HTTP.APIForService.BasicAuth))
}
service.POST("/event", rt.pushEventToQueue)
service.POST("/event-persist", rt.eventPersist)
service.POST("/make-event", rt.makeEvent)
service.GET("/event-detail/:hash", rt.eventDetail)
service.GET("/alert-eval-detail/:id", rt.alertEvalDetail)
service.GET("/trace-logs/:traceid", rt.traceLogs)
}
func Render(c *gin.Context, data, msg interface{}) {
if msg == nil {
if data == nil {
data = struct{}{}
}
c.JSON(http.StatusOK, gin.H{"data": data, "error": ""})
} else {
c.JSON(http.StatusOK, gin.H{"error": gin.H{"message": msg}})
}
}
func Dangerous(c *gin.Context, v interface{}, code ...int) {
if v == nil {
return
}
switch t := v.(type) {
case string:
if t != "" {
c.JSON(http.StatusOK, gin.H{"error": gin.H{"message": v}})
}
case error:
c.JSON(http.StatusOK, gin.H{"error": gin.H{"message": t.Error()}})
}
}
================================================
FILE: alert/router/router_alert_eval_detail.go
================================================
package router
import (
"fmt"
"github.com/ccfos/nightingale/v6/pkg/loggrep"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
func (rt *Router) alertEvalDetail(c *gin.Context) {
id := ginx.UrlParamStr(c, "id")
if !loggrep.IsValidRuleID(id) {
ginx.Bomb(200, "invalid rule id format")
}
instance := fmt.Sprintf("%s:%d", rt.Alert.Heartbeat.IP, rt.HTTP.Port)
keyword := fmt.Sprintf("alert_eval_%s", id)
logs, err := loggrep.GrepLogDir(rt.LogDir, keyword)
ginx.Dangerous(err)
ginx.NewRender(c).Data(loggrep.EventDetailResp{
Logs: logs,
Instance: instance,
}, nil)
}
================================================
FILE: alert/router/router_event.go
================================================
package router
import (
"fmt"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/alert/dispatch"
"github.com/ccfos/nightingale/v6/alert/mute"
"github.com/ccfos/nightingale/v6/alert/naming"
"github.com/ccfos/nightingale/v6/alert/process"
"github.com/ccfos/nightingale/v6/alert/queue"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/logger"
)
func (rt *Router) pushEventToQueue(c *gin.Context) {
var event *models.AlertCurEvent
ginx.BindJSON(c, &event)
if event.RuleId == 0 {
ginx.Bomb(200, "event is illegal")
}
event.FE2DB()
event.TagsMap = make(map[string]string)
for i := 0; i < len(event.TagsJSON); i++ {
pair := strings.TrimSpace(event.TagsJSON[i])
if pair == "" {
continue
}
arr := strings.SplitN(pair, "=", 2)
if len(arr) != 2 {
continue
}
event.TagsMap[arr[0]] = arr[1]
}
hit, _ := mute.EventMuteStrategy(event, rt.AlertMuteCache)
if hit {
logger.Infof("event_muted: rule_id=%d %s", event.RuleId, event.Hash)
ginx.NewRender(c).Message(nil)
return
}
if err := event.ParseRule("rule_name"); err != nil {
event.RuleName = fmt.Sprintf("failed to parse rule name: %v", err)
}
if err := event.ParseRule("rule_note"); err != nil {
event.RuleNote = fmt.Sprintf("failed to parse rule note: %v", err)
}
if err := event.ParseRule("annotations"); err != nil {
event.RuleNote = fmt.Sprintf("failed to parse rule note: %v", err)
}
// 如果 rule_note 中有 ; 前缀,则使用 rule_note 替换 tags 中的内容
if strings.HasPrefix(event.RuleNote, ";") {
event.RuleNote = strings.TrimPrefix(event.RuleNote, ";")
event.Tags = strings.ReplaceAll(event.RuleNote, " ", ",,")
event.TagsJSON = strings.Split(event.Tags, ",,")
} else {
event.Tags = strings.Join(event.TagsJSON, ",,")
}
event.Callbacks = strings.Join(event.CallbacksJSON, " ")
event.NotifyChannels = strings.Join(event.NotifyChannelsJSON, " ")
event.NotifyGroups = strings.Join(event.NotifyGroupsJSON, " ")
dispatch.LogEvent(event, "http_push_queue")
if !queue.EventQueue.PushFront(event) {
msg := fmt.Sprintf("event:%s push_queue err: queue is full", event.Hash)
ginx.Bomb(200, msg)
logger.Warningf(msg)
}
ginx.NewRender(c).Message(nil)
}
func (rt *Router) eventPersist(c *gin.Context) {
var event *models.AlertCurEvent
ginx.BindJSON(c, &event)
event.FE2DB()
err := models.EventPersist(rt.Ctx, event)
ginx.NewRender(c).Data(event.Id, err)
}
type eventForm struct {
Alert bool `json:"alert"`
AnomalyPoints []models.AnomalyPoint `json:"vectors"`
RuleId int64 `json:"rule_id"`
DatasourceId int64 `json:"datasource_id"`
Inhibit bool `json:"inhibit"`
}
func (rt *Router) makeEvent(c *gin.Context) {
var events []*eventForm
ginx.BindJSON(c, &events)
//now := time.Now().Unix()
for i := 0; i < len(events); i++ {
node, err := naming.DatasourceHashRing.GetNode(strconv.FormatInt(events[i].DatasourceId, 10), fmt.Sprintf("%d", events[i].RuleId))
if err != nil {
logger.Warningf("event(rule_id=%d ds_id=%d) get node err:%v", events[i].RuleId, events[i].DatasourceId, err)
ginx.Bomb(200, "event node not exists")
}
if node != rt.Alert.Heartbeat.Endpoint {
err := forwardEvent(events[i], node)
if err != nil {
logger.Warningf("event(rule_id=%d ds_id=%d) forward err:%v", events[i].RuleId, events[i].DatasourceId, err)
ginx.Bomb(200, "event forward error")
}
continue
}
ruleWorker, exists := rt.ExternalProcessors.GetExternalAlertRule(events[i].DatasourceId, events[i].RuleId)
logger.Debugf("handle event(rule_id=%d ds_id=%d) exists:%v", events[i].RuleId, events[i].DatasourceId, exists)
if !exists {
ginx.Bomb(200, "rule not exists")
}
if events[i].Alert {
go ruleWorker.Handle(events[i].AnomalyPoints, "http", events[i].Inhibit)
} else {
for _, vector := range events[i].AnomalyPoints {
readableString := vector.ReadableValue()
go ruleWorker.RecoverSingle(false, process.Hash(events[i].RuleId, events[i].DatasourceId, vector), vector.Timestamp, &readableString)
}
}
}
ginx.NewRender(c).Message(nil)
}
// event 不归本实例处理,转发给对应的实例
func forwardEvent(event *eventForm, instance string) error {
ur := fmt.Sprintf("http://%s/v1/n9e/make-event", instance)
res, code, err := poster.PostJSON(ur, time.Second*5, []*eventForm{event}, 3)
if err != nil {
return err
}
logger.Infof("forward event: result=succ url=%s code=%d rule_id=%d response=%s", ur, code, event.RuleId, string(res))
return nil
}
================================================
FILE: alert/router/router_event_detail.go
================================================
package router
import (
"fmt"
"github.com/ccfos/nightingale/v6/pkg/loggrep"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
func (rt *Router) eventDetail(c *gin.Context) {
hash := ginx.UrlParamStr(c, "hash")
if !loggrep.IsValidHash(hash) {
ginx.Bomb(200, "invalid hash format")
}
instance := fmt.Sprintf("%s:%d", rt.Alert.Heartbeat.IP, rt.HTTP.Port)
logs, err := loggrep.GrepLogDir(rt.LogDir, hash)
ginx.Dangerous(err)
ginx.NewRender(c).Data(loggrep.EventDetailResp{
Logs: logs,
Instance: instance,
}, nil)
}
================================================
FILE: alert/router/router_trace_logs.go
================================================
package router
import (
"fmt"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/ccfos/nightingale/v6/pkg/loggrep"
"github.com/gin-gonic/gin"
)
func (rt *Router) traceLogs(c *gin.Context) {
traceId := ginx.UrlParamStr(c, "traceid")
if !loggrep.IsValidTraceID(traceId) {
ginx.Bomb(200, "invalid trace id format")
}
instance := fmt.Sprintf("%s:%d", rt.Alert.Heartbeat.IP, rt.HTTP.Port)
keyword := "trace_id=" + traceId
logs, err := loggrep.GrepLatestLogFiles(rt.LogDir, keyword)
ginx.Dangerous(err)
ginx.NewRender(c).Data(loggrep.EventDetailResp{
Logs: logs,
Instance: instance,
}, nil)
}
================================================
FILE: alert/sender/callback.go
================================================
package sender
import (
"fmt"
"html/template"
"net/url"
"strings"
"time"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/toolkits/pkg/logger"
)
type (
// CallBacker 进行回调的接口
CallBacker interface {
CallBack(ctx CallBackContext)
}
// CallBackContext 回调时所需的上下文
CallBackContext struct {
Ctx *ctx.Context
CallBackURL string
Users []*models.User
Rule *models.AlertRule
Events []*models.AlertCurEvent
Stats *astats.Stats
BatchSend bool
}
DefaultCallBacker struct{}
)
func BuildCallBackContext(ctx *ctx.Context, callBackURL string, rule *models.AlertRule, events []*models.AlertCurEvent,
uids []int64, userCache *memsto.UserCacheType, batchSend bool, stats *astats.Stats) CallBackContext {
users := userCache.GetByUserIds(uids)
newCallBackUrl, _ := events[0].ParseURL(callBackURL)
return CallBackContext{
Ctx: ctx,
CallBackURL: newCallBackUrl,
Rule: rule,
Events: events,
Users: users,
BatchSend: batchSend,
Stats: stats,
}
}
func ExtractAtsParams(rawURL string) []string {
ans := make([]string, 0, 1)
parsedURL, err := url.Parse(rawURL)
if err != nil {
logger.Errorf("ExtractAtsParams(url=%s), err: %v", rawURL, err)
return ans
}
queryParams := parsedURL.Query()
atParam := queryParams.Get("ats")
if atParam == "" {
return ans
}
// Split the atParam by comma and return the result as a slice
return strings.Split(atParam, ",")
}
func NewCallBacker(
key string,
targetCache *memsto.TargetCacheType,
userCache *memsto.UserCacheType,
taskTplCache *memsto.TaskTplCache,
tpls map[string]*template.Template,
) CallBacker {
switch key {
case models.IbexDomain: // Distribute to Ibex
return &IbexCallBacker{
targetCache: targetCache,
userCache: userCache,
taskTplCache: taskTplCache,
}
case models.DefaultDomain: // default callback
return &DefaultCallBacker{}
case models.DingtalkDomain:
return &DingtalkSender{tpl: tpls[models.Dingtalk]}
case models.WecomDomain:
return &WecomSender{tpl: tpls[models.Wecom]}
case models.FeishuDomain:
return &FeishuSender{tpl: tpls[models.Feishu]}
case models.FeishuCardDomain:
return &FeishuCardSender{tpl: tpls[models.FeishuCard]}
//case models.Mm:
// return &MmSender{tpl: tpls[models.Mm]}
case models.TelegramDomain:
return &TelegramSender{tpl: tpls[models.Telegram]}
case models.LarkDomain:
return &LarkSender{tpl: tpls[models.Lark]}
case models.LarkCardDomain:
return &LarkCardSender{tpl: tpls[models.LarkCard]}
}
return nil
}
func (c *DefaultCallBacker) CallBack(ctx CallBackContext) {
if len(ctx.CallBackURL) == 0 || len(ctx.Events) == 0 {
return
}
event := ctx.Events[0]
if ctx.BatchSend {
webhookConf := &models.Webhook{
Type: models.RuleCallback,
Enable: true,
Url: ctx.CallBackURL,
Timeout: 5,
RetryCount: 3,
RetryInterval: 10,
Batch: 1000,
}
PushCallbackEvent(ctx.Ctx, webhookConf, event, ctx.Stats)
return
}
doSendAndRecord(ctx.Ctx, ctx.CallBackURL, ctx.CallBackURL, event, "callback", ctx.Stats, ctx.Events)
}
func doSendAndRecord(ctx *ctx.Context, url, token string, body interface{}, channel string,
stats *astats.Stats, events []*models.AlertCurEvent) {
res, err := doSend(url, body, channel, stats)
NotifyRecord(ctx, events, 0, channel, token, res, err)
}
func NotifyRecord(ctx *ctx.Context, evts []*models.AlertCurEvent, notifyRuleID int64, channel, target, res string, err error) {
// 一个通知可能对应多个 event,都需要记录
notis := make([]*models.NotificationRecord, 0, len(evts))
for _, evt := range evts {
noti := models.NewNotificationRecord(evt, notifyRuleID, channel, target)
if err != nil {
noti.SetStatus(models.NotiStatusFailure)
noti.SetDetails(err.Error())
} else if res != "" {
noti.SetDetails(string(res))
}
notis = append(notis, noti)
}
if !ctx.IsCenter {
err := poster.PostByUrls(ctx, "/v1/n9e/notify-record", notis)
if err != nil {
logger.Errorf("add notis:%v failed, err: %v", notis, err)
}
return
}
PushNotifyRecords(notis)
}
func doSend(url string, body interface{}, channel string, stats *astats.Stats) (string, error) {
stats.AlertNotifyTotal.WithLabelValues(channel).Inc()
start := time.Now()
res, code, err := poster.PostJSON(url, time.Second*5, body, 3)
res = []byte(fmt.Sprintf("duration: %d ms status_code:%d, response:%s", time.Since(start).Milliseconds(), code, string(res)))
if err != nil {
logger.Errorf("%s_sender: result=fail url=%s code=%d error=%v req:%v response=%s", channel, url, code, err, body, string(res))
stats.AlertNotifyErrorTotal.WithLabelValues(channel).Inc()
return string(res), err
}
logger.Infof("%s_sender: result=succ url=%s code=%d req:%v response=%s", channel, url, code, body, string(res))
return string(res), nil
}
type TaskCreateReply struct {
Err string `json:"err"`
Dat int64 `json:"dat"` // task.id
}
func PushCallbackEvent(ctx *ctx.Context, webhook *models.Webhook, event *models.AlertCurEvent, stats *astats.Stats) {
CallbackEventQueueLock.RLock()
queue := CallbackEventQueue[webhook.Url]
CallbackEventQueueLock.RUnlock()
if queue == nil {
queue = &WebhookQueue{
eventQueue: NewSafeEventQueue(QueueMaxSize),
closeCh: make(chan struct{}),
}
CallbackEventQueueLock.Lock()
CallbackEventQueue[webhook.Url] = queue
CallbackEventQueueLock.Unlock()
StartConsumer(ctx, queue, webhook.Batch, webhook, stats)
}
succ := queue.eventQueue.Push(event)
if !succ {
logger.Warningf("Write channel(%s) full, current channel size: %d event:%s", webhook.Url, queue.eventQueue.Len(), event.Hash)
}
}
================================================
FILE: alert/sender/dingtalk.go
================================================
package sender
import (
"html/template"
"strings"
"github.com/ccfos/nightingale/v6/models"
)
type dingtalkMarkdown struct {
Title string `json:"title"`
Text string `json:"text"`
}
type dingtalkAt struct {
AtMobiles []string `json:"atMobiles"`
IsAtAll bool `json:"isAtAll"`
}
type dingtalk struct {
Msgtype string `json:"msgtype"`
Markdown dingtalkMarkdown `json:"markdown"`
At dingtalkAt `json:"at"`
}
var (
_ CallBacker = (*DingtalkSender)(nil)
)
type DingtalkSender struct {
tpl *template.Template
}
func (ds *DingtalkSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
urls, ats, tokens := ds.extract(ctx.Users)
if len(urls) == 0 {
return
}
message := BuildTplMessage(models.Dingtalk, ds.tpl, ctx.Events)
for i, url := range urls {
var body dingtalk
// NoAt in url
if strings.Contains(url, "noat=1") {
body = dingtalk{
Msgtype: "markdown",
Markdown: dingtalkMarkdown{
Title: ctx.Events[0].RuleName,
Text: message,
},
}
} else {
body = dingtalk{
Msgtype: "markdown",
Markdown: dingtalkMarkdown{
Title: ctx.Events[0].RuleName,
Text: message + "\n" + strings.Join(ats, " "),
},
At: dingtalkAt{
AtMobiles: ats,
IsAtAll: false,
},
}
}
doSendAndRecord(ctx.Ctx, url, tokens[i], body, models.Dingtalk, ctx.Stats, ctx.Events)
}
}
func (ds *DingtalkSender) CallBack(ctx CallBackContext) {
if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 {
return
}
body := dingtalk{
Msgtype: "markdown",
Markdown: dingtalkMarkdown{
Title: ctx.Events[0].RuleName,
},
}
ats := ExtractAtsParams(ctx.CallBackURL)
message := BuildTplMessage(models.Dingtalk, ds.tpl, ctx.Events)
if len(ats) > 0 {
body.Markdown.Text = message + "\n@" + strings.Join(ats, "@")
body.At = dingtalkAt{
AtMobiles: ats,
IsAtAll: false,
}
} else {
// NoAt in url
body.Markdown.Text = message
}
doSendAndRecord(ctx.Ctx, ctx.CallBackURL, ctx.CallBackURL, body, "callback", ctx.Stats, ctx.Events)
}
// extract urls and ats from Users
func (ds *DingtalkSender) extract(users []*models.User) ([]string, []string, []string) {
urls := make([]string, 0, len(users))
ats := make([]string, 0, len(users))
tokens := make([]string, 0, len(users))
for _, user := range users {
if user.Phone != "" {
ats = append(ats, "@"+user.Phone)
}
if token, has := user.ExtractToken(models.Dingtalk); has {
url := token
if !strings.HasPrefix(token, "https://") && !strings.HasPrefix(token, "http://") {
url = "https://oapi.dingtalk.com/robot/send?access_token=" + token
}
urls = append(urls, url)
tokens = append(tokens, token)
}
}
return urls, ats, tokens
}
================================================
FILE: alert/sender/email.go
================================================
package sender
import (
"crypto/tls"
"errors"
"html/template"
"time"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/logger"
"gopkg.in/gomail.v2"
)
var mailch chan *EmailContext
type EmailSender struct {
subjectTpl *template.Template
contentTpl *template.Template
smtp aconf.SMTPConfig
}
type EmailContext struct {
events []*models.AlertCurEvent
mail *gomail.Message
}
func (es *EmailSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
tos := extract(ctx.Users)
var subject string
if es.subjectTpl != nil {
subject = BuildTplMessage(models.Email, es.subjectTpl, []*models.AlertCurEvent{ctx.Events[0]})
} else {
subject = ctx.Events[0].RuleName
}
content := BuildTplMessage(models.Email, es.contentTpl, ctx.Events)
es.WriteEmail(subject, content, tos, ctx.Events)
ctx.Stats.AlertNotifyTotal.WithLabelValues(models.Email).Add(float64(len(tos)))
}
func extract(users []*models.User) []string {
tos := make([]string, 0, len(users))
for _, u := range users {
if u.Email != "" {
tos = append(tos, u.Email)
}
}
return tos
}
func SendEmail(subject, content string, tos []string, stmp aconf.SMTPConfig) error {
conf := stmp
d := gomail.NewDialer(conf.Host, conf.Port, conf.User, conf.Pass)
if conf.InsecureSkipVerify {
d.TLSConfig = &tls.Config{InsecureSkipVerify: true}
}
m := gomail.NewMessage()
m.SetHeader("From", stmp.From)
m.SetHeader("To", tos...)
m.SetHeader("Subject", subject)
m.SetBody("text/html", content)
err := d.DialAndSend(m)
if err != nil {
return errors.New("email_sender: failed to send: " + err.Error())
}
return nil
}
func (es *EmailSender) WriteEmail(subject, content string, tos []string, events []*models.AlertCurEvent) {
m := gomail.NewMessage()
m.SetHeader("From", es.smtp.From)
m.SetHeader("To", tos...)
m.SetHeader("Subject", subject)
m.SetBody("text/html", content)
mailch <- &EmailContext{events, m}
}
func dialSmtp(d *gomail.Dialer) gomail.SendCloser {
for {
select {
case <-mailQuit:
// Note that Sendcloser is not obtained below,
// and the outgoing signal (with configuration changes) exits the current dial
return nil
default:
if s, err := d.Dial(); err != nil {
logger.Errorf("email_sender: failed to dial smtp: %s", err)
} else {
return s
}
time.Sleep(time.Second)
}
}
}
var mailQuit = make(chan struct{})
func RestartEmailSender(ctx *ctx.Context, smtp aconf.SMTPConfig) {
// Notify internal start exit
mailQuit <- struct{}{}
startEmailSender(ctx, smtp)
}
var smtpConfig aconf.SMTPConfig
func InitEmailSender(ctx *ctx.Context, ncc *memsto.NotifyConfigCacheType) {
mailch = make(chan *EmailContext, 100000)
go updateSmtp(ctx, ncc)
smtpConfig = ncc.GetSMTP()
go startEmailSender(ctx, smtpConfig)
}
func updateSmtp(ctx *ctx.Context, ncc *memsto.NotifyConfigCacheType) {
for {
time.Sleep(1 * time.Minute)
smtp := ncc.GetSMTP()
if smtpConfig.Host != smtp.Host || smtpConfig.Batch != smtp.Batch || smtpConfig.From != smtp.From ||
smtpConfig.Pass != smtp.Pass || smtpConfig.User != smtp.User || smtpConfig.Port != smtp.Port ||
smtpConfig.InsecureSkipVerify != smtp.InsecureSkipVerify { //diff
smtpConfig = smtp
RestartEmailSender(ctx, smtp)
}
}
}
func startEmailSender(ctx *ctx.Context, smtp aconf.SMTPConfig) {
conf := smtp
if conf.Host == "" || conf.Port == 0 {
logger.Debug("SMTP configurations invalid")
<-mailQuit
return
}
logger.Infof("start email sender... conf.Host:%+v,conf.Port:%+v", conf.Host, conf.Port)
d := gomail.NewDialer(conf.Host, conf.Port, conf.User, conf.Pass)
if conf.InsecureSkipVerify {
d.TLSConfig = &tls.Config{InsecureSkipVerify: true}
}
var s gomail.SendCloser
var open bool
var size int
for {
select {
case <-mailQuit:
return
case m, ok := <-mailch:
if !ok {
return
}
if !open {
s = dialSmtp(d)
if s == nil {
// Indicates that the dialing failed and exited the current goroutine directly,
// but put the Message back in the mailch
mailch <- m
return
}
open = true
}
var err error
if err = gomail.Send(s, m.mail); err != nil {
logger.Errorf("email_sender: failed to send: %s", err)
// close and retry
if err := s.Close(); err != nil {
logger.Warningf("email_sender: failed to close smtp connection: %s", err)
}
s = dialSmtp(d)
if s == nil {
// Indicates that the dialing failed and exited the current goroutine directly,
// but put the Message back in the mailch
mailch <- m
return
}
open = true
if err = gomail.Send(s, m.mail); err != nil {
logger.Errorf("email_sender: failed to retry send: %s", err)
}
} else {
logger.Infof("email_sender: result=succ subject=%v to=%v",
m.mail.GetHeader("Subject"), m.mail.GetHeader("To"))
}
for _, to := range m.mail.GetHeader("To") {
msg := ""
if err == nil {
msg = "ok"
}
NotifyRecord(ctx, m.events, 0, models.Email, to, msg, err)
}
size++
if size >= conf.Batch {
if err := s.Close(); err != nil {
logger.Warningf("email_sender: failed to close smtp connection: %s", err)
}
open = false
size = 0
}
// Close the connection to the SMTP server if no email was sent in
// the last 30 seconds.
case <-time.After(30 * time.Second):
if open {
if err := s.Close(); err != nil {
logger.Warningf("email_sender: failed to close smtp connection: %s", err)
}
open = false
}
}
}
}
================================================
FILE: alert/sender/feishu.go
================================================
package sender
import (
"fmt"
"html/template"
"strings"
"github.com/ccfos/nightingale/v6/models"
)
type feishuContent struct {
Text string `json:"text"`
}
type feishuAt struct {
AtMobiles []string `json:"atMobiles"`
IsAtAll bool `json:"isAtAll"`
}
type feishu struct {
Msgtype string `json:"msg_type"`
Content feishuContent `json:"content"`
At feishuAt `json:"at"`
}
var (
_ CallBacker = (*FeishuSender)(nil)
)
type FeishuSender struct {
tpl *template.Template
}
func (fs *FeishuSender) CallBack(ctx CallBackContext) {
if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 {
return
}
ats := ExtractAtsParams(ctx.CallBackURL)
message := BuildTplMessage(models.Feishu, fs.tpl, ctx.Events)
if len(ats) > 0 {
atTags := ""
for _, at := range ats {
atTags += fmt.Sprintf(" ", at)
}
message = atTags + message
}
body := feishu{
Msgtype: "text",
Content: feishuContent{
Text: message,
},
}
doSendAndRecord(ctx.Ctx, ctx.CallBackURL, ctx.CallBackURL, body, "callback", ctx.Stats, ctx.Events)
}
func (fs *FeishuSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
urls, ats, tokens := fs.extract(ctx.Users)
message := BuildTplMessage(models.Feishu, fs.tpl, ctx.Events)
for i, url := range urls {
body := feishu{
Msgtype: "text",
Content: feishuContent{
Text: message,
},
}
if !strings.Contains(url, "noat=1") {
body.At = feishuAt{
AtMobiles: ats,
IsAtAll: false,
}
}
doSendAndRecord(ctx.Ctx, url, tokens[i], body, models.Feishu, ctx.Stats, ctx.Events)
}
}
func (fs *FeishuSender) extract(users []*models.User) ([]string, []string, []string) {
urls := make([]string, 0, len(users))
ats := make([]string, 0, len(users))
tokens := make([]string, 0, len(users))
for _, user := range users {
if user.Phone != "" {
ats = append(ats, user.Phone)
}
if token, has := user.ExtractToken(models.Feishu); has {
url := token
if !strings.HasPrefix(token, "https://") && !strings.HasPrefix(token, "http://") {
url = "https://open.feishu.cn/open-apis/bot/v2/hook/" + token
}
urls = append(urls, url)
tokens = append(tokens, token)
}
}
return urls, ats, tokens
}
================================================
FILE: alert/sender/feishucard.go
================================================
package sender
import (
"fmt"
"html/template"
"net/url"
"strings"
"github.com/ccfos/nightingale/v6/models"
)
type Conf struct {
WideScreenMode bool `json:"wide_screen_mode"`
EnableForward bool `json:"enable_forward"`
}
type Te struct {
Content string `json:"content"`
Tag string `json:"tag"`
}
type Element struct {
Tag string `json:"tag"`
Text Te `json:"text"`
Content string `json:"content"`
Elements []Element `json:"elements"`
}
type Titles struct {
Content string `json:"content"`
Tag string `json:"tag"`
}
type Headers struct {
Title Titles `json:"title"`
Template string `json:"template"`
}
type Cards struct {
Config Conf `json:"config"`
Elements []Element `json:"elements"`
Header Headers `json:"header"`
}
type feishuCard struct {
feishu
Card Cards `json:"card"`
}
type FeishuCardSender struct {
tpl *template.Template
}
const (
Recovered = "recovered"
Triggered = "triggered"
)
func createFeishuCardBody() feishuCard {
return feishuCard{
feishu: feishu{Msgtype: "interactive"},
Card: Cards{
Config: Conf{
WideScreenMode: true,
EnableForward: true,
},
Header: Headers{
Title: Titles{
Tag: "plain_text",
},
},
Elements: []Element{
{
Tag: "div",
Text: Te{
Tag: "lark_md",
},
},
{
Tag: "hr",
},
{
Tag: "note",
Elements: []Element{
{
Tag: "lark_md",
},
},
},
},
},
}
}
func (fs *FeishuCardSender) CallBack(ctx CallBackContext) {
if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 {
return
}
ats := ExtractAtsParams(ctx.CallBackURL)
message := BuildTplMessage(models.FeishuCard, fs.tpl, ctx.Events)
if len(ats) > 0 {
atTags := ""
for _, at := range ats {
if strings.Contains(at, "@") {
atTags += fmt.Sprintf(" ", at)
} else {
atTags += fmt.Sprintf(" ", at)
}
}
message = atTags + message
}
color := "red"
lowerUnicode := strings.ToLower(message)
if strings.Count(lowerUnicode, Recovered) > 0 && strings.Count(lowerUnicode, Triggered) > 0 {
color = "orange"
} else if strings.Count(lowerUnicode, Recovered) > 0 {
color = "green"
}
SendTitle := fmt.Sprintf("🔔 %s", ctx.Events[0].RuleName)
body := createFeishuCardBody()
body.Card.Header.Title.Content = SendTitle
body.Card.Header.Template = color
body.Card.Elements[0].Text.Content = message
body.Card.Elements[2].Elements[0].Content = SendTitle
// This is to be compatible with the feishucard interface, if with query string parameters, the request will fail
// Remove query parameters from the URL,
parsedURL, err := url.Parse(ctx.CallBackURL)
if err != nil {
return
}
parsedURL.RawQuery = ""
doSendAndRecord(ctx.Ctx, parsedURL.String(), parsedURL.String(), body, "callback", ctx.Stats, ctx.Events)
}
func (fs *FeishuCardSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
urls, tokens := fs.extract(ctx.Users)
message := BuildTplMessage(models.FeishuCard, fs.tpl, ctx.Events)
color := "red"
lowerUnicode := strings.ToLower(message)
if strings.Count(lowerUnicode, Recovered) > 0 && strings.Count(lowerUnicode, Triggered) > 0 {
color = "orange"
} else if strings.Count(lowerUnicode, Recovered) > 0 {
color = "green"
}
SendTitle := fmt.Sprintf("🔔 %s", ctx.Events[0].RuleName)
body := createFeishuCardBody()
body.Card.Header.Title.Content = SendTitle
body.Card.Header.Template = color
body.Card.Elements[0].Text.Content = message
body.Card.Elements[2].Elements[0].Content = SendTitle
for i, url := range urls {
doSendAndRecord(ctx.Ctx, url, tokens[i], body, models.FeishuCard, ctx.Stats, ctx.Events)
}
}
func (fs *FeishuCardSender) extract(users []*models.User) ([]string, []string) {
urls := make([]string, 0, len(users))
tokens := make([]string, 0, len(users))
for i := range users {
if token, has := users[i].ExtractToken(models.FeishuCard); has {
url := token
if !strings.HasPrefix(token, "https://") && !strings.HasPrefix(token, "http://") {
url = "https://open.feishu.cn/open-apis/bot/v2/hook/" + strings.TrimSpace(token)
}
urls = append(urls, url)
tokens = append(tokens, token)
}
}
return urls, tokens
}
================================================
FILE: alert/sender/global_webhook.go
================================================
package sender
import (
"bytes"
"crypto/tls"
"encoding/json"
"fmt"
"io"
"net/http"
"time"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/toolkits/pkg/logger"
)
var staticGlobalWebhookClient *http.Client
var staticGlobalWebhookConf aconf.GlobalWebhook
const staticGlobalWebhookChannel = "static_global_webhook"
func InitStaticGlobalWebhook(conf aconf.GlobalWebhook) {
staticGlobalWebhookConf = conf
if !conf.Enable || conf.Url == "" {
return
}
if len(conf.Headers) > 0 && len(conf.Headers)%2 != 0 {
logger.Warningf("static_global_webhook headers count is odd(%d), headers will be ignored", len(conf.Headers))
}
timeout := conf.Timeout
if timeout <= 0 {
timeout = 10
}
transport := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: conf.SkipVerify},
MaxIdleConns: 100,
MaxIdleConnsPerHost: 10,
IdleConnTimeout: 90 * time.Second,
}
if poster.UseProxy(conf.Url) {
transport.Proxy = http.ProxyFromEnvironment
}
staticGlobalWebhookClient = &http.Client{
Timeout: time.Duration(timeout) * time.Second,
Transport: transport,
}
logger.Infof("static_global_webhook initialized, url:%s", conf.Url)
}
func SendStaticGlobalWebhook(ctx *ctx.Context, event *models.AlertCurEvent, stats *astats.Stats) {
if staticGlobalWebhookClient == nil {
return
}
bs, err := json.Marshal(event)
if err != nil {
logger.Errorf("%s failed to marshal event err:%v", staticGlobalWebhookChannel, err)
NotifyRecord(ctx, []*models.AlertCurEvent{event}, 0, staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, "", err)
return
}
req, err := http.NewRequest("POST", staticGlobalWebhookConf.Url, bytes.NewBuffer(bs))
if err != nil {
logger.Warningf("%s failed to new request event:%s err:%v", staticGlobalWebhookChannel, string(bs), err)
NotifyRecord(ctx, []*models.AlertCurEvent{event}, 0, staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, "", err)
return
}
req.Header.Set("Content-Type", "application/json")
if staticGlobalWebhookConf.BasicAuthUser != "" && staticGlobalWebhookConf.BasicAuthPass != "" {
req.SetBasicAuth(staticGlobalWebhookConf.BasicAuthUser, staticGlobalWebhookConf.BasicAuthPass)
}
if len(staticGlobalWebhookConf.Headers) > 0 && len(staticGlobalWebhookConf.Headers)%2 == 0 {
for i := 0; i < len(staticGlobalWebhookConf.Headers); i += 2 {
if staticGlobalWebhookConf.Headers[i] == "Host" || staticGlobalWebhookConf.Headers[i] == "host" {
req.Host = staticGlobalWebhookConf.Headers[i+1]
continue
}
req.Header.Set(staticGlobalWebhookConf.Headers[i], staticGlobalWebhookConf.Headers[i+1])
}
}
stats.AlertNotifyTotal.WithLabelValues(staticGlobalWebhookChannel).Inc()
resp, err := staticGlobalWebhookClient.Do(req)
if err != nil {
stats.AlertNotifyErrorTotal.WithLabelValues(staticGlobalWebhookChannel).Inc()
logger.Errorf("%s_fail url:%s event:%s error:%v", staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, event.Hash, err)
NotifyRecord(ctx, []*models.AlertCurEvent{event}, 0, staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, "", err)
return
}
defer resp.Body.Close()
body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
res := fmt.Sprintf("status_code:%d, response:%s", resp.StatusCode, string(body))
if resp.StatusCode >= 400 {
stats.AlertNotifyErrorTotal.WithLabelValues(staticGlobalWebhookChannel).Inc()
logger.Errorf("%s_fail url:%s status:%d body:%s event:%s", staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, resp.StatusCode, string(body), event.Hash)
NotifyRecord(ctx, []*models.AlertCurEvent{event}, 0, staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, res, fmt.Errorf("status code %d", resp.StatusCode))
return
}
logger.Debugf("%s_succ url:%s status:%d body:%s event:%s", staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, resp.StatusCode, string(body), event.Hash)
NotifyRecord(ctx, []*models.AlertCurEvent{event}, 0, staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, res, nil)
}
================================================
FILE: alert/sender/global_webhook_test.go
================================================
package sender
import (
"context"
"errors"
"net/http"
"strings"
"testing"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/models"
ctxpkg "github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/prometheus/client_golang/prometheus"
)
type roundTripperFunc func(*http.Request) (*http.Response, error)
func (f roundTripperFunc) RoundTrip(req *http.Request) (*http.Response, error) {
return f(req)
}
func newStaticWebhookTestStats() *astats.Stats {
return &astats.Stats{
AlertNotifyTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{Name: "test_static_global_webhook_total"},
[]string{"channel"},
),
AlertNotifyErrorTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{Name: "test_static_global_webhook_error_total"},
[]string{"channel"},
),
}
}
func TestSendStaticGlobalWebhookRecordsNewRequestFailure(t *testing.T) {
prevClient := staticGlobalWebhookClient
prevConf := staticGlobalWebhookConf
defer func() {
staticGlobalWebhookClient = prevClient
staticGlobalWebhookConf = prevConf
}()
NotifyRecordQueue.RemoveAll()
defer NotifyRecordQueue.RemoveAll()
staticGlobalWebhookClient = &http.Client{}
staticGlobalWebhookConf = aconf.GlobalWebhook{Enable: true, Url: "://bad-url"}
SendStaticGlobalWebhook(
ctxpkg.NewContext(context.Background(), nil, true),
&models.AlertCurEvent{Id: 1, Hash: "event-1"},
newStaticWebhookTestStats(),
)
if got := NotifyRecordQueue.Len(); got != 1 {
t.Fatalf("expected 1 notify record, got %d", got)
}
record, ok := NotifyRecordQueue.PopBack().(*models.NotificationRecord)
if !ok {
t.Fatalf("expected *models.NotificationRecord in queue")
}
if record.Status != models.NotiStatusFailure {
t.Fatalf("expected failure status, got %d", record.Status)
}
if record.Channel != staticGlobalWebhookChannel {
t.Fatalf("expected channel %q, got %q", staticGlobalWebhookChannel, record.Channel)
}
}
func TestSendStaticGlobalWebhookRecordsTransportFailure(t *testing.T) {
prevClient := staticGlobalWebhookClient
prevConf := staticGlobalWebhookConf
defer func() {
staticGlobalWebhookClient = prevClient
staticGlobalWebhookConf = prevConf
}()
NotifyRecordQueue.RemoveAll()
defer NotifyRecordQueue.RemoveAll()
staticGlobalWebhookClient = &http.Client{
Transport: roundTripperFunc(func(req *http.Request) (*http.Response, error) {
return nil, errors.New("transport boom")
}),
}
staticGlobalWebhookConf = aconf.GlobalWebhook{Enable: true, Url: "http://example.com/webhook"}
SendStaticGlobalWebhook(
ctxpkg.NewContext(context.Background(), nil, true),
&models.AlertCurEvent{Id: 2, Hash: "event-2"},
newStaticWebhookTestStats(),
)
if got := NotifyRecordQueue.Len(); got != 1 {
t.Fatalf("expected 1 notify record, got %d", got)
}
record, ok := NotifyRecordQueue.PopBack().(*models.NotificationRecord)
if !ok {
t.Fatalf("expected *models.NotificationRecord in queue")
}
if record.Status != models.NotiStatusFailure {
t.Fatalf("expected failure status, got %d", record.Status)
}
if !strings.Contains(record.Details, "transport boom") {
t.Fatalf("expected transport error details, got %q", record.Details)
}
}
================================================
FILE: alert/sender/ibex.go
================================================
// @Author: Ciusyan 6/5/24
package sender
import (
"encoding/json"
"fmt"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
imodels "github.com/flashcatcloud/ibex/src/models"
"github.com/flashcatcloud/ibex/src/storage"
"github.com/toolkits/pkg/logger"
)
var (
_ CallBacker = (*IbexCallBacker)(nil)
)
type IbexCallBacker struct {
targetCache *memsto.TargetCacheType
userCache *memsto.UserCacheType
taskTplCache *memsto.TaskTplCache
}
func (c *IbexCallBacker) CallBack(ctx CallBackContext) {
if len(ctx.CallBackURL) == 0 || len(ctx.Events) == 0 {
logger.Warningf("event_callback_ibex: url or events is empty, url: %s", ctx.CallBackURL)
return
}
event := ctx.Events[0]
if event.IsRecovered {
logger.Infof("event_callback_ibex: event is recovered, event: %s", event.Hash)
return
}
c.handleIbex(ctx.Ctx, ctx.CallBackURL, event)
}
func (c *IbexCallBacker) handleIbex(ctx *ctx.Context, url string, event *models.AlertCurEvent) {
logger.Infof("event_callback_ibex: url: %s, event: %s", url, event.Hash)
if imodels.DB() == nil && ctx.IsCenter {
logger.Warningf("event_callback_ibex: db is nil, event: %s", event.Hash)
return
}
arr := strings.Split(url, "/")
var idstr string
var host string
if len(arr) > 1 {
idstr = arr[1]
}
if len(arr) > 2 {
host = arr[2]
}
id, err := strconv.ParseInt(idstr, 10, 64)
if err != nil {
logger.Errorf("event_callback_ibex: failed to parse url: %s event: %s", url, event.Hash)
return
}
if host == "" {
// 用户在callback url中没有传入host,就从event中解析
host = event.TargetIdent
if host == "" {
if ident, has := event.TagsMap["ident"]; has {
host = ident
}
}
}
if host == "" {
logger.Errorf("event_callback_ibex: failed to get host, id: %d, event: %s", id, event.Hash)
return
}
CallIbex(ctx, id, host, c.taskTplCache, c.targetCache, c.userCache, event, "")
}
func CallIbex(ctx *ctx.Context, id int64, host string,
taskTplCache *memsto.TaskTplCache, targetCache *memsto.TargetCacheType,
userCache *memsto.UserCacheType, event *models.AlertCurEvent, args string) (int64, error) {
logger.Infof("event_callback_ibex: id: %d, host: %s, args: %s, event: %s", id, host, args, event.Hash)
tpl := taskTplCache.Get(id)
if tpl == nil {
err := fmt.Errorf("event_callback_ibex: no such tpl(%d), event: %s", id, event.Hash)
logger.Errorf("%s", err)
return 0, err
}
// check perm
// tpl.GroupId - host - account 三元组校验权限
can, err := CanDoIbex(tpl.UpdateBy, tpl, host, targetCache, userCache)
if err != nil {
err = fmt.Errorf("event_callback_ibex: check perm fail: %v, event: %s", err, event.Hash)
logger.Errorf("%s", err)
return 0, err
}
if !can {
err = fmt.Errorf("event_callback_ibex: user(%s) no permission, event: %s", tpl.UpdateBy, event.Hash)
logger.Errorf("%s", err)
return 0, err
}
tagsMap := make(map[string]string)
for i := 0; i < len(event.TagsJSON); i++ {
pair := strings.TrimSpace(event.TagsJSON[i])
if pair == "" {
continue
}
arr := strings.SplitN(pair, "=", 2)
if len(arr) != 2 {
continue
}
tagsMap[arr[0]] = arr[1]
}
// 附加告警级别 告警触发值标签
tagsMap["alert_severity"] = strconv.Itoa(event.Severity)
tagsMap["alert_trigger_value"] = event.TriggerValue
tagsMap["is_recovered"] = strconv.FormatBool(event.IsRecovered)
tags, err := json.Marshal(tagsMap)
if err != nil {
err = fmt.Errorf("event_callback_ibex: failed to marshal tags to json: %v, event: %s", tagsMap, event.Hash)
logger.Errorf("%s", err)
return 0, err
}
// call ibex
taskArgs := tpl.Args
if args != "" {
taskArgs = args
}
in := models.TaskForm{
Title: tpl.Title + " FH: " + host,
Account: tpl.Account,
Batch: tpl.Batch,
Tolerance: tpl.Tolerance,
Timeout: tpl.Timeout,
Pause: tpl.Pause,
Script: tpl.Script,
Args: taskArgs,
Stdin: string(tags),
Action: "start",
Creator: tpl.UpdateBy,
Hosts: []string{host},
AlertTriggered: true,
}
id, err = TaskAdd(in, tpl.UpdateBy, ctx.IsCenter)
if err != nil {
err = fmt.Errorf("event_callback_ibex: call ibex fail: %v, event: %s", err, event.Hash)
logger.Errorf("%s", err)
return 0, err
}
// write db
record := models.TaskRecord{
Id: id,
EventId: event.Id,
GroupId: tpl.GroupId,
Title: in.Title,
Account: in.Account,
Batch: in.Batch,
Tolerance: in.Tolerance,
Timeout: in.Timeout,
Pause: in.Pause,
Script: in.Script,
Args: in.Args,
CreateAt: time.Now().Unix(),
CreateBy: in.Creator,
}
if err = record.Add(ctx); err != nil {
err = fmt.Errorf("event_callback_ibex: persist task_record fail: %v, event: %s", err, event.Hash)
logger.Errorf("%s", err)
return id, err
}
return id, nil
}
func CanDoIbex(username string, tpl *models.TaskTpl, host string, targetCache *memsto.TargetCacheType, userCache *memsto.UserCacheType) (bool, error) {
user := userCache.GetByUsername(username)
if user != nil && user.IsAdmin() {
return true, nil
}
target, has := targetCache.Get(host)
if !has {
return false, nil
}
return target.MatchGroupId(tpl.GroupId), nil
}
func TaskAdd(f models.TaskForm, authUser string, isCenter bool) (int64, error) {
if storage.Cache == nil {
logger.Warningf("event_callback_ibex: redis cache is nil, task: %+v", f)
return 0, fmt.Errorf("redis cache is nil")
}
hosts := cleanHosts(f.Hosts)
if len(hosts) == 0 {
return 0, fmt.Errorf("arg(hosts) empty")
}
taskMeta := &imodels.TaskMeta{
Title: f.Title,
Account: f.Account,
Batch: f.Batch,
Tolerance: f.Tolerance,
Timeout: f.Timeout,
Pause: f.Pause,
Script: f.Script,
Args: f.Args,
Stdin: f.Stdin,
Creator: f.Creator,
}
err := taskMeta.CleanFields()
if err != nil {
return 0, err
}
taskMeta.HandleFH(hosts[0])
// 任务类型分为"告警规则触发"和"n9e center用户下发"两种;
// 边缘机房"告警规则触发"的任务不需要规划,并且它可能是失联的,无法使用db资源,所以放入redis缓存中,直接下发给agentd执行
if !isCenter && f.AlertTriggered {
if err := taskMeta.Create(); err != nil {
// 当网络不连通时,生成唯一的id,防止边缘机房中不同任务的id相同;
// 方法是,redis自增id去防止同一个机房的不同n9e edge生成的id相同;
// 但没法防止不同边缘机房生成同样的id,所以,生成id的数据不会上报存入数据库,只用于闭环执行。
taskMeta.Id, err = storage.IdGet()
if err != nil {
return 0, err
}
}
taskHost := imodels.TaskHost{
Id: taskMeta.Id,
Host: hosts[0],
Status: "running",
}
if err = taskHost.Create(); err != nil {
logger.Warningf("task_add_fail: authUser=%s title=%s err=%s", authUser, taskMeta.Title, err.Error())
}
// 缓存任务元信息和待下发的任务
err = taskMeta.Cache(hosts[0])
if err != nil {
return 0, err
}
} else {
// 如果是中心机房,还是保持之前的逻辑
err = taskMeta.Save(hosts, f.Action)
if err != nil {
return 0, err
}
}
logger.Infof("task_add_succ: authUser=%s title=%s", authUser, taskMeta.Title)
return taskMeta.Id, nil
}
func cleanHosts(formHosts []string) []string {
cnt := len(formHosts)
arr := make([]string, 0, cnt)
for i := 0; i < cnt; i++ {
item := strings.TrimSpace(formHosts[i])
if item == "" {
continue
}
if strings.HasPrefix(item, "#") {
continue
}
arr = append(arr, item)
}
return arr
}
================================================
FILE: alert/sender/lark.go
================================================
package sender
import (
"html/template"
"strings"
"github.com/ccfos/nightingale/v6/models"
)
var (
_ CallBacker = (*LarkSender)(nil)
)
type LarkSender struct {
tpl *template.Template
}
func (lk *LarkSender) CallBack(ctx CallBackContext) {
if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 {
return
}
body := feishu{
Msgtype: "text",
Content: feishuContent{
Text: BuildTplMessage(models.Lark, lk.tpl, ctx.Events),
},
}
doSendAndRecord(ctx.Ctx, ctx.CallBackURL, ctx.CallBackURL, body, "callback", ctx.Stats, ctx.Events)
}
func (lk *LarkSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
urls, tokens := lk.extract(ctx.Users)
message := BuildTplMessage(models.Lark, lk.tpl, ctx.Events)
for i, url := range urls {
body := feishu{
Msgtype: "text",
Content: feishuContent{
Text: message,
},
}
doSendAndRecord(ctx.Ctx, url, tokens[i], body, models.Lark, ctx.Stats, ctx.Events)
}
}
func (lk *LarkSender) extract(users []*models.User) ([]string, []string) {
urls := make([]string, 0, len(users))
tokens := make([]string, 0, len(users))
for _, user := range users {
if token, has := user.ExtractToken(models.Lark); has {
url := token
if !strings.HasPrefix(token, "https://") && !strings.HasPrefix(token, "http://") {
url = "https://open.larksuite.com/open-apis/bot/v2/hook/" + token
}
urls = append(urls, url)
tokens = append(tokens, token)
}
}
return urls, tokens
}
================================================
FILE: alert/sender/larkcard.go
================================================
package sender
import (
"fmt"
"html/template"
"net/url"
"strings"
"github.com/ccfos/nightingale/v6/models"
)
type LarkCardSender struct {
tpl *template.Template
}
func (fs *LarkCardSender) CallBack(ctx CallBackContext) {
if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 {
return
}
ats := ExtractAtsParams(ctx.CallBackURL)
message := BuildTplMessage(models.LarkCard, fs.tpl, ctx.Events)
if len(ats) > 0 {
atTags := ""
for _, at := range ats {
if strings.Contains(at, "@") {
atTags += fmt.Sprintf(" ", at)
} else {
atTags += fmt.Sprintf(" ", at)
}
}
message = atTags + message
}
color := "red"
lowerUnicode := strings.ToLower(message)
if strings.Count(lowerUnicode, Recovered) > 0 && strings.Count(lowerUnicode, Triggered) > 0 {
color = "orange"
} else if strings.Count(lowerUnicode, Recovered) > 0 {
color = "green"
}
SendTitle := fmt.Sprintf("🔔 %s", ctx.Events[0].RuleName)
body := createFeishuCardBody()
body.Card.Header.Title.Content = SendTitle
body.Card.Header.Template = color
body.Card.Elements[0].Text.Content = message
body.Card.Elements[2].Elements[0].Content = SendTitle
// This is to be compatible with the Larkcard interface, if with query string parameters, the request will fail
// Remove query parameters from the URL,
parsedURL, err := url.Parse(ctx.CallBackURL)
if err != nil {
return
}
parsedURL.RawQuery = ""
doSendAndRecord(ctx.Ctx, ctx.CallBackURL, ctx.CallBackURL, body, "callback", ctx.Stats, ctx.Events)
}
func (fs *LarkCardSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
urls, tokens := fs.extract(ctx.Users)
message := BuildTplMessage(models.LarkCard, fs.tpl, ctx.Events)
color := "red"
lowerUnicode := strings.ToLower(message)
if strings.Count(lowerUnicode, Recovered) > 0 && strings.Count(lowerUnicode, Triggered) > 0 {
color = "orange"
} else if strings.Count(lowerUnicode, Recovered) > 0 {
color = "green"
}
SendTitle := fmt.Sprintf("🔔 %s", ctx.Events[0].RuleName)
body := createFeishuCardBody()
body.Card.Header.Title.Content = SendTitle
body.Card.Header.Template = color
body.Card.Elements[0].Text.Content = message
body.Card.Elements[2].Elements[0].Content = SendTitle
for i, url := range urls {
doSendAndRecord(ctx.Ctx, url, tokens[i], body, models.LarkCard, ctx.Stats, ctx.Events)
}
}
func (fs *LarkCardSender) extract(users []*models.User) ([]string, []string) {
urls := make([]string, 0, len(users))
tokens := make([]string, 0)
for i := range users {
if token, has := users[i].ExtractToken(models.Lark); has {
url := token
if !strings.HasPrefix(token, "https://") && !strings.HasPrefix(token, "http://") {
url = "https://open.larksuite.com/open-apis/bot/v2/hook/" + strings.TrimSpace(token)
}
urls = append(urls, url)
tokens = append(tokens, token)
}
}
return urls, tokens
}
================================================
FILE: alert/sender/mm.go
================================================
package sender
import (
"html/template"
"net/url"
"strings"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/logger"
)
type MatterMostMessage struct {
Text string
Tokens []string
Stats *astats.Stats
}
type mm struct {
Channel string `json:"channel"`
Username string `json:"username"`
Text string `json:"text"`
}
type MmSender struct {
tpl *template.Template
}
func (ms *MmSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
urls := ms.extract(ctx.Users)
if len(urls) == 0 {
return
}
message := BuildTplMessage(models.Mm, ms.tpl, ctx.Events)
SendMM(ctx.Ctx, MatterMostMessage{
Text: message,
Tokens: urls,
Stats: ctx.Stats,
}, ctx.Events, models.Mm)
}
func (ms *MmSender) CallBack(ctx CallBackContext) {
if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 {
return
}
message := BuildTplMessage(models.Mm, ms.tpl, ctx.Events)
SendMM(ctx.Ctx, MatterMostMessage{
Text: message,
Tokens: []string{ctx.CallBackURL},
Stats: ctx.Stats,
}, ctx.Events, "callback")
}
func (ms *MmSender) extract(users []*models.User) []string {
tokens := make([]string, 0, len(users))
for _, user := range users {
if token, has := user.ExtractToken(models.Mm); has {
tokens = append(tokens, token)
}
}
return tokens
}
func SendMM(ctx *ctx.Context, message MatterMostMessage, events []*models.AlertCurEvent, channel string) {
for i := 0; i < len(message.Tokens); i++ {
u, err := url.Parse(message.Tokens[i])
if err != nil {
logger.Errorf("mm_sender: failed to parse error=%v", err)
NotifyRecord(ctx, events, 0, channel, message.Tokens[i], "", err)
continue
}
v, err := url.ParseQuery(u.RawQuery)
if err != nil {
logger.Errorf("mm_sender: failed to parse query error=%v", err)
}
channels := v["channel"] // do not get
txt := ""
atuser := v["atuser"]
if len(atuser) != 0 {
txt = strings.Join(MapStrToStr(atuser, func(u string) string {
return "@" + u
}), ",") + "\n"
}
username := v.Get("username")
if err != nil {
logger.Errorf("mm_sender: failed to parse error=%v", err)
}
// simple concatenating
ur := u.Scheme + "://" + u.Host + u.Path
for _, channel := range channels {
body := mm{
Channel: channel,
Username: username,
Text: txt + message.Text,
}
doSendAndRecord(ctx, ur, message.Tokens[i], body, channel, message.Stats, events)
}
}
}
func MapStrToStr(arr []string, fn func(s string) string) []string {
var newArray = []string{}
for _, it := range arr {
newArray = append(newArray, fn(it))
}
return newArray
}
================================================
FILE: alert/sender/notify_record_queue.go
================================================
package sender
import (
"errors"
"time"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/container/list"
"github.com/toolkits/pkg/logger"
)
// 通知记录队列,最大长度 1000000
var NotifyRecordQueue = list.NewSafeListLimited(1000000)
// 每秒上报通知记录队列大小
func ReportNotifyRecordQueueSize(stats *astats.Stats) {
for {
time.Sleep(time.Second)
stats.GaugeNotifyRecordQueueSize.Set(float64(NotifyRecordQueue.Len()))
}
}
// 推送通知记录到队列
// 若队列满 则返回 error
func PushNotifyRecords(records []*models.NotificationRecord) error {
for _, record := range records {
if ok := NotifyRecordQueue.PushFront(record); !ok {
logger.Warningf("notify record queue is full, record: %+v", record)
return errors.New("notify record queue is full")
}
}
return nil
}
type NotifyRecordConsumer struct {
ctx *ctx.Context
}
func NewNotifyRecordConsumer(ctx *ctx.Context) *NotifyRecordConsumer {
return &NotifyRecordConsumer{
ctx: ctx,
}
}
// 消费通知记录队列 每 100ms 检测一次队列是否为空
func (c *NotifyRecordConsumer) LoopConsume() {
duration := time.Duration(100) * time.Millisecond
for {
// 无论队列是否为空 都需要等待
time.Sleep(duration)
inotis := NotifyRecordQueue.PopBackBy(100)
if len(inotis) == 0 {
continue
}
// 类型转换,不然 CreateInBatches 会报错
notis := make([]*models.NotificationRecord, 0, len(inotis))
for _, inoti := range inotis {
notis = append(notis, inoti.(*models.NotificationRecord))
}
c.consume(notis)
}
}
func (c *NotifyRecordConsumer) consume(notis []*models.NotificationRecord) {
if err := models.DB(c.ctx).CreateInBatches(notis, 100).Error; err != nil {
logger.Errorf("add notis:%v failed, err: %v", notis, err)
}
}
================================================
FILE: alert/sender/plugin.go
================================================
package sender
import (
"bytes"
"fmt"
"os"
"os/exec"
"time"
"unicode/utf8"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/file"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/sys"
)
func MayPluginNotify(ctx *ctx.Context, noticeBytes []byte, notifyScript models.NotifyScript,
stats *astats.Stats, event *models.AlertCurEvent) {
if len(noticeBytes) == 0 {
return
}
alertingCallScript(ctx, noticeBytes, notifyScript, stats, event)
}
func alertingCallScript(ctx *ctx.Context, stdinBytes []byte, notifyScript models.NotifyScript,
stats *astats.Stats, event *models.AlertCurEvent) {
// not enable or no notify.py? do nothing
config := notifyScript
if !config.Enable || config.Content == "" {
return
}
channel := "script"
stats.AlertNotifyTotal.WithLabelValues(channel).Inc()
fpath := ".notify_script"
if config.Type == 1 {
fpath = config.Content
} else {
rewrite := true
if file.IsExist(fpath) {
oldContent, err := file.ToString(fpath)
if err != nil {
logger.Errorf("event_script_notify_fail: read script file err: %v", err)
stats.AlertNotifyErrorTotal.WithLabelValues(channel).Inc()
return
}
if oldContent == config.Content {
rewrite = false
}
}
if rewrite {
_, err := file.WriteString(fpath, config.Content)
if err != nil {
logger.Errorf("event_script_notify_fail: write script file err: %v", err)
stats.AlertNotifyErrorTotal.WithLabelValues(channel).Inc()
return
}
err = os.Chmod(fpath, 0777)
if err != nil {
logger.Errorf("event_script_notify_fail: chmod script file err: %v", err)
stats.AlertNotifyErrorTotal.WithLabelValues(channel).Inc()
return
}
}
fpath = "./" + fpath
}
cmd := exec.Command(fpath)
cmd.Stdin = bytes.NewReader(stdinBytes)
// combine stdout and stderr
var buf bytes.Buffer
cmd.Stdout = &buf
cmd.Stderr = &buf
start := time.Now()
err := startCmd(cmd)
if err != nil {
logger.Errorf("event_script_notify_fail: run cmd err: %v", err)
return
}
err, isTimeout := sys.WrapTimeout(cmd, time.Duration(config.Timeout)*time.Second)
res := buf.String()
res = fmt.Sprintf("send_time: %s duration: %d ms %s", time.Now().Format("2006-01-02 15:04:05"), time.Since(start).Milliseconds(), res)
// 截断超出长度的输出
if len(res) > 512 {
// 确保在有效的UTF-8字符边界处截断
validLen := 0
for i := 0; i < 512 && i < len(res); {
_, size := utf8.DecodeRuneInString(res[i:])
if i+size > 512 {
break
}
i += size
validLen = i
}
res = res[:validLen] + "..."
}
NotifyRecord(ctx, []*models.AlertCurEvent{event}, 0, channel, cmd.String(), res, buildErr(err, isTimeout))
if isTimeout {
if err == nil {
logger.Errorf("event_script_notify_fail: timeout and killed process %s", fpath)
}
if err != nil {
logger.Errorf("event_script_notify_fail: kill process %s occur error %v", fpath, err)
stats.AlertNotifyErrorTotal.WithLabelValues(channel).Inc()
}
return
}
if err != nil {
logger.Errorf("event_script_notify_fail: exec script %s occur error: %v, output: %s", fpath, err, res)
stats.AlertNotifyErrorTotal.WithLabelValues(channel).Inc()
return
}
logger.Infof("event_script_notify_ok: exec %s output: %s", fpath, res)
}
func buildErr(err error, isTimeout bool) error {
if err == nil && !isTimeout {
return nil
} else {
return fmt.Errorf("is_timeout: %v, err: %v", isTimeout, err)
}
}
================================================
FILE: alert/sender/plugin_cmd_unix.go
================================================
//go:build !windows
// +build !windows
package sender
import (
"os/exec"
"syscall"
)
func startCmd(c *exec.Cmd) error {
c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
return c.Start()
}
================================================
FILE: alert/sender/plugin_cmd_windows.go
================================================
package sender
import "os/exec"
func startCmd(c *exec.Cmd) error {
return c.Start()
}
================================================
FILE: alert/sender/sender.go
================================================
package sender
import (
"bytes"
"html/template"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
)
type (
// Sender 发送消息通知的接口
Sender interface {
Send(ctx MessageContext)
}
// MessageContext 一个event所生成的告警通知的上下文
MessageContext struct {
Users []*models.User
Rule *models.AlertRule
Events []*models.AlertCurEvent
Stats *astats.Stats
Ctx *ctx.Context
}
)
func NewSender(key string, tpls map[string]*template.Template, smtp ...aconf.SMTPConfig) Sender {
switch key {
case models.Dingtalk:
return &DingtalkSender{tpl: tpls[models.Dingtalk]}
case models.Wecom:
return &WecomSender{tpl: tpls[models.Wecom]}
case models.Feishu:
return &FeishuSender{tpl: tpls[models.Feishu]}
case models.FeishuCard:
return &FeishuCardSender{tpl: tpls[models.FeishuCard]}
case models.Email:
return &EmailSender{subjectTpl: tpls[models.EmailSubject], contentTpl: tpls[models.Email], smtp: smtp[0]}
case models.Mm:
return &MmSender{tpl: tpls[models.Mm]}
case models.Telegram:
return &TelegramSender{tpl: tpls[models.Telegram]}
case models.Lark:
return &LarkSender{tpl: tpls[models.Lark]}
case models.LarkCard:
return &LarkCardSender{tpl: tpls[models.LarkCard]}
}
return nil
}
func BuildMessageContext(ctx *ctx.Context, rule *models.AlertRule, events []*models.AlertCurEvent,
uids []int64, userCache *memsto.UserCacheType, stats *astats.Stats) MessageContext {
users := userCache.GetByUserIds(uids)
return MessageContext{
Rule: rule,
Events: events,
Users: users,
Stats: stats,
Ctx: ctx,
}
}
type BuildTplMessageFunc func(channel string, tpl *template.Template, events []*models.AlertCurEvent) string
var BuildTplMessage BuildTplMessageFunc = buildTplMessage
func buildTplMessage(channel string, tpl *template.Template, events []*models.AlertCurEvent) string {
if tpl == nil {
return "tpl for current sender not found, please check configuration"
}
var content string
for _, event := range events {
var body bytes.Buffer
if err := tpl.Execute(&body, event); err != nil {
return err.Error()
}
content += body.String() + "\n\n"
}
return content
}
================================================
FILE: alert/sender/telegram.go
================================================
package sender
import (
"errors"
"html/template"
"strings"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/logger"
)
type TelegramMessage struct {
Text string
Tokens []string
Stats *astats.Stats
}
type telegram struct {
ParseMode string `json:"parse_mode"`
Text string `json:"text"`
}
var (
_ CallBacker = (*TelegramSender)(nil)
)
type TelegramSender struct {
tpl *template.Template
}
func (ts *TelegramSender) CallBack(ctx CallBackContext) {
if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 {
return
}
message := BuildTplMessage(models.Telegram, ts.tpl, ctx.Events)
SendTelegram(ctx.Ctx, TelegramMessage{
Text: message,
Tokens: []string{ctx.CallBackURL},
Stats: ctx.Stats,
}, ctx.Events, "callback")
}
func (ts *TelegramSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
tokens := ts.extract(ctx.Users)
message := BuildTplMessage(models.Telegram, ts.tpl, ctx.Events)
SendTelegram(ctx.Ctx, TelegramMessage{
Text: message,
Tokens: tokens,
Stats: ctx.Stats,
}, ctx.Events, models.Telegram)
}
func (ts *TelegramSender) extract(users []*models.User) []string {
tokens := make([]string, 0, len(users))
for _, user := range users {
if token, has := user.ExtractToken(models.Telegram); has {
tokens = append(tokens, token)
}
}
return tokens
}
func SendTelegram(ctx *ctx.Context, message TelegramMessage, events []*models.AlertCurEvent, channel string) {
for i := 0; i < len(message.Tokens); i++ {
if !strings.Contains(message.Tokens[i], "/") && !strings.HasPrefix(message.Tokens[i], "https://") {
logger.Errorf("telegram_sender: result=fail invalid token=%s", message.Tokens[i])
NotifyRecord(ctx, events, 0, channel, message.Tokens[i], "", errors.New("invalid token"))
continue
}
var url string
if strings.HasPrefix(message.Tokens[i], "https://") || strings.HasPrefix(message.Tokens[i], "http://") {
url = message.Tokens[i]
} else {
array := strings.Split(message.Tokens[i], "/")
if len(array) != 2 {
logger.Errorf("telegram_sender: result=fail invalid token=%s", message.Tokens[i])
continue
}
botToken := array[0]
chatId := array[1]
url = "https://api.telegram.org/bot" + botToken + "/sendMessage?chat_id=" + chatId
}
body := telegram{
ParseMode: "markdown",
Text: message.Text,
}
doSendAndRecord(ctx, url, message.Tokens[i], body, channel, message.Stats, events)
}
}
================================================
FILE: alert/sender/webhook.go
================================================
package sender
import (
"bytes"
"crypto/tls"
"encoding/json"
"fmt"
"io"
"net/http"
"sync"
"time"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/toolkits/pkg/logger"
)
// webhookClientCache 缓存 http.Client,避免每次请求都创建新的 Client 导致连接泄露
var webhookClientCache sync.Map // key: clientKey (string), value: *http.Client
// 相同配置的 webhook 会复用同一个 Client
func getWebhookClient(webhook *models.Webhook) *http.Client {
clientKey := webhook.Hash()
if client, ok := webhookClientCache.Load(clientKey); ok {
return client.(*http.Client)
}
// 创建新的 Client
transport := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: webhook.SkipVerify},
MaxIdleConns: 100,
MaxIdleConnsPerHost: 10,
IdleConnTimeout: 90 * time.Second,
}
if poster.UseProxy(webhook.Url) {
transport.Proxy = http.ProxyFromEnvironment
}
timeout := webhook.Timeout
if timeout <= 0 {
timeout = 10
}
newClient := &http.Client{
Timeout: time.Duration(timeout) * time.Second,
Transport: transport,
}
// 使用 LoadOrStore 确保并发安全,避免重复创建
actual, loaded := webhookClientCache.LoadOrStore(clientKey, newClient)
if loaded {
return actual.(*http.Client)
}
return newClient
}
func sendWebhook(webhook *models.Webhook, event interface{}, stats *astats.Stats) (bool, string, error) {
channel := "webhook"
if webhook.Type == models.RuleCallback {
channel = "callback"
}
conf := webhook
if conf.Url == "" || !conf.Enable {
return false, "", nil
}
bs, err := json.Marshal(event)
if err != nil {
logger.Errorf("%s alertingWebhook failed to marshal event err:%v", channel, err)
return false, "", err
}
bf := bytes.NewBuffer(bs)
req, err := http.NewRequest("POST", conf.Url, bf)
if err != nil {
logger.Warningf("%s alertingWebhook failed to new request event:%s err:%v", channel, string(bs), err)
return true, "", err
}
req.Header.Set("Content-Type", "application/json")
if conf.BasicAuthUser != "" && conf.BasicAuthPass != "" {
req.SetBasicAuth(conf.BasicAuthUser, conf.BasicAuthPass)
}
if len(conf.Headers) > 0 && len(conf.Headers)%2 == 0 {
for i := 0; i < len(conf.Headers); i += 2 {
if conf.Headers[i] == "host" || conf.Headers[i] == "Host" {
req.Host = conf.Headers[i+1]
continue
}
req.Header.Set(conf.Headers[i], conf.Headers[i+1])
}
}
// 使用全局 Client 缓存,避免每次请求都创建新的 Client 导致连接泄露
client := getWebhookClient(conf)
stats.AlertNotifyTotal.WithLabelValues(channel).Inc()
var resp *http.Response
var body []byte
resp, err = client.Do(req)
if err != nil {
stats.AlertNotifyErrorTotal.WithLabelValues(channel).Inc()
logger.Errorf("event_%s_fail, event:%s, url: [%s], error: [%s]", channel, string(bs), conf.Url, err)
return true, "", err
}
if resp.Body != nil {
defer resp.Body.Close()
body, _ = io.ReadAll(resp.Body)
}
if resp.StatusCode == 429 {
logger.Errorf("event_%s_fail, url: %s, response code: %d, body: %s event:%s", channel, conf.Url, resp.StatusCode, string(body), string(bs))
return true, fmt.Sprintf("status_code:%d, response:%s", resp.StatusCode, string(body)), fmt.Errorf("status code is 429")
}
logger.Debugf("event_%s_succ, url: %s, response code: %d, body: %s event:%s", channel, conf.Url, resp.StatusCode, string(body), string(bs))
return false, fmt.Sprintf("status_code:%d, response:%s", resp.StatusCode, string(body)), nil
}
func SingleSendWebhooks(ctx *ctx.Context, webhooks map[string]*models.Webhook, event *models.AlertCurEvent, stats *astats.Stats) {
for _, conf := range webhooks {
retryCount := 0
for retryCount < 3 {
start := time.Now()
needRetry, res, err := sendWebhook(conf, event, stats)
res = fmt.Sprintf("send_time: %s duration: %d ms %s", time.Now().Format("2006-01-02 15:04:05"), time.Since(start).Milliseconds(), res)
NotifyRecord(ctx, []*models.AlertCurEvent{event}, 0, "webhook", conf.Url, res, err)
if !needRetry {
break
}
retryCount++
time.Sleep(time.Minute * 1 * time.Duration(retryCount))
}
}
}
func BatchSendWebhooks(ctx *ctx.Context, webhooks map[string]*models.Webhook, event *models.AlertCurEvent, stats *astats.Stats) {
for _, conf := range webhooks {
logger.Infof("push event:%s to queue:%v", event.Hash, conf)
PushEvent(ctx, conf, event, stats)
}
}
var EventQueue = make(map[string]*WebhookQueue)
var CallbackEventQueue = make(map[string]*WebhookQueue)
var CallbackEventQueueLock sync.RWMutex
var EventQueueLock sync.RWMutex
const QueueMaxSize = 100000
type WebhookQueue struct {
eventQueue *SafeEventQueue
closeCh chan struct{}
}
func PushEvent(ctx *ctx.Context, webhook *models.Webhook, event *models.AlertCurEvent, stats *astats.Stats) {
EventQueueLock.RLock()
queue := EventQueue[webhook.Url]
EventQueueLock.RUnlock()
if queue == nil {
queue = &WebhookQueue{
eventQueue: NewSafeEventQueue(QueueMaxSize),
closeCh: make(chan struct{}),
}
EventQueueLock.Lock()
EventQueue[webhook.Url] = queue
EventQueueLock.Unlock()
StartConsumer(ctx, queue, webhook.Batch, webhook, stats)
}
succ := queue.eventQueue.Push(event)
if !succ {
stats.AlertNotifyErrorTotal.WithLabelValues("push_event_queue").Inc()
logger.Warningf("Write channel(%s) full, current channel size: %d event:%s", webhook.Url, queue.eventQueue.Len(), event.Hash)
}
}
func StartConsumer(ctx *ctx.Context, queue *WebhookQueue, popSize int, webhook *models.Webhook, stats *astats.Stats) {
for {
select {
case <-queue.closeCh:
logger.Infof("event queue:%v closed", queue)
return
default:
events := queue.eventQueue.PopN(popSize)
if len(events) == 0 {
time.Sleep(time.Millisecond * 400)
continue
}
retryCount := 0
for retryCount < webhook.RetryCount {
start := time.Now()
needRetry, res, err := sendWebhook(webhook, events, stats)
res = fmt.Sprintf("send_time: %s duration: %d ms %s", time.Now().Format("2006-01-02 15:04:05"), time.Since(start).Milliseconds(), res)
go NotifyRecord(ctx, events, 0, "webhook", webhook.Url, res, err)
if !needRetry {
break
}
retryCount++
time.Sleep(time.Second * time.Duration(webhook.RetryInterval) * time.Duration(retryCount))
}
}
}
}
================================================
FILE: alert/sender/webhook_event_queue.go
================================================
package sender
import (
"container/list"
"sync"
"github.com/ccfos/nightingale/v6/models"
)
type SafeEventQueue struct {
lock sync.RWMutex
maxSize int
queueHigh *list.List
queueMiddle *list.List
queueLow *list.List
}
const (
High = 1
Middle = 2
Low = 3
)
func NewSafeEventQueue(maxSize int) *SafeEventQueue {
return &SafeEventQueue{
maxSize: maxSize,
lock: sync.RWMutex{},
queueHigh: list.New(),
queueMiddle: list.New(),
queueLow: list.New(),
}
}
func (spq *SafeEventQueue) Len() int {
spq.lock.RLock()
defer spq.lock.RUnlock()
return spq.queueHigh.Len() + spq.queueMiddle.Len() + spq.queueLow.Len()
}
// len 无锁读取长度,不要在本文件外调用
func (spq *SafeEventQueue) len() int {
return spq.queueHigh.Len() + spq.queueMiddle.Len() + spq.queueLow.Len()
}
func (spq *SafeEventQueue) Push(event *models.AlertCurEvent) bool {
spq.lock.Lock()
defer spq.lock.Unlock()
for spq.len() > spq.maxSize {
return false
}
switch event.Severity {
case High:
spq.queueHigh.PushBack(event)
case Middle:
spq.queueMiddle.PushBack(event)
case Low:
spq.queueLow.PushBack(event)
default:
return false
}
return true
}
// pop 无锁弹出事件,不要在本文件外调用
func (spq *SafeEventQueue) pop() *models.AlertCurEvent {
if spq.len() == 0 {
return nil
}
var elem interface{}
if spq.queueHigh.Len() > 0 {
elem = spq.queueHigh.Remove(spq.queueHigh.Front())
} else if spq.queueMiddle.Len() > 0 {
elem = spq.queueMiddle.Remove(spq.queueMiddle.Front())
} else {
elem = spq.queueLow.Remove(spq.queueLow.Front())
}
event, ok := elem.(*models.AlertCurEvent)
if !ok {
return nil
}
return event
}
func (spq *SafeEventQueue) Pop() *models.AlertCurEvent {
spq.lock.Lock()
defer spq.lock.Unlock()
return spq.pop()
}
func (spq *SafeEventQueue) PopN(n int) []*models.AlertCurEvent {
spq.lock.Lock()
defer spq.lock.Unlock()
events := make([]*models.AlertCurEvent, 0, n)
count := 0
for count < n && spq.len() > 0 {
event := spq.pop()
if event != nil {
events = append(events, event)
}
count++
}
return events
}
================================================
FILE: alert/sender/webhook_event_queue_test.go
================================================
package sender
import (
"sync"
"testing"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/stretchr/testify/assert"
)
func TestSafePriorityQueue_ConcurrentPushPop(t *testing.T) {
spq := NewSafeEventQueue(100000)
var wg sync.WaitGroup
numGoroutines := 100
numEvents := 1000
// 并发 Push
wg.Add(numGoroutines)
for i := 0; i < numGoroutines; i++ {
go func(goroutineID int) {
defer wg.Done()
for j := 0; j < numEvents; j++ {
event := &models.AlertCurEvent{
Severity: goroutineID%3 + 1,
TriggerTime: time.Now().UnixNano(),
}
spq.Push(event)
}
}(i)
}
wg.Wait()
// 检查队列长度是否正确
expectedLen := numGoroutines * numEvents
assert.Equal(t, expectedLen, spq.Len(), "Queue length mismatch after concurrent pushes")
// 并发 Pop
wg.Add(numGoroutines)
for i := 0; i < numGoroutines; i++ {
go func() {
defer wg.Done()
for {
event := spq.Pop()
if event == nil {
return
}
}
}()
}
wg.Wait()
// 最终队列应该为空
assert.Equal(t, 0, spq.Len(), "Queue should be empty after concurrent pops")
}
func TestSafePriorityQueue_ConcurrentPopMax(t *testing.T) {
spq := NewSafeEventQueue(100000)
// 添加初始数据
for i := 0; i < 1000; i++ {
spq.Push(&models.AlertCurEvent{
Severity: i%3 + 1,
TriggerTime: time.Now().UnixNano(),
})
}
var wg sync.WaitGroup
numGoroutines := 10
popMax := 100
// 并发 PopN
wg.Add(numGoroutines)
for i := 0; i < numGoroutines; i++ {
go func() {
defer wg.Done()
events := spq.PopN(popMax)
assert.LessOrEqual(t, len(events), popMax, "PopN exceeded maximum")
}()
}
wg.Wait()
// 检查队列长度是否正确
expectedRemaining := 1000 - (numGoroutines * popMax)
if expectedRemaining < 0 {
expectedRemaining = 0
}
assert.Equal(t, expectedRemaining, spq.Len(), "Queue length mismatch after concurrent PopN")
}
func TestSafePriorityQueue_ConcurrentPushPopWithDifferentSeverities(t *testing.T) {
spq := NewSafeEventQueue(100000)
var wg sync.WaitGroup
numGoroutines := 50
numEvents := 500
// 并发 Push 不同优先级的事件
wg.Add(numGoroutines)
for i := 0; i < numGoroutines; i++ {
go func(goroutineID int) {
defer wg.Done()
for j := 0; j < numEvents; j++ {
event := &models.AlertCurEvent{
Severity: goroutineID%3 + 1, // 模拟不同的 Severity
TriggerTime: time.Now().UnixNano(),
}
spq.Push(event)
}
}(i)
}
wg.Wait()
// 检查队列长度是否正确
expectedLen := numGoroutines * numEvents
assert.Equal(t, expectedLen, spq.Len(), "Queue length mismatch after concurrent pushes")
// 检查事件的顺序是否按照优先级排列
var lastEvent *models.AlertCurEvent
for spq.Len() > 0 {
event := spq.Pop()
if lastEvent != nil {
assert.LessOrEqual(t, lastEvent.Severity, event.Severity, "Events are not in correct priority order")
}
lastEvent = event
}
}
func TestSafePriorityQueue_ExceedMaxSize(t *testing.T) {
spq := NewSafeEventQueue(5)
// 插入超过最大容量的事件
for i := 0; i < 10; i++ {
spq.Push(&models.AlertCurEvent{
Severity: i % 3,
TriggerTime: int64(i),
})
}
// 验证队列的长度是否不超过 maxSize
assert.LessOrEqual(t, spq.Len(), spq.maxSize)
// 验证队列中剩余事件的内容
expectedEvents := 5
if spq.Len() < 5 {
expectedEvents = spq.Len()
}
// 检查最后存入的事件是否是按优先级排序
for i := 0; i < expectedEvents; i++ {
event := spq.Pop()
if event != nil {
assert.LessOrEqual(t, event.Severity, 2)
}
}
}
================================================
FILE: alert/sender/webhook_queue.go
================================================
package sender
import (
"container/list"
"sync"
"github.com/ccfos/nightingale/v6/models"
)
type SafeList struct {
sync.RWMutex
L *list.List
}
func NewSafeList() *SafeList {
return &SafeList{L: list.New()}
}
func (sl *SafeList) PushFront(v interface{}) *list.Element {
sl.Lock()
e := sl.L.PushFront(v)
sl.Unlock()
return e
}
func (sl *SafeList) PushFrontBatch(vs []interface{}) {
sl.Lock()
for _, item := range vs {
sl.L.PushFront(item)
}
sl.Unlock()
}
func (sl *SafeList) PopBack(max int) []*models.AlertCurEvent {
sl.Lock()
count := sl.L.Len()
if count == 0 {
sl.Unlock()
return []*models.AlertCurEvent{}
}
if count > max {
count = max
}
items := make([]*models.AlertCurEvent, 0, count)
for i := 0; i < count; i++ {
item := sl.L.Remove(sl.L.Back())
sample, ok := item.(*models.AlertCurEvent)
if ok {
items = append(items, sample)
}
}
sl.Unlock()
return items
}
func (sl *SafeList) RemoveAll() {
sl.Lock()
sl.L.Init()
sl.Unlock()
}
func (sl *SafeList) Len() int {
sl.RLock()
size := sl.L.Len()
sl.RUnlock()
return size
}
// SafeList with Limited Size
type SafeListLimited struct {
maxSize int
SL *SafeList
}
func NewSafeListLimited(maxSize int) *SafeListLimited {
return &SafeListLimited{SL: NewSafeList(), maxSize: maxSize}
}
func (sll *SafeListLimited) PopBack(max int) []*models.AlertCurEvent {
return sll.SL.PopBack(max)
}
func (sll *SafeListLimited) PushFront(v interface{}) bool {
if sll.SL.Len() >= sll.maxSize {
return false
}
sll.SL.PushFront(v)
return true
}
func (sll *SafeListLimited) PushFrontBatch(vs []interface{}) bool {
if sll.SL.Len() >= sll.maxSize {
return false
}
sll.SL.PushFrontBatch(vs)
return true
}
func (sll *SafeListLimited) RemoveAll() {
sll.SL.RemoveAll()
}
func (sll *SafeListLimited) Len() int {
return sll.SL.Len()
}
================================================
FILE: alert/sender/wecom.go
================================================
package sender
import (
"html/template"
"strings"
"github.com/ccfos/nightingale/v6/models"
)
type wecomMarkdown struct {
Content string `json:"content"`
}
type wecom struct {
Msgtype string `json:"msgtype"`
Markdown wecomMarkdown `json:"markdown"`
}
var (
_ CallBacker = (*WecomSender)(nil)
)
type WecomSender struct {
tpl *template.Template
}
func (ws *WecomSender) CallBack(ctx CallBackContext) {
if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 {
return
}
message := BuildTplMessage(models.Wecom, ws.tpl, ctx.Events)
body := wecom{
Msgtype: "markdown",
Markdown: wecomMarkdown{
Content: message,
},
}
doSendAndRecord(ctx.Ctx, ctx.CallBackURL, ctx.CallBackURL, body, "callback", ctx.Stats, ctx.Events)
}
func (ws *WecomSender) Send(ctx MessageContext) {
if len(ctx.Users) == 0 || len(ctx.Events) == 0 {
return
}
urls, tokens := ws.extract(ctx.Users)
message := BuildTplMessage(models.Wecom, ws.tpl, ctx.Events)
for i, url := range urls {
body := wecom{
Msgtype: "markdown",
Markdown: wecomMarkdown{
Content: message,
},
}
doSendAndRecord(ctx.Ctx, url, tokens[i], body, models.Wecom, ctx.Stats, ctx.Events)
}
}
func (ws *WecomSender) extract(users []*models.User) ([]string, []string) {
urls := make([]string, 0, len(users))
tokens := make([]string, 0, len(users))
for _, user := range users {
if token, has := user.ExtractToken(models.Wecom); has {
url := token
if !strings.HasPrefix(token, "https://") && !strings.HasPrefix(token, "http://") {
url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=" + token
}
urls = append(urls, url)
tokens = append(tokens, token)
}
}
return urls, tokens
}
================================================
FILE: center/cconf/conf.go
================================================
package cconf
import (
"time"
"github.com/ccfos/nightingale/v6/pkg/httpx"
)
type Center struct {
Plugins []Plugin
MetricsYamlFile string
OpsYamlFile string
BuiltinIntegrationsDir string
I18NHeaderKey string
MetricDesc MetricDescType
AnonymousAccess AnonymousAccess
UseFileAssets bool
FlashDuty FlashDuty
EventHistoryGroupView bool
CleanNotifyRecordDay int
CleanPipelineExecutionDay int
MigrateBusiGroupLabel bool
RSA httpx.RSAConfig
}
type Plugin struct {
Id int64 `json:"id"`
Category string `json:"category"`
Type string `json:"plugin_type"`
TypeName string `json:"plugin_type_name"`
}
type FlashDuty struct {
Api string
Headers map[string]string
Timeout time.Duration
}
type AnonymousAccess struct {
PromQuerier bool
AlertDetail bool
}
func (c *Center) PreCheck() {
if len(c.Plugins) == 0 {
c.Plugins = Plugins
}
}
================================================
FILE: center/cconf/event_example.go
================================================
package cconf
const EVENT_EXAMPLE = `
{
"id": 1000000,
"cate": "prometheus",
"datasource_id": 1,
"group_id": 1,
"group_name": "Default Busi Group",
"hash": "2cb966f9ba1cdc7af94c3796e855955a",
"rule_id": 23,
"rule_name": "测试告警",
"rule_note": "测试告警",
"rule_prod": "metric",
"rule_config": {
"queries": [
{
"key": "all_hosts",
"op": "==",
"values": []
}
],
"triggers": [
{
"duration": 3,
"percent": 10,
"severity": 3,
"type": "pct_target_miss"
}
]
},
"prom_for_duration": 60,
"prom_eval_interval": 30,
"callbacks": ["https://n9e.github.io"],
"notify_recovered": 1,
"notify_channels": ["dingtalk"],
"notify_groups": [],
"notify_groups_obj": null,
"target_ident": "host01",
"target_note": "机器备注",
"trigger_time": 1677229517,
"trigger_value": "2273533952",
"tags": [
"__name__=disk_free",
"dc=qcloud-dev",
"device=vda1",
"fstype=ext4",
"ident=tt-fc-dev00.nj"
],
"is_recovered": false,
"notify_users_obj": null,
"last_eval_time": 1677229517,
"last_sent_time": 1677229517,
"notify_cur_number": 1,
"first_trigger_time": 1677229517,
"annotations": {
"summary": "测试告警"
}
}
`
================================================
FILE: center/cconf/metric.go
================================================
package cconf
import (
"path"
"github.com/toolkits/pkg/file"
)
// metricDesc , As load map happens before read map, there is no necessary to use concurrent map for metric desc store
type MetricDescType struct {
CommonDesc map[string]string `yaml:",inline" json:"common"`
Zh map[string]string `yaml:"zh" json:"zh"`
En map[string]string `yaml:"en" json:"en"`
}
var MetricDesc MetricDescType
// GetMetricDesc , if metric is not registered, empty string will be returned
func GetMetricDesc(lang, metric string) string {
var m map[string]string
switch lang {
case "en":
m = MetricDesc.En
default:
m = MetricDesc.Zh
}
if m != nil {
if desc, ok := m[metric]; ok {
return desc
}
}
if MetricDesc.CommonDesc != nil {
if desc, ok := MetricDesc.CommonDesc[metric]; ok {
return desc
}
}
return ""
}
func LoadMetricsYaml(configDir, metricsYamlFile string) error {
fp := metricsYamlFile
if fp == "" {
fp = path.Join(configDir, "metrics.yaml")
}
if !file.IsExist(fp) {
return nil
}
return file.ReadYaml(fp, &MetricDesc)
}
================================================
FILE: center/cconf/ops.go
================================================
package cconf
import (
"fmt"
"path"
"github.com/toolkits/pkg/file"
"gopkg.in/yaml.v2"
)
var Operations = Operation{}
type Operation struct {
Ops []Ops `yaml:"ops"`
}
type Ops struct {
Name string `yaml:"name" json:"name"`
Cname string `yaml:"cname" json:"cname"`
Ops []SingleOp `yaml:"ops" json:"ops"`
}
// SingleOp Name 为 op 名称;Cname 为展示名称,默认英文
type SingleOp struct {
Name string `yaml:"name" json:"name"`
Cname string `yaml:"cname" json:"cname"`
}
func TransformNames(name []string, nameToName map[string]string) []string {
var ret []string
for _, n := range name {
if v, has := nameToName[n]; has {
ret = append(ret, v)
}
}
return ret
}
func LoadOpsYaml(configDir string, opsYamlFile string) error {
fp := opsYamlFile
if fp == "" {
fp = path.Join(configDir, "ops.yaml")
}
if !file.IsExist(fp) {
return nil
}
hash, _ := file.MD5(fp)
if hash == "2f91a9ed265cf2024e266dc1d538ee77" {
// ops.yaml 是老的默认文件,删除
file.Remove(fp)
return nil
}
return file.ReadYaml(fp, &Operations)
}
func GetAllOps(ops []Ops) []SingleOp {
var ret []SingleOp
for _, op := range ops {
ret = append(ret, op.Ops...)
}
return ret
}
func MergeOperationConf() error {
var opsBuiltIn Operation
err := yaml.Unmarshal([]byte(builtInOps), &opsBuiltIn)
if err != nil {
return fmt.Errorf("cannot parse builtInOps: %s", err.Error())
}
configOpsMap := make(map[string]struct{})
for _, op := range Operations.Ops {
configOpsMap[op.Name] = struct{}{}
}
//If the opBu.Name is not a constant in the target (Operations.Ops), add Ops from the built-in options
for _, opBu := range opsBuiltIn.Ops {
if _, has := configOpsMap[opBu.Name]; !has {
Operations.Ops = append(Operations.Ops, opBu)
}
}
return nil
}
const (
builtInOps = `
ops:
- name: Infrastructure
cname: Infrastructure
ops:
- name: /targets
cname: Host - View
- name: /targets/put
cname: Host - Modify
- name: /targets/del
cname: Host - Delete
- name: /targets/bind
cname: Host - Bind Uncategorized
- name: Explorer
cname: Explorer
ops:
- name: /metric/explorer
cname: Metrics Explorer
- name: /object/explorer
cname: Quick View
- name: /metrics-built-in
cname: Built-in Metric - View
- name: /builtin-metrics/add
cname: Built-in Metric - Add
- name: /builtin-metrics/put
cname: Built-in Metric - Modify
- name: /builtin-metrics/del
cname: Built-in Metric - Delete
- name: /recording-rules
cname: Recording Rule - View
- name: /recording-rules/add
cname: Recording Rule - Add
- name: /recording-rules/put
cname: Recording Rule - Modify
- name: /recording-rules/del
cname: Recording Rule - Delete
- name: /log/explorer
cname: Logs Explorer
- name: /log/index-patterns # 前端有个管理索引模式的页面,所以需要一个权限点来控制,后面应该改成侧拉板
cname: Index Pattern - View
- name: /log/index-patterns/add
cname: Index Pattern - Add
- name: /log/index-patterns/put
cname: Index Pattern - Modify
- name: /log/index-patterns/del
cname: Index Pattern - Delete
- name: /dashboards
cname: Dashboard - View
- name: /dashboards/add
cname: Dashboard - Add
- name: /dashboards/put
cname: Dashboard - Modify
- name: /dashboards/del
cname: Dashboard - Delete
- name: /public-dashboards
cname: Dashboard - View Public
- name: alerting
cname: Alerting
ops:
- name: /alert-rules
cname: Alerting Rule - View
- name: /alert-rules/add
cname: Alerting Rule - Add
- name: /alert-rules/put
cname: Alerting Rule - Modify
- name: /alert-rules/del
cname: Alerting Rule - Delete
- name: /alert-mutes
cname: Mutting Rule - View
- name: /alert-mutes/add
cname: Mutting Rule - Add
- name: /alert-mutes/put
cname: Mutting Rule - Modify
- name: /alert-mutes/del
cname: Mutting Rule - Delete
- name: /alert-subscribes
cname: Subscribing Rule - View
- name: /alert-subscribes/add
cname: Subscribing Rule - Add
- name: /alert-subscribes/put
cname: Subscribing Rule - Modify
- name: /alert-subscribes/del
cname: Subscribing Rule - Delete
- name: /job-tpls
cname: Self-healing-Script - View
- name: /job-tpls/add
cname: Self-healing-Script - Add
- name: /job-tpls/put
cname: Self-healing-Script - Modify
- name: /job-tpls/del
cname: Self-healing-Script - Delete
- name: /job-tasks
cname: Self-healing-Job - View
- name: /job-tasks/add
cname: Self-healing-Job - Add
- name: /job-tasks/put
cname: Self-healing-Job - Modify
- name: /alert-cur-events
cname: Active Event - View
- name: /alert-cur-events/del
cname: Active Event - Delete
- name: /alert-his-events
cname: Historical Event - View
- name: Notification
cname: Notification
ops:
- name: /notification-rules
cname: Notification Rule - View
- name: /notification-rules/add
cname: Notification Rule - Add
- name: /notification-rules/put
cname: Notification Rule - Modify
- name: /notification-rules/del
cname: Notification Rule - Delete
- name: /notification-channels
cname: Media Type - View
- name: /notification-channels/add
cname: Media Type - Add
- name: /notification-channels/put
cname: Media Type - Modify
- name: /notification-channels/del
cname: Media Type - Delete
- name: /notification-templates
cname: Message Template - View
- name: /notification-templates/add
cname: Message Template - Add
- name: /notification-templates/put
cname: Message Template - Modify
- name: /notification-templates/del
cname: Message Template - Delete
- name: /event-pipelines
cname: Event Pipeline - View
- name: /event-pipelines/add
cname: Event Pipeline - Add
- name: /event-pipelines/put
cname: Event Pipeline - Modify
- name: /event-pipelines/del
cname: Event Pipeline - Delete
- name: /help/notification-settings # 用于控制老版本的通知设置菜单是否展示
cname: Notification Settings - View
- name: /help/notification-tpls # 用于控制老版本的通知模板菜单是否展示
cname: Notification Templates - View
- name: Integrations
cname: Integrations
ops:
- name: /datasources # 用于控制能否看到数据源列表页面的菜单。只有 Admin 才能修改、删除数据源
cname: Data Source - View
- name: /components
cname: Component - View
- name: /components/add
cname: Component - Add
- name: /components/put
cname: Component - Modify
- name: /components/del
cname: Component - Delete
- name: /embedded-products
cname: Embedded Product - View
- name: /embedded-product/add
cname: Embedded Product - Add
- name: /embedded-product/put
cname: Embedded Product - Modify
- name: /embedded-product/delete
cname: Embedded Product - Delete
- name: Organization
cname: Organization
ops:
- name: /users
cname: User - View
- name: /users/add
cname: User - Add
- name: /users/put
cname: User - Modify
- name: /users/del
cname: User - Delete
- name: /user-groups
cname: Team - View
- name: /user-groups/add
cname: Team - Add
- name: /user-groups/put
cname: Team - Modify
- name: /user-groups/del
cname: Team - Delete
- name: /busi-groups
cname: Business Group - View
- name: /busi-groups/add
cname: Business Group - Add
- name: /busi-groups/put
cname: Business Group - Modify
- name: /busi-groups/del
cname: Business Group - Delete
- name: /roles
cname: Role - View
- name: /roles/add
cname: Role - Add
- name: /roles/put
cname: Role - Modify
- name: /roles/del
cname: Role - Delete
- name: System Settings
cname: System Settings
ops:
- name: /system/site-settings # 仅用于控制能否展示菜单,只有 Admin 才能修改、删除
cname: View Site Settings
- name: /system/variable-settings
cname: View Variable Settings
- name: /system/sso-settings
cname: View SSO Settings
- name: /system/alerting-engines
cname: View Alerting Engines
- name: /system/version
cname: View Product Version
`
)
================================================
FILE: center/cconf/plugin.go
================================================
package cconf
var Plugins = []Plugin{
{
Id: 1,
Category: "timeseries",
Type: "prometheus",
TypeName: "Prometheus Like",
},
{
Id: 2,
Category: "logging",
Type: "elasticsearch",
TypeName: "Elasticsearch",
},
{
Id: 3,
Category: "loki",
Type: "loki",
TypeName: "Loki",
},
{
Id: 4,
Category: "timeseries",
Type: "tdengine",
TypeName: "TDengine",
},
{
Id: 5,
Category: "logging",
Type: "ck",
TypeName: "ClickHouse",
},
{
Id: 6,
Category: "timeseries",
Type: "mysql",
TypeName: "MySQL",
},
{
Id: 7,
Category: "timeseries",
Type: "pgsql",
TypeName: "PostgreSQL",
},
{
Id: 8,
Category: "logging",
Type: "doris",
TypeName: "Doris",
},
{
Id: 9,
Category: "logging",
Type: "opensearch",
TypeName: "OpenSearch",
},
{
Id: 10,
Category: "logging",
Type: "victorialogs",
TypeName: "VictoriaLogs",
},
}
================================================
FILE: center/cconf/rsa/rsa_conf.go
================================================
package rsa
import (
"os"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/httpx"
"github.com/ccfos/nightingale/v6/pkg/secu"
"github.com/pkg/errors"
"github.com/toolkits/pkg/file"
"github.com/toolkits/pkg/logger"
)
func InitRSAConfig(ctx *ctx.Context, rsaConfig *httpx.RSAConfig) error {
// 1.Load RSA keys from Database
rsaPassWord, err := models.ConfigsGet(ctx, models.RSA_PASSWORD)
if err != nil {
return errors.WithMessagef(err, "cannot query config(%s)", models.RSA_PASSWORD)
}
privateKeyVal, err := models.ConfigsGet(ctx, models.RSA_PRIVATE_KEY)
if err != nil {
return errors.WithMessagef(err, "cannot query config(%s)", models.RSA_PRIVATE_KEY)
}
publicKeyVal, err := models.ConfigsGet(ctx, models.RSA_PUBLIC_KEY)
if err != nil {
return errors.WithMessagef(err, "cannot query config(%s)", models.RSA_PUBLIC_KEY)
}
if rsaPassWord != "" && privateKeyVal != "" && publicKeyVal != "" {
rsaConfig.RSAPassWord = rsaPassWord
rsaConfig.RSAPrivateKey = []byte(privateKeyVal)
rsaConfig.RSAPublicKey = []byte(publicKeyVal)
return nil
}
// 2.Read RSA configuration from file if exists
if file.IsExist(rsaConfig.RSAPrivateKeyPath) && file.IsExist(rsaConfig.RSAPublicKeyPath) {
//password already read from config
rsaConfig.RSAPrivateKey, rsaConfig.RSAPublicKey, err = readConfigFile(rsaConfig)
if err != nil {
return errors.WithMessage(err, "failed to read rsa config from file")
}
return nil
}
// 3.Generate RSA keys if not exist
rsaConfig.RSAPassWord, rsaConfig.RSAPrivateKey, rsaConfig.RSAPublicKey, err = initRSAKeyPairs(ctx, rsaConfig.RSAPassWord)
if err != nil {
return errors.WithMessage(err, "failed to generate rsa key pair")
}
return nil
}
func initRSAKeyPairs(ctx *ctx.Context, rsaPassWord string) (password string, privateByte, publicByte []byte, err error) {
// Generate RSA keys
// Generate RSA password
if rsaPassWord != "" {
logger.Debug("Using existing RSA password")
password = rsaPassWord
err = models.ConfigsSet(ctx, models.RSA_PASSWORD, password)
if err != nil {
err = errors.WithMessagef(err, "failed to set config(%s)", models.RSA_PASSWORD)
return
}
} else {
password, err = models.InitRSAPassWord(ctx)
if err != nil {
err = errors.WithMessage(err, "failed to generate rsa password")
return
}
}
privateByte, publicByte, err = secu.GenerateRsaKeyPair(password)
if err != nil {
err = errors.WithMessage(err, "failed to generate rsa key pair")
return
}
// Save generated RSA keys
err = models.ConfigsSet(ctx, models.RSA_PRIVATE_KEY, string(privateByte))
if err != nil {
err = errors.WithMessagef(err, "failed to set config(%s)", models.RSA_PRIVATE_KEY)
return
}
err = models.ConfigsSet(ctx, models.RSA_PUBLIC_KEY, string(publicByte))
if err != nil {
err = errors.WithMessagef(err, "failed to set config(%s)", models.RSA_PUBLIC_KEY)
return
}
return
}
func readConfigFile(rsaConfig *httpx.RSAConfig) (privateBuf, publicBuf []byte, err error) {
publicBuf, err = os.ReadFile(rsaConfig.RSAPublicKeyPath)
if err != nil {
err = errors.WithMessagef(err, "could not read RSAPublicKeyPath %q", rsaConfig.RSAPublicKeyPath)
return
}
privateBuf, err = os.ReadFile(rsaConfig.RSAPrivateKeyPath)
if err != nil {
err = errors.WithMessagef(err, "could not read RSAPrivateKeyPath %q", rsaConfig.RSAPrivateKeyPath)
}
return
}
================================================
FILE: center/cconf/sql_tpl.go
================================================
package cconf
var TDengineSQLTpl = map[string]string{
"load5": "SELECT _wstart as ts, last(load5) FROM $database.system WHERE host = '$server' and _ts >= $from and _ts <= $to interval($interval) fill(null)",
"process_total": "SELECT _wstart as ts, last(total) FROM $database.processes WHERE host = '$server' and _ts >= $from and _ts <= $to interval($interval) fill(null)",
"thread_total": "SELECT _wstart as ts, last(total) FROM $database.threads WHERE host = '$server' and _ts >= $from and _ts <= $to interval($interval) fill(null)",
"cpu_idle": "SELECT _wstart as ts, last(usage_idle) * -1 + 100 FROM $database.cpu WHERE (host = '$server' and cpu = 'cpu-total') and _ts >= $from and _ts <= $to interval($interval) fill(null)",
"mem_used_percent": "SELECT _wstart as ts, last(used_percent) FROM $database.mem WHERE (host = '$server') and _ts >= $from and _ts <= $to interval($interval) fill(null)",
"disk_used_percent": "SELECT _wstart as ts, last(used_percent) FROM $database.disk WHERE (host = '$server' and path = '/') and _ts >= $from and _ts <= $to interval($interval) fill(null)",
"cpu_context_switches": "select ts, derivative(context_switches, 1s, 0) as context FROM (SELECT _wstart as ts, avg(context_switches) as context_switches FROM $database.kernel WHERE host = '$server' and _ts >= $from and _ts <= $to interval($interval) )",
"tcp": "SELECT _wstart as ts, avg(tcp_close) as CLOSED, avg(tcp_close_wait) as CLOSE_WAIT, avg(tcp_closing) as CLOSING, avg(tcp_established) as ESTABLISHED, avg(tcp_fin_wait1) as FIN_WAIT1, avg(tcp_fin_wait2) as FIN_WAIT2, avg(tcp_last_ack) as LAST_ACK, avg(tcp_syn_recv) as SYN_RECV, avg(tcp_syn_sent) as SYN_SENT, avg(tcp_time_wait) as TIME_WAIT FROM $database.netstat WHERE host = '$server' and _ts >= $from and _ts <= $to interval($interval)",
"net_bytes_recv": "SELECT _wstart as ts, derivative(bytes_recv,1s, 1) as bytes_in FROM $database.net WHERE host = '$server' and interface = '$netif' and _ts >= $from and _ts <= $to group by tbname",
"net_bytes_sent": "SELECT _wstart as ts, derivative(bytes_sent,1s, 1) as bytes_out FROM $database.net WHERE host = '$server' and interface = '$netif' and _ts >= $from and _ts <= $to group by tbname",
"disk_total": "SELECT _wstart as ts, avg(total) AS total, avg(used) as used FROM $database.disk WHERE path = '$mountpoint' and _ts >= $from and _ts <= $to interval($interval) group by host",
}
================================================
FILE: center/center.go
================================================
package center
import (
"context"
"encoding/json"
"fmt"
"github.com/ccfos/nightingale/v6/dscache"
"github.com/toolkits/pkg/logger"
"github.com/ccfos/nightingale/v6/alert"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/alert/dispatch"
"github.com/ccfos/nightingale/v6/alert/process"
alertrt "github.com/ccfos/nightingale/v6/alert/router"
"github.com/ccfos/nightingale/v6/center/cconf"
"github.com/ccfos/nightingale/v6/center/cconf/rsa"
"github.com/ccfos/nightingale/v6/center/integration"
"github.com/ccfos/nightingale/v6/center/metas"
centerrt "github.com/ccfos/nightingale/v6/center/router"
"github.com/ccfos/nightingale/v6/center/sso"
"github.com/ccfos/nightingale/v6/conf"
"github.com/ccfos/nightingale/v6/cron"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/models/migrate"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/flashduty"
"github.com/ccfos/nightingale/v6/pkg/httpx"
"github.com/ccfos/nightingale/v6/pkg/i18nx"
"github.com/ccfos/nightingale/v6/pkg/logx"
"github.com/ccfos/nightingale/v6/pkg/macros"
"github.com/ccfos/nightingale/v6/pkg/version"
"github.com/ccfos/nightingale/v6/prom"
"github.com/ccfos/nightingale/v6/pushgw/idents"
pushgwrt "github.com/ccfos/nightingale/v6/pushgw/router"
"github.com/ccfos/nightingale/v6/pushgw/writer"
"github.com/ccfos/nightingale/v6/storage"
"github.com/flashcatcloud/ibex/src/cmd/ibex"
)
func Initialize(configDir string, cryptoKey string) (func(), error) {
config, err := conf.InitConfig(configDir, cryptoKey)
if err != nil {
return nil, fmt.Errorf("failed to init config: %v", err)
}
cconf.LoadMetricsYaml(configDir, config.Center.MetricsYamlFile)
cconf.LoadOpsYaml(configDir, config.Center.OpsYamlFile)
cconf.MergeOperationConf()
if config.Alert.Heartbeat.EngineName == "" {
config.Alert.Heartbeat.EngineName = "default"
}
logxClean, err := logx.Init(config.Log)
if err != nil {
return nil, err
}
i18nx.Init(configDir)
flashduty.Init(config.Center.FlashDuty)
db, err := storage.New(config.DB)
if err != nil {
return nil, err
}
ctx := ctx.NewContext(context.Background(), db, true)
migrate.Migrate(db)
isRootInit := models.InitRoot(ctx)
config.HTTP.JWTAuth.SigningKey = models.InitJWTSigningKey(ctx)
err = rsa.InitRSAConfig(ctx, &config.HTTP.RSA)
if err != nil {
return nil, err
}
go integration.Init(ctx, config.Center.BuiltinIntegrationsDir)
var redis storage.Redis
redis, err = storage.NewRedis(config.Redis)
if err != nil {
return nil, err
}
metas := metas.New(redis)
idents := idents.New(ctx, redis, config.Pushgw)
syncStats := memsto.NewSyncStats()
alertStats := astats.NewSyncStats()
if config.Center.MigrateBusiGroupLabel || models.CanMigrateBg(ctx) {
models.MigrateBg(ctx, config.Pushgw.BusiGroupLabelKey)
}
if models.CanMigrateEP(ctx) {
models.MigrateEP(ctx)
}
// 初始化 siteUrl,如果为空则设置默认值
InitSiteUrl(ctx, config.Alert.Heartbeat.IP, config.HTTP.Port)
configCache := memsto.NewConfigCache(ctx, syncStats, config.HTTP.RSA.RSAPrivateKey, config.HTTP.RSA.RSAPassWord)
busiGroupCache := memsto.NewBusiGroupCache(ctx, syncStats)
targetCache := memsto.NewTargetCache(ctx, syncStats, redis)
dsCache := memsto.NewDatasourceCache(ctx, syncStats)
alertMuteCache := memsto.NewAlertMuteCache(ctx, syncStats)
alertRuleCache := memsto.NewAlertRuleCache(ctx, syncStats)
notifyConfigCache := memsto.NewNotifyConfigCache(ctx, configCache)
userCache := memsto.NewUserCache(ctx, syncStats)
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)
taskTplCache := memsto.NewTaskTplCache(ctx)
configCvalCache := memsto.NewCvalCache(ctx, syncStats)
notifyRuleCache := memsto.NewNotifyRuleCache(ctx, syncStats)
notifyChannelCache := memsto.NewNotifyChannelCache(ctx, syncStats)
messageTemplateCache := memsto.NewMessageTemplateCache(ctx, syncStats)
userTokenCache := memsto.NewUserTokenCache(ctx, syncStats)
sso := sso.Init(config.Center, ctx, configCache)
promClients := prom.NewPromClient(ctx)
dispatch.InitRegisterQueryFunc(promClients)
externalProcessors := process.NewExternalProcessors()
macros.RegisterMacro(macros.MacroInVain)
dscache.Init(ctx, false)
alert.Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, taskTplCache, dsCache, ctx, promClients, userCache, userGroupCache, notifyRuleCache, notifyChannelCache, messageTemplateCache, configCvalCache)
writers := writer.NewWriters(config.Pushgw)
go version.GetGithubVersion()
go cron.CleanNotifyRecord(ctx, config.Center.CleanNotifyRecordDay)
go cron.CleanPipelineExecution(ctx, config.Center.CleanPipelineExecutionDay)
alertrtRouter := alertrt.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors, config.Log.Dir)
centerRouter := centerrt.New(config.HTTP, config.Center, config.Alert, config.Ibex,
cconf.Operations, dsCache, notifyConfigCache, promClients,
redis, sso, ctx, metas, idents, targetCache, userCache, userGroupCache, userTokenCache, config.Log.Dir)
pushgwRouter := pushgwrt.New(config.HTTP, config.Pushgw, config.Alert, targetCache, busiGroupCache, idents, metas, writers, ctx)
r := httpx.GinEngine(config.Global.RunMode, config.HTTP, configCvalCache.PrintBodyPaths, configCvalCache.PrintAccessLog)
centerRouter.Config(r)
alertrtRouter.Config(r)
pushgwRouter.Config(r)
dumper.ConfigRouter(r)
if config.Ibex.Enable {
migrate.MigrateIbexTables(db)
ibex.ServerStart(true, db, redis, config.HTTP.APIForService.BasicAuth, config.Alert.Heartbeat, &config.CenterApi, r, centerRouter, config.Ibex, config.HTTP.Port)
}
httpClean := httpx.Init(config.HTTP, r)
fmt.Printf("please view n9e at http://%v:%v\n", config.Alert.Heartbeat.IP, config.HTTP.Port)
if isRootInit {
fmt.Println("username/password: root/root.2020")
}
return func() {
logxClean()
httpClean()
}, nil
}
// initSiteUrl 初始化 site_info 中的 site_url,如果为空则使用服务器IP和端口设置默认值
func InitSiteUrl(ctx *ctx.Context, serverIP string, serverPort int) {
// 构造默认的 SiteUrl
defaultSiteUrl := fmt.Sprintf("http://%s:%d", serverIP, serverPort)
// 获取现有的 site_info 配置
siteInfoStr, err := models.ConfigsGet(ctx, "site_info")
if err != nil {
logger.Errorf("failed to get site_info config: %v", err)
return
}
// 如果 site_info 不存在,创建新的
if siteInfoStr == "" {
newSiteInfo := memsto.SiteInfo{
SiteUrl: defaultSiteUrl,
}
siteInfoBytes, err := json.Marshal(newSiteInfo)
if err != nil {
logger.Errorf("failed to marshal site_info: %v", err)
return
}
err = models.ConfigsSet(ctx, "site_info", string(siteInfoBytes))
if err != nil {
logger.Errorf("failed to set site_info: %v", err)
return
}
logger.Infof("initialized site_url with default value: %s", defaultSiteUrl)
return
}
// 检查现有的 site_info 中的 site_url 字段
var existingSiteInfo memsto.SiteInfo
err = json.Unmarshal([]byte(siteInfoStr), &existingSiteInfo)
if err != nil {
logger.Errorf("failed to unmarshal site_info: %v", err)
return
}
// 如果 site_url 已经有值,则不需要初始化
if existingSiteInfo.SiteUrl != "" {
return
}
// 设置 site_url
existingSiteInfo.SiteUrl = defaultSiteUrl
siteInfoBytes, err := json.Marshal(existingSiteInfo)
if err != nil {
logger.Errorf("failed to marshal updated site_info: %v", err)
return
}
err = models.ConfigsSet(ctx, "site_info", string(siteInfoBytes))
if err != nil {
logger.Errorf("failed to update site_info: %v", err)
return
}
logger.Infof("initialized site_url with default value: %s", defaultSiteUrl)
}
================================================
FILE: center/cstats/stats.go
================================================
package cstats
import (
"time"
"github.com/prometheus/client_golang/prometheus"
)
const (
namespace = "n9e"
subsystem = "center"
)
var (
uptime = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "uptime",
Help: "HTTP service uptime.",
},
)
RequestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Buckets: prometheus.DefBuckets,
Name: "http_request_duration_seconds",
Help: "HTTP request latencies in seconds.",
}, []string{"code", "path", "method"},
)
RedisOperationLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "redis_operation_latency_seconds",
Help: "Histogram of latencies for Redis operations",
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5},
},
[]string{"operation", "status"},
)
)
func init() {
// Register the summary and the histogram with Prometheus's default registry.
prometheus.MustRegister(
uptime,
RequestDuration,
RedisOperationLatency,
)
go recordUptime()
}
// recordUptime increases service uptime per second.
func recordUptime() {
for range time.Tick(time.Second) {
uptime.Inc()
}
}
================================================
FILE: center/integration/init.go
================================================
package integration
import (
"encoding/json"
"path"
"sort"
"strings"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/container/set"
"github.com/toolkits/pkg/file"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/runner"
)
const SYSTEM = "system"
var BuiltinPayloadInFile *BuiltinPayloadInFileType
type BuiltinPayloadInFileType struct {
Data map[uint64]map[string]map[string][]*models.BuiltinPayload // map[component_id]map[type]map[cate][]*models.BuiltinPayload
IndexData map[int64]*models.BuiltinPayload // map[uuid]payload
BuiltinMetrics map[string]*models.BuiltinMetric
}
func Init(ctx *ctx.Context, builtinIntegrationsDir string) {
BuiltinPayloadInFile = NewBuiltinPayloadInFileType()
err := models.InitBuiltinPayloads(ctx)
if err != nil {
logger.Warning("init old builtinPayloads fail ", err)
return
}
if res, err := models.ConfigsSelectByCkey(ctx, "disable_integration_init"); err != nil {
logger.Error("fail to get value 'disable_integration_init' from configs", err)
return
} else if len(res) != 0 {
logger.Info("disable_integration_init is set, skip integration init")
return
}
fp := builtinIntegrationsDir
if fp == "" {
fp = path.Join(runner.Cwd, "integrations")
}
// var fileList []string
dirList, err := file.DirsUnder(fp)
if err != nil {
logger.Warning("read builtin component dir fail ", err)
return
}
for _, dir := range dirList {
// components icon
componentDir := fp + "/" + dir
component := models.BuiltinComponent{
Ident: dir,
}
// get logo name
// /api/n9e/integrations/icon/AliYun/aliyun.png
files, err := file.FilesUnder(componentDir + "/icon")
if err == nil && len(files) > 0 {
component.Logo = "/api/n9e/integrations/icon/" + component.Ident + "/" + files[0]
} else if err != nil {
logger.Warningf("read builtin component icon dir fail %s %v", component.Ident, err)
}
// get description
files, err = file.FilesUnder(componentDir + "/markdown")
if err == nil && len(files) > 0 {
var readmeFile string
for _, file := range files {
if strings.HasSuffix(strings.ToLower(file), "md") {
readmeFile = componentDir + "/markdown/" + file
break
}
}
if readmeFile != "" {
component.Readme, _ = file.ReadString(readmeFile)
}
} else if err != nil {
logger.Warningf("read builtin component markdown dir fail %s %v", component.Ident, err)
}
exists, _ := models.BuiltinComponentExists(ctx, &component)
if !exists {
err = component.Add(ctx, SYSTEM)
if err != nil {
logger.Warning("add builtin component fail ", component, err)
continue
}
} else {
old, err := models.BuiltinComponentGet(ctx, "ident = ?", component.Ident)
if err != nil {
logger.Warning("get builtin component fail ", component, err)
continue
}
if old == nil {
logger.Warning("get builtin component nil ", component)
continue
}
if old.UpdatedBy == SYSTEM {
now := time.Now().Unix()
old.CreatedAt = now
old.UpdatedAt = now
old.Readme = component.Readme
old.UpdatedBy = SYSTEM
err = models.DB(ctx).Model(old).Select("*").Updates(old).Error
if err != nil {
logger.Warning("update builtin component fail ", old, err)
}
}
component.ID = old.ID
}
// delete uuid is empty
err = models.DB(ctx).Exec("delete from builtin_payloads where uuid = 0 and type != 'collect' and (updated_by = 'system' or updated_by = '')").Error
if err != nil {
logger.Warning("delete builtin payloads fail ", err)
}
// delete builtin metrics uuid is empty
err = models.DB(ctx).Exec("delete from builtin_metrics where uuid = 0 and (updated_by = 'system' or updated_by = '')").Error
if err != nil {
logger.Warning("delete builtin metrics fail ", err)
}
// 删除 uuid%1000 不为 0 uuid > 1000000000000000000 且 type 为 dashboard 的记录
err = models.DB(ctx).Exec("delete from builtin_payloads where uuid%1000 != 0 and uuid > 1000000000000000000 and type = 'dashboard' and updated_by = 'system'").Error
if err != nil {
logger.Warning("delete builtin payloads fail ", err)
}
// alerts
files, err = file.FilesUnder(componentDir + "/alerts")
if err == nil && len(files) > 0 {
for _, f := range files {
fp := componentDir + "/alerts/" + f
bs, err := file.ReadBytes(fp)
if err != nil {
logger.Warning("read builtin component alerts file fail ", f, err)
continue
}
alerts := []models.AlertRule{}
err = json.Unmarshal(bs, &alerts)
if err != nil {
logger.Warning("parse builtin component alerts file fail ", f, err)
continue
}
newAlerts := []models.AlertRule{}
for _, alert := range alerts {
if alert.UUID == 0 {
time.Sleep(time.Microsecond)
alert.UUID = time.Now().UnixMicro()
}
newAlerts = append(newAlerts, alert)
content, err := json.Marshal(alert)
if err != nil {
logger.Warning("marshal builtin alert fail ", alert, err)
continue
}
cate := strings.Replace(f, ".json", "", -1)
builtinAlert := models.BuiltinPayload{
ComponentID: component.ID,
Type: "alert",
Cate: cate,
Name: alert.Name,
Tags: alert.AppendTags,
Content: string(content),
UUID: alert.UUID,
ID: alert.UUID,
CreatedBy: SYSTEM,
UpdatedBy: SYSTEM,
}
BuiltinPayloadInFile.AddBuiltinPayload(&builtinAlert)
}
}
}
// dashboards
files, err = file.FilesUnder(componentDir + "/dashboards")
if err == nil && len(files) > 0 {
for _, f := range files {
fp := componentDir + "/dashboards/" + f
bs, err := file.ReadBytes(fp)
if err != nil {
logger.Warning("read builtin component dashboards file fail ", f, err)
continue
}
dashboard := BuiltinBoard{}
err = json.Unmarshal(bs, &dashboard)
if err != nil {
logger.Warning("parse builtin component dashboards file fail ", f, err)
continue
}
if dashboard.UUID == 0 {
time.Sleep(time.Microsecond)
dashboard.UUID = time.Now().UnixMicro()
// 补全文件中的 uuid
bs, err = json.MarshalIndent(dashboard, "", " ")
if err != nil {
logger.Warning("marshal builtin dashboard fail ", dashboard, err)
continue
}
_, err = file.WriteBytes(fp, bs)
if err != nil {
logger.Warning("write builtin dashboard file fail ", f, err)
}
}
content, err := json.Marshal(dashboard)
if err != nil {
logger.Warning("marshal builtin dashboard fail ", dashboard, err)
continue
}
builtinDashboard := models.BuiltinPayload{
ComponentID: component.ID,
Type: "dashboard",
Cate: "",
Name: dashboard.Name,
Tags: dashboard.Tags,
Note: dashboard.Note,
Content: string(content),
UUID: dashboard.UUID,
ID: dashboard.UUID,
CreatedBy: SYSTEM,
UpdatedBy: SYSTEM,
}
BuiltinPayloadInFile.AddBuiltinPayload(&builtinDashboard)
}
} else if err != nil {
logger.Warningf("read builtin component dash dir fail %s %v", component.Ident, err)
}
// metrics
files, err = file.FilesUnder(componentDir + "/metrics")
if err == nil && len(files) > 0 {
for _, f := range files {
fp := componentDir + "/metrics/" + f
bs, err := file.ReadBytes(fp)
if err != nil {
logger.Warning("read builtin component metrics file fail", f, err)
continue
}
metrics := []models.BuiltinMetric{}
err = json.Unmarshal(bs, &metrics)
if err != nil {
logger.Warning("parse builtin component metrics file fail", f, err)
continue
}
for _, metric := range metrics {
time.Sleep(time.Microsecond)
metric.UUID = time.Now().UnixMicro()
metric.ID = metric.UUID
metric.CreatedBy = SYSTEM
metric.UpdatedBy = SYSTEM
BuiltinPayloadInFile.BuiltinMetrics[metric.Expression] = &metric
}
}
} else if err != nil {
logger.Warningf("read builtin component metrics dir fail %s %v", component.Ident, err)
}
}
}
type BuiltinBoard struct {
Id int64 `json:"id" gorm:"primaryKey"`
GroupId int64 `json:"group_id"`
Name string `json:"name"`
Ident string `json:"ident"`
Tags string `json:"tags"`
Note string `json:"note"`
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
Configs interface{} `json:"configs" gorm:"-"`
Public int `json:"public"` // 0: false, 1: true
PublicCate int `json:"public_cate"` // 0: anonymous, 1: login, 2: busi
Bgids []int64 `json:"bgids" gorm:"-"`
BuiltIn int `json:"built_in"` // 0: false, 1: true
Hide int `json:"hide"` // 0: false, 1: true
UUID int64 `json:"uuid"`
}
func NewBuiltinPayloadInFileType() *BuiltinPayloadInFileType {
return &BuiltinPayloadInFileType{
Data: make(map[uint64]map[string]map[string][]*models.BuiltinPayload),
IndexData: make(map[int64]*models.BuiltinPayload),
BuiltinMetrics: make(map[string]*models.BuiltinMetric),
}
}
func (b *BuiltinPayloadInFileType) AddBuiltinPayload(bp *models.BuiltinPayload) {
if _, exists := b.Data[bp.ComponentID]; !exists {
b.Data[bp.ComponentID] = make(map[string]map[string][]*models.BuiltinPayload)
}
bpInType := b.Data[bp.ComponentID]
if _, exists := bpInType[bp.Type]; !exists {
bpInType[bp.Type] = make(map[string][]*models.BuiltinPayload)
}
bpInCate := bpInType[bp.Type]
if _, exists := bpInCate[bp.Cate]; !exists {
bpInCate[bp.Cate] = make([]*models.BuiltinPayload, 0)
}
bpInCate[bp.Cate] = append(bpInCate[bp.Cate], bp)
b.IndexData[bp.UUID] = bp
}
func (b *BuiltinPayloadInFileType) GetComponentIdentByCate(typ, cate string) string {
for _, source := range b.Data {
if source == nil {
continue
}
typeMap, exists := source[typ]
if !exists {
continue
}
payloads, exists := typeMap[cate]
if !exists {
continue
}
if len(payloads) > 0 {
return payloads[0].Component
}
}
return ""
}
func (b *BuiltinPayloadInFileType) GetBuiltinPayload(typ, cate, query string, componentId uint64) ([]*models.BuiltinPayload, error) {
var result []*models.BuiltinPayload
source := b.Data[componentId]
if source == nil {
return nil, nil
}
typeMap, exists := source[typ]
if !exists {
return nil, nil
}
if cate != "" {
payloads, exists := typeMap[cate]
if !exists {
return nil, nil
}
result = append(result, filterByQuery(payloads, query)...)
} else {
for _, payloads := range typeMap {
result = append(result, filterByQuery(payloads, query)...)
}
}
if len(result) > 0 {
sort.Slice(result, func(i, j int) bool {
return result[i].Name < result[j].Name
})
}
return result, nil
}
func (b *BuiltinPayloadInFileType) GetBuiltinPayloadCates(typ string, componentId uint64) ([]string, error) {
var result []string
source := b.Data[componentId]
if source == nil {
return result, nil
}
typeData := source[typ]
if typeData == nil {
return result, nil
}
for cate := range typeData {
result = append(result, cate)
}
sort.Strings(result)
return result, nil
}
func filterByQuery(payloads []*models.BuiltinPayload, query string) []*models.BuiltinPayload {
if query == "" {
return payloads
}
queryLower := strings.ToLower(query)
var filtered []*models.BuiltinPayload
for _, p := range payloads {
if strings.Contains(strings.ToLower(p.Name), queryLower) || strings.Contains(strings.ToLower(p.Tags), queryLower) {
filtered = append(filtered, p)
}
}
return filtered
}
func (b *BuiltinPayloadInFileType) BuiltinMetricGets(metricsInDB []*models.BuiltinMetric, lang, collector, typ, query, unit string, limit, offset int) ([]*models.BuiltinMetric, int, error) {
var filteredMetrics []*models.BuiltinMetric
expressionSet := set.NewStringSet()
builtinMetricsByDB := convertBuiltinMetricByDB(metricsInDB)
builtinMetricsMap := make(map[string]*models.BuiltinMetric)
for expression, metric := range builtinMetricsByDB {
builtinMetricsMap[expression] = metric
}
for expression, metric := range b.BuiltinMetrics {
builtinMetricsMap[expression] = metric
}
for _, metric := range builtinMetricsMap {
if !applyFilter(metric, collector, typ, query, unit) {
continue
}
// Skip if expression is already in db cache
// NOTE: 忽略重复的expression,特别的,在旧版本中,用户可能已经创建了重复的metrics,需要覆盖掉ByFile中相同的Metrics
// NOTE: Ignore duplicate expressions, especially in the old version, users may have created duplicate metrics,
if expressionSet.Exists(metric.Expression) {
continue
}
// Add db expression in set.
expressionSet.Add(metric.Expression)
// Apply language
trans, err := getTranslationWithLanguage(metric, lang)
if err != nil {
logger.Errorf("Error getting translation for metric %s: %v", metric.Name, err)
continue // Skip if translation not found
}
metric.Name = trans.Name
metric.Note = trans.Note
filteredMetrics = append(filteredMetrics, metric)
}
// Sort metrics
sort.Slice(filteredMetrics, func(i, j int) bool {
if filteredMetrics[i].Collector != filteredMetrics[j].Collector {
return filteredMetrics[i].Collector < filteredMetrics[j].Collector
}
if filteredMetrics[i].Typ != filteredMetrics[j].Typ {
return filteredMetrics[i].Typ < filteredMetrics[j].Typ
}
return filteredMetrics[i].Expression < filteredMetrics[j].Expression
})
totalCount := len(filteredMetrics)
// Validate parameters
if offset < 0 {
offset = 0
}
if limit < 0 {
limit = 0
}
// Handle edge cases
if offset >= totalCount || limit == 0 {
return []*models.BuiltinMetric{}, totalCount, nil
}
// Apply pagination
end := offset + limit
if end > totalCount {
end = totalCount
}
return filteredMetrics[offset:end], totalCount, nil
}
func (b *BuiltinPayloadInFileType) BuiltinMetricTypes(lang, collector, query string) []string {
typeSet := set.NewStringSet()
for _, metric := range b.BuiltinMetrics {
if !applyFilter(metric, collector, "", query, "") {
continue
}
typeSet.Add(metric.Typ)
}
return typeSet.ToSlice()
}
func (b *BuiltinPayloadInFileType) BuiltinMetricCollectors(lang, typ, query string) []string {
collectorSet := set.NewStringSet()
for _, metric := range b.BuiltinMetrics {
if !applyFilter(metric, "", typ, query, "") {
continue
}
collectorSet.Add(metric.Collector)
}
return collectorSet.ToSlice()
}
func applyFilter(metric *models.BuiltinMetric, collector, typ, query, unit string) bool {
if collector != "" && collector != metric.Collector {
return false
}
if typ != "" && typ != metric.Typ {
return false
}
if unit != "" && !containsUnit(unit, metric.Unit) {
return false
}
if query != "" && !applyQueryFilter(metric, query) {
return false
}
return true
}
func containsUnit(unit, metricUnit string) bool {
us := strings.Split(unit, ",")
for _, u := range us {
if u == metricUnit {
return true
}
}
return false
}
func applyQueryFilter(metric *models.BuiltinMetric, query string) bool {
qs := strings.Split(query, " ")
for _, q := range qs {
if strings.HasPrefix(q, "-") {
q = strings.TrimPrefix(q, "-")
if strings.Contains(metric.Name, q) || strings.Contains(metric.Note, q) || strings.Contains(metric.Expression, q) {
return false
}
} else {
if !strings.Contains(metric.Name, q) && !strings.Contains(metric.Note, q) && !strings.Contains(metric.Expression, q) {
return false
}
}
}
return true
}
func getTranslationWithLanguage(bm *models.BuiltinMetric, lang string) (*models.Translation, error) {
var defaultTranslation *models.Translation
for _, t := range bm.Translation {
if t.Lang == lang {
return &t, nil
}
if t.Lang == "en_US" {
defaultTranslation = &t
}
}
if defaultTranslation != nil {
return defaultTranslation, nil
}
return nil, errors.Errorf("translation not found for metric %s", bm.Name)
}
func convertBuiltinMetricByDB(metricsInDB []*models.BuiltinMetric) map[string]*models.BuiltinMetric {
builtinMetricsByDB := make(map[string]*models.BuiltinMetric)
builtinMetricsByDBList := make(map[string][]*models.BuiltinMetric)
for _, metric := range metricsInDB {
builtinMetrics, ok := builtinMetricsByDBList[metric.Expression]
if !ok {
builtinMetrics = []*models.BuiltinMetric{}
}
builtinMetrics = append(builtinMetrics, metric)
builtinMetricsByDBList[metric.Expression] = builtinMetrics
}
for expression, builtinMetrics := range builtinMetricsByDBList {
if len(builtinMetrics) == 0 {
continue
}
// NOTE: 为兼容旧版本用户已经创建的 metrics,同时将修改 metrics 收敛到同一个记录上,
// 我们选择使用 expression 相同但是 id 最小的 metric 记录作为主要的 Metric。
sort.Slice(builtinMetrics, func(i, j int) bool {
return builtinMetrics[i].ID < builtinMetrics[j].ID
})
currentBuiltinMetric := builtinMetrics[0]
// User has no customized translation, so we can merge it
if len(currentBuiltinMetric.Translation) == 0 {
translationMap := make(map[string]models.Translation)
for _, bm := range builtinMetrics {
for _, t := range getDefaultTranslation(bm) {
translationMap[t.Lang] = t
}
}
currentBuiltinMetric.Translation = make([]models.Translation, 0, len(translationMap))
for _, t := range translationMap {
currentBuiltinMetric.Translation = append(currentBuiltinMetric.Translation, t)
}
}
builtinMetricsByDB[expression] = currentBuiltinMetric
}
return builtinMetricsByDB
}
func getDefaultTranslation(bm *models.BuiltinMetric) []models.Translation {
if len(bm.Translation) != 0 {
return bm.Translation
}
return []models.Translation{{
Lang: bm.Lang,
Name: bm.Name,
Note: bm.Note,
}}
}
================================================
FILE: center/metas/metas.go
================================================
package metas
import (
"context"
"encoding/json"
"sync"
"time"
"github.com/ccfos/nightingale/v6/center/cstats"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/storage"
"github.com/toolkits/pkg/logger"
)
type Set struct {
sync.RWMutex
items map[string]models.HostMeta
redis storage.Redis
}
func New(redis storage.Redis) *Set {
set := &Set{
items: make(map[string]models.HostMeta),
redis: redis,
}
set.Init()
return set
}
func (s *Set) Init() {
go s.LoopPersist()
}
func (s *Set) MSet(items map[string]models.HostMeta) {
s.Lock()
defer s.Unlock()
for ident, meta := range items {
s.items[ident] = meta
}
}
func (s *Set) Set(ident string, meta models.HostMeta) {
s.Lock()
defer s.Unlock()
s.items[ident] = meta
}
func (s *Set) LoopPersist() {
for {
time.Sleep(time.Second)
s.persist()
}
}
func (s *Set) persist() {
var items map[string]models.HostMeta
s.Lock()
if len(s.items) == 0 {
s.Unlock()
return
}
items = s.items
s.items = make(map[string]models.HostMeta)
s.Unlock()
s.updateMeta(items)
}
func (s *Set) updateMeta(items map[string]models.HostMeta) {
m := make(map[string]models.HostMeta, 100)
num := 0
for _, meta := range items {
m[meta.Hostname] = meta
num++
if num == 100 {
if err := s.updateTargets(m); err != nil {
logger.Errorf("failed to update targets: %v", err)
}
m = make(map[string]models.HostMeta, 100)
num = 0
}
}
if err := s.updateTargets(m); err != nil {
logger.Errorf("failed to update targets: %v", err)
}
}
func (s *Set) updateTargets(m map[string]models.HostMeta) error {
if s.redis == nil {
logger.Warningf("redis is nil")
return nil
}
count := int64(len(m))
if count == 0 {
return nil
}
newMap := make(map[string]interface{}, count)
extendMap := make(map[string]interface{})
for ident, meta := range m {
if meta.ExtendInfo != nil {
extendMeta := meta.ExtendInfo
meta.ExtendInfo = make(map[string]interface{})
extendMetaStr, err := json.Marshal(extendMeta)
if err != nil {
return err
}
extendMap[models.WrapExtendIdent(ident)] = extendMetaStr
}
newMap[models.WrapIdent(ident)] = meta
}
start := time.Now()
err := storage.MSet(context.Background(), s.redis, newMap, 7*24*time.Hour)
if err != nil {
cstats.RedisOperationLatency.WithLabelValues("mset_target_meta", "fail").Observe(time.Since(start).Seconds())
return err
} else {
cstats.RedisOperationLatency.WithLabelValues("mset_target_meta", "success").Observe(time.Since(start).Seconds())
}
if len(extendMap) > 0 {
err = storage.MSet(context.Background(), s.redis, extendMap, 7*24*time.Hour)
if err != nil {
cstats.RedisOperationLatency.WithLabelValues("mset_target_extend", "fail").Observe(time.Since(start).Seconds())
return err
} else {
cstats.RedisOperationLatency.WithLabelValues("mset_target_extend", "success").Observe(time.Since(start).Seconds())
}
}
return err
}
================================================
FILE: center/router/router.go
================================================
package router
import (
"fmt"
"net/http"
"path"
"runtime"
"strings"
"time"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/center/cconf"
"github.com/ccfos/nightingale/v6/center/cstats"
"github.com/ccfos/nightingale/v6/center/metas"
"github.com/ccfos/nightingale/v6/center/sso"
"github.com/ccfos/nightingale/v6/conf"
_ "github.com/ccfos/nightingale/v6/front/statik"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/aop"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/httpx"
"github.com/ccfos/nightingale/v6/pkg/version"
"github.com/ccfos/nightingale/v6/prom"
"github.com/ccfos/nightingale/v6/pushgw/idents"
"github.com/ccfos/nightingale/v6/storage"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"gorm.io/gorm"
"github.com/gin-gonic/gin"
"github.com/rakyll/statik/fs"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/runner"
)
type Router struct {
HTTP httpx.Config
Center cconf.Center
Ibex conf.Ibex
Alert aconf.Alert
Operations cconf.Operation
DatasourceCache *memsto.DatasourceCacheType
NotifyConfigCache *memsto.NotifyConfigCacheType
PromClients *prom.PromClientMap
Redis storage.Redis
MetaSet *metas.Set
IdentSet *idents.Set
TargetCache *memsto.TargetCacheType
Sso *sso.SsoClient
UserCache *memsto.UserCacheType
UserGroupCache *memsto.UserGroupCacheType
UserTokenCache *memsto.UserTokenCacheType
Ctx *ctx.Context
LogDir string
HeartbeatHook HeartbeatHookFunc
TargetDeleteHook models.TargetDeleteHookFunc
AlertRuleModifyHook AlertRuleModifyHookFunc
}
func New(httpConfig httpx.Config, center cconf.Center, alert aconf.Alert, ibex conf.Ibex,
operations cconf.Operation, ds *memsto.DatasourceCacheType, ncc *memsto.NotifyConfigCacheType,
pc *prom.PromClientMap, redis storage.Redis,
sso *sso.SsoClient, ctx *ctx.Context, metaSet *metas.Set, idents *idents.Set,
tc *memsto.TargetCacheType, uc *memsto.UserCacheType, ugc *memsto.UserGroupCacheType, utc *memsto.UserTokenCacheType, logDir string) *Router {
return &Router{
HTTP: httpConfig,
Center: center,
Alert: alert,
Ibex: ibex,
Operations: operations,
DatasourceCache: ds,
NotifyConfigCache: ncc,
PromClients: pc,
Redis: redis,
MetaSet: metaSet,
IdentSet: idents,
TargetCache: tc,
Sso: sso,
UserCache: uc,
UserGroupCache: ugc,
UserTokenCache: utc,
Ctx: ctx,
LogDir: logDir,
HeartbeatHook: func(ident string) map[string]interface{} { return nil },
TargetDeleteHook: func(tx *gorm.DB, idents []string) error { return nil },
AlertRuleModifyHook: func(ar *models.AlertRule) {},
}
}
func stat() gin.HandlerFunc {
return func(c *gin.Context) {
start := time.Now()
c.Next()
code := fmt.Sprintf("%d", c.Writer.Status())
method := c.Request.Method
labels := []string{code, c.FullPath(), method}
cstats.RequestDuration.WithLabelValues(labels...).Observe(time.Since(start).Seconds())
}
}
func languageDetector(i18NHeaderKey string) gin.HandlerFunc {
headerKey := i18NHeaderKey
return func(c *gin.Context) {
if headerKey != "" {
lang := c.GetHeader(headerKey)
if lang != "" {
if strings.HasPrefix(lang, "zh_HK") {
c.Request.Header.Set("X-Language", "zh_HK")
} else if strings.HasPrefix(lang, "zh") {
c.Request.Header.Set("X-Language", "zh_CN")
} else if strings.HasPrefix(lang, "en") {
c.Request.Header.Set("X-Language", "en")
} else {
c.Request.Header.Set("X-Language", lang)
}
} else {
c.Request.Header.Set("X-Language", "zh_CN")
}
}
c.Next()
}
}
func (rt *Router) configNoRoute(r *gin.Engine, fs *http.FileSystem) {
r.NoRoute(func(c *gin.Context) {
arr := strings.Split(c.Request.URL.Path, ".")
suffix := arr[len(arr)-1]
switch suffix {
case "png", "jpeg", "jpg", "svg", "ico", "gif", "css", "js", "html", "htm", "gz", "zip", "map", "ttf", "md":
if !rt.Center.UseFileAssets {
c.FileFromFS(c.Request.URL.Path, *fs)
} else {
cwdarr := []string{"/"}
if runtime.GOOS == "windows" {
cwdarr[0] = ""
}
cwdarr = append(cwdarr, strings.Split(runner.Cwd, "/")...)
cwdarr = append(cwdarr, "pub")
cwdarr = append(cwdarr, strings.Split(c.Request.URL.Path, "/")...)
c.File(path.Join(cwdarr...))
}
default:
if !rt.Center.UseFileAssets {
c.FileFromFS("/", *fs)
} else {
cwdarr := []string{"/"}
if runtime.GOOS == "windows" {
cwdarr[0] = ""
}
cwdarr = append(cwdarr, strings.Split(runner.Cwd, "/")...)
cwdarr = append(cwdarr, "pub")
cwdarr = append(cwdarr, "index.html")
c.File(path.Join(cwdarr...))
}
}
})
}
func (rt *Router) Config(r *gin.Engine) {
r.Use(stat())
r.Use(languageDetector(rt.Center.I18NHeaderKey))
r.Use(aop.Recovery())
statikFS, err := fs.New()
if err != nil {
logger.Errorf("cannot create statik fs: %v", err)
}
if !rt.Center.UseFileAssets {
r.StaticFS("/pub", statikFS)
}
pagesPrefix := "/api/n9e"
pages := r.Group(pagesPrefix)
{
pages.DELETE("/datasource/series", rt.auth(), rt.admin(), rt.deleteDatasourceSeries)
if rt.Center.AnonymousAccess.PromQuerier {
pages.Any("/proxy/:id/*url", rt.dsProxy)
pages.POST("/query-range-batch", rt.promBatchQueryRange)
pages.POST("/query-instant-batch", rt.promBatchQueryInstant)
pages.GET("/datasource/brief", rt.datasourceBriefs)
pages.POST("/datasource/query", rt.datasourceQuery)
pages.POST("/ds-query", rt.QueryData)
pages.POST("/logs-query", rt.QueryLogV2)
pages.POST("/tdengine-databases", rt.tdengineDatabases)
pages.POST("/tdengine-tables", rt.tdengineTables)
pages.POST("/tdengine-columns", rt.tdengineColumns)
pages.POST("/log-query-batch", rt.QueryLogBatch)
// 数据库元数据接口
pages.POST("/db-databases", rt.ShowDatabases)
pages.POST("/db-tables", rt.ShowTables)
pages.POST("/db-desc-table", rt.DescribeTable)
// es 专用接口
pages.POST("/indices", rt.auth(), rt.user(), rt.QueryIndices)
pages.POST("/es-variable", rt.auth(), rt.user(), rt.QueryESVariable)
pages.POST("/fields", rt.auth(), rt.user(), rt.QueryFields)
pages.POST("/log-query", rt.auth(), rt.user(), rt.QueryLog)
} else {
pages.Any("/proxy/:id/*url", rt.auth(), rt.dsProxy)
pages.POST("/query-range-batch", rt.auth(), rt.promBatchQueryRange)
pages.POST("/query-instant-batch", rt.auth(), rt.promBatchQueryInstant)
pages.GET("/datasource/brief", rt.auth(), rt.user(), rt.datasourceBriefs)
pages.POST("/datasource/query", rt.auth(), rt.user(), rt.datasourceQuery)
pages.POST("/ds-query", rt.auth(), rt.user(), rt.QueryData)
pages.POST("/logs-query", rt.auth(), rt.user(), rt.QueryLogV2)
pages.POST("/tdengine-databases", rt.auth(), rt.tdengineDatabases)
pages.POST("/tdengine-tables", rt.auth(), rt.tdengineTables)
pages.POST("/tdengine-columns", rt.auth(), rt.tdengineColumns)
pages.POST("/log-query-batch", rt.auth(), rt.user(), rt.QueryLogBatch)
// 数据库元数据接口
pages.POST("/db-databases", rt.auth(), rt.user(), rt.ShowDatabases)
pages.POST("/db-tables", rt.auth(), rt.user(), rt.ShowTables)
pages.POST("/db-desc-table", rt.auth(), rt.user(), rt.DescribeTable)
// es 专用接口
pages.POST("/indices", rt.auth(), rt.user(), rt.QueryIndices)
pages.POST("/es-variable", rt.QueryESVariable)
pages.POST("/fields", rt.QueryFields)
pages.POST("/log-query", rt.QueryLog)
}
// OpenSearch 专用接口
pages.POST("/os-indices", rt.QueryOSIndices)
pages.POST("/os-variable", rt.QueryOSVariable)
pages.POST("/os-fields", rt.QueryOSFields)
pages.GET("/sql-template", rt.QuerySqlTemplate)
pages.POST("/auth/login", rt.jwtMock(), rt.loginPost)
pages.POST("/auth/logout", rt.jwtMock(), rt.auth(), rt.user(), rt.logoutPost)
pages.POST("/auth/refresh", rt.jwtMock(), rt.refreshPost)
pages.POST("/auth/captcha", rt.jwtMock(), rt.generateCaptcha)
pages.POST("/auth/captcha-verify", rt.jwtMock(), rt.captchaVerify)
pages.GET("/auth/ifshowcaptcha", rt.ifShowCaptcha)
pages.GET("/auth/sso-config", rt.ssoConfigNameGet)
pages.GET("/auth/rsa-config", rt.rsaConfigGet)
pages.GET("/auth/redirect", rt.loginRedirect)
pages.GET("/auth/redirect/cas", rt.loginRedirectCas)
pages.GET("/auth/redirect/oauth", rt.loginRedirectOAuth)
pages.GET("/auth/redirect/dingtalk", rt.loginRedirectDingTalk)
pages.GET("/auth/redirect/feishu", rt.loginRedirectFeiShu)
pages.GET("/auth/callback", rt.loginCallback)
pages.GET("/auth/callback/cas", rt.loginCallbackCas)
pages.GET("/auth/callback/oauth", rt.loginCallbackOAuth)
pages.GET("/auth/callback/dingtalk", rt.loginCallbackDingTalk)
pages.GET("/auth/callback/feishu", rt.loginCallbackFeiShu)
pages.GET("/auth/perms", rt.allPerms)
pages.GET("/metrics/desc", rt.metricsDescGetFile)
pages.POST("/metrics/desc", rt.metricsDescGetMap)
pages.GET("/notify-channels", rt.notifyChannelsGets)
pages.GET("/contact-keys", rt.contactKeysGets)
pages.GET("/install-date", rt.installDateGet)
pages.GET("/self/perms", rt.auth(), rt.user(), rt.permsGets)
pages.GET("/self/profile", rt.auth(), rt.user(), rt.selfProfileGet)
pages.PUT("/self/profile", rt.auth(), rt.user(), rt.selfProfilePut)
pages.PUT("/self/password", rt.auth(), rt.user(), rt.selfPasswordPut)
pages.GET("/self/token", rt.auth(), rt.user(), rt.getToken)
pages.POST("/self/token", rt.auth(), rt.user(), rt.addToken)
pages.DELETE("/self/token/:id", rt.auth(), rt.user(), rt.deleteToken)
pages.GET("/users", rt.auth(), rt.user(), rt.perm("/users"), rt.userGets)
pages.POST("/users", rt.auth(), rt.user(), rt.perm("/users/add"), rt.userAddPost)
pages.GET("/user/:id/profile", rt.auth(), rt.userProfileGet)
pages.PUT("/user/:id/profile", rt.auth(), rt.user(), rt.perm("/users/put"), rt.userProfilePut)
pages.PUT("/user/:id/password", rt.auth(), rt.user(), rt.perm("/users/put"), rt.userPasswordPut)
pages.DELETE("/user/:id", rt.auth(), rt.user(), rt.perm("/users/del"), rt.userDel)
pages.GET("/metric-views", rt.auth(), rt.metricViewGets)
pages.DELETE("/metric-views", rt.auth(), rt.user(), rt.metricViewDel)
pages.POST("/metric-views", rt.auth(), rt.user(), rt.metricViewAdd)
pages.PUT("/metric-views", rt.auth(), rt.user(), rt.metricViewPut)
pages.GET("/builtin-metric-filters", rt.auth(), rt.user(), rt.metricFilterGets)
pages.DELETE("/builtin-metric-filters", rt.auth(), rt.user(), rt.metricFilterDel)
pages.POST("/builtin-metric-filters", rt.auth(), rt.user(), rt.metricFilterAdd)
pages.PUT("/builtin-metric-filters", rt.auth(), rt.user(), rt.metricFilterPut)
pages.POST("/builtin-metric-promql", rt.auth(), rt.user(), rt.getMetricPromql)
pages.POST("/builtin-metrics", rt.auth(), rt.user(), rt.perm("/builtin-metrics/add"), rt.builtinMetricsAdd)
pages.PUT("/builtin-metrics", rt.auth(), rt.user(), rt.perm("/builtin-metrics/put"), rt.builtinMetricsPut)
pages.DELETE("/builtin-metrics", rt.auth(), rt.user(), rt.perm("/builtin-metrics/del"), rt.builtinMetricsDel)
pages.GET("/builtin-metrics", rt.auth(), rt.user(), rt.builtinMetricsGets)
pages.GET("/builtin-metrics/types", rt.auth(), rt.user(), rt.builtinMetricsTypes)
pages.GET("/builtin-metrics/types/default", rt.auth(), rt.user(), rt.builtinMetricsDefaultTypes)
pages.GET("/builtin-metrics/collectors", rt.auth(), rt.user(), rt.builtinMetricsCollectors)
pages.GET("/user-groups", rt.auth(), rt.user(), rt.userGroupGets)
pages.POST("/user-groups", rt.auth(), rt.user(), rt.perm("/user-groups/add"), rt.userGroupAdd)
pages.GET("/user-group/:id", rt.auth(), rt.user(), rt.userGroupGet)
pages.PUT("/user-group/:id", rt.auth(), rt.user(), rt.perm("/user-groups/put"), rt.userGroupWrite(), rt.userGroupPut)
pages.DELETE("/user-group/:id", rt.auth(), rt.user(), rt.perm("/user-groups/del"), rt.userGroupWrite(), rt.userGroupDel)
pages.POST("/user-group/:id/members", rt.auth(), rt.user(), rt.perm("/user-groups/put"), rt.userGroupWrite(), rt.userGroupMemberAdd)
pages.DELETE("/user-group/:id/members", rt.auth(), rt.user(), rt.perm("/user-groups/put"), rt.userGroupWrite(), rt.userGroupMemberDel)
pages.GET("/busi-groups", rt.auth(), rt.user(), rt.busiGroupGets)
pages.POST("/busi-groups", rt.auth(), rt.user(), rt.perm("/busi-groups/add"), rt.busiGroupAdd)
pages.GET("/busi-groups/alertings", rt.auth(), rt.busiGroupAlertingsGets)
pages.GET("/busi-group/:id", rt.auth(), rt.user(), rt.bgro(), rt.busiGroupGet)
pages.PUT("/busi-group/:id", rt.auth(), rt.user(), rt.perm("/busi-groups/put"), rt.bgrw(), rt.busiGroupPut)
pages.POST("/busi-group/:id/members", rt.auth(), rt.user(), rt.perm("/busi-groups/put"), rt.bgrw(), rt.busiGroupMemberAdd)
pages.DELETE("/busi-group/:id/members", rt.auth(), rt.user(), rt.perm("/busi-groups/put"), rt.bgrw(), rt.busiGroupMemberDel)
pages.DELETE("/busi-group/:id", rt.auth(), rt.user(), rt.perm("/busi-groups/del"), rt.bgrw(), rt.busiGroupDel)
pages.GET("/busi-group/:id/perm/:perm", rt.auth(), rt.user(), rt.checkBusiGroupPerm)
pages.GET("/busi-groups/tags", rt.auth(), rt.user(), rt.busiGroupsGetTags)
pages.GET("/targets", rt.auth(), rt.user(), rt.targetGets)
pages.GET("/targets/stats", rt.auth(), rt.user(), rt.targetStats)
pages.POST("/target-update", rt.auth(), rt.targetUpdate)
pages.GET("/target/extra-meta", rt.auth(), rt.user(), rt.targetExtendInfoByIdent)
pages.POST("/target/list", rt.auth(), rt.user(), rt.targetGetsByHostFilter)
pages.DELETE("/targets", rt.auth(), rt.user(), rt.perm("/targets/del"), rt.targetDel)
pages.GET("/targets/tags", rt.auth(), rt.user(), rt.targetGetTags)
pages.POST("/targets/tags", rt.auth(), rt.user(), rt.perm("/targets/put"), rt.targetBindTagsByFE)
pages.DELETE("/targets/tags", rt.auth(), rt.user(), rt.perm("/targets/put"), rt.targetUnbindTagsByFE)
pages.PUT("/targets/note", rt.auth(), rt.user(), rt.perm("/targets/put"), rt.targetUpdateNote)
pages.PUT("/targets/bgids", rt.auth(), rt.user(), rt.perm("/targets/put"), rt.targetBindBgids)
pages.POST("/builtin-cate-favorite", rt.auth(), rt.user(), rt.builtinCateFavoriteAdd)
pages.DELETE("/builtin-cate-favorite/:name", rt.auth(), rt.user(), rt.builtinCateFavoriteDel)
pages.GET("/integrations/icon/:cate/:name", rt.builtinIcon)
// pages.GET("/builtin-boards", rt.builtinBoardGets)
// pages.GET("/builtin-board/:name", rt.builtinBoardGet)
// pages.GET("/dashboards/builtin/list", rt.builtinBoardGets)
// pages.GET("/builtin-boards-cates", rt.auth(), rt.user(), rt.builtinBoardCateGets)
// pages.POST("/builtin-boards-detail", rt.auth(), rt.user(), rt.builtinBoardDetailGets)
// pages.GET("/integrations/makedown/:cate", rt.builtinMarkdown)
pages.GET("/busi-groups/public-boards", rt.auth(), rt.user(), rt.perm("/dashboards"), rt.publicBoardGets)
pages.GET("/busi-groups/boards", rt.auth(), rt.user(), rt.perm("/dashboards"), rt.boardGetsByGids)
pages.GET("/busi-group/:id/boards", rt.auth(), rt.user(), rt.perm("/dashboards"), rt.bgro(), rt.boardGets)
pages.POST("/busi-group/:id/boards", rt.auth(), rt.user(), rt.perm("/dashboards/add"), rt.bgrw(), rt.boardAdd)
pages.POST("/busi-group/:id/board/:bid/clone", rt.auth(), rt.user(), rt.perm("/dashboards/add"), rt.bgrw(), rt.boardClone)
pages.POST("/busi-groups/boards/clones", rt.auth(), rt.user(), rt.perm("/dashboards/add"), rt.boardBatchClone)
pages.GET("/boards", rt.auth(), rt.user(), rt.boardGetsByBids)
pages.GET("/board/:bid", rt.boardGet)
pages.GET("/board/:bid/pure", rt.boardPureGet)
pages.PUT("/board/:bid", rt.auth(), rt.user(), rt.perm("/dashboards/put"), rt.boardPut)
pages.PUT("/board/:bid/configs", rt.auth(), rt.user(), rt.perm("/dashboards/put"), rt.boardPutConfigs)
pages.PUT("/board/:bid/public", rt.auth(), rt.user(), rt.perm("/dashboards/put"), rt.boardPutPublic)
pages.DELETE("/boards", rt.auth(), rt.user(), rt.perm("/dashboards/del"), rt.boardDel)
pages.GET("/share-charts", rt.chartShareGets)
pages.POST("/share-charts", rt.auth(), rt.chartShareAdd)
pages.POST("/dashboard-annotations", rt.auth(), rt.user(), rt.perm("/dashboards/put"), rt.dashAnnotationAdd)
pages.GET("/dashboard-annotations", rt.dashAnnotationGets)
pages.PUT("/dashboard-annotation/:id", rt.auth(), rt.user(), rt.perm("/dashboards/put"), rt.dashAnnotationPut)
pages.DELETE("/dashboard-annotation/:id", rt.auth(), rt.user(), rt.perm("/dashboards/del"), rt.dashAnnotationDel)
// pages.GET("/alert-rules/builtin/alerts-cates", rt.auth(), rt.user(), rt.builtinAlertCateGets)
// pages.GET("/alert-rules/builtin/list", rt.auth(), rt.user(), rt.builtinAlertRules)
pages.GET("/alert-rules/callbacks", rt.auth(), rt.user(), rt.alertRuleCallbacks)
pages.GET("/timezones", rt.auth(), rt.user(), rt.timezonesGet)
pages.GET("/busi-groups/alert-rules", rt.auth(), rt.user(), rt.perm("/alert-rules"), rt.alertRuleGetsByGids)
pages.GET("/busi-group/:id/alert-rules", rt.auth(), rt.user(), rt.perm("/alert-rules"), rt.alertRuleGets)
pages.POST("/busi-group/:id/alert-rules", rt.auth(), rt.user(), rt.perm("/alert-rules/add"), rt.bgrw(), rt.alertRuleAddByFE)
pages.POST("/busi-group/:id/alert-rules/import", rt.auth(), rt.user(), rt.perm("/alert-rules/add"), rt.bgrw(), rt.alertRuleAddByImport)
pages.POST("/busi-group/:id/alert-rules/import-prom-rule", rt.auth(),
rt.user(), rt.perm("/alert-rules/add"), rt.bgrw(), rt.alertRuleAddByImportPromRule)
pages.DELETE("/busi-group/:id/alert-rules", rt.auth(), rt.user(), rt.perm("/alert-rules/del"), rt.bgrw(), rt.alertRuleDel)
pages.PUT("/busi-group/:id/alert-rules/fields", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.bgrw(), rt.alertRulePutFields)
pages.PUT("/busi-group/:id/alert-rule/:arid", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.alertRulePutByFE)
pages.GET("/alert-rule/:arid", rt.auth(), rt.user(), rt.perm("/alert-rules"), rt.alertRuleGet)
pages.GET("/alert-rule/:arid/pure", rt.auth(), rt.user(), rt.perm("/alert-rules"), rt.alertRulePureGet)
pages.PUT("/busi-group/alert-rule/validate", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.alertRuleValidation)
pages.POST("/relabel-test", rt.auth(), rt.user(), rt.relabelTest)
pages.POST("/busi-group/:id/alert-rules/clone", rt.auth(), rt.user(), rt.perm("/alert-rules/add"), rt.bgrw(), rt.cloneToMachine)
pages.POST("/busi-groups/alert-rules/clones", rt.auth(), rt.user(), rt.perm("/alert-rules/add"), rt.batchAlertRuleClone)
pages.POST("/busi-group/alert-rules/notify-tryrun", rt.auth(), rt.user(), rt.perm("/alert-rules/add"), rt.alertRuleNotifyTryRun)
pages.POST("/busi-group/alert-rules/enable-tryrun", rt.auth(), rt.user(), rt.perm("/alert-rules/add"), rt.alertRuleEnableTryRun)
pages.GET("/busi-groups/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules"), rt.recordingRuleGetsByGids)
pages.GET("/busi-group/:id/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules"), rt.recordingRuleGets)
pages.POST("/busi-group/:id/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules/add"), rt.bgrw(), rt.recordingRuleAddByFE)
pages.DELETE("/busi-group/:id/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules/del"), rt.bgrw(), rt.recordingRuleDel)
pages.GET("/recording-rule/:rrid", rt.auth(), rt.user(), rt.perm("/recording-rules"), rt.recordingRuleGet)
pages.PUT("/recording-rule/:rrid", rt.auth(), rt.user(), rt.perm("/recording-rules"), rt.recordingRulePutByFE)
pages.PUT("/busi-group/:id/recording-rules/fields", rt.auth(), rt.user(), rt.perm("/recording-rules/put"), rt.recordingRulePutFields)
pages.GET("/busi-groups/alert-mutes", rt.auth(), rt.user(), rt.perm("/alert-mutes"), rt.alertMuteGetsByGids)
pages.GET("/busi-group/:id/alert-mutes", rt.auth(), rt.user(), rt.perm("/alert-mutes"), rt.bgro(), rt.alertMuteGetsByBG)
pages.POST("/busi-group/:id/alert-mutes/preview", rt.auth(), rt.user(), rt.perm("/alert-mutes/add"), rt.bgrw(), rt.alertMutePreview)
pages.POST("/busi-group/:id/alert-mutes", rt.auth(), rt.user(), rt.perm("/alert-mutes/add"), rt.bgrw(), rt.alertMuteAdd)
pages.DELETE("/busi-group/:id/alert-mutes", rt.auth(), rt.user(), rt.perm("/alert-mutes/del"), rt.bgrw(), rt.alertMuteDel)
pages.PUT("/busi-group/:id/alert-mute/:amid", rt.auth(), rt.user(), rt.perm("/alert-mutes/put"), rt.alertMutePutByFE)
pages.GET("/busi-group/:id/alert-mute/:amid", rt.auth(), rt.user(), rt.perm("/alert-mutes"), rt.alertMuteGet)
pages.PUT("/busi-group/:id/alert-mutes/fields", rt.auth(), rt.user(), rt.perm("/alert-mutes/put"), rt.bgrw(), rt.alertMutePutFields)
pages.POST("/alert-mute-tryrun", rt.auth(), rt.user(), rt.perm("/alert-mutes/add"), rt.alertMuteTryRun)
pages.GET("/busi-groups/alert-subscribes", rt.auth(), rt.user(), rt.perm("/alert-subscribes"), rt.alertSubscribeGetsByGids)
pages.GET("/busi-group/:id/alert-subscribes", rt.auth(), rt.user(), rt.perm("/alert-subscribes"), rt.bgro(), rt.alertSubscribeGets)
pages.GET("/alert-subscribe/:sid", rt.auth(), rt.user(), rt.perm("/alert-subscribes"), rt.alertSubscribeGet)
pages.POST("/busi-group/:id/alert-subscribes", rt.auth(), rt.user(), rt.perm("/alert-subscribes/add"), rt.bgrw(), rt.alertSubscribeAdd)
pages.PUT("/busi-group/:id/alert-subscribes", rt.auth(), rt.user(), rt.perm("/alert-subscribes/put"), rt.bgrw(), rt.alertSubscribePut)
pages.DELETE("/busi-group/:id/alert-subscribes", rt.auth(), rt.user(), rt.perm("/alert-subscribes/del"), rt.bgrw(), rt.alertSubscribeDel)
pages.POST("/alert-subscribe/alert-subscribes-tryrun", rt.auth(), rt.user(), rt.perm("/alert-subscribes/add"), rt.alertSubscribeTryRun)
pages.GET("/alert-cur-event/:eid", rt.alertCurEventGet)
pages.GET("/alert-his-event/:eid", rt.alertHisEventGet)
pages.GET("/event-notify-records/:eid", rt.notificationRecordList)
pages.GET("/event-detail/:hash", rt.eventDetailPage)
pages.GET("/alert-eval-detail/:id", rt.alertEvalDetailPage)
pages.GET("/trace-logs/:traceid", rt.traceLogsPage)
// card logic
pages.GET("/alert-cur-events/list", rt.auth(), rt.user(), rt.alertCurEventsList)
pages.GET("/alert-cur-events/card", rt.auth(), rt.user(), rt.alertCurEventsCard)
pages.POST("/alert-cur-events/card/details", rt.auth(), rt.alertCurEventsCardDetails)
pages.GET("/alert-his-events/list", rt.auth(), rt.user(), rt.alertHisEventsList)
pages.DELETE("/alert-his-events", rt.auth(), rt.admin(), rt.alertHisEventsDelete)
pages.DELETE("/alert-cur-events", rt.auth(), rt.user(), rt.perm("/alert-cur-events/del"), rt.alertCurEventDel)
pages.GET("/alert-cur-events/stats", rt.auth(), rt.alertCurEventsStatistics)
pages.GET("/alert-aggr-views", rt.auth(), rt.alertAggrViewGets)
pages.DELETE("/alert-aggr-views", rt.auth(), rt.user(), rt.alertAggrViewDel)
pages.POST("/alert-aggr-views", rt.auth(), rt.user(), rt.alertAggrViewAdd)
pages.PUT("/alert-aggr-views", rt.auth(), rt.user(), rt.alertAggrViewPut)
pages.GET("/busi-groups/task-tpls", rt.auth(), rt.user(), rt.perm("/job-tpls"), rt.taskTplGetsByGids)
pages.GET("/busi-group/:id/task-tpls", rt.auth(), rt.user(), rt.perm("/job-tpls"), rt.bgro(), rt.taskTplGets)
pages.POST("/busi-group/:id/task-tpls", rt.auth(), rt.user(), rt.perm("/job-tpls/add"), rt.bgrw(), rt.taskTplAdd)
pages.DELETE("/busi-group/:id/task-tpl/:tid", rt.auth(), rt.user(), rt.perm("/job-tpls/del"), rt.bgrw(), rt.taskTplDel)
pages.POST("/busi-group/:id/task-tpls/tags", rt.auth(), rt.user(), rt.perm("/job-tpls/put"), rt.bgrw(), rt.taskTplBindTags)
pages.DELETE("/busi-group/:id/task-tpls/tags", rt.auth(), rt.user(), rt.perm("/job-tpls/put"), rt.bgrw(), rt.taskTplUnbindTags)
pages.GET("/busi-group/:id/task-tpl/:tid", rt.auth(), rt.user(), rt.perm("/job-tpls"), rt.bgro(), rt.taskTplGet)
pages.PUT("/busi-group/:id/task-tpl/:tid", rt.auth(), rt.user(), rt.perm("/job-tpls/put"), rt.bgrw(), rt.taskTplPut)
pages.GET("/busi-groups/tasks", rt.auth(), rt.user(), rt.perm("/job-tasks"), rt.taskGetsByGids)
pages.GET("/busi-group/:id/tasks", rt.auth(), rt.user(), rt.perm("/job-tasks"), rt.bgro(), rt.taskGets)
pages.POST("/busi-group/:id/tasks", rt.auth(), rt.user(), rt.perm("/job-tasks/add"), rt.bgrw(), rt.taskAdd)
pages.GET("/servers", rt.auth(), rt.user(), rt.serversGet)
pages.GET("/server-clusters", rt.auth(), rt.user(), rt.serverClustersGet)
pages.POST("/datasource/list", rt.auth(), rt.user(), rt.datasourceList)
pages.POST("/datasource/plugin/list", rt.auth(), rt.pluginList)
pages.POST("/datasource/upsert", rt.auth(), rt.admin(), rt.datasourceUpsert)
pages.POST("/datasource/desc", rt.auth(), rt.admin(), rt.datasourceGet)
pages.POST("/datasource/status/update", rt.auth(), rt.admin(), rt.datasourceUpdataStatus)
pages.DELETE("/datasource/", rt.auth(), rt.admin(), rt.datasourceDel)
pages.GET("/roles", rt.auth(), rt.user(), rt.roleGets)
pages.POST("/roles", rt.auth(), rt.user(), rt.perm("/roles/add"), rt.roleAdd)
pages.PUT("/roles", rt.auth(), rt.user(), rt.perm("/roles/put"), rt.rolePut)
pages.DELETE("/role/:id", rt.auth(), rt.user(), rt.perm("/roles/del"), rt.roleDel)
pages.GET("/role/:id/ops", rt.auth(), rt.user(), rt.perm("/roles"), rt.operationOfRole)
pages.PUT("/role/:id/ops", rt.auth(), rt.user(), rt.perm("/roles/put"), rt.roleBindOperation)
pages.GET("/operation", rt.operations)
pages.GET("/notify-tpls", rt.auth(), rt.user(), rt.notifyTplGets)
pages.PUT("/notify-tpl/content", rt.auth(), rt.user(), rt.notifyTplUpdateContent)
pages.PUT("/notify-tpl", rt.auth(), rt.user(), rt.notifyTplUpdate)
pages.POST("/notify-tpl", rt.auth(), rt.user(), rt.notifyTplAdd)
pages.DELETE("/notify-tpl/:id", rt.auth(), rt.user(), rt.notifyTplDel)
pages.POST("/notify-tpl/preview", rt.auth(), rt.user(), rt.notifyTplPreview)
pages.GET("/sso-configs", rt.auth(), rt.admin(), rt.ssoConfigGets)
pages.PUT("/sso-config", rt.auth(), rt.admin(), rt.ssoConfigUpdate)
pages.GET("/webhooks", rt.auth(), rt.user(), rt.webhookGets)
pages.PUT("/webhooks", rt.auth(), rt.admin(), rt.webhookPuts)
pages.GET("/notify-script", rt.auth(), rt.user(), rt.perm("/help/notification-settings"), rt.notifyScriptGet)
pages.PUT("/notify-script", rt.auth(), rt.admin(), rt.notifyScriptPut)
pages.GET("/notify-channel", rt.auth(), rt.user(), rt.perm("/help/notification-settings"), rt.notifyChannelGets)
pages.PUT("/notify-channel", rt.auth(), rt.admin(), rt.notifyChannelPuts)
pages.GET("/notify-contact", rt.auth(), rt.user(), rt.notifyContactGets)
pages.PUT("/notify-contact", rt.auth(), rt.admin(), rt.notifyContactPuts)
pages.GET("/notify-config", rt.auth(), rt.user(), rt.perm("/help/notification-settings"), rt.notifyConfigGet)
pages.PUT("/notify-config", rt.auth(), rt.admin(), rt.notifyConfigPut)
pages.PUT("/smtp-config-test", rt.auth(), rt.admin(), rt.attemptSendEmail)
pages.GET("/es-index-pattern", rt.auth(), rt.esIndexPatternGet)
pages.GET("/es-index-pattern-list", rt.auth(), rt.esIndexPatternGetList)
pages.POST("/es-index-pattern", rt.auth(), rt.user(), rt.perm("/log/index-patterns/add"), rt.esIndexPatternAdd)
pages.PUT("/es-index-pattern", rt.auth(), rt.user(), rt.perm("/log/index-patterns/put"), rt.esIndexPatternPut)
pages.DELETE("/es-index-pattern", rt.auth(), rt.user(), rt.perm("/log/index-patterns/del"), rt.esIndexPatternDel)
pages.GET("/embedded-dashboards", rt.auth(), rt.user(), rt.perm("/embedded-dashboards"), rt.embeddedDashboardsGet)
pages.PUT("/embedded-dashboards", rt.auth(), rt.user(), rt.perm("/embedded-dashboards/put"), rt.embeddedDashboardsPut)
// 获取 embedded-product 列表
pages.GET("/embedded-product", rt.auth(), rt.user(), rt.embeddedProductGets)
pages.GET("/embedded-product/:id", rt.auth(), rt.user(), rt.embeddedProductGet)
pages.POST("/embedded-product", rt.auth(), rt.user(), rt.perm("/embedded-product/add"), rt.embeddedProductAdd)
pages.PUT("/embedded-product/:id", rt.auth(), rt.user(), rt.perm("/embedded-product/put"), rt.embeddedProductPut)
pages.DELETE("/embedded-product/:id", rt.auth(), rt.user(), rt.perm("/embedded-product/delete"), rt.embeddedProductDelete)
pages.GET("/user-variable-configs", rt.auth(), rt.user(), rt.perm("/help/variable-configs"), rt.userVariableConfigGets)
pages.POST("/user-variable-config", rt.auth(), rt.user(), rt.perm("/help/variable-configs"), rt.userVariableConfigAdd)
pages.PUT("/user-variable-config/:id", rt.auth(), rt.user(), rt.perm("/help/variable-configs"), rt.userVariableConfigPut)
pages.DELETE("/user-variable-config/:id", rt.auth(), rt.user(), rt.perm("/help/variable-configs"), rt.userVariableConfigDel)
pages.GET("/config", rt.auth(), rt.admin(), rt.configGetByKey)
pages.PUT("/config", rt.auth(), rt.admin(), rt.configPutByKey)
pages.GET("/site-info", rt.siteInfo)
// source token 相关路由
pages.POST("/source-token", rt.auth(), rt.user(), rt.sourceTokenAdd)
// for admin api
pages.GET("/user/busi-groups", rt.auth(), rt.admin(), rt.userBusiGroupsGets)
pages.GET("/builtin-components", rt.auth(), rt.user(), rt.builtinComponentsGets)
pages.POST("/builtin-components", rt.auth(), rt.user(), rt.perm("/components/add"), rt.builtinComponentsAdd)
pages.PUT("/builtin-components", rt.auth(), rt.user(), rt.perm("/components/put"), rt.builtinComponentsPut)
pages.DELETE("/builtin-components", rt.auth(), rt.user(), rt.perm("/components/del"), rt.builtinComponentsDel)
pages.GET("/builtin-payloads", rt.auth(), rt.user(), rt.builtinPayloadsGets)
pages.GET("/builtin-payloads/cates", rt.auth(), rt.user(), rt.builtinPayloadcatesGet)
pages.POST("/builtin-payloads", rt.auth(), rt.user(), rt.perm("/components/add"), rt.builtinPayloadsAdd)
pages.PUT("/builtin-payloads", rt.auth(), rt.user(), rt.perm("/components/put"), rt.builtinPayloadsPut)
pages.DELETE("/builtin-payloads", rt.auth(), rt.user(), rt.perm("/components/del"), rt.builtinPayloadsDel)
pages.GET("/builtin-payload", rt.auth(), rt.user(), rt.builtinPayloadsGetByUUID)
pages.POST("/message-templates", rt.auth(), rt.user(), rt.perm("/notification-templates/add"), rt.messageTemplatesAdd)
pages.DELETE("/message-templates", rt.auth(), rt.user(), rt.perm("/notification-templates/del"), rt.messageTemplatesDel)
pages.PUT("/message-template/:id", rt.auth(), rt.user(), rt.perm("/notification-templates/put"), rt.messageTemplatePut)
pages.GET("/message-template/:id", rt.auth(), rt.user(), rt.perm("/notification-templates"), rt.messageTemplateGet)
pages.GET("/message-templates", rt.auth(), rt.user(), rt.messageTemplatesGet)
pages.POST("/events-message", rt.auth(), rt.user(), rt.eventsMessage)
pages.POST("/notify-rules", rt.auth(), rt.user(), rt.perm("/notification-rules/add"), rt.notifyRulesAdd)
pages.DELETE("/notify-rules", rt.auth(), rt.user(), rt.perm("/notification-rules/del"), rt.notifyRulesDel)
pages.PUT("/notify-rule/:id", rt.auth(), rt.user(), rt.perm("/notification-rules/put"), rt.notifyRulePut)
pages.GET("/notify-rule/:id", rt.auth(), rt.user(), rt.perm("/notification-rules"), rt.notifyRuleGet)
pages.GET("/notify-rules", rt.auth(), rt.user(), rt.perm("/notification-rules"), rt.notifyRulesGet)
pages.POST("/notify-rule/test", rt.auth(), rt.user(), rt.perm("/notification-rules"), rt.notifyTest)
pages.GET("/notify-rule/custom-params", rt.auth(), rt.user(), rt.perm("/notification-rules"), rt.notifyRuleCustomParamsGet)
pages.POST("/notify-rule/event-pipelines-tryrun", rt.auth(), rt.user(), rt.perm("/notification-rules/add"), rt.tryRunEventProcessorByNotifyRule)
pages.GET("/event-tagkeys", rt.auth(), rt.user(), rt.eventTagKeys)
pages.GET("/event-tagvalues", rt.auth(), rt.user(), rt.eventTagValues)
// 事件Pipeline相关路由
pages.GET("/event-pipelines", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.eventPipelinesList)
pages.POST("/event-pipeline", rt.auth(), rt.user(), rt.perm("/event-pipelines/add"), rt.addEventPipeline)
pages.PUT("/event-pipeline", rt.auth(), rt.user(), rt.perm("/event-pipelines/put"), rt.updateEventPipeline)
pages.GET("/event-pipeline/:id", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.getEventPipeline)
pages.DELETE("/event-pipelines", rt.auth(), rt.user(), rt.perm("/event-pipelines/del"), rt.deleteEventPipelines)
pages.POST("/event-pipeline-tryrun", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.tryRunEventPipeline)
pages.POST("/event-processor-tryrun", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.tryRunEventProcessor)
// API 触发工作流
pages.POST("/event-pipeline/:id/trigger", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.triggerEventPipelineByAPI)
// SSE 流式执行工作流
pages.POST("/event-pipeline/:id/stream", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.streamEventPipeline)
// 事件Pipeline执行记录路由
pages.GET("/event-pipeline-executions", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.listAllEventPipelineExecutions)
pages.GET("/event-pipeline/:id/executions", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.listEventPipelineExecutions)
pages.GET("/event-pipeline/:id/execution/:exec_id", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.getEventPipelineExecution)
pages.GET("/event-pipeline-execution/:exec_id", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.getEventPipelineExecution)
pages.GET("/event-pipeline/:id/execution-stats", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.getEventPipelineExecutionStats)
pages.POST("/event-pipeline-executions/clean", rt.auth(), rt.user(), rt.admin(), rt.cleanEventPipelineExecutions)
pages.POST("/notify-channel-configs", rt.auth(), rt.user(), rt.perm("/notification-channels/add"), rt.notifyChannelsAdd)
pages.DELETE("/notify-channel-configs", rt.auth(), rt.user(), rt.perm("/notification-channels/del"), rt.notifyChannelsDel)
pages.PUT("/notify-channel-config/:id", rt.auth(), rt.user(), rt.perm("/notification-channels/put"), rt.notifyChannelPut)
pages.GET("/notify-channel-config/:id", rt.auth(), rt.user(), rt.perm("/notification-channels"), rt.notifyChannelGet)
pages.GET("/notify-channel-configs", rt.auth(), rt.user(), rt.perm("/notification-channels"), rt.notifyChannelsGet)
pages.GET("/simplified-notify-channel-configs", rt.notifyChannelsGetForNormalUser)
pages.GET("/flashduty-channel-list/:id", rt.auth(), rt.user(), rt.flashDutyNotifyChannelsGet)
pages.GET("/pagerduty-integration-key/:id/:service_id/:integration_id", rt.auth(), rt.user(), rt.pagerDutyIntegrationKeyGet)
pages.GET("/pagerduty-service-list/:id", rt.auth(), rt.user(), rt.pagerDutyNotifyServicesGet)
pages.GET("/notify-channel-config", rt.auth(), rt.user(), rt.notifyChannelGetBy)
pages.GET("/notify-channel-config/idents", rt.notifyChannelIdentsGet)
// saved view 查询条件保存相关路由
pages.GET("/saved-views", rt.auth(), rt.user(), rt.savedViewGets)
pages.POST("/saved-views", rt.auth(), rt.user(), rt.savedViewAdd)
pages.PUT("/saved-view/:id", rt.auth(), rt.user(), rt.savedViewPut)
pages.DELETE("/saved-view/:id", rt.auth(), rt.user(), rt.savedViewDel)
pages.POST("/saved-view/:id/favorite", rt.auth(), rt.user(), rt.savedViewFavoriteAdd)
pages.DELETE("/saved-view/:id/favorite", rt.auth(), rt.user(), rt.savedViewFavoriteDel)
}
r.GET("/api/n9e/versions", func(c *gin.Context) {
v := version.Version
lastIndex := strings.LastIndex(version.Version, "-")
if lastIndex != -1 {
v = version.Version[:lastIndex]
}
gv := version.GithubVersion.Load()
if gv != nil {
ginx.NewRender(c).Data(gin.H{"version": v, "github_verison": gv.(string)}, nil)
} else {
ginx.NewRender(c).Data(gin.H{"version": v, "github_verison": ""}, nil)
}
})
if rt.HTTP.APIForService.Enable {
service := r.Group("/v1/n9e")
if len(rt.HTTP.APIForService.BasicAuth) > 0 {
service.Use(gin.BasicAuth(rt.HTTP.APIForService.BasicAuth))
}
{
service.Any("/prometheus/*url", rt.dsProxy)
service.POST("/users", rt.userAddPost)
service.PUT("/user/:id", rt.userProfilePutByService)
service.DELETE("/user/:id", rt.userDel)
service.GET("/users", rt.userFindAll)
service.GET("/user-groups", rt.userGroupGetsByService)
service.GET("/user-group-members", rt.userGroupMemberGetsByService)
service.GET("/targets", rt.targetGetsByService)
service.GET("/target/extra-meta", rt.targetExtendInfoByIdent)
service.POST("/target/list", rt.targetGetsByHostFilter)
service.DELETE("/targets", rt.targetDelByService)
service.GET("/targets/tags", rt.targetGetTags)
service.POST("/targets/tags", rt.targetBindTagsByService)
service.DELETE("/targets/tags", rt.targetUnbindTagsByService)
service.PUT("/targets/note", rt.targetUpdateNoteByService)
service.PUT("/targets/bgid", rt.targetUpdateBgidByService)
service.POST("/targets-of-host-query", rt.targetsOfHostQuery)
service.POST("/alert-rules", rt.alertRuleAddByService)
service.POST("/alert-rule-add", rt.alertRuleAddOneByService)
service.DELETE("/alert-rules", rt.alertRuleDelByService)
service.PUT("/alert-rule/:arid", rt.alertRulePutByService)
service.GET("/alert-rule/:arid", rt.alertRuleGet)
service.GET("/alert-rules", rt.alertRulesGetByService)
service.GET("/alert-subscribes", rt.alertSubscribeGetsByService)
service.GET("/busi-groups", rt.busiGroupGetsByService)
service.GET("/datasources", rt.datasourceGetsByService)
service.GET("/datasource-rsa-config", rt.datasourceRsaConfigGet)
service.GET("/datasource-ids", rt.getDatasourceIds)
service.POST("/server-heartbeat", rt.serverHeartbeat)
service.GET("/servers-active", rt.serversActive)
service.GET("/recording-rules", rt.recordingRuleGetsByService)
service.GET("/alert-mutes", rt.alertMuteGets)
service.GET("/active-alert-mutes", rt.activeAlertMuteGets)
service.POST("/alert-mutes", rt.alertMuteAddByService)
service.DELETE("/alert-mutes", rt.alertMuteDel)
service.GET("/alert-cur-events", rt.alertCurEventsList)
service.GET("/alert-cur-events-get-by-rid", rt.alertCurEventsGetByRid)
service.GET("/alert-his-events", rt.alertHisEventsList)
service.GET("/alert-his-event/:eid", rt.alertHisEventGet)
service.GET("/task-tpl/:tid", rt.taskTplGetByService)
service.GET("/task-tpls", rt.taskTplGetsByService)
service.GET("/task-tpl/statistics", rt.taskTplStatistics)
service.GET("/config/:id", rt.configGet)
service.GET("/configs", rt.configsGet)
service.GET("/config", rt.configGetByKey)
service.GET("/all-configs", rt.configGetAll)
service.PUT("/configs", rt.configsPut)
service.POST("/configs", rt.configsPost)
service.DELETE("/configs", rt.configsDel)
service.POST("/conf-prop/encrypt", rt.confPropEncrypt)
service.POST("/conf-prop/decrypt", rt.confPropDecrypt)
service.GET("/statistic", rt.statistic)
service.GET("/notify-tpls", rt.notifyTplGets)
service.POST("/task-record-add", rt.taskRecordAdd)
service.GET("/user-variable/decrypt", rt.userVariableGetDecryptByService)
service.GET("/targets-of-alert-rule", rt.targetsOfAlertRule)
service.POST("/notify-record", rt.notificationRecordAdd)
service.GET("/alert-cur-events-del-by-hash", rt.alertCurEventDelByHash)
service.POST("/center/heartbeat", rt.heartbeat)
service.GET("/es-index-pattern-list", rt.esIndexPatternGetList)
service.GET("/notify-rules", rt.notifyRulesGetByService)
service.GET("/notify-channels", rt.notifyChannelConfigGets)
service.GET("/message-templates", rt.messageTemplateGets)
service.GET("/event-pipelines", rt.eventPipelinesListByService)
service.POST("/event-pipeline/:id/trigger", rt.triggerEventPipelineByService)
service.POST("/event-pipeline/:id/stream", rt.streamEventPipelineByService)
service.POST("/event-pipeline-execution", rt.eventPipelineExecutionAdd)
// 手机号加密存储配置接口
service.POST("/users/phone/encrypt", rt.usersPhoneEncrypt)
service.POST("/users/phone/decrypt", rt.usersPhoneDecrypt)
service.POST("/users/phone/refresh-encryption-config", rt.usersPhoneDecryptRefresh)
service.GET("/builtin-components", rt.builtinComponentsGets)
service.GET("/builtin-payloads", rt.builtinPayloadsGets)
}
}
if rt.HTTP.APIForAgent.Enable {
heartbeat := r.Group("/v1/n9e")
{
if len(rt.HTTP.APIForAgent.BasicAuth) > 0 {
heartbeat.Use(gin.BasicAuth(rt.HTTP.APIForAgent.BasicAuth))
}
heartbeat.POST("/heartbeat", rt.heartbeat)
}
}
rt.configNoRoute(r, &statikFS)
}
func Render(c *gin.Context, data, msg interface{}) {
if msg == nil {
if data == nil {
data = struct{}{}
}
c.JSON(http.StatusOK, gin.H{"data": data, "error": ""})
} else {
c.JSON(http.StatusOK, gin.H{"error": gin.H{"message": msg}})
}
}
func Dangerous(c *gin.Context, v interface{}, code ...int) {
if v == nil {
return
}
switch t := v.(type) {
case string:
if t != "" {
c.JSON(http.StatusOK, gin.H{"error": v})
}
case error:
c.JSON(http.StatusOK, gin.H{"error": t.Error()})
}
}
================================================
FILE: center/router/router_alert_aggr_view.go
================================================
package router
import (
"net/http"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
// no param
func (rt *Router) alertAggrViewGets(c *gin.Context) {
lst, err := models.AlertAggrViewGets(rt.Ctx, c.MustGet("userid"))
ginx.NewRender(c).Data(lst, err)
}
// body: name, rule, cate
func (rt *Router) alertAggrViewAdd(c *gin.Context) {
var f models.AlertAggrView
ginx.BindJSON(c, &f)
me := c.MustGet("user").(*models.User)
if !me.IsAdmin() {
// 管理员可以选择当前这个视图是公开呢,还是私有,普通用户的话就只能是私有的
f.Cate = 1
}
f.Id = 0
f.CreateBy = me.Id
ginx.Dangerous(f.Add(rt.Ctx))
ginx.NewRender(c).Data(f, nil)
}
// body: ids
func (rt *Router) alertAggrViewDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
f.Verify()
me := c.MustGet("user").(*models.User)
if me.IsAdmin() {
ginx.NewRender(c).Message(models.AlertAggrViewDel(rt.Ctx, f.Ids))
} else {
ginx.NewRender(c).Message(models.AlertAggrViewDel(rt.Ctx, f.Ids, me.Id))
}
}
// body: id, name, rule, cate
func (rt *Router) alertAggrViewPut(c *gin.Context) {
var f models.AlertAggrView
ginx.BindJSON(c, &f)
view, err := models.AlertAggrViewGet(rt.Ctx, "id = ?", f.Id)
ginx.Dangerous(err)
if view == nil {
ginx.NewRender(c).Message("no such item(id: %d)", f.Id)
return
}
me := c.MustGet("user").(*models.User)
if !me.IsAdmin() {
f.Cate = 1
if view.CreateBy != me.Id {
ginx.NewRender(c, http.StatusForbidden).Message("forbidden")
return
}
}
view.Name = f.Name
view.Rule = f.Rule
view.Cate = f.Cate
if view.CreateBy == 0 {
view.CreateBy = me.Id
}
ginx.NewRender(c).Message(view.Update(rt.Ctx))
}
================================================
FILE: center/router/router_alert_cur_event.go
================================================
package router
import (
"fmt"
"net/http"
"sort"
"strings"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/strx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/logger"
)
func getUserGroupIds(ctx *gin.Context, rt *Router, myGroups bool) ([]int64, error) {
if !myGroups {
return nil, nil
}
me := ctx.MustGet("user").(*models.User)
return models.MyGroupIds(rt.Ctx, me.Id)
}
func (rt *Router) alertCurEventsCard(c *gin.Context) {
stime, etime := getTimeRange(c)
severity := strx.IdsInt64ForAPI(ginx.QueryStr(c, "severity", ""), ",")
query := ginx.QueryStr(c, "query", "")
myGroups := ginx.QueryBool(c, "my_groups", false) // 是否只看自己组,默认false
var gids []int64
var err error
if myGroups {
gids, err = getUserGroupIds(c, rt, myGroups)
ginx.Dangerous(err)
if len(gids) == 0 {
gids = append(gids, -1)
}
}
viewId := ginx.QueryInt64(c, "view_id")
alertView, err := models.GetAlertAggrViewByViewID(rt.Ctx, viewId)
ginx.Dangerous(err)
if alertView == nil {
ginx.Bomb(http.StatusNotFound, "alert aggr view not found")
}
dsIds := queryDatasourceIds(c)
prod := ginx.QueryStr(c, "prods", "")
if prod == "" {
prod = ginx.QueryStr(c, "rule_prods", "")
}
prods := []string{}
if prod != "" {
prods = strings.Split(prod, ",")
}
cate := ginx.QueryStr(c, "cate", "$all")
cates := []string{}
if cate != "$all" {
cates = strings.Split(cate, ",")
}
bgids, err := GetBusinessGroupIds(c, rt.Ctx, rt.Center.EventHistoryGroupView, myGroups)
ginx.Dangerous(err)
// 最多获取50000个,获取太多也没啥意义
list, err := models.AlertCurEventsGet(rt.Ctx, prods, bgids, stime, etime, severity, dsIds,
cates, 0, query, 50000, 0, []int64{})
ginx.Dangerous(err)
cardmap := make(map[string]*AlertCard)
for _, event := range list {
title, err := event.GenCardTitle(alertView.Rule)
ginx.Dangerous(err)
if _, has := cardmap[title]; has {
cardmap[title].Total++
cardmap[title].EventIds = append(cardmap[title].EventIds, event.Id)
if event.Severity < cardmap[title].Severity {
cardmap[title].Severity = event.Severity
}
} else {
cardmap[title] = &AlertCard{
Total: 1,
EventIds: []int64{event.Id},
Title: title,
Severity: event.Severity,
}
}
if cardmap[title].Severity < 1 {
cardmap[title].Severity = 3
}
}
titles := make([]string, 0, len(cardmap))
for title := range cardmap {
titles = append(titles, title)
}
sort.Strings(titles)
cards := make([]*AlertCard, len(titles))
for i := 0; i < len(titles); i++ {
cards[i] = cardmap[titles[i]]
}
sort.SliceStable(cards, func(i, j int) bool {
if cards[i].Severity != cards[j].Severity {
return cards[i].Severity < cards[j].Severity
}
return cards[i].Total > cards[j].Total
})
ginx.NewRender(c).Data(cards, nil)
}
type AlertCard struct {
Title string `json:"title"`
Total int `json:"total"`
EventIds []int64 `json:"event_ids"`
Severity int `json:"severity"`
}
func (rt *Router) alertCurEventsCardDetails(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
list, err := models.AlertCurEventGetByIds(rt.Ctx, f.Ids)
if err == nil {
cache := make(map[int64]*models.UserGroup)
for i := 0; i < len(list); i++ {
list[i].FillNotifyGroups(rt.Ctx, cache)
}
}
ginx.NewRender(c).Data(list, err)
}
// alertCurEventsGetByRid
func (rt *Router) alertCurEventsGetByRid(c *gin.Context) {
rid := ginx.QueryInt64(c, "rid")
dsId := ginx.QueryInt64(c, "dsid")
ginx.NewRender(c).Data(models.AlertCurEventGetByRuleIdAndDsId(rt.Ctx, rid, dsId))
}
// 列表方式,拉取活跃告警
func (rt *Router) alertCurEventsList(c *gin.Context) {
stime, etime := getTimeRange(c)
severity := strx.IdsInt64ForAPI(ginx.QueryStr(c, "severity", ""), ",")
query := ginx.QueryStr(c, "query", "")
limit := ginx.QueryInt(c, "limit", 20)
myGroups := ginx.QueryBool(c, "my_groups", false) // 是否只看自己组,默认false
dsIds := queryDatasourceIds(c)
eventIds := strx.IdsInt64ForAPI(ginx.QueryStr(c, "event_ids", ""), ",")
prod := ginx.QueryStr(c, "prods", "")
if prod == "" {
prod = ginx.QueryStr(c, "rule_prods", "")
}
prods := []string{}
if prod != "" {
prods = strings.Split(prod, ",")
}
cate := ginx.QueryStr(c, "cate", "$all")
cates := []string{}
if cate != "$all" {
cates = strings.Split(cate, ",")
}
ruleId := ginx.QueryInt64(c, "rid", 0)
bgids, err := GetBusinessGroupIds(c, rt.Ctx, rt.Center.EventHistoryGroupView, myGroups)
ginx.Dangerous(err)
total, err := models.AlertCurEventTotal(rt.Ctx, prods, bgids, stime, etime, severity, dsIds,
cates, ruleId, query, eventIds)
ginx.Dangerous(err)
list, err := models.AlertCurEventsGet(rt.Ctx, prods, bgids, stime, etime, severity, dsIds,
cates, ruleId, query, limit, ginx.Offset(c, limit), eventIds)
ginx.Dangerous(err)
cache := make(map[int64]*models.UserGroup)
for i := 0; i < len(list); i++ {
list[i].FillNotifyGroups(rt.Ctx, cache)
}
ginx.NewRender(c).Data(gin.H{
"list": list,
"total": total,
}, nil)
}
func (rt *Router) alertCurEventDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
f.Verify()
rt.checkCurEventBusiGroupRWPermission(c, f.Ids)
ginx.NewRender(c).Message(models.AlertCurEventDel(rt.Ctx, f.Ids))
}
func (rt *Router) checkCurEventBusiGroupRWPermission(c *gin.Context, ids []int64) {
set := make(map[int64]struct{})
// event group id is 0, ignore perm check
set[0] = struct{}{}
for i := 0; i < len(ids); i++ {
event, err := models.AlertCurEventGetById(rt.Ctx, ids[i])
ginx.Dangerous(err)
if event == nil {
continue
}
if _, has := set[event.GroupId]; !has {
rt.bgrwCheck(c, event.GroupId)
set[event.GroupId] = struct{}{}
}
}
}
func (rt *Router) alertCurEventGet(c *gin.Context) {
eid := ginx.UrlParamInt64(c, "eid")
event, err := GetCurEventDetail(rt.Ctx, eid)
hasPermission := HasPermission(rt.Ctx, c, "event", fmt.Sprintf("%d", eid), rt.Center.AnonymousAccess.AlertDetail)
if !hasPermission {
rt.auth()(c)
rt.user()(c)
rt.bgroCheck(c, event.GroupId)
}
ginx.NewRender(c).Data(event, err)
}
func GetCurEventDetail(ctx *ctx.Context, eid int64) (*models.AlertCurEvent, error) {
event, err := models.AlertCurEventGetById(ctx, eid)
if err != nil {
return nil, err
}
if event == nil {
return nil, fmt.Errorf("no such active event")
}
ruleConfig, needReset := models.FillRuleConfigTplName(ctx, event.RuleConfig)
if needReset {
event.RuleConfigJson = ruleConfig
}
event.LastEvalTime = event.TriggerTime
event.NotifyVersion, err = GetEventNotifyVersion(ctx, event.RuleId, event.NotifyRuleIds)
ginx.Dangerous(err)
event.NotifyRules, err = GetEventNotifyRuleNames(ctx, event.NotifyRuleIds)
return event, err
}
func GetEventNotifyRuleNames(ctx *ctx.Context, notifyRuleIds []int64) ([]*models.EventNotifyRule, error) {
notifyRuleNames := make([]*models.EventNotifyRule, 0)
notifyRules, err := models.NotifyRulesGet(ctx, "id in ?", notifyRuleIds)
if err != nil {
return nil, err
}
for _, notifyRule := range notifyRules {
notifyRuleNames = append(notifyRuleNames, &models.EventNotifyRule{
Id: notifyRule.ID,
Name: notifyRule.Name,
})
}
return notifyRuleNames, nil
}
func GetEventNotifyVersion(ctx *ctx.Context, ruleId int64, notifyRuleIds []int64) (int, error) {
if len(notifyRuleIds) != 0 {
// 如果存在 notify_rule_ids,则认为使用新的告警通知方式
return 1, nil
}
rule, err := models.AlertRuleGetById(ctx, ruleId)
if err != nil {
return 0, err
}
return rule.NotifyVersion, nil
}
func (rt *Router) alertCurEventsStatistics(c *gin.Context) {
ginx.NewRender(c).Data(models.AlertCurEventStatistics(rt.Ctx, time.Now()), nil)
}
func (rt *Router) alertCurEventDelByHash(c *gin.Context) {
hash := ginx.QueryStr(c, "hash")
ginx.NewRender(c).Message(models.AlertCurEventDelByHash(rt.Ctx, hash))
}
func (rt *Router) eventTagKeys(c *gin.Context) {
// 获取最近1天的活跃告警事件
now := time.Now().Unix()
stime := now - 24*3600
etime := now
// 获取用户可见的业务组ID列表
bgids, err := GetBusinessGroupIds(c, rt.Ctx, rt.Center.EventHistoryGroupView, false)
if err != nil {
logger.Warningf("failed to get business group ids: %v", err)
ginx.NewRender(c).Data([]string{"ident", "app", "service", "instance"}, nil)
return
}
// 查询活跃告警事件,限制数量以提高性能
events, err := models.AlertCurEventsGet(rt.Ctx, []string{}, bgids, stime, etime, []int64{}, []int64{}, []string{}, 0, "", 200, 0, []int64{})
if err != nil {
logger.Warningf("failed to get current alert events: %v", err)
ginx.NewRender(c).Data([]string{"ident", "app", "service", "instance"}, nil)
return
}
// 如果没有查到事件,返回默认标签
if len(events) == 0 {
ginx.NewRender(c).Data([]string{"ident", "app", "service", "instance"}, nil)
return
}
// 收集所有标签键并去重
tagKeys := make(map[string]struct{})
for _, event := range events {
for key := range event.TagsMap {
tagKeys[key] = struct{}{}
}
}
// 转换为字符串切片
var result []string
for key := range tagKeys {
result = append(result, key)
}
// 如果没有收集到任何标签键,返回默认值
if len(result) == 0 {
result = []string{"ident", "app", "service", "instance"}
}
ginx.NewRender(c).Data(result, nil)
}
func (rt *Router) eventTagValues(c *gin.Context) {
// 获取标签key
tagKey := ginx.QueryStr(c, "key")
// 获取最近1天的活跃告警事件
now := time.Now().Unix()
stime := now - 24*3600
etime := now
// 获取用户可见的业务组ID列表
bgids, err := GetBusinessGroupIds(c, rt.Ctx, rt.Center.EventHistoryGroupView, false)
if err != nil {
logger.Warningf("failed to get business group ids: %v", err)
ginx.NewRender(c).Data([]string{}, nil)
return
}
// 查询活跃告警事件,获取更多数据以保证统计准确性
events, err := models.AlertCurEventsGet(rt.Ctx, []string{}, bgids, stime, etime, []int64{}, []int64{}, []string{}, 0, "", 1000, 0, []int64{})
if err != nil {
logger.Warningf("failed to get current alert events: %v", err)
ginx.NewRender(c).Data([]string{}, nil)
return
}
// 如果没有查到事件,返回空数组
if len(events) == 0 {
ginx.NewRender(c).Data([]string{}, nil)
return
}
// 统计标签值出现次数
valueCount := make(map[string]int)
for _, event := range events {
// TagsMap已经在AlertCurEventsGet中处理,直接使用
if value, exists := event.TagsMap[tagKey]; exists && value != "" {
valueCount[value]++
}
}
// 转换为切片并按出现次数降序排序
type tagValue struct {
value string
count int
}
tagValues := make([]tagValue, 0, len(valueCount))
for value, count := range valueCount {
tagValues = append(tagValues, tagValue{value, count})
}
// 按出现次数降序排序
sort.Slice(tagValues, func(i, j int) bool {
return tagValues[i].count > tagValues[j].count
})
// 只取Top20并转换为字符串数组
limit := 20
if len(tagValues) < limit {
limit = len(tagValues)
}
result := make([]string, 0, limit)
for i := 0; i < limit; i++ {
result = append(result, tagValues[i].value)
}
ginx.NewRender(c).Data(result, nil)
}
================================================
FILE: center/router/router_alert_eval_detail.go
================================================
package router
import (
"encoding/json"
"fmt"
"io"
"net/http"
"sort"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/loggrep"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
// alertEvalDetailPage renders an HTML log viewer page for alert rule evaluation logs.
func (rt *Router) alertEvalDetailPage(c *gin.Context) {
id := ginx.UrlParamStr(c, "id")
if !loggrep.IsValidRuleID(id) {
c.String(http.StatusBadRequest, "invalid rule id format")
return
}
logs, instance, err := rt.getAlertEvalLogs(id)
if err != nil {
c.String(http.StatusInternalServerError, "Error: %v", err)
return
}
c.Header("Content-Type", "text/html; charset=utf-8")
err = loggrep.RenderAlertEvalHTML(c.Writer, loggrep.AlertEvalPageData{
RuleID: id,
Instance: instance,
Logs: logs,
Total: len(logs),
})
if err != nil {
c.String(http.StatusInternalServerError, "render error: %v", err)
}
}
// alertEvalDetailJSON returns JSON for alert rule evaluation logs.
func (rt *Router) alertEvalDetailJSON(c *gin.Context) {
id := ginx.UrlParamStr(c, "id")
if !loggrep.IsValidRuleID(id) {
ginx.Bomb(200, "invalid rule id format")
}
logs, instance, err := rt.getAlertEvalLogs(id)
ginx.Dangerous(err)
ginx.NewRender(c).Data(loggrep.EventDetailResp{
Logs: logs,
Instance: instance,
}, nil)
}
// getAlertEvalLogs resolves the target instance(s) and retrieves alert eval logs.
func (rt *Router) getAlertEvalLogs(id string) ([]string, string, error) {
ruleId, _ := strconv.ParseInt(id, 10, 64)
rule, err := models.AlertRuleGetById(rt.Ctx, ruleId)
if err != nil {
return nil, "", err
}
if rule == nil {
return nil, "", fmt.Errorf("no such alert rule")
}
instance := fmt.Sprintf("%s:%d", rt.Alert.Heartbeat.IP, rt.HTTP.Port)
keyword := fmt.Sprintf("alert_eval_%s", id)
// Get datasource IDs for this rule
dsIds := rt.DatasourceCache.GetIDsByDsCateAndQueries(rule.Cate, rule.DatasourceQueries)
if len(dsIds) == 0 {
// No datasources found (e.g. host rule), try local grep
logs, err := loggrep.GrepLogDir(rt.LogDir, keyword)
return logs, instance, err
}
// Find unique target nodes via hash ring, with DB fallback
nodeSet := make(map[string]struct{})
for _, dsId := range dsIds {
node, err := rt.getNodeForDatasource(dsId, id)
if err != nil {
continue
}
nodeSet[node] = struct{}{}
}
if len(nodeSet) == 0 {
// Hash ring not ready, grep locally
logs, err := loggrep.GrepLogDir(rt.LogDir, keyword)
return logs, instance, err
}
// Collect logs from all target nodes
var allLogs []string
var instances []string
for node := range nodeSet {
if node == instance {
logs, err := loggrep.GrepLogDir(rt.LogDir, keyword)
if err == nil {
allLogs = append(allLogs, logs...)
instances = append(instances, node)
}
} else {
logs, nodeAddr, err := rt.forwardAlertEvalDetail(node, id)
if err == nil {
allLogs = append(allLogs, logs...)
instances = append(instances, nodeAddr)
}
}
}
// Sort logs by timestamp descending
sort.Slice(allLogs, func(i, j int) bool {
return allLogs[i] > allLogs[j]
})
if len(allLogs) > loggrep.MaxLogLines {
allLogs = allLogs[:loggrep.MaxLogLines]
}
return allLogs, strings.Join(instances, ", "), nil
}
func (rt *Router) forwardAlertEvalDetail(node, id string) ([]string, string, error) {
url := fmt.Sprintf("http://%s/v1/n9e/alert-eval-detail/%s", node, id)
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, node, err
}
for user, pass := range rt.HTTP.APIForService.BasicAuth {
req.SetBasicAuth(user, pass)
break
}
client := &http.Client{Timeout: 15 * time.Second}
resp, err := client.Do(req)
if err != nil {
return nil, node, fmt.Errorf("forward to %s failed: %v", node, err)
}
defer resp.Body.Close()
body, err := io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024)) // 10MB limit
if err != nil {
return nil, node, err
}
var result struct {
Dat loggrep.EventDetailResp `json:"dat"`
Err string `json:"err"`
}
if err := json.Unmarshal(body, &result); err != nil {
return nil, node, err
}
if result.Err != "" {
return nil, node, fmt.Errorf("%s", result.Err)
}
return result.Dat.Logs, result.Dat.Instance, nil
}
================================================
FILE: center/router/router_alert_his_event.go
================================================
package router
import (
"fmt"
"net/http"
"strings"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/logger"
"golang.org/x/exp/slices"
)
func getTimeRange(c *gin.Context) (stime, etime int64) {
stime = ginx.QueryInt64(c, "stime", 0)
etime = ginx.QueryInt64(c, "etime", 0)
hours := ginx.QueryInt64(c, "hours", 0)
now := time.Now().Unix()
if hours != 0 {
stime = now - 3600*hours
etime = now + 3600*24
}
if stime != 0 && etime == 0 {
etime = now + 3600*24
}
return
}
func (rt *Router) alertHisEventsList(c *gin.Context) {
stime, etime := getTimeRange(c)
severity := ginx.QueryInt(c, "severity", -1)
recovered := ginx.QueryInt(c, "is_recovered", -1)
query := ginx.QueryStr(c, "query", "")
limit := ginx.QueryInt(c, "limit", 20)
dsIds := queryDatasourceIds(c)
prod := ginx.QueryStr(c, "prods", "")
if prod == "" {
prod = ginx.QueryStr(c, "rule_prods", "")
}
prods := []string{}
if prod != "" {
prods = strings.Split(prod, ",")
}
cate := ginx.QueryStr(c, "cate", "$all")
cates := []string{}
if cate != "$all" {
cates = strings.Split(cate, ",")
}
ruleId := ginx.QueryInt64(c, "rid", 0)
bgids, err := GetBusinessGroupIds(c, rt.Ctx, rt.Center.EventHistoryGroupView, false)
ginx.Dangerous(err)
total, err := models.AlertHisEventTotal(rt.Ctx, prods, bgids, stime, etime, severity,
recovered, dsIds, cates, ruleId, query, []int64{})
ginx.Dangerous(err)
list, err := models.AlertHisEventGets(rt.Ctx, prods, bgids, stime, etime, severity, recovered,
dsIds, cates, ruleId, query, limit, ginx.Offset(c, limit), []int64{})
ginx.Dangerous(err)
cache := make(map[int64]*models.UserGroup)
for i := 0; i < len(list); i++ {
list[i].FillNotifyGroups(rt.Ctx, cache)
}
ginx.NewRender(c).Data(gin.H{
"list": list,
"total": total,
}, nil)
}
type alertHisEventsDeleteForm struct {
Severities []int `json:"severities"`
Timestamp int64 `json:"timestamp" binding:"required"`
}
func (rt *Router) alertHisEventsDelete(c *gin.Context) {
var f alertHisEventsDeleteForm
ginx.BindJSON(c, &f)
// 校验
if f.Timestamp == 0 {
ginx.Bomb(http.StatusBadRequest, "timestamp parameter is required")
return
}
user := c.MustGet("user").(*models.User)
// 启动后台清理任务
go func() {
limit := 100
for {
n, err := models.AlertHisEventBatchDelete(rt.Ctx, f.Timestamp, f.Severities, limit)
if err != nil {
logger.Errorf("Failed to delete alert history events: operator=%s, timestamp=%d, severities=%v, error=%v",
user.Username, f.Timestamp, f.Severities, err)
break
}
logger.Debugf("Successfully deleted alert history events: operator=%s, timestamp=%d, severities=%v, deleted=%d",
user.Username, f.Timestamp, f.Severities, n)
if n < int64(limit) {
break // 已经删完
}
time.Sleep(100 * time.Millisecond) // 防止锁表
}
}()
ginx.NewRender(c).Data("Alert history events deletion started", nil)
}
var TransferEventToCur func(*ctx.Context, *models.AlertHisEvent) *models.AlertCurEvent
func init() {
TransferEventToCur = transferEventToCur
}
func transferEventToCur(ctx *ctx.Context, event *models.AlertHisEvent) *models.AlertCurEvent {
cur := event.ToCur()
return cur
}
func (rt *Router) alertHisEventGet(c *gin.Context) {
eid := ginx.UrlParamInt64(c, "eid")
event, err := models.AlertHisEventGetById(rt.Ctx, eid)
ginx.Dangerous(err)
if event == nil {
ginx.Bomb(404, "No such alert event")
}
hasPermission := HasPermission(rt.Ctx, c, "event", fmt.Sprintf("%d", eid), rt.Center.AnonymousAccess.AlertDetail)
if !hasPermission {
rt.auth()(c)
rt.user()(c)
rt.bgroCheck(c, event.GroupId)
}
ruleConfig, needReset := models.FillRuleConfigTplName(rt.Ctx, event.RuleConfig)
if needReset {
event.RuleConfigJson = ruleConfig
}
event.NotifyVersion, err = GetEventNotifyVersion(rt.Ctx, event.RuleId, event.NotifyRuleIds)
ginx.Dangerous(err)
event.NotifyRules, err = GetEventNotifyRuleNames(rt.Ctx, event.NotifyRuleIds)
ginx.NewRender(c).Data(TransferEventToCur(rt.Ctx, event), err)
}
func GetBusinessGroupIds(c *gin.Context, ctx *ctx.Context, onlySelfGroupView bool, myGroups bool) ([]int64, error) {
bgid := ginx.QueryInt64(c, "bgid", 0)
var bgids []int64
if strings.HasPrefix(c.Request.URL.Path, "/v1") {
// 如果请求路径以 /v1 开头,不查询用户信息
if bgid > 0 {
return []int64{bgid}, nil
}
return bgids, nil
}
user := c.MustGet("user").(*models.User)
if myGroups || (onlySelfGroupView && !user.IsAdmin()) {
// 1. 页面上勾选了我的业务组,需要查询用户所属的业务组
// 2. 如果 onlySelfGroupView 为 true,表示只允许查询用户所属的业务组
bussGroupIds, err := models.MyBusiGroupIds(ctx, user.Id)
if err != nil {
return nil, err
}
if len(bussGroupIds) == 0 {
// 如果没查到用户属于任何业务组,需要返回一个0,否则会导致查询到全部告警历史
return []int64{0}, nil
}
if bgid > 0 {
if !slices.Contains(bussGroupIds, bgid) && !user.IsAdmin() {
return nil, fmt.Errorf("business group ID not allowed")
}
return []int64{bgid}, nil
}
return bussGroupIds, nil
}
if bgid > 0 {
return []int64{bgid}, nil
}
return bgids, nil
}
================================================
FILE: center/router/router_alert_rule.go
================================================
package router
import (
"encoding/json"
"fmt"
"net/http"
"regexp"
"strconv"
"strings"
"time"
"gopkg.in/yaml.v2"
"github.com/ccfos/nightingale/v6/alert/mute"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/ccfos/nightingale/v6/pkg/strx"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/ccfos/nightingale/v6/pushgw/writer"
"github.com/gin-gonic/gin"
"github.com/jinzhu/copier"
"github.com/pkg/errors"
"github.com/prometheus/prometheus/prompb"
"github.com/toolkits/pkg/i18n"
)
type AlertRuleModifyHookFunc func(ar *models.AlertRule)
// Return all, front-end search and paging
func (rt *Router) alertRuleGets(c *gin.Context) {
busiGroupId := ginx.UrlParamInt64(c, "id")
ars, err := models.AlertRuleGets(rt.Ctx, busiGroupId)
if err == nil {
cache := make(map[int64]*models.UserGroup)
for i := 0; i < len(ars); i++ {
ars[i].FillNotifyGroups(rt.Ctx, cache)
}
models.FillUpdateByNicknames(rt.Ctx, ars)
}
ginx.NewRender(c).Data(ars, err)
}
func GetAlertCueEventTimeRange(c *gin.Context) (stime, etime int64) {
stime = ginx.QueryInt64(c, "stime", 0)
etime = ginx.QueryInt64(c, "etime", 0)
if etime == 0 {
etime = time.Now().Unix()
}
if stime == 0 || stime >= etime {
stime = etime - 30*24*int64(time.Hour.Seconds())
}
return
}
func (rt *Router) alertRuleGetsByGids(c *gin.Context) {
gids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",")
if len(gids) > 0 {
for _, gid := range gids {
rt.bgroCheck(c, gid)
}
} else {
me := c.MustGet("user").(*models.User)
if !me.IsAdmin() {
var err error
gids, err = models.MyBusiGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
if len(gids) == 0 {
ginx.NewRender(c).Data([]int{}, nil)
return
}
}
}
ars, err := models.AlertRuleGetsByBGIds(rt.Ctx, gids)
if err == nil {
cache := make(map[int64]*models.UserGroup)
rids := make([]int64, 0, len(ars))
for i := 0; i < len(ars); i++ {
ars[i].FillNotifyGroups(rt.Ctx, cache)
if len(ars[i].DatasourceQueries) != 0 {
ars[i].DatasourceIdsJson = rt.DatasourceCache.GetIDsByDsCateAndQueries(ars[i].Cate, ars[i].DatasourceQueries)
}
rids = append(rids, ars[i].Id)
}
stime, etime := GetAlertCueEventTimeRange(c)
cnt := models.AlertCurEventCountByRuleId(rt.Ctx, rids, stime, etime)
if cnt != nil {
for i := 0; i < len(ars); i++ {
ars[i].CurEventCount = cnt[ars[i].Id]
}
}
models.FillUpdateByNicknames(rt.Ctx, ars)
}
ginx.NewRender(c).Data(ars, err)
}
func (rt *Router) alertRulesGetByService(c *gin.Context) {
prods := []string{}
prodStr := ginx.QueryStr(c, "prods", "")
if prodStr != "" {
prods = strings.Split(ginx.QueryStr(c, "prods", ""), ",")
}
query := ginx.QueryStr(c, "query", "")
algorithm := ginx.QueryStr(c, "algorithm", "")
cluster := ginx.QueryStr(c, "cluster", "")
cate := ginx.QueryStr(c, "cate", "$all")
cates := []string{}
if cate != "$all" {
cates = strings.Split(cate, ",")
}
disabled := ginx.QueryInt(c, "disabled", -1)
ars, err := models.AlertRulesGetsBy(rt.Ctx, prods, query, algorithm, cluster, cates, disabled)
if err == nil {
cache := make(map[int64]*models.UserGroup)
for i := 0; i < len(ars); i++ {
ars[i].FillNotifyGroups(rt.Ctx, cache)
if len(ars[i].DatasourceQueries) != 0 {
ars[i].DatasourceIdsJson = rt.DatasourceCache.GetIDsByDsCateAndQueries(ars[i].Cate, ars[i].DatasourceQueries)
}
}
models.FillUpdateByNicknames(rt.Ctx, ars)
}
ginx.NewRender(c).Data(ars, err)
}
// single or import
func (rt *Router) alertRuleAddByFE(c *gin.Context) {
username := c.MustGet("username").(string)
var lst []models.AlertRule
ginx.BindJSON(c, &lst)
count := len(lst)
if count == 0 {
ginx.Bomb(http.StatusBadRequest, "input json is empty")
}
bgid := ginx.UrlParamInt64(c, "id")
reterr := rt.alertRuleAdd(lst, username, bgid, c.GetHeader("X-Language"))
ginx.NewRender(c).Data(reterr, nil)
}
type AlertRuleTryRunForm struct {
EventId int64 `json:"event_id" binding:"required"`
AlertRuleConfig models.AlertRule `json:"config" binding:"required"`
}
func (rt *Router) alertRuleNotifyTryRun(c *gin.Context) {
// check notify channels of old version
var f AlertRuleTryRunForm
ginx.BindJSON(c, &f)
hisEvent, err := models.AlertHisEventGetById(rt.Ctx, f.EventId)
ginx.Dangerous(err)
if hisEvent == nil {
ginx.Bomb(http.StatusNotFound, "event not found")
}
curEvent := *hisEvent.ToCur()
curEvent.SetTagsMap()
if f.AlertRuleConfig.NotifyVersion == 1 {
for _, id := range f.AlertRuleConfig.NotifyRuleIds {
notifyRule, err := models.GetNotifyRule(rt.Ctx, id)
ginx.Dangerous(err)
for _, notifyConfig := range notifyRule.NotifyConfigs {
_, err = SendNotifyChannelMessage(rt.Ctx, rt.UserCache, rt.UserGroupCache, notifyConfig, []*models.AlertCurEvent{&curEvent})
ginx.Dangerous(err)
}
}
ginx.NewRender(c).Data("notification test ok", nil)
return
}
if len(f.AlertRuleConfig.NotifyChannelsJSON) == 0 {
ginx.Bomb(http.StatusOK, "no notify channels selected")
}
if len(f.AlertRuleConfig.NotifyGroupsJSON) == 0 {
ginx.Bomb(http.StatusOK, "no notify groups selected")
}
ancs := make([]string, 0, len(curEvent.NotifyChannelsJSON))
ugids := f.AlertRuleConfig.NotifyGroupsJSON
ngids := make([]int64, 0)
for i := 0; i < len(ugids); i++ {
if gid, err := strconv.ParseInt(ugids[i], 10, 64); err == nil {
ngids = append(ngids, gid)
}
}
userGroups := rt.UserGroupCache.GetByUserGroupIds(ngids)
uids := make([]int64, 0)
for i := range userGroups {
uids = append(uids, userGroups[i].UserIds...)
}
users := rt.UserCache.GetByUserIds(uids)
for _, NotifyChannels := range curEvent.NotifyChannelsJSON {
flag := true
// ignore non-default channels
switch NotifyChannels {
case models.Dingtalk, models.Wecom, models.Feishu, models.Mm,
models.Telegram, models.Email, models.FeishuCard:
// do nothing
default:
continue
}
// default channels
for ui := range users {
if _, b := users[ui].ExtractToken(NotifyChannels); b {
flag = false
break
}
}
if flag {
ancs = append(ancs, NotifyChannels)
}
}
if len(ancs) > 0 {
ginx.Dangerous(errors.New(fmt.Sprintf("All users are missing notify channel configurations. Please check for missing tokens (each channel should be configured with at least one user). %v", ancs)))
}
ginx.NewRender(c).Data("notification test ok", nil)
}
func (rt *Router) alertRuleEnableTryRun(c *gin.Context) {
// check notify channels of old version
var f AlertRuleTryRunForm
ginx.BindJSON(c, &f)
hisEvent, err := models.AlertHisEventGetById(rt.Ctx, f.EventId)
ginx.Dangerous(err)
if hisEvent == nil {
ginx.Bomb(http.StatusNotFound, "event not found")
}
curEvent := *hisEvent.ToCur()
curEvent.SetTagsMap()
if f.AlertRuleConfig.Disabled == 1 {
ginx.Bomb(http.StatusOK, "rule is disabled")
}
if mute.TimeSpanMuteStrategy(&f.AlertRuleConfig, &curEvent) {
ginx.Bomb(http.StatusOK, "event is not match for period of time")
}
if mute.BgNotMatchMuteStrategy(&f.AlertRuleConfig, &curEvent, rt.TargetCache) {
ginx.Bomb(http.StatusOK, "event target busi group not match rule busi group")
}
ginx.NewRender(c).Data("event is effective", nil)
}
func (rt *Router) alertRuleAddByImport(c *gin.Context) {
username := c.MustGet("username").(string)
var lst []models.AlertRule
ginx.BindJSON(c, &lst)
count := len(lst)
if count == 0 {
ginx.Bomb(http.StatusBadRequest, "input json is empty")
}
for i := range lst {
if len(lst[i].DatasourceQueries) == 0 {
lst[i].DatasourceQueries = []models.DatasourceQuery{
models.DataSourceQueryAll,
}
}
// 将导入的规则统一转为新版本的通知规则配置
lst[i].NotifyVersion = 1
lst[i].NotifyChannelsJSON = []string{}
lst[i].NotifyGroupsJSON = []string{}
lst[i].NotifyChannels = ""
lst[i].NotifyGroups = ""
lst[i].Callbacks = ""
lst[i].CallbacksJSON = []string{}
}
bgid := ginx.UrlParamInt64(c, "id")
reterr := rt.alertRuleAdd(lst, username, bgid, c.GetHeader("X-Language"))
ginx.NewRender(c).Data(reterr, nil)
}
type promRuleForm struct {
Payload string `json:"payload" binding:"required"`
DatasourceQueries []models.DatasourceQuery `json:"datasource_queries" binding:"required"`
Disabled int `json:"disabled" binding:"gte=0,lte=1"`
}
func (rt *Router) alertRuleAddByImportPromRule(c *gin.Context) {
var f promRuleForm
ginx.Dangerous(c.BindJSON(&f))
// 首先尝试解析带 groups 的格式
var pr struct {
Groups []models.PromRuleGroup `yaml:"groups"`
}
err := yaml.Unmarshal([]byte(f.Payload), &pr)
var groups []models.PromRuleGroup
if err != nil || len(pr.Groups) == 0 {
// 如果解析失败或没有 groups,尝试解析规则数组格式
var rules []models.PromRule
err = yaml.Unmarshal([]byte(f.Payload), &rules)
if err != nil {
// 最后尝试解析单个规则格式
var singleRule models.PromRule
err = yaml.Unmarshal([]byte(f.Payload), &singleRule)
if err != nil {
ginx.Bomb(http.StatusBadRequest, "invalid yaml format. err: %v", err)
}
// 验证单个规则是否有效
if singleRule.Alert == "" && singleRule.Record == "" {
ginx.Bomb(http.StatusBadRequest, "input yaml is empty or invalid")
}
rules = []models.PromRule{singleRule}
}
// 验证规则数组是否为空
if len(rules) == 0 {
ginx.Bomb(http.StatusBadRequest, "input yaml contains no rules")
}
// 将规则数组包装成 group
groups = []models.PromRuleGroup{
{
Name: "imported_rules",
Rules: rules,
},
}
} else {
// 使用已解析的 groups
groups = pr.Groups
}
lst := models.DealPromGroup(groups, f.DatasourceQueries, f.Disabled)
username := c.MustGet("username").(string)
bgid := ginx.UrlParamInt64(c, "id")
ginx.NewRender(c).Data(rt.alertRuleAdd(lst, username, bgid, c.GetHeader("X-Language")), nil)
}
func (rt *Router) alertRuleAddByService(c *gin.Context) {
var lst []models.AlertRule
ginx.BindJSON(c, &lst)
count := len(lst)
if count == 0 {
ginx.Bomb(http.StatusBadRequest, "input json is empty")
}
reterr := rt.alertRuleAddForService(lst, "")
ginx.NewRender(c).Data(reterr, nil)
}
func (rt *Router) alertRuleAddOneByService(c *gin.Context) {
var f models.AlertRule
ginx.BindJSON(c, &f)
err := f.FE2DB()
ginx.Dangerous(err)
err = f.Add(rt.Ctx)
ginx.NewRender(c).Data(f.Id, err)
}
func (rt *Router) alertRuleAddForService(lst []models.AlertRule, username string) map[string]string {
count := len(lst)
// alert rule name -> error string
reterr := make(map[string]string)
for i := 0; i < count; i++ {
lst[i].Id = 0
if username != "" {
lst[i].CreateBy = username
lst[i].UpdateBy = username
}
if err := lst[i].FE2DB(); err != nil {
reterr[lst[i].Name] = err.Error()
continue
}
if err := lst[i].Add(rt.Ctx); err != nil {
reterr[lst[i].Name] = err.Error()
} else {
reterr[lst[i].Name] = ""
}
}
return reterr
}
func (rt *Router) alertRuleAdd(lst []models.AlertRule, username string, bgid int64, lang string) map[string]string {
count := len(lst)
// alert rule name -> error string
reterr := make(map[string]string)
for i := 0; i < count; i++ {
lst[i].Id = 0
lst[i].GroupId = bgid
if username != "" {
lst[i].CreateBy = username
lst[i].UpdateBy = username
}
if err := lst[i].FE2DB(); err != nil {
reterr[lst[i].Name] = i18n.Sprintf(lang, err.Error())
continue
}
if err := lst[i].Add(rt.Ctx); err != nil {
reterr[lst[i].Name] = i18n.Sprintf(lang, err.Error())
} else {
reterr[lst[i].Name] = ""
}
}
return reterr
}
func (rt *Router) alertRuleDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
f.Verify()
// param(busiGroupId) for protect
ginx.NewRender(c).Message(models.AlertRuleDels(rt.Ctx, f.Ids, ginx.UrlParamInt64(c, "id")))
}
func (rt *Router) alertRuleDelByService(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
f.Verify()
ginx.NewRender(c).Message(models.AlertRuleDels(rt.Ctx, f.Ids))
}
func (rt *Router) alertRulePutByFE(c *gin.Context) {
var f models.AlertRule
ginx.BindJSON(c, &f)
arid := ginx.UrlParamInt64(c, "arid")
ar, err := models.AlertRuleGetById(rt.Ctx, arid)
ginx.Dangerous(err)
if ar == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such AlertRule")
return
}
rt.bgrwCheck(c, ar.GroupId)
f.UpdateBy = c.MustGet("username").(string)
ginx.NewRender(c).Message(ar.Update(rt.Ctx, f))
}
func (rt *Router) alertRulePutByService(c *gin.Context) {
var f models.AlertRule
ginx.BindJSON(c, &f)
arid := ginx.UrlParamInt64(c, "arid")
ar, err := models.AlertRuleGetById(rt.Ctx, arid)
ginx.Dangerous(err)
if ar == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such AlertRule")
return
}
ginx.NewRender(c).Message(ar.Update(rt.Ctx, f))
}
type alertRuleFieldForm struct {
Ids []int64 `json:"ids"`
Fields map[string]interface{} `json:"fields"`
Action string `json:"action"`
}
// update one field: cluster note severity disabled prom_eval_interval prom_for_duration notify_channels notify_groups notify_recovered notify_repeat_step callbacks runbook_url append_tags
func (rt *Router) alertRulePutFields(c *gin.Context) {
var f alertRuleFieldForm
ginx.BindJSON(c, &f)
if len(f.Fields) == 0 {
ginx.Bomb(http.StatusBadRequest, "fields empty")
}
updateBy := c.MustGet("username").(string)
updateAt := time.Now().Unix()
for i := 0; i < len(f.Ids); i++ {
ar, err := models.AlertRuleGetById(rt.Ctx, f.Ids[i])
ginx.Dangerous(err)
if ar == nil {
continue
}
if f.Action == "update_triggers" {
if triggers, has := f.Fields["triggers"]; has {
originRule := ar.RuleConfigJson.(map[string]interface{})
originRule["triggers"] = triggers
b, err := json.Marshal(originRule)
ginx.Dangerous(err)
ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, map[string]interface{}{"rule_config": string(b)}))
}
}
if f.Action == "annotations_add" {
if annotations, has := f.Fields["annotations"]; has {
annotationsMap := annotations.(map[string]interface{})
for k, v := range annotationsMap {
ar.AnnotationsJSON[k] = v.(string)
}
b, err := json.Marshal(ar.AnnotationsJSON)
ginx.Dangerous(err)
ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, map[string]interface{}{"annotations": string(b)}))
}
}
if f.Action == "annotations_del" {
if annotations, has := f.Fields["annotations"]; has {
annotationsKeys := annotations.(map[string]interface{})
for key := range annotationsKeys {
delete(ar.AnnotationsJSON, key)
}
b, err := json.Marshal(ar.AnnotationsJSON)
ginx.Dangerous(err)
ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, map[string]interface{}{"annotations": string(b)}))
}
}
if f.Action == "callback_add" {
// 增加一个 callback 地址
if callbacks, has := f.Fields["callbacks"]; has {
callback := callbacks.(string)
if !strings.Contains(ar.Callbacks, callback) {
ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, map[string]interface{}{"callbacks": ar.Callbacks + " " + callback}))
}
}
}
if f.Action == "callback_del" {
// 删除一个 callback 地址
if callbacks, has := f.Fields["callbacks"]; has {
callback := callbacks.(string)
ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, map[string]interface{}{"callbacks": strings.ReplaceAll(ar.Callbacks, callback, "")}))
}
}
if f.Action == "datasource_change" {
// 修改数据源
if datasourceQueries, has := f.Fields["datasource_queries"]; has {
bytes, err := json.Marshal(datasourceQueries)
ginx.Dangerous(err)
ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, map[string]interface{}{"datasource_queries": bytes}))
}
}
for k, v := range f.Fields {
// 检查 v 是否为各种切片类型
switch v.(type) {
case []interface{}, []int64, []int, []string:
// 将切片转换为 JSON 字符串
bytes, err := json.Marshal(v)
ginx.Dangerous(err)
ginx.Dangerous(ar.UpdateColumn(rt.Ctx, k, string(bytes)))
default:
ginx.Dangerous(ar.UpdateColumn(rt.Ctx, k, v))
}
}
// 统一更新更新时间和更新人,只有更新时间变了,告警规则才会被引擎拉取
ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, map[string]interface{}{
"update_by": updateBy,
"update_at": updateAt,
}))
}
ginx.NewRender(c).Message(nil)
}
func (rt *Router) alertRuleGet(c *gin.Context) {
arid := ginx.UrlParamInt64(c, "arid")
ar, err := models.AlertRuleGetById(rt.Ctx, arid)
ginx.Dangerous(err)
if ar == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such AlertRule")
return
}
if len(ar.DatasourceQueries) != 0 {
ar.DatasourceIdsJson = rt.DatasourceCache.GetIDsByDsCateAndQueries(ar.Cate, ar.DatasourceQueries)
}
err = ar.FillNotifyGroups(rt.Ctx, make(map[int64]*models.UserGroup))
ginx.Dangerous(err)
rt.AlertRuleModifyHook(ar)
ginx.NewRender(c).Data(ar, err)
}
func (rt *Router) alertRulePureGet(c *gin.Context) {
arid := ginx.UrlParamInt64(c, "arid")
ar, err := models.AlertRuleGetById(rt.Ctx, arid)
ginx.Dangerous(err)
if ar == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such AlertRule")
return
}
ginx.NewRender(c).Data(ar, err)
}
// pre validation before save rule
func (rt *Router) alertRuleValidation(c *gin.Context) {
var f models.AlertRule //new
ginx.BindJSON(c, &f)
if len(f.NotifyChannelsJSON) > 0 && len(f.NotifyGroupsJSON) > 0 { //Validation NotifyChannels
ngids := make([]int64, 0, len(f.NotifyChannelsJSON))
for i := range f.NotifyGroupsJSON {
id, _ := strconv.ParseInt(f.NotifyGroupsJSON[i], 10, 64)
ngids = append(ngids, id)
}
userGroups := rt.UserGroupCache.GetByUserGroupIds(ngids)
uids := make([]int64, 0)
for i := range userGroups {
uids = append(uids, userGroups[i].UserIds...)
}
users := rt.UserCache.GetByUserIds(uids)
//If any users have a certain notify channel's token, it will be okay. Otherwise, this notify channel is absent of tokens.
ancs := make([]string, 0, len(f.NotifyChannelsJSON)) //absent Notify Channels
for i := range f.NotifyChannelsJSON {
flag := true
//ignore non-default channels
switch f.NotifyChannelsJSON[i] {
case models.Dingtalk, models.Wecom, models.Feishu, models.Mm,
models.Telegram, models.Email, models.FeishuCard:
// do nothing
default:
continue
}
//default channels
for ui := range users {
if _, b := users[ui].ExtractToken(f.NotifyChannelsJSON[i]); b {
flag = false
break
}
}
if flag {
ancs = append(ancs, f.NotifyChannelsJSON[i])
}
}
if len(ancs) > 0 {
ginx.NewRender(c).Message("All users are missing notify channel configurations. Please check for missing tokens (each channel should be configured with at least one user). %s", ancs)
return
}
}
ginx.NewRender(c).Message("")
}
func (rt *Router) alertRuleCallbacks(c *gin.Context) {
user := c.MustGet("user").(*models.User)
bussGroupIds, err := models.MyBusiGroupIds(rt.Ctx, user.Id)
ginx.Dangerous(err)
ars, err := models.AlertRuleGetsByBGIds(rt.Ctx, bussGroupIds)
ginx.Dangerous(err)
var callbacks []string
callbackFilter := make(map[string]struct{})
for i := range ars {
for _, callback := range ars[i].CallbacksJSON {
if _, ok := callbackFilter[callback]; !ok {
callbackFilter[callback] = struct{}{}
callbacks = append(callbacks, callback)
}
}
}
ginx.NewRender(c).Data(callbacks, nil)
}
type alertRuleTestForm struct {
Configs []*pconf.RelabelConfig `json:"configs"`
Tags []string `json:"tags"`
}
func (rt *Router) relabelTest(c *gin.Context) {
var f alertRuleTestForm
ginx.BindJSON(c, &f)
if len(f.Tags) == 0 || len(f.Configs) == 0 {
ginx.Bomb(http.StatusBadRequest, "relabel config is empty")
}
labels := make([]prompb.Label, len(f.Tags))
for i, tag := range f.Tags {
label := strings.SplitN(tag, "=", 2)
if len(label) != 2 {
ginx.Bomb(http.StatusBadRequest, "tag:%s format error", tag)
}
labels[i] = prompb.Label{Name: label[0], Value: label[1]}
}
for i := 0; i < len(f.Configs); i++ {
if f.Configs[i].Replacement == "" {
f.Configs[i].Replacement = "$1"
}
if f.Configs[i].Separator == "" {
f.Configs[i].Separator = ";"
}
if f.Configs[i].Regex == "" {
f.Configs[i].Regex = "(.*)"
}
}
relabels := writer.Process(labels, f.Configs...)
var tags []string
for _, label := range relabels {
tags = append(tags, fmt.Sprintf("%s=%s", label.Name, label.Value))
}
ginx.NewRender(c).Data(tags, nil)
}
type identListForm struct {
Ids []int64 `json:"ids"`
IdentList []string `json:"ident_list"`
}
func containsIdentOperator(s string) bool {
pattern := `ident\s*(!=|!~|=~)`
matched, err := regexp.MatchString(pattern, s)
if err != nil {
return false
}
return matched
}
func (rt *Router) cloneToMachine(c *gin.Context) {
var f identListForm
ginx.BindJSON(c, &f)
if len(f.IdentList) == 0 {
ginx.Bomb(http.StatusBadRequest, "ident_list is empty")
}
alertRules, err := models.AlertRuleGetsByIds(rt.Ctx, f.Ids)
ginx.Dangerous(err)
re := regexp.MustCompile(`ident\s*=\s*\\".*?\\"`)
user := c.MustGet("username").(string)
now := time.Now().Unix()
newRules := make([]*models.AlertRule, 0)
reterr := make(map[string]map[string]string)
for i := range alertRules {
errMsg := make(map[string]string)
if alertRules[i].Cate != "prometheus" {
errMsg["all"] = "Only Prometheus rule can be cloned to machines"
reterr[alertRules[i].Name] = errMsg
continue
}
if containsIdentOperator(alertRules[i].RuleConfig) {
errMsg["all"] = "promql is missing ident"
reterr[alertRules[i].Name] = errMsg
continue
}
for j := range f.IdentList {
alertRules[i].RuleConfig = re.ReplaceAllString(alertRules[i].RuleConfig, fmt.Sprintf(`ident=\"%s\"`, f.IdentList[j]))
newRule := &models.AlertRule{}
if err := copier.Copy(newRule, alertRules[i]); err != nil {
errMsg[f.IdentList[j]] = fmt.Sprintf("fail to clone rule, err: %s", err)
continue
}
newRule.Id = 0
newRule.Name = alertRules[i].Name + "_" + f.IdentList[j]
newRule.CreateBy = user
newRule.UpdateBy = user
newRule.UpdateAt = now
newRule.CreateAt = now
newRule.RuleConfig = alertRules[i].RuleConfig
exist, err := models.AlertRuleExists(rt.Ctx, 0, newRule.GroupId, newRule.Name)
if err != nil {
errMsg[f.IdentList[j]] = err.Error()
continue
}
if exist {
errMsg[f.IdentList[j]] = fmt.Sprintf("rule already exists, ruleName: %s", newRule.Name)
continue
}
newRules = append(newRules, newRule)
}
if len(errMsg) > 0 {
reterr[alertRules[i].Name] = errMsg
}
}
ginx.NewRender(c).Data(reterr, models.InsertAlertRule(rt.Ctx, newRules))
}
type alertBatchCloneForm struct {
RuleIds []int64 `json:"rule_ids"`
Bgids []int64 `json:"bgids"`
}
// 批量克隆告警规则
func (rt *Router) batchAlertRuleClone(c *gin.Context) {
me := c.MustGet("user").(*models.User)
var f alertBatchCloneForm
ginx.BindJSON(c, &f)
// 校验 bgids 操作权限
for _, bgid := range f.Bgids {
rt.bgrwCheck(c, bgid)
}
reterr := make(map[string]string, len(f.RuleIds))
lang := c.GetHeader("X-Language")
for _, arid := range f.RuleIds {
ar, err := models.AlertRuleGetById(rt.Ctx, arid)
for _, bgid := range f.Bgids {
// 为了让 bgid 和 arid 对应,将上面的 err 放到这里处理
if err != nil {
reterr[fmt.Sprintf("%d-%d", arid, bgid)] = i18n.Sprintf(lang, err.Error())
continue
}
if ar == nil {
reterr[fmt.Sprintf("%d-%d", arid, bgid)] = i18n.Sprintf(lang, "alert rule not found")
continue
}
newAr := ar.Clone(me.Username, bgid)
err = newAr.Add(rt.Ctx)
if err != nil {
reterr[fmt.Sprintf("%d-%d", arid, bgid)] = i18n.Sprintf(lang, err.Error())
continue
}
}
}
ginx.NewRender(c).Data(reterr, nil)
}
func (rt *Router) timezonesGet(c *gin.Context) {
// 返回常用时区列表(按时差去重,每个时差只保留一个代表性时区)
timezones := []string{
"Local",
"UTC",
"Asia/Shanghai", // UTC+8 (代表 Asia/Hong_Kong, Asia/Singapore 等)
"Asia/Tokyo", // UTC+9 (代表 Asia/Seoul 等)
"Asia/Dubai", // UTC+4
"Asia/Kolkata", // UTC+5:30
"Asia/Bangkok", // UTC+7 (代表 Asia/Jakarta 等)
"Europe/London", // UTC+0 (代表 UTC)
"Europe/Paris", // UTC+1 (代表 Europe/Berlin, Europe/Rome, Europe/Madrid 等)
"Europe/Moscow", // UTC+3
"America/New_York", // UTC-5 (代表 America/Toronto 等)
"America/Chicago", // UTC-6 (代表 America/Mexico_City 等)
"America/Denver", // UTC-7
"America/Los_Angeles", // UTC-8
"America/Sao_Paulo", // UTC-3
"Australia/Sydney", // UTC+10 (代表 Australia/Melbourne 等)
"Pacific/Auckland", // UTC+12
}
ginx.NewRender(c).Data(timezones, nil)
}
================================================
FILE: center/router/router_alert_subscribe.go
================================================
package router
import (
"net/http"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/alert/common"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/strx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/i18n"
)
// Return all, front-end search and paging
func (rt *Router) alertSubscribeGets(c *gin.Context) {
bgid := ginx.UrlParamInt64(c, "id")
lst, err := models.AlertSubscribeGets(rt.Ctx, bgid)
ginx.Dangerous(err)
ugcache := make(map[int64]*models.UserGroup)
rulecache := make(map[int64]string)
for i := 0; i < len(lst); i++ {
ginx.Dangerous(lst[i].FillUserGroups(rt.Ctx, ugcache))
ginx.Dangerous(lst[i].FillRuleNames(rt.Ctx, rulecache))
ginx.Dangerous(lst[i].FillDatasourceIds(rt.Ctx))
ginx.Dangerous(lst[i].DB2FE())
}
models.FillUpdateByNicknames(rt.Ctx, lst)
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) alertSubscribeGetsByGids(c *gin.Context) {
gids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",")
if len(gids) > 0 {
for _, gid := range gids {
rt.bgroCheck(c, gid)
}
} else {
me := c.MustGet("user").(*models.User)
if !me.IsAdmin() {
var err error
gids, err = models.MyBusiGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
if len(gids) == 0 {
ginx.NewRender(c).Data([]int{}, nil)
return
}
}
}
lst, err := models.AlertSubscribeGetsByBGIds(rt.Ctx, gids)
ginx.Dangerous(err)
ugcache := make(map[int64]*models.UserGroup)
rulecache := make(map[int64]string)
for i := 0; i < len(lst); i++ {
ginx.Dangerous(lst[i].FillUserGroups(rt.Ctx, ugcache))
ginx.Dangerous(lst[i].FillRuleNames(rt.Ctx, rulecache))
ginx.Dangerous(lst[i].FillDatasourceIds(rt.Ctx))
ginx.Dangerous(lst[i].DB2FE())
}
models.FillUpdateByNicknames(rt.Ctx, lst)
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) alertSubscribeGet(c *gin.Context) {
subid := ginx.UrlParamInt64(c, "sid")
sub, err := models.AlertSubscribeGet(rt.Ctx, "id=?", subid)
ginx.Dangerous(err)
if sub == nil {
ginx.NewRender(c, 404).Message("No such alert subscribe")
return
}
ugcache := make(map[int64]*models.UserGroup)
ginx.Dangerous(sub.FillUserGroups(rt.Ctx, ugcache))
rulecache := make(map[int64]string)
ginx.Dangerous(sub.FillRuleNames(rt.Ctx, rulecache))
ginx.Dangerous(sub.FillDatasourceIds(rt.Ctx))
ginx.Dangerous(sub.DB2FE())
ginx.NewRender(c).Data(sub, nil)
}
func (rt *Router) alertSubscribeAdd(c *gin.Context) {
var f models.AlertSubscribe
ginx.BindJSON(c, &f)
username := c.MustGet("username").(string)
f.CreateBy = username
f.UpdateBy = username
f.GroupId = ginx.UrlParamInt64(c, "id")
if f.GroupId <= 0 {
ginx.Bomb(http.StatusBadRequest, "group_id invalid")
}
ginx.NewRender(c).Message(f.Add(rt.Ctx))
}
type SubscribeTryRunForm struct {
EventId int64 `json:"event_id" binding:"required"`
SubscribeConfig models.AlertSubscribe `json:"config" binding:"required"`
}
func (rt *Router) alertSubscribeTryRun(c *gin.Context) {
var f SubscribeTryRunForm
ginx.BindJSON(c, &f)
ginx.Dangerous(f.SubscribeConfig.Verify())
hisEvent, err := models.AlertHisEventGetById(rt.Ctx, f.EventId)
ginx.Dangerous(err)
if hisEvent == nil {
ginx.Bomb(http.StatusNotFound, "event not found")
}
curEvent := *hisEvent.ToCur()
curEvent.SetTagsMap()
lang := c.GetHeader("X-Language")
// 先判断匹配条件
if !f.SubscribeConfig.MatchCluster(curEvent.DatasourceId) {
ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "event datasource not match"))
}
if len(f.SubscribeConfig.RuleIds) != 0 {
match := false
for _, rid := range f.SubscribeConfig.RuleIds {
if rid == curEvent.RuleId {
match = true
break
}
}
if !match {
ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "event rule id not match"))
}
}
// 匹配 tag
f.SubscribeConfig.Parse()
if !common.MatchTags(curEvent.TagsMap, f.SubscribeConfig.ITags) {
ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "event tags not match"))
}
// 匹配group name
if !common.MatchGroupsName(curEvent.GroupName, f.SubscribeConfig.IBusiGroups) {
ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "event group name not match"))
}
// 检查严重级别(Severity)匹配
if len(f.SubscribeConfig.SeveritiesJson) != 0 {
match := false
for _, s := range f.SubscribeConfig.SeveritiesJson {
if s == curEvent.Severity || s == 0 {
match = true
break
}
}
if !match {
ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "event severity not match"))
}
}
// 新版本通知规则
if f.SubscribeConfig.NotifyVersion == 1 {
if len(f.SubscribeConfig.NotifyRuleIds) == 0 {
ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "no notify rules selected"))
}
for _, id := range f.SubscribeConfig.NotifyRuleIds {
notifyRule, err := models.GetNotifyRule(rt.Ctx, id)
if err != nil {
ginx.Bomb(http.StatusNotFound, i18n.Sprintf(lang, "subscribe notify rule not found: %v", err))
}
for _, notifyConfig := range notifyRule.NotifyConfigs {
_, err = SendNotifyChannelMessage(rt.Ctx, rt.UserCache, rt.UserGroupCache, notifyConfig, []*models.AlertCurEvent{&curEvent})
if err != nil {
ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "notify rule send error: %v", err))
}
}
}
ginx.NewRender(c).Data(i18n.Sprintf(lang, "event match subscribe and notification test ok"), nil)
return
}
// 旧版通知方式
f.SubscribeConfig.ModifyEvent(&curEvent)
if len(curEvent.NotifyChannelsJSON) == 0 {
ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "no notify channels selected"))
}
if len(curEvent.NotifyGroupsJSON) == 0 {
ginx.Bomb(http.StatusOK, i18n.Sprintf(lang, "no notify groups selected"))
}
ancs := make([]string, 0, len(curEvent.NotifyChannelsJSON))
ugids := strings.Fields(f.SubscribeConfig.UserGroupIds)
ngids := make([]int64, 0)
for i := 0; i < len(ugids); i++ {
if gid, err := strconv.ParseInt(ugids[i], 10, 64); err == nil {
ngids = append(ngids, gid)
}
}
userGroups := rt.UserGroupCache.GetByUserGroupIds(ngids)
uids := make([]int64, 0)
for i := range userGroups {
uids = append(uids, userGroups[i].UserIds...)
}
users := rt.UserCache.GetByUserIds(uids)
for _, NotifyChannels := range curEvent.NotifyChannelsJSON {
flag := true
// ignore non-default channels
switch NotifyChannels {
case models.Dingtalk, models.Wecom, models.Feishu, models.Mm,
models.Telegram, models.Email, models.FeishuCard:
// do nothing
default:
continue
}
// default channels
for ui := range users {
if _, b := users[ui].ExtractToken(NotifyChannels); b {
flag = false
break
}
}
if flag {
ancs = append(ancs, NotifyChannels)
}
}
if len(ancs) > 0 {
ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "all users missing notify channel configurations: %v", ancs))
}
ginx.NewRender(c).Data(i18n.Sprintf(lang, "event match subscribe and notify settings ok"), nil)
}
func (rt *Router) alertSubscribePut(c *gin.Context) {
var fs []models.AlertSubscribe
ginx.BindJSON(c, &fs)
timestamp := time.Now().Unix()
username := c.MustGet("username").(string)
for i := 0; i < len(fs); i++ {
fs[i].UpdateBy = username
fs[i].UpdateAt = timestamp
//After adding the function of batch subscription alert rules, rule_ids is used instead of rule_id.
//When the subscription rules are updated, set rule_id=0 to prevent the wrong subscription caused by the old rule_id.
fs[i].RuleId = 0
ginx.Dangerous(fs[i].Update(
rt.Ctx,
"name",
"disabled",
"prod",
"cate",
"datasource_ids",
"cluster",
"rule_id",
"rule_ids",
"tags",
"redefine_severity",
"new_severity",
"redefine_channels",
"new_channels",
"user_group_ids",
"update_at",
"update_by",
"webhooks",
"for_duration",
"redefine_webhooks",
"severities",
"extra_config",
"busi_groups",
"note",
"notify_rule_ids",
"notify_version",
))
}
ginx.NewRender(c).Message(nil)
}
func (rt *Router) alertSubscribeDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
f.Verify()
ginx.NewRender(c).Message(models.AlertSubscribeDel(rt.Ctx, f.Ids))
}
func (rt *Router) alertSubscribeGetsByService(c *gin.Context) {
lst, err := models.AlertSubscribeGetsByService(rt.Ctx)
ginx.NewRender(c).Data(lst, err)
}
================================================
FILE: center/router/router_board.go
================================================
package router
import (
"fmt"
"net/http"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/strx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/i18n"
)
type boardForm struct {
Name string `json:"name"`
Ident string `json:"ident"`
Tags string `json:"tags"`
Note string `json:"note"`
Configs string `json:"configs"`
Public int `json:"public"`
PublicCate int `json:"public_cate"`
Bgids []int64 `json:"bgids"`
}
func (rt *Router) boardAdd(c *gin.Context) {
var f boardForm
ginx.BindJSON(c, &f)
me := c.MustGet("user").(*models.User)
board := &models.Board{
GroupId: ginx.UrlParamInt64(c, "id"),
Name: f.Name,
Ident: f.Ident,
Tags: f.Tags,
Note: f.Note,
Configs: f.Configs,
CreateBy: me.Username,
UpdateBy: me.Username,
}
err := board.Add(rt.Ctx)
ginx.Dangerous(err)
if f.Configs != "" {
ginx.Dangerous(models.BoardPayloadSave(rt.Ctx, board.Id, f.Configs))
}
ginx.NewRender(c).Data(board, nil)
}
func (rt *Router) boardGet(c *gin.Context) {
bid := ginx.UrlParamStr(c, "bid")
board, err := models.BoardGet(rt.Ctx, "ident = ?", bid)
ginx.Dangerous(err)
if board == nil {
board, err = models.BoardGet(rt.Ctx, "id = ?", bid)
ginx.Dangerous(err)
}
if board == nil {
ginx.Bomb(http.StatusNotFound, "No such dashboard")
}
if board.Public == 0 {
rt.auth()(c)
rt.user()(c)
me := c.MustGet("user").(*models.User)
if !me.IsAdmin() {
// check permission
rt.bgroCheck(c, board.GroupId)
}
}
if board.PublicCate == models.PublicLogin {
rt.auth()(c)
} else if board.PublicCate == models.PublicBusi {
rt.auth()(c)
rt.user()(c)
me := c.MustGet("user").(*models.User)
if !me.IsAdmin() {
bgids, err := models.MyBusiGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
if len(bgids) == 0 {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
ok, err := models.BoardBusigroupCheck(rt.Ctx, board.Id, bgids)
ginx.Dangerous(err)
if !ok {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
}
}
ginx.NewRender(c).Data(board, nil)
}
// 根据 bids 参数,获取多个 board
func (rt *Router) boardGetsByBids(c *gin.Context) {
bids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "bids", ""), ",")
boards, err := models.BoardGetsByBids(rt.Ctx, bids)
ginx.Dangerous(err)
ginx.NewRender(c).Data(boards, err)
}
func (rt *Router) boardPureGet(c *gin.Context) {
board, err := models.BoardGetByID(rt.Ctx, ginx.UrlParamInt64(c, "bid"))
ginx.Dangerous(err)
if board == nil {
ginx.Bomb(http.StatusNotFound, "No such dashboard")
}
// 清除创建者和更新者信息
board.CreateBy = ""
board.UpdateBy = ""
ginx.NewRender(c).Data(board, nil)
}
// bgrwCheck
func (rt *Router) boardDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
f.Verify()
for i := 0; i < len(f.Ids); i++ {
bid := f.Ids[i]
board, err := models.BoardGet(rt.Ctx, "id = ?", bid)
ginx.Dangerous(err)
if board == nil {
continue
}
me := c.MustGet("user").(*models.User)
if !me.IsAdmin() {
// check permission
rt.bgrwCheck(c, board.GroupId)
}
ginx.Dangerous(board.Del(rt.Ctx))
}
ginx.NewRender(c).Message(nil)
}
func (rt *Router) Board(id int64) *models.Board {
obj, err := models.BoardGet(rt.Ctx, "id=?", id)
ginx.Dangerous(err)
if obj == nil {
ginx.Bomb(http.StatusNotFound, "No such dashboard")
}
return obj
}
// bgrwCheck
func (rt *Router) boardPut(c *gin.Context) {
var f boardForm
ginx.BindJSON(c, &f)
me := c.MustGet("user").(*models.User)
bo := rt.Board(ginx.UrlParamInt64(c, "bid"))
if !me.IsAdmin() {
// check permission
rt.bgrwCheck(c, bo.GroupId)
}
can, err := bo.CanRenameIdent(rt.Ctx, f.Ident)
ginx.Dangerous(err)
if !can {
ginx.Bomb(http.StatusOK, "Ident duplicate")
}
bo.Name = f.Name
bo.Ident = f.Ident
bo.Tags = f.Tags
bo.Note = f.Note
bo.UpdateBy = me.Username
bo.UpdateAt = time.Now().Unix()
err = bo.Update(rt.Ctx, "name", "ident", "tags", "note", "update_by", "update_at")
ginx.NewRender(c).Data(bo, err)
}
// bgrwCheck
func (rt *Router) boardPutConfigs(c *gin.Context) {
var f boardForm
ginx.BindJSON(c, &f)
me := c.MustGet("user").(*models.User)
bid := ginx.UrlParamStr(c, "bid")
bo, err := models.BoardGet(rt.Ctx, "id = ? or ident = ?", bid, bid)
ginx.Dangerous(err)
if bo == nil {
ginx.Bomb(http.StatusNotFound, "No such dashboard")
}
// check permission
if !me.IsAdmin() {
rt.bgrwCheck(c, bo.GroupId)
}
bo.UpdateBy = me.Username
bo.UpdateAt = time.Now().Unix()
ginx.Dangerous(bo.Update(rt.Ctx, "update_by", "update_at"))
bo.Configs = f.Configs
ginx.Dangerous(models.BoardPayloadSave(rt.Ctx, bo.Id, f.Configs))
ginx.NewRender(c).Data(bo, nil)
}
// bgrwCheck
func (rt *Router) boardPutPublic(c *gin.Context) {
var f boardForm
ginx.BindJSON(c, &f)
me := c.MustGet("user").(*models.User)
bo := rt.Board(ginx.UrlParamInt64(c, "bid"))
// check permission
if !me.IsAdmin() {
rt.bgrwCheck(c, bo.GroupId)
}
bo.Public = f.Public
bo.PublicCate = f.PublicCate
if bo.PublicCate == models.PublicBusi {
err := models.BoardBusigroupUpdate(rt.Ctx, bo.Id, f.Bgids)
ginx.Dangerous(err)
} else {
err := models.BoardBusigroupDelByBoardId(rt.Ctx, bo.Id)
ginx.Dangerous(err)
}
bo.UpdateBy = me.Username
bo.UpdateAt = time.Now().Unix()
err := bo.Update(rt.Ctx, "public", "public_cate", "update_by", "update_at")
ginx.NewRender(c).Data(bo, err)
}
func (rt *Router) boardGets(c *gin.Context) {
bgid := ginx.UrlParamInt64(c, "id")
query := ginx.QueryStr(c, "query", "")
boards, err := models.BoardGetsByGroupId(rt.Ctx, bgid, query)
if err == nil {
models.FillUpdateByNicknames(rt.Ctx, boards)
}
ginx.NewRender(c).Data(boards, err)
}
func (rt *Router) publicBoardGets(c *gin.Context) {
me := c.MustGet("user").(*models.User)
bgids, err := models.MyBusiGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
boardIds, err := models.BoardIdsByBusiGroupIds(rt.Ctx, bgids)
ginx.Dangerous(err)
boards, err := models.BoardGets(rt.Ctx, "", "public=1 and (public_cate in (?) or id in (?))", []int64{0, 1}, boardIds)
if err == nil {
models.FillUpdateByNicknames(rt.Ctx, boards)
}
ginx.NewRender(c).Data(boards, err)
}
func (rt *Router) boardGetsByGids(c *gin.Context) {
gids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",")
query := ginx.QueryStr(c, "query", "")
if len(gids) > 0 {
for _, gid := range gids {
rt.bgroCheck(c, gid)
}
} else {
me := c.MustGet("user").(*models.User)
if !me.IsAdmin() {
var err error
gids, err = models.MyBusiGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
if len(gids) == 0 {
ginx.NewRender(c).Data([]int{}, nil)
return
}
}
}
boardBusigroups, err := models.BoardBusigroupGets(rt.Ctx)
ginx.Dangerous(err)
m := make(map[int64][]int64)
for _, boardBusigroup := range boardBusigroups {
m[boardBusigroup.BoardId] = append(m[boardBusigroup.BoardId], boardBusigroup.BusiGroupId)
}
boards, err := models.BoardGetsByBGIds(rt.Ctx, gids, query)
ginx.Dangerous(err)
for i := 0; i < len(boards); i++ {
if ids, ok := m[boards[i].Id]; ok {
boards[i].Bgids = ids
}
}
models.FillUpdateByNicknames(rt.Ctx, boards)
ginx.NewRender(c).Data(boards, err)
}
func (rt *Router) boardClone(c *gin.Context) {
me := c.MustGet("user").(*models.User)
bo := rt.Board(ginx.UrlParamInt64(c, "bid"))
newBoard := bo.Clone(me.Username, bo.GroupId, " Cloned")
ginx.Dangerous(newBoard.Add(rt.Ctx))
// clone payload
payload, err := models.BoardPayloadGet(rt.Ctx, bo.Id)
ginx.Dangerous(err)
if payload != "" {
ginx.Dangerous(models.BoardPayloadSave(rt.Ctx, newBoard.Id, payload))
}
ginx.NewRender(c).Message(nil)
}
type boardsForm struct {
BoardIds []int64 `json:"board_ids"`
Bgids []int64 `json:"bgids"`
}
func (rt *Router) boardBatchClone(c *gin.Context) {
me := c.MustGet("user").(*models.User)
var f boardsForm
ginx.BindJSON(c, &f)
for _, bgid := range f.Bgids {
rt.bgrwCheck(c, bgid)
}
reterr := make(map[string]string, len(f.BoardIds))
lang := c.GetHeader("X-Language")
for _, bgid := range f.Bgids {
for _, bid := range f.BoardIds {
bo := rt.Board(bid)
newBoard := bo.Clone(me.Username, bgid, "")
payload, err := models.BoardPayloadGet(rt.Ctx, bo.Id)
if err != nil {
reterr[fmt.Sprintf("%s-%d", newBoard.Name, bgid)] = i18n.Sprintf(lang, err.Error())
continue
}
if err = newBoard.AtomicAdd(rt.Ctx, payload); err != nil {
reterr[fmt.Sprintf("%s-%d", newBoard.Name, bgid)] = i18n.Sprintf(lang, err.Error())
}
}
}
ginx.NewRender(c).Data(reterr, nil)
}
================================================
FILE: center/router/router_builtin.go
================================================
package router
import (
"encoding/json"
"fmt"
"net/http"
"path"
"strings"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/file"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/runner"
)
// 创建 builtin_cate
func (rt *Router) builtinCateFavoriteAdd(c *gin.Context) {
var f models.BuiltinCate
ginx.BindJSON(c, &f)
if f.Name == "" {
ginx.Bomb(http.StatusBadRequest, "name is empty")
}
me := c.MustGet("user").(*models.User)
f.UserId = me.Id
ginx.NewRender(c).Message(f.Create(rt.Ctx))
}
// 删除 builtin_cate
func (rt *Router) builtinCateFavoriteDel(c *gin.Context) {
name := ginx.UrlParamStr(c, "name")
me := c.MustGet("user").(*models.User)
ginx.NewRender(c).Message(models.BuiltinCateDelete(rt.Ctx, name, me.Id))
}
type Payload struct {
Cate string `json:"cate"`
Fname string `json:"fname"`
Name string `json:"name"`
Configs interface{} `json:"configs"`
Tags string `json:"tags"`
}
type BoardCate struct {
Name string `json:"name"`
IconUrl string `json:"icon_url"`
Boards []Payload `json:"boards"`
Favorite bool `json:"favorite"`
}
func (rt *Router) builtinBoardDetailGets(c *gin.Context) {
var payload Payload
ginx.BindJSON(c, &payload)
fp := rt.Center.BuiltinIntegrationsDir
if fp == "" {
fp = path.Join(runner.Cwd, "integrations")
}
fn := fp + "/" + payload.Cate + "/dashboards/" + payload.Fname
content, err := file.ReadBytes(fn)
ginx.Dangerous(err)
err = json.Unmarshal(content, &payload)
ginx.NewRender(c).Data(payload, err)
}
func (rt *Router) builtinBoardCateGets(c *gin.Context) {
fp := rt.Center.BuiltinIntegrationsDir
if fp == "" {
fp = path.Join(runner.Cwd, "integrations")
}
me := c.MustGet("user").(*models.User)
builtinFavoritesMap, err := models.BuiltinCateGetByUserId(rt.Ctx, me.Id)
if err != nil {
logger.Warningf("get builtin favorites fail: %v", err)
}
var boardCates []BoardCate
dirList, err := file.DirsUnder(fp)
ginx.Dangerous(err)
for _, dir := range dirList {
var boardCate BoardCate
boardCate.Name = dir
files, err := file.FilesUnder(fp + "/" + dir + "/dashboards")
ginx.Dangerous(err)
if len(files) == 0 {
continue
}
var boards []Payload
for _, f := range files {
fn := fp + "/" + dir + "/dashboards/" + f
content, err := file.ReadBytes(fn)
if err != nil {
logger.Warningf("add board fail: %v", err)
continue
}
var payload Payload
err = json.Unmarshal(content, &payload)
if err != nil {
logger.Warningf("add board:%s fail: %v", fn, err)
continue
}
payload.Cate = dir
payload.Fname = f
payload.Configs = ""
boards = append(boards, payload)
}
boardCate.Boards = boards
if _, ok := builtinFavoritesMap[dir]; ok {
boardCate.Favorite = true
}
iconFiles, _ := file.FilesUnder(fp + "/" + dir + "/icon")
if len(iconFiles) > 0 {
boardCate.IconUrl = fmt.Sprintf("/api/n9e/integrations/icon/%s/%s", dir, iconFiles[0])
}
boardCates = append(boardCates, boardCate)
}
ginx.NewRender(c).Data(boardCates, nil)
}
func (rt *Router) builtinBoardGets(c *gin.Context) {
fp := rt.Center.BuiltinIntegrationsDir
if fp == "" {
fp = path.Join(runner.Cwd, "integrations")
}
var fileList []string
dirList, err := file.DirsUnder(fp)
ginx.Dangerous(err)
for _, dir := range dirList {
files, err := file.FilesUnder(fp + "/" + dir + "/dashboards")
ginx.Dangerous(err)
fileList = append(fileList, files...)
}
names := make([]string, 0, len(fileList))
for _, f := range fileList {
if !strings.HasSuffix(f, ".json") {
continue
}
name := strings.TrimSuffix(f, ".json")
names = append(names, name)
}
ginx.NewRender(c).Data(names, nil)
}
type AlertCate struct {
Name string `json:"name"`
IconUrl string `json:"icon_url"`
AlertRules []models.AlertRule `json:"alert_rules"`
Favorite bool `json:"favorite"`
}
func (rt *Router) builtinAlertCateGets(c *gin.Context) {
fp := rt.Center.BuiltinIntegrationsDir
if fp == "" {
fp = path.Join(runner.Cwd, "integrations")
}
me := c.MustGet("user").(*models.User)
builtinFavoritesMap, err := models.BuiltinCateGetByUserId(rt.Ctx, me.Id)
if err != nil {
logger.Warningf("get builtin favorites fail: %v", err)
}
var alertCates []AlertCate
dirList, err := file.DirsUnder(fp)
ginx.Dangerous(err)
for _, dir := range dirList {
var alertCate AlertCate
alertCate.Name = dir
files, err := file.FilesUnder(fp + "/" + dir + "/alerts")
ginx.Dangerous(err)
var alertRules []models.AlertRule
for _, f := range files {
fn := fp + "/" + dir + "/alerts/" + f
content, err := file.ReadBytes(fn)
if err != nil {
logger.Warningf("add board fail: %v", err)
continue
}
var ars []models.AlertRule
err = json.Unmarshal(content, &ars)
if err != nil {
logger.Warningf("add board:%s fail: %v", fn, err)
continue
}
alertRules = append(alertRules, ars...)
}
alertCate.AlertRules = alertRules
iconFiles, _ := file.FilesUnder(fp + "/" + dir + "/icon")
if len(iconFiles) > 0 {
alertCate.IconUrl = fmt.Sprintf("/api/n9e/integrations/icon/%s/%s", dir, iconFiles[0])
}
if _, ok := builtinFavoritesMap[dir]; ok {
alertCate.Favorite = true
}
alertCates = append(alertCates, alertCate)
}
ginx.NewRender(c).Data(alertCates, nil)
}
type builtinAlertRulesList struct {
Name string `json:"name"`
IconUrl string `json:"icon_url"`
AlertRules map[string][]models.AlertRule `json:"alert_rules"`
Favorite bool `json:"favorite"`
}
func (rt *Router) builtinAlertRules(c *gin.Context) {
fp := rt.Center.BuiltinIntegrationsDir
if fp == "" {
fp = path.Join(runner.Cwd, "integrations")
}
me := c.MustGet("user").(*models.User)
builtinFavoritesMap, err := models.BuiltinCateGetByUserId(rt.Ctx, me.Id)
if err != nil {
logger.Warningf("get builtin favorites fail: %v", err)
}
var alertCates []builtinAlertRulesList
dirList, err := file.DirsUnder(fp)
ginx.Dangerous(err)
for _, dir := range dirList {
var alertCate builtinAlertRulesList
alertCate.Name = dir
files, err := file.FilesUnder(fp + "/" + dir + "/alerts")
ginx.Dangerous(err)
if len(files) == 0 {
continue
}
alertRules := make(map[string][]models.AlertRule)
for _, f := range files {
fn := fp + "/" + dir + "/alerts/" + f
content, err := file.ReadBytes(fn)
if err != nil {
logger.Warningf("add board fail: %v", err)
continue
}
var ars []models.AlertRule
err = json.Unmarshal(content, &ars)
if err != nil {
logger.Warningf("add board:%s fail: %v", fn, err)
continue
}
alertRules[strings.TrimSuffix(f, ".json")] = ars
}
alertCate.AlertRules = alertRules
iconFiles, _ := file.FilesUnder(fp + "/" + dir + "/icon")
if len(iconFiles) > 0 {
alertCate.IconUrl = fmt.Sprintf("/api/n9e/integrations/icon/%s/%s", dir, iconFiles[0])
}
if _, ok := builtinFavoritesMap[dir]; ok {
alertCate.Favorite = true
}
alertCates = append(alertCates, alertCate)
}
ginx.NewRender(c).Data(alertCates, nil)
}
// read the json file content
func (rt *Router) builtinBoardGet(c *gin.Context) {
name := ginx.UrlParamStr(c, "name")
dirpath := rt.Center.BuiltinIntegrationsDir
if dirpath == "" {
dirpath = path.Join(runner.Cwd, "integrations")
}
dirList, err := file.DirsUnder(dirpath)
ginx.Dangerous(err)
for _, dir := range dirList {
jsonFile := dirpath + "/" + dir + "/dashboards/" + name + ".json"
if file.IsExist(jsonFile) {
body, err := file.ReadString(jsonFile)
ginx.NewRender(c).Data(body, err)
return
}
}
ginx.Bomb(http.StatusBadRequest, "%s not found", name)
}
func (rt *Router) builtinIcon(c *gin.Context) {
fp := rt.Center.BuiltinIntegrationsDir
if fp == "" {
fp = path.Join(runner.Cwd, "integrations")
}
cate := ginx.UrlParamStr(c, "cate")
iconPath := fp + "/" + cate + "/icon/" + ginx.UrlParamStr(c, "name")
c.File(path.Join(iconPath))
}
func (rt *Router) builtinMarkdown(c *gin.Context) {
fp := rt.Center.BuiltinIntegrationsDir
if fp == "" {
fp = path.Join(runner.Cwd, "integrations")
}
cate := ginx.UrlParamStr(c, "cate")
var markdown []byte
markdownDir := fp + "/" + cate + "/markdown"
markdownFiles, err := file.FilesUnder(markdownDir)
if err != nil {
logger.Warningf("get markdown fail: %v", err)
} else if len(markdownFiles) > 0 {
f := markdownFiles[0]
fn := markdownDir + "/" + f
markdown, err = file.ReadBytes(fn)
if err != nil {
logger.Warningf("get collect fail: %v", err)
}
}
ginx.NewRender(c).Data(string(markdown), nil)
}
================================================
FILE: center/router/router_builtin_component.go
================================================
package router
import (
"net/http"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"gorm.io/gorm"
)
const SYSTEM = "system"
func (rt *Router) builtinComponentsAdd(c *gin.Context) {
var lst []models.BuiltinComponent
ginx.BindJSON(c, &lst)
username := Username(c)
count := len(lst)
if count == 0 {
ginx.Bomb(http.StatusBadRequest, "input json is empty")
}
reterr := make(map[string]string)
for i := 0; i < count; i++ {
if err := lst[i].Add(rt.Ctx, username); err != nil {
reterr[lst[i].Ident] = err.Error()
}
}
ginx.NewRender(c).Data(reterr, nil)
}
func (rt *Router) builtinComponentsGets(c *gin.Context) {
query := ginx.QueryStr(c, "query", "")
disabled := ginx.QueryInt(c, "disabled", -1)
bc, err := models.BuiltinComponentGets(rt.Ctx, query, disabled)
ginx.Dangerous(err)
ginx.NewRender(c).Data(bc, nil)
}
func (rt *Router) builtinComponentsPut(c *gin.Context) {
var req models.BuiltinComponent
ginx.BindJSON(c, &req)
bc, err := models.BuiltinComponentGet(rt.Ctx, "id = ?", req.ID)
ginx.Dangerous(err)
if bc == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such builtin component")
return
}
if bc.CreatedBy == SYSTEM {
req.Ident = bc.Ident
}
username := Username(c)
req.UpdatedBy = username
err = models.DB(rt.Ctx).Transaction(func(tx *gorm.DB) error {
tCtx := &ctx.Context{
DB: tx,
}
txErr := models.BuiltinMetricBatchUpdateColumn(tCtx, "typ", bc.Ident, req.Ident, req.UpdatedBy)
if txErr != nil {
return txErr
}
txErr = bc.Update(tCtx, req)
if txErr != nil {
return txErr
}
return nil
})
ginx.NewRender(c).Message(err)
}
func (rt *Router) builtinComponentsDel(c *gin.Context) {
var req idsForm
ginx.BindJSON(c, &req)
req.Verify()
ginx.NewRender(c).Message(models.BuiltinComponentDels(rt.Ctx, req.Ids))
}
================================================
FILE: center/router/router_builtin_metric_filter.go
================================================
package router
import (
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/prom"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
func (rt *Router) metricFilterGets(c *gin.Context) {
lst, err := models.MetricFilterGets(rt.Ctx, "")
ginx.Dangerous(err)
me := c.MustGet("user").(*models.User)
gids, err := models.MyGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
arr := make([]models.MetricFilter, 0)
for _, f := range lst {
if me.Username == f.CreateBy {
arr = append(arr, f)
continue
}
if HasPerm(gids, f.GroupsPerm, false) {
arr = append(arr, f)
}
}
models.FillUpdateByNicknames(rt.Ctx, arr)
ginx.NewRender(c).Data(arr, err)
}
func (rt *Router) metricFilterAdd(c *gin.Context) {
var f models.MetricFilter
ginx.BindJSON(c, &f)
me := c.MustGet("user").(*models.User)
f.CreateBy = me.Username
f.UpdateBy = me.Username
ginx.Dangerous(f.Add(rt.Ctx))
ginx.NewRender(c).Data(f, nil)
}
func (rt *Router) metricFilterDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
f.Verify()
me := c.MustGet("user").(*models.User)
for _, id := range f.Ids {
old, err := models.MetricFilterGet(rt.Ctx, id)
ginx.Dangerous(err)
if me.Username != old.CreateBy {
gids, err := models.MyGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
if !HasPerm(gids, old.GroupsPerm, true) {
ginx.NewRender(c).Message("forbidden")
return
}
}
}
ginx.NewRender(c).Message(models.MetricFilterDel(rt.Ctx, f.Ids))
}
func (rt *Router) metricFilterPut(c *gin.Context) {
var f models.MetricFilter
ginx.BindJSON(c, &f)
me := c.MustGet("user").(*models.User)
old, err := models.MetricFilterGet(rt.Ctx, f.ID)
ginx.Dangerous(err)
if me.Username != old.CreateBy {
gids, err := models.MyGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
if !HasPerm(gids, old.GroupsPerm, true) {
ginx.NewRender(c).Message("forbidden")
return
}
}
f.UpdateBy = me.Username
ginx.NewRender(c).Message(f.Update(rt.Ctx))
}
type metricPromqlReq struct {
LabelFilter string `json:"label_filter"`
Promql string `json:"promql"`
}
func (rt *Router) getMetricPromql(c *gin.Context) {
var req metricPromqlReq
ginx.BindJSON(c, &req)
promql := prom.AddLabelToPromQL(req.LabelFilter, req.Promql)
ginx.NewRender(c).Data(promql, nil)
}
func HasPerm(gids []int64, gps []models.GroupPerm, checkWrite bool) bool {
gmap := make(map[int64]struct{})
for _, gp := range gps {
if checkWrite && !gp.Write {
continue
}
gmap[gp.Gid] = struct{}{}
}
for _, gid := range gids {
if _, ok := gmap[gid]; ok {
return true
}
}
return false
}
================================================
FILE: center/router/router_builtin_metrics.go
================================================
package router
import (
"net/http"
"sort"
"time"
"github.com/ccfos/nightingale/v6/center/integration"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/i18n"
)
// single or import
func (rt *Router) builtinMetricsAdd(c *gin.Context) {
var lst []models.BuiltinMetric
ginx.BindJSON(c, &lst)
username := Username(c)
count := len(lst)
if count == 0 {
ginx.Bomb(http.StatusBadRequest, "input json is empty")
}
lang := c.GetHeader("X-Language")
if lang == "" {
lang = "zh_CN"
}
reterr := make(map[string]string)
for i := 0; i < count; i++ {
lst[i].Lang = lang
lst[i].UUID = time.Now().UnixMicro()
if err := lst[i].Add(rt.Ctx, username); err != nil {
reterr[lst[i].Name] = i18n.Sprintf(c.GetHeader("X-Language"), err.Error())
}
}
ginx.NewRender(c).Data(reterr, nil)
}
func (rt *Router) builtinMetricsGets(c *gin.Context) {
collector := ginx.QueryStr(c, "collector", "")
typ := ginx.QueryStr(c, "typ", "")
query := ginx.QueryStr(c, "query", "")
limit := ginx.QueryInt(c, "limit", 20)
lang := c.GetHeader("X-Language")
unit := ginx.QueryStr(c, "unit", "")
if lang == "" {
lang = "zh_CN"
}
bmInDB, err := models.BuiltinMetricGets(rt.Ctx, "", collector, typ, query, unit)
ginx.Dangerous(err)
bm, total, err := integration.BuiltinPayloadInFile.BuiltinMetricGets(bmInDB, lang, collector, typ, query, unit, limit, ginx.Offset(c, limit))
ginx.Dangerous(err)
ginx.NewRender(c).Data(gin.H{
"list": bm,
"total": total,
}, nil)
}
func (rt *Router) builtinMetricsPut(c *gin.Context) {
var req models.BuiltinMetric
ginx.BindJSON(c, &req)
bm, err := models.BuiltinMetricGet(rt.Ctx, "id = ?", req.ID)
ginx.Dangerous(err)
if bm == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such builtin metric")
return
}
username := Username(c)
req.UpdatedBy = username
ginx.NewRender(c).Message(bm.Update(rt.Ctx, req))
}
func (rt *Router) builtinMetricsDel(c *gin.Context) {
var req idsForm
ginx.BindJSON(c, &req)
req.Verify()
ginx.NewRender(c).Message(models.BuiltinMetricDels(rt.Ctx, req.Ids))
}
func (rt *Router) builtinMetricsDefaultTypes(c *gin.Context) {
lst := []string{
"Linux",
"Procstat",
"cAdvisor",
"Ping",
"MySQL",
"ClickHouse",
}
ginx.NewRender(c).Data(lst, nil)
}
func (rt *Router) builtinMetricsTypes(c *gin.Context) {
collector := ginx.QueryStr(c, "collector", "")
query := ginx.QueryStr(c, "query", "")
lang := c.GetHeader("X-Language")
metricTypeListInDB, err := models.BuiltinMetricTypes(rt.Ctx, lang, collector, query)
ginx.Dangerous(err)
metricTypeListInFile := integration.BuiltinPayloadInFile.BuiltinMetricTypes(lang, collector, query)
typeMap := make(map[string]struct{})
for _, metricType := range metricTypeListInDB {
typeMap[metricType] = struct{}{}
}
for _, metricType := range metricTypeListInFile {
typeMap[metricType] = struct{}{}
}
metricTypeList := make([]string, 0, len(typeMap))
for metricType := range typeMap {
metricTypeList = append(metricTypeList, metricType)
}
sort.Strings(metricTypeList)
ginx.NewRender(c).Data(metricTypeList, nil)
}
func (rt *Router) builtinMetricsCollectors(c *gin.Context) {
typ := ginx.QueryStr(c, "typ", "")
query := ginx.QueryStr(c, "query", "")
lang := c.GetHeader("X-Language")
collectorListInDB, err := models.BuiltinMetricCollectors(rt.Ctx, lang, typ, query)
ginx.Dangerous(err)
collectorListInFile := integration.BuiltinPayloadInFile.BuiltinMetricCollectors(lang, typ, query)
collectorMap := make(map[string]struct{})
for _, collector := range collectorListInDB {
collectorMap[collector] = struct{}{}
}
for _, collector := range collectorListInFile {
collectorMap[collector] = struct{}{}
}
collectorList := make([]string, 0, len(collectorMap))
for collector := range collectorMap {
collectorList = append(collectorList, collector)
}
sort.Strings(collectorList)
ginx.NewRender(c).Data(collectorList, nil)
}
================================================
FILE: center/router/router_builtin_payload.go
================================================
package router
import (
"encoding/json"
"net/http"
"strings"
"time"
"github.com/BurntSushi/toml"
"github.com/ccfos/nightingale/v6/center/integration"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/i18n"
)
type Board struct {
Name string `json:"name"`
Tags string `json:"tags"`
Configs interface{} `json:"configs"`
UUID int64 `json:"uuid"`
Note string `json:"note"`
}
func (rt *Router) builtinPayloadsAdd(c *gin.Context) {
var lst []models.BuiltinPayload
ginx.BindJSON(c, &lst)
username := Username(c)
count := len(lst)
if count == 0 {
ginx.Bomb(http.StatusBadRequest, "input json is empty")
}
reterr := make(map[string]string)
for i := 0; i < count; i++ {
if lst[i].Type == "alert" {
if strings.HasPrefix(strings.TrimSpace(lst[i].Content), "[") {
// 处理多个告警规则模板的情况
alertRules := []models.AlertRule{}
if err := json.Unmarshal([]byte(lst[i].Content), &alertRules); err != nil {
reterr[lst[i].Name] = err.Error()
}
for _, rule := range alertRules {
if rule.UUID == 0 {
rule.UUID = time.Now().UnixMicro()
}
contentBytes, err := json.Marshal(rule)
if err != nil {
reterr[rule.Name] = err.Error()
continue
}
bp := models.BuiltinPayload{
Type: lst[i].Type,
ComponentID: lst[i].ComponentID,
Cate: lst[i].Cate,
Name: rule.Name,
Tags: rule.AppendTags,
UUID: rule.UUID,
Content: string(contentBytes),
CreatedBy: username,
UpdatedBy: username,
}
if err := bp.Add(rt.Ctx, username); err != nil {
reterr[bp.Name] = i18n.Sprintf(c.GetHeader("X-Language"), err.Error())
}
}
continue
}
alertRule := models.AlertRule{}
if err := json.Unmarshal([]byte(lst[i].Content), &alertRule); err != nil {
reterr[lst[i].Name] = err.Error()
continue
}
if alertRule.UUID == 0 {
alertRule.UUID = time.Now().UnixMicro()
}
contentBytes, err := json.Marshal(alertRule)
if err != nil {
reterr[alertRule.Name] = err.Error()
continue
}
bp := models.BuiltinPayload{
Type: lst[i].Type,
ComponentID: lst[i].ComponentID,
Cate: lst[i].Cate,
Name: alertRule.Name,
Tags: alertRule.AppendTags,
UUID: alertRule.UUID,
Content: string(contentBytes),
CreatedBy: username,
UpdatedBy: username,
}
if err := bp.Add(rt.Ctx, username); err != nil {
reterr[bp.Name] = i18n.Sprintf(c.GetHeader("X-Language"), err.Error())
}
} else if lst[i].Type == "dashboard" {
if strings.HasPrefix(strings.TrimSpace(lst[i].Content), "[") {
// 处理多个告警规则模板的情况
dashboards := []Board{}
if err := json.Unmarshal([]byte(lst[i].Content), &dashboards); err != nil {
reterr[lst[i].Name] = err.Error()
}
for _, dashboard := range dashboards {
if dashboard.UUID == 0 {
dashboard.UUID = time.Now().UnixMicro()
}
contentBytes, err := json.Marshal(dashboard)
if err != nil {
reterr[dashboard.Name] = err.Error()
continue
}
bp := models.BuiltinPayload{
Type: lst[i].Type,
ComponentID: lst[i].ComponentID,
Cate: lst[i].Cate,
Name: dashboard.Name,
Tags: dashboard.Tags,
UUID: dashboard.UUID,
Note: dashboard.Note,
Content: string(contentBytes),
CreatedBy: username,
UpdatedBy: username,
}
if err := bp.Add(rt.Ctx, username); err != nil {
reterr[bp.Name] = i18n.Sprintf(c.GetHeader("X-Language"), err.Error())
}
}
continue
}
dashboard := Board{}
if err := json.Unmarshal([]byte(lst[i].Content), &dashboard); err != nil {
reterr[lst[i].Name] = i18n.Sprintf(c.GetHeader("X-Language"), err.Error())
continue
}
if dashboard.UUID == 0 {
dashboard.UUID = time.Now().UnixMicro()
}
contentBytes, err := json.Marshal(dashboard)
if err != nil {
reterr[dashboard.Name] = err.Error()
continue
}
bp := models.BuiltinPayload{
Type: lst[i].Type,
ComponentID: lst[i].ComponentID,
Cate: lst[i].Cate,
Name: dashboard.Name,
Tags: dashboard.Tags,
UUID: dashboard.UUID,
Note: dashboard.Note,
Content: string(contentBytes),
CreatedBy: username,
UpdatedBy: username,
}
if err := bp.Add(rt.Ctx, username); err != nil {
reterr[bp.Name] = i18n.Sprintf(c.GetHeader("X-Language"), err.Error())
}
} else {
if lst[i].Type == "collect" {
c := make(map[string]interface{})
if _, err := toml.Decode(lst[i].Content, &c); err != nil {
reterr[lst[i].Name] = err.Error()
continue
}
}
if err := lst[i].Add(rt.Ctx, username); err != nil {
reterr[lst[i].Name] = i18n.Sprintf(c.GetHeader("X-Language"), err.Error())
}
}
}
ginx.NewRender(c).Data(reterr, nil)
}
func (rt *Router) builtinPayloadsGets(c *gin.Context) {
typ := ginx.QueryStr(c, "type", "")
if typ == "" {
ginx.Bomb(http.StatusBadRequest, "type is required")
return
}
ComponentID := ginx.QueryInt64(c, "component_id", 0)
cate := ginx.QueryStr(c, "cate", "")
query := ginx.QueryStr(c, "query", "")
lst, err := models.BuiltinPayloadGets(rt.Ctx, uint64(ComponentID), typ, cate, query)
ginx.Dangerous(err)
lstInFile, err := integration.BuiltinPayloadInFile.GetBuiltinPayload(typ, cate, query, uint64(ComponentID))
ginx.Dangerous(err)
if len(lstInFile) > 0 {
lst = append(lst, lstInFile...)
}
ginx.NewRender(c).Data(lst, nil)
}
func (rt *Router) builtinPayloadcatesGet(c *gin.Context) {
typ := ginx.QueryStr(c, "type", "")
ComponentID := ginx.QueryInt64(c, "component_id", 0)
cates, err := models.BuiltinPayloadCates(rt.Ctx, typ, uint64(ComponentID))
ginx.Dangerous(err)
catesInFile, err := integration.BuiltinPayloadInFile.GetBuiltinPayloadCates(typ, uint64(ComponentID))
ginx.Dangerous(err)
// 使用 map 进行去重
cateMap := make(map[string]bool)
// 添加数据库中的分类
for _, cate := range cates {
cateMap[cate] = true
}
// 添加文件中的分类
for _, cate := range catesInFile {
cateMap[cate] = true
}
// 将去重后的结果转换回切片
result := make([]string, 0, len(cateMap))
for cate := range cateMap {
result = append(result, cate)
}
ginx.NewRender(c).Data(result, nil)
}
func (rt *Router) builtinPayloadsPut(c *gin.Context) {
var req models.BuiltinPayload
ginx.BindJSON(c, &req)
bp, err := models.BuiltinPayloadGet(rt.Ctx, "id = ?", req.ID)
ginx.Dangerous(err)
if bp == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such builtin payload")
return
}
if req.Type == "alert" {
alertRule := models.AlertRule{}
if err := json.Unmarshal([]byte(req.Content), &alertRule); err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
req.Name = alertRule.Name
req.Tags = alertRule.AppendTags
} else if req.Type == "dashboard" {
dashboard := Board{}
if err := json.Unmarshal([]byte(req.Content), &dashboard); err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
req.Name = dashboard.Name
req.Tags = dashboard.Tags
req.Note = dashboard.Note
} else if req.Type == "collect" {
c := make(map[string]interface{})
if _, err := toml.Decode(req.Content, &c); err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
}
username := Username(c)
req.UpdatedBy = username
ginx.NewRender(c).Message(bp.Update(rt.Ctx, req))
}
func (rt *Router) builtinPayloadsDel(c *gin.Context) {
var req idsForm
ginx.BindJSON(c, &req)
req.Verify()
ginx.NewRender(c).Message(models.BuiltinPayloadDels(rt.Ctx, req.Ids))
}
func (rt *Router) builtinPayloadsGetByUUID(c *gin.Context) {
uuid := ginx.QueryInt64(c, "uuid")
bp, err := models.BuiltinPayloadGet(rt.Ctx, "uuid = ?", uuid)
ginx.Dangerous(err)
if bp != nil {
ginx.NewRender(c).Data(bp, nil)
} else {
ginx.NewRender(c).Data(integration.BuiltinPayloadInFile.IndexData[uuid], nil)
}
}
================================================
FILE: center/router/router_busi_group.go
================================================
package router
import (
"net/http"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/strx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/logger"
)
type busiGroupForm struct {
Name string `json:"name" binding:"required"`
LabelEnable int `json:"label_enable"`
LabelValue string `json:"label_value"`
Members []models.BusiGroupMember `json:"members"`
}
func (rt *Router) busiGroupAdd(c *gin.Context) {
var f busiGroupForm
ginx.BindJSON(c, &f)
if len(f.Members) == 0 {
ginx.Bomb(http.StatusBadRequest, "members empty")
}
rwhas := false
for i := 0; i < len(f.Members); i++ {
if f.Members[i].PermFlag == "rw" {
rwhas = true
break
}
}
if !rwhas {
ginx.Bomb(http.StatusBadRequest, "At least one team have rw permission")
}
username := c.MustGet("username").(string)
ginx.Dangerous(models.BusiGroupAdd(rt.Ctx, f.Name, f.LabelEnable, f.LabelValue, f.Members, username))
// 如果创建成功,拿着name去查,应该可以查到
newbg, err := models.BusiGroupGet(rt.Ctx, "name=?", f.Name)
ginx.Dangerous(err)
if newbg == nil {
ginx.NewRender(c).Message("Failed to create BusiGroup(%s)", f.Name)
return
}
ginx.NewRender(c).Data(newbg.Id, nil)
}
func (rt *Router) busiGroupPut(c *gin.Context) {
var f busiGroupForm
ginx.BindJSON(c, &f)
username := c.MustGet("username").(string)
targetbg := c.MustGet("busi_group").(*models.BusiGroup)
ginx.NewRender(c).Message(targetbg.Update(rt.Ctx, f.Name, f.LabelEnable, f.LabelValue, username))
}
func (rt *Router) busiGroupMemberAdd(c *gin.Context) {
var members []models.BusiGroupMember
ginx.BindJSON(c, &members)
username := c.MustGet("username").(string)
targetbg := c.MustGet("busi_group").(*models.BusiGroup)
for i := 0; i < len(members); i++ {
if members[i].BusiGroupId != targetbg.Id {
ginx.Bomb(http.StatusBadRequest, "business group id invalid")
}
}
ginx.NewRender(c).Message(targetbg.AddMembers(rt.Ctx, members, username))
}
func (rt *Router) busiGroupMemberDel(c *gin.Context) {
var members []models.BusiGroupMember
ginx.BindJSON(c, &members)
username := c.MustGet("username").(string)
targetbg := c.MustGet("busi_group").(*models.BusiGroup)
for i := 0; i < len(members); i++ {
if members[i].BusiGroupId != targetbg.Id {
ginx.Bomb(http.StatusBadRequest, "business group id invalid")
}
}
ginx.NewRender(c).Message(targetbg.DelMembers(rt.Ctx, members, username))
}
func (rt *Router) busiGroupDel(c *gin.Context) {
username := c.MustGet("username").(string)
targetbg := c.MustGet("busi_group").(*models.BusiGroup)
err := targetbg.Del(rt.Ctx)
if err != nil {
logger.Infof("busi_group_delete fail: operator=%s, group_name=%s error=%v", username, targetbg.Name, err)
} else {
logger.Infof("busi_group_delete succ: operator=%s, group_name=%s", username, targetbg.Name)
}
ginx.NewRender(c).Message(err)
}
// 我是超管、或者我是业务组成员
func (rt *Router) busiGroupGets(c *gin.Context) {
limit := ginx.QueryInt(c, "limit", defaultLimit)
query := ginx.QueryStr(c, "query", "")
all := ginx.QueryBool(c, "all", false)
me := c.MustGet("user").(*models.User)
lst, err := me.BusiGroups(rt.Ctx, limit, query, all)
if len(lst) == 0 {
lst = []models.BusiGroup{}
}
if err == nil {
models.FillUpdateByNicknames(rt.Ctx, lst)
}
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) busiGroupGetsByService(c *gin.Context) {
lst, err := models.BusiGroupGetAll(rt.Ctx)
ginx.NewRender(c).Data(lst, err)
}
// 这个接口只有在活跃告警页面才调用,获取各个BG的活跃告警数量
func (rt *Router) busiGroupAlertingsGets(c *gin.Context) {
ids := ginx.QueryStr(c, "ids", "")
ret, err := models.AlertNumbers(rt.Ctx, strx.IdsInt64ForAPI(ids))
ginx.NewRender(c).Data(ret, err)
}
func (rt *Router) busiGroupGet(c *gin.Context) {
bg := BusiGroup(rt.Ctx, ginx.UrlParamInt64(c, "id"))
ginx.Dangerous(bg.FillUserGroups(rt.Ctx))
ginx.NewRender(c).Data(bg, nil)
}
func (rt *Router) busiGroupsGetTags(c *gin.Context) {
bgids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",")
targetIdents, err := models.TargetIndentsGetByBgids(rt.Ctx, bgids)
ginx.Dangerous(err)
tags, err := models.TargetGetTags(rt.Ctx, targetIdents, true, "busigroup")
ginx.Dangerous(err)
ginx.NewRender(c).Data(tags, nil)
}
================================================
FILE: center/router/router_captcha.go
================================================
package router
import (
"context"
"time"
"github.com/ccfos/nightingale/v6/storage"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
captcha "github.com/mojocn/base64Captcha"
"github.com/toolkits/pkg/logger"
)
type CaptchaRedisStore struct {
redis storage.Redis
}
func (s *CaptchaRedisStore) Set(id string, value string) error {
ctx := context.Background()
err := s.redis.Set(ctx, id, value, time.Duration(300*time.Second)).Err()
if err != nil {
logger.Errorf("captcha id set to redis error : %s", err.Error())
return err
}
return nil
}
func (s *CaptchaRedisStore) Get(id string, clear bool) string {
ctx := context.Background()
val, err := s.redis.Get(ctx, id).Result()
if err != nil {
logger.Errorf("captcha id get from redis error : %s", err.Error())
return ""
}
if clear {
s.redis.Del(ctx, id)
}
return val
}
func (s *CaptchaRedisStore) Verify(id, answer string, clear bool) bool {
old := s.Get(id, clear)
return old == answer
}
func (rt *Router) newCaptchaRedisStore() *CaptchaRedisStore {
if captchaStore == nil {
captchaStore = &CaptchaRedisStore{redis: rt.Redis}
}
return captchaStore
}
var captchaStore *CaptchaRedisStore
type CaptchaReqBody struct {
Id string
VerifyValue string
}
// 生成图形验证码
func (rt *Router) generateCaptcha(c *gin.Context) {
var driver = captcha.NewDriverMath(60, 200, 0, captcha.OptionShowHollowLine, nil, nil, []string{"wqy-microhei.ttc"})
cc := captcha.NewCaptcha(driver, rt.newCaptchaRedisStore())
//data:image/png;base64
id, b64s, _, err := cc.Generate()
if err != nil {
ginx.NewRender(c).Message(err)
return
}
ginx.NewRender(c).Data(gin.H{
"imgdata": b64s,
"captchaid": id,
}, nil)
}
// 验证
func (rt *Router) captchaVerify(c *gin.Context) {
var param CaptchaReqBody
ginx.BindJSON(c, ¶m)
//verify the captcha
if captchaStore.Verify(param.Id, param.VerifyValue, true) {
ginx.NewRender(c).Message("")
return
}
ginx.NewRender(c).Message("incorrect verification code")
}
// 验证码开关
func (rt *Router) ifShowCaptcha(c *gin.Context) {
if rt.HTTP.ShowCaptcha.Enable {
ginx.NewRender(c).Data(gin.H{
"show": true,
}, nil)
return
}
ginx.NewRender(c).Data(gin.H{
"show": false,
}, nil)
}
// 验证
func CaptchaVerify(id string, value string) bool {
//verify the captcha
return captchaStore.Verify(id, value, true)
}
================================================
FILE: center/router/router_chart_share.go
================================================
package router
import (
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/strx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
func (rt *Router) chartShareGets(c *gin.Context) {
ids := ginx.QueryStr(c, "ids", "")
lst, err := models.ChartShareGetsByIds(rt.Ctx, strx.IdsInt64ForAPI(ids, ","))
ginx.NewRender(c).Data(lst, err)
}
type chartShareForm struct {
DatasourceId int64 `json:"datasource_id"`
Configs string `json:"configs"`
}
func (rt *Router) chartShareAdd(c *gin.Context) {
username := c.MustGet("username").(string)
var forms []chartShareForm
ginx.BindJSON(c, &forms)
ids := []int64{}
now := time.Now().Unix()
for _, f := range forms {
chart := models.ChartShare{
DatasourceId: f.DatasourceId,
Configs: f.Configs,
CreateBy: username,
CreateAt: now,
}
ginx.Dangerous(chart.Add(rt.Ctx))
ids = append(ids, chart.Id)
}
ginx.NewRender(c).Data(ids, nil)
}
================================================
FILE: center/router/router_config.go
================================================
package router
import (
"encoding/json"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
func (rt *Router) notifyChannelsGets(c *gin.Context) {
var labelAndKeys []models.LabelAndKey
cval, err := models.ConfigsGet(rt.Ctx, models.NOTIFYCHANNEL)
ginx.Dangerous(err)
if cval == "" {
ginx.NewRender(c).Data(labelAndKeys, nil)
return
}
var notifyChannels []models.NotifyChannel
err = json.Unmarshal([]byte(cval), ¬ifyChannels)
ginx.Dangerous(err)
for _, v := range notifyChannels {
if v.Hide {
continue
}
var labelAndKey models.LabelAndKey
labelAndKey.Label = v.Name
labelAndKey.Key = v.Ident
labelAndKeys = append(labelAndKeys, labelAndKey)
}
ginx.NewRender(c).Data(labelAndKeys, nil)
}
func (rt *Router) contactKeysGets(c *gin.Context) {
var labelAndKeys []models.LabelAndKey
cval, err := models.ConfigsGet(rt.Ctx, models.NOTIFYCONTACT)
ginx.Dangerous(err)
if cval == "" {
ginx.NewRender(c).Data(labelAndKeys, nil)
return
}
var notifyContacts []models.NotifyContact
err = json.Unmarshal([]byte(cval), ¬ifyContacts)
ginx.Dangerous(err)
for _, v := range notifyContacts {
if v.Hide {
continue
}
var labelAndKey models.LabelAndKey
labelAndKey.Label = v.Name
labelAndKey.Key = v.Ident
labelAndKeys = append(labelAndKeys, labelAndKey)
}
ginx.NewRender(c).Data(labelAndKeys, nil)
}
func (rt *Router) siteInfo(c *gin.Context) {
config, err := models.ConfigsGet(rt.Ctx, "site_info")
ginx.NewRender(c).Data(config, err)
}
================================================
FILE: center/router/router_configs.go
================================================
package router
import (
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
const EMBEDDEDDASHBOARD = "embedded-dashboards"
func (rt *Router) configsGet(c *gin.Context) {
prefix := ginx.QueryStr(c, "prefix", "")
limit := ginx.QueryInt(c, "limit", 10)
configs, err := models.ConfigsGets(rt.Ctx, prefix, limit, ginx.Offset(c, limit))
if err == nil {
models.FillUpdateByNicknames(rt.Ctx, configs)
}
ginx.NewRender(c).Data(configs, err)
}
func (rt *Router) configGet(c *gin.Context) {
id := ginx.UrlParamInt64(c, "id")
configs, err := models.ConfigGet(rt.Ctx, id)
ginx.NewRender(c).Data(configs, err)
}
func (rt *Router) configGetAll(c *gin.Context) {
config, err := models.ConfigsGetAll(rt.Ctx)
ginx.NewRender(c).Data(config, err)
}
func (rt *Router) configGetByKey(c *gin.Context) {
config, err := models.ConfigsGet(rt.Ctx, ginx.QueryStr(c, "key"))
ginx.NewRender(c).Data(config, err)
}
func (rt *Router) configPutByKey(c *gin.Context) {
var f models.Configs
ginx.BindJSON(c, &f)
username := c.MustGet("username").(string)
ginx.NewRender(c).Message(models.ConfigsSetWithUname(rt.Ctx, f.Ckey, f.Cval, username))
}
func (rt *Router) embeddedDashboardsGet(c *gin.Context) {
config, err := models.ConfigsGet(rt.Ctx, EMBEDDEDDASHBOARD)
ginx.NewRender(c).Data(config, err)
}
func (rt *Router) embeddedDashboardsPut(c *gin.Context) {
var f models.Configs
ginx.BindJSON(c, &f)
username := c.MustGet("username").(string)
ginx.NewRender(c).Message(models.ConfigsSetWithUname(rt.Ctx, EMBEDDEDDASHBOARD, f.Cval, username))
}
func (rt *Router) configsDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
ginx.NewRender(c).Message(models.ConfigsDel(rt.Ctx, f.Ids))
}
func (rt *Router) configsPut(c *gin.Context) { //for APIForService
var arr []models.Configs
ginx.BindJSON(c, &arr)
username := c.GetString("user")
if username == "" {
username = "default"
}
now := time.Now().Unix()
for i := 0; i < len(arr); i++ {
arr[i].UpdateBy = username
arr[i].UpdateAt = now
ginx.Dangerous(arr[i].Update(rt.Ctx))
}
ginx.NewRender(c).Message(nil)
}
func (rt *Router) configsPost(c *gin.Context) { //for APIForService
var arr []models.Configs
ginx.BindJSON(c, &arr)
username := c.GetString("user")
if username == "" {
username = "default"
}
now := time.Now().Unix()
for i := 0; i < len(arr); i++ {
arr[i].CreateBy = username
arr[i].UpdateBy = username
arr[i].CreateAt = now
arr[i].UpdateAt = now
ginx.Dangerous(arr[i].Add(rt.Ctx))
}
ginx.NewRender(c).Message(nil)
}
================================================
FILE: center/router/router_crypto.go
================================================
package router
import (
"github.com/ccfos/nightingale/v6/pkg/secu"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
type confPropCrypto struct {
Data string `json:"data" binding:"required"`
Key string `json:"key" binding:"required"`
}
func (rt *Router) confPropEncrypt(c *gin.Context) {
var f confPropCrypto
ginx.BindJSON(c, &f)
k := len(f.Key)
switch k {
default:
c.String(400, "The key length should be 16, 24 or 32")
return
case 16, 24, 32:
break
}
s, err := secu.DealWithEncrypt(f.Data, f.Key)
if err != nil {
c.String(500, err.Error())
}
c.JSON(200, gin.H{
"src": f.Data,
"key": f.Key,
"encrypt": s,
})
}
func (rt *Router) confPropDecrypt(c *gin.Context) {
var f confPropCrypto
ginx.BindJSON(c, &f)
k := len(f.Key)
switch k {
default:
c.String(400, "The key length should be 16, 24 or 32")
return
case 16, 24, 32:
break
}
s, err := secu.DealWithDecrypt(f.Data, f.Key)
if err != nil {
c.String(500, err.Error())
}
c.JSON(200, gin.H{
"src": f.Data,
"key": f.Key,
"decrypt": s,
})
}
================================================
FILE: center/router/router_dash_annotation.go
================================================
package router
import (
"fmt"
"net/http"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
func checkAnnotationPermission(c *gin.Context, ctx *ctx.Context, dashboardId int64) {
dashboard, err := models.BoardGetByID(ctx, dashboardId)
if err != nil {
ginx.Bomb(http.StatusInternalServerError, "failed to get dashboard: %v", err)
}
if dashboard == nil {
ginx.Bomb(http.StatusNotFound, "dashboard not found")
}
bg := BusiGroup(ctx, dashboard.GroupId)
me := c.MustGet("user").(*models.User)
can, err := me.CanDoBusiGroup(ctx, bg, "rw")
ginx.Dangerous(err)
if !can {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
}
func (rt *Router) dashAnnotationAdd(c *gin.Context) {
var f models.DashAnnotation
ginx.BindJSON(c, &f)
username := c.MustGet("username").(string)
now := time.Now().Unix()
checkAnnotationPermission(c, rt.Ctx, f.DashboardId)
f.CreateBy = username
f.CreateAt = now
f.UpdateBy = username
f.UpdateAt = now
ginx.NewRender(c).Data(f.Id, f.Add(rt.Ctx))
}
func (rt *Router) dashAnnotationGets(c *gin.Context) {
dashboardId := ginx.QueryInt64(c, "dashboard_id")
from := ginx.QueryInt64(c, "from")
to := ginx.QueryInt64(c, "to")
limit := ginx.QueryInt(c, "limit", 100)
lst, err := models.DashAnnotationGets(rt.Ctx, dashboardId, from, to, limit)
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) dashAnnotationPut(c *gin.Context) {
var f models.DashAnnotation
ginx.BindJSON(c, &f)
id := ginx.UrlParamInt64(c, "id")
annotation, err := getAnnotationById(rt.Ctx, id)
ginx.Dangerous(err)
checkAnnotationPermission(c, rt.Ctx, annotation.DashboardId)
f.Id = id
f.UpdateAt = time.Now().Unix()
f.UpdateBy = c.MustGet("username").(string)
ginx.NewRender(c).Message(f.Update(rt.Ctx))
}
func (rt *Router) dashAnnotationDel(c *gin.Context) {
id := ginx.UrlParamInt64(c, "id")
annotation, err := getAnnotationById(rt.Ctx, id)
ginx.Dangerous(err)
checkAnnotationPermission(c, rt.Ctx, annotation.DashboardId)
ginx.NewRender(c).Message(models.DashAnnotationDel(rt.Ctx, id))
}
// 可以提取获取注释的通用方法
func getAnnotationById(ctx *ctx.Context, id int64) (*models.DashAnnotation, error) {
annotation, err := models.DashAnnotationGet(ctx, "id=?", id)
if err != nil {
return nil, err
}
if annotation == nil {
return nil, fmt.Errorf("annotation not found")
}
return annotation, nil
}
================================================
FILE: center/router/router_dashboard.go
================================================
package router
type ChartPure struct {
Configs string `json:"configs"`
Weight int `json:"weight"`
}
type ChartGroupPure struct {
Name string `json:"name"`
Weight int `json:"weight"`
Charts []ChartPure `json:"charts"`
}
type DashboardPure struct {
Name string `json:"name"`
Tags string `json:"tags"`
Configs string `json:"configs"`
ChartGroups []ChartGroupPure `json:"chart_groups"`
}
================================================
FILE: center/router/router_datasource.go
================================================
package router
import (
"context"
"crypto/tls"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
"github.com/ccfos/nightingale/v6/datasource/opensearch"
"github.com/ccfos/nightingale/v6/dskit/clickhouse"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/i18n"
"github.com/toolkits/pkg/logger"
)
func (rt *Router) pluginList(c *gin.Context) {
Render(c, rt.Center.Plugins, nil)
}
type listReq struct {
Name string `json:"name"`
Type string `json:"plugin_type"`
Category string `json:"category"`
}
func (rt *Router) datasourceList(c *gin.Context) {
if rt.DatasourceCache.DatasourceCheckHook(c) {
Render(c, []int{}, nil)
return
}
var req listReq
ginx.BindJSON(c, &req)
typ := req.Type
category := req.Category
name := req.Name
user := c.MustGet("user").(*models.User)
list, err := models.GetDatasourcesGetsBy(rt.Ctx, typ, category, name, "")
Render(c, rt.DatasourceCache.DatasourceFilter(list, user), err)
}
func (rt *Router) datasourceGetsByService(c *gin.Context) {
typ := ginx.QueryStr(c, "typ", "")
lst, err := models.GetDatasourcesGetsBy(rt.Ctx, typ, "", "", "")
openRsa := rt.Center.RSA.OpenRSA
for _, item := range lst {
if err := item.Encrypt(openRsa, rt.HTTP.RSA.RSAPublicKey); err != nil {
logger.Errorf("datasource %+v encrypt failed: %v", item, err)
continue
}
}
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) datasourceRsaConfigGet(c *gin.Context) {
if rt.Center.RSA.OpenRSA {
publicKey := ""
privateKey := ""
if len(rt.HTTP.RSA.RSAPublicKey) > 0 {
publicKey = base64.StdEncoding.EncodeToString(rt.HTTP.RSA.RSAPublicKey)
}
if len(rt.HTTP.RSA.RSAPrivateKey) > 0 {
privateKey = base64.StdEncoding.EncodeToString(rt.HTTP.RSA.RSAPrivateKey)
}
logger.Debugf("OpenRSA=%v", rt.Center.RSA.OpenRSA)
ginx.NewRender(c).Data(models.RsaConfig{
OpenRSA: rt.Center.RSA.OpenRSA,
RSAPublicKey: publicKey,
RSAPrivateKey: privateKey,
RSAPassWord: rt.HTTP.RSA.RSAPassWord,
}, nil)
} else {
ginx.NewRender(c).Data(models.RsaConfig{
OpenRSA: rt.Center.RSA.OpenRSA,
}, nil)
}
}
func (rt *Router) datasourceBriefs(c *gin.Context) {
var dss []*models.Datasource
list, err := models.GetDatasourcesGetsBy(rt.Ctx, "", "", "", "")
ginx.Dangerous(err)
for _, item := range list {
item.AuthJson.BasicAuthPassword = ""
if item.PluginType == models.PROMETHEUS {
for k, v := range item.SettingsJson {
if strings.HasPrefix(k, "prometheus.") {
item.SettingsJson[strings.TrimPrefix(k, "prometheus.")] = v
delete(item.SettingsJson, k)
}
}
} else if item.PluginType == "cloudwatch" {
for k := range item.SettingsJson {
if !strings.Contains(k, "region") {
delete(item.SettingsJson, k)
}
}
} else {
item.SettingsJson = nil
}
dss = append(dss, item)
}
if !rt.Center.AnonymousAccess.PromQuerier {
user := c.MustGet("user").(*models.User)
dss = rt.DatasourceCache.DatasourceFilter(dss, user)
}
ginx.NewRender(c).Data(dss, err)
}
func (rt *Router) datasourceUpsert(c *gin.Context) {
if rt.DatasourceCache.DatasourceCheckHook(c) {
Render(c, []int{}, nil)
return
}
var req models.Datasource
ginx.BindJSON(c, &req)
username := Username(c)
req.UpdatedBy = username
var err error
var count int64
if !req.ForceSave {
if req.PluginType == models.PROMETHEUS || req.PluginType == models.LOKI || req.PluginType == models.TDENGINE {
err = DatasourceCheck(c, req)
if err != nil {
Dangerous(c, err)
return
}
}
}
for k, v := range req.SettingsJson {
if strings.Contains(k, "cluster_name") {
req.ClusterName = v.(string)
break
}
}
if req.PluginType == models.OPENSEARCH {
b, err := json.Marshal(req.SettingsJson)
if err != nil {
logger.Warningf("marshal settings fail: %v", err)
return
}
var os opensearch.OpenSearch
err = json.Unmarshal(b, &os)
if err != nil {
logger.Warningf("unmarshal settings fail: %v", err)
return
}
if len(os.Nodes) == 0 {
logger.Warningf("nodes empty, %+v", req)
return
}
req.HTTPJson = models.HTTP{
Timeout: os.Timeout,
Url: os.Nodes[0],
Headers: os.Headers,
TLS: models.TLS{
SkipTlsVerify: os.TLS.SkipTlsVerify,
},
}
req.AuthJson = models.Auth{
BasicAuth: os.Basic.Enable,
BasicAuthUser: os.Basic.Username,
BasicAuthPassword: os.Basic.Password,
}
}
if req.PluginType == models.CLICKHOUSE {
b, err := json.Marshal(req.SettingsJson)
if err != nil {
logger.Warningf("marshal clickhouse settings failed: %v", err)
Dangerous(c, err)
return
}
var ckConfig clickhouse.Clickhouse
err = json.Unmarshal(b, &ckConfig)
if err != nil {
logger.Warningf("unmarshal clickhouse settings failed: %v", err)
Dangerous(c, err)
return
}
// 检查ckconfig的nodes不应该以http://或https://开头
for _, addr := range ckConfig.Nodes {
if strings.HasPrefix(addr, "http://") || strings.HasPrefix(addr, "https://") {
err = fmt.Errorf("clickhouse node address should not start with http:// or https:// : %s", addr)
logger.Warningf("clickhouse node address invalid: %v", err)
Dangerous(c, err)
return
}
}
// InitCli 会自动检测并选择 HTTP 或 Native 协议
err = ckConfig.InitCli()
if err != nil {
logger.Warningf("clickhouse connection failed: %v", err)
Dangerous(c, err)
return
}
// 执行 SHOW DATABASES 测试连通性
_, err = ckConfig.ShowDatabases(context.Background())
if err != nil {
logger.Warningf("clickhouse test query failed: %v", err)
Dangerous(c, err)
return
}
}
if req.PluginType == models.ELASTICSEARCH {
skipAuto := false
// 若用户输入了version(version字符串存在且不为空),则不自动获取
if req.SettingsJson != nil {
if v, ok := req.SettingsJson["version"]; ok {
switch vv := v.(type) {
case string:
if strings.TrimSpace(vv) != "" {
skipAuto = true
}
default:
if strings.TrimSpace(fmt.Sprint(vv)) != "" {
skipAuto = true
}
}
}
}
if !skipAuto {
version, err := getElasticsearchVersion(req, 10*time.Second)
if err != nil {
logger.Warningf("failed to get elasticsearch version: %v", err)
} else {
if req.SettingsJson == nil {
req.SettingsJson = make(map[string]interface{})
}
req.SettingsJson["version"] = version
}
}
}
if req.Id == 0 {
req.CreatedBy = username
req.Status = "enabled"
count, err = models.GetDatasourcesCountBy(rt.Ctx, "", "", req.Name)
if err != nil {
Render(c, nil, err)
return
}
if count > 0 {
Render(c, nil, "name already exists")
return
}
err = req.Add(rt.Ctx)
} else {
err = req.Update(rt.Ctx, "name", "identifier", "description", "cluster_name", "settings", "http", "auth", "updated_by", "updated_at", "is_default", "weight")
}
Render(c, nil, err)
}
func DatasourceCheck(c *gin.Context, ds models.Datasource) error {
if ds.PluginType == models.PROMETHEUS || ds.PluginType == models.LOKI || ds.PluginType == models.TDENGINE {
if ds.HTTPJson.Url == "" {
return fmt.Errorf("url is empty")
}
if !strings.HasPrefix(ds.HTTPJson.Url, "http") {
return fmt.Errorf("url must start with http or https")
}
}
// 使用 TLS 配置(支持 mTLS)
tlsConfig, err := ds.HTTPJson.TLS.TLSConfig()
if err != nil {
return fmt.Errorf("failed to create TLS config: %v", err)
}
client := &http.Client{
Transport: &http.Transport{
TLSClientConfig: tlsConfig,
},
}
ds.HTTPJson.Url = strings.TrimRight(ds.HTTPJson.Url, "/")
var fullURL string
req, err := ds.HTTPJson.NewReq(&fullURL)
if err != nil {
logger.Errorf("Error creating request: %v", err)
return fmt.Errorf("request urls:%v failed: %v", ds.HTTPJson.GetUrls(), err)
}
if ds.PluginType == models.PROMETHEUS {
subPath := "/api/v1/query"
query := url.Values{}
if ds.HTTPJson.IsLoki() {
subPath = "/api/v1/labels"
} else {
query.Add("query", "1+1")
}
fullURL = fmt.Sprintf("%s%s?%s", ds.HTTPJson.Url, subPath, query.Encode())
req, err = http.NewRequest("GET", fullURL, nil)
if err != nil {
logger.Errorf("Error creating request: %v", err)
return fmt.Errorf("request url:%s failed: %v", fullURL, err)
}
} else if ds.PluginType == models.TDENGINE {
fullURL = fmt.Sprintf("%s/rest/sql", ds.HTTPJson.Url)
req, err = http.NewRequest("POST", fullURL, strings.NewReader("show databases"))
if err != nil {
logger.Errorf("Error creating request: %v", err)
return fmt.Errorf("request url:%s failed: %v", fullURL, err)
}
}
if ds.PluginType == models.LOKI {
subPath := "/api/v1/labels"
fullURL = fmt.Sprintf("%s%s", ds.HTTPJson.Url, subPath)
req, err = http.NewRequest("GET", fullURL, nil)
if err != nil {
logger.Errorf("Error creating request: %v", err)
if !strings.Contains(ds.HTTPJson.Url, "/loki") {
lang := c.GetHeader("X-Language")
return fmt.Errorf(i18n.Sprintf(lang, "/loki suffix is miss, please add /loki to the url: %s", ds.HTTPJson.Url+"/loki"))
}
return fmt.Errorf("request url:%s failed: %v", fullURL, err)
}
}
if ds.AuthJson.BasicAuthUser != "" {
req.SetBasicAuth(ds.AuthJson.BasicAuthUser, ds.AuthJson.BasicAuthPassword)
}
for k, v := range ds.HTTPJson.Headers {
req.Header.Set(k, v)
}
resp, err := client.Do(req)
if err != nil {
logger.Errorf("Error making request: %v\n", err)
return fmt.Errorf("request url:%s failed: %v", fullURL, err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
logger.Errorf("Error making request: %v\n", resp.StatusCode)
if resp.StatusCode == 404 && ds.PluginType == models.LOKI && !strings.Contains(ds.HTTPJson.Url, "/loki") {
lang := c.GetHeader("X-Language")
return fmt.Errorf(i18n.Sprintf(lang, "/loki suffix is miss, please add /loki to the url: %s", ds.HTTPJson.Url+"/loki"))
}
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("request url:%s failed code:%d body:%s", fullURL, resp.StatusCode, string(body))
}
return nil
}
func (rt *Router) datasourceGet(c *gin.Context) {
if rt.DatasourceCache.DatasourceCheckHook(c) {
Render(c, []int{}, nil)
return
}
var req models.Datasource
ginx.BindJSON(c, &req)
err := req.Get(rt.Ctx)
Render(c, req, err)
}
func (rt *Router) datasourceUpdataStatus(c *gin.Context) {
if rt.DatasourceCache.DatasourceCheckHook(c) {
Render(c, []int{}, nil)
return
}
var req models.Datasource
ginx.BindJSON(c, &req)
username := Username(c)
req.UpdatedBy = username
err := req.Update(rt.Ctx, "status", "updated_by", "updated_at")
Render(c, req, err)
}
func (rt *Router) datasourceDel(c *gin.Context) {
if rt.DatasourceCache.DatasourceCheckHook(c) {
Render(c, []int{}, nil)
return
}
var ids []int64
ginx.BindJSON(c, &ids)
err := models.DatasourceDel(rt.Ctx, ids)
Render(c, nil, err)
}
func (rt *Router) getDatasourceIds(c *gin.Context) {
name := ginx.QueryStr(c, "name")
datasourceIds, err := models.GetDatasourceIdsByEngineName(rt.Ctx, name)
ginx.NewRender(c).Data(datasourceIds, err)
}
type datasourceQueryForm struct {
Cate string `json:"datasource_cate"`
DatasourceQueries []models.DatasourceQuery `json:"datasource_queries"`
}
type datasourceQueryResp struct {
ID int64 `json:"id"`
Name string `json:"name"`
}
func (rt *Router) datasourceQuery(c *gin.Context) {
var dsf datasourceQueryForm
ginx.BindJSON(c, &dsf)
datasources, err := models.GetDatasourcesGetsByTypes(rt.Ctx, []string{dsf.Cate})
ginx.Dangerous(err)
nameToID := make(map[string]int64)
IDToName := make(map[int64]string)
for _, ds := range datasources {
nameToID[ds.Name] = ds.Id
IDToName[ds.Id] = ds.Name
}
ids := models.GetDatasourceIDsByDatasourceQueries(dsf.DatasourceQueries, IDToName, nameToID)
var req []datasourceQueryResp
for _, id := range ids {
req = append(req, datasourceQueryResp{
ID: id,
Name: IDToName[id],
})
}
ginx.NewRender(c).Data(req, err)
}
// getElasticsearchVersion 该函数尝试从提供的Elasticsearch数据源中获取版本号,遍历所有URL,
// 直到成功获取版本号或所有URL均尝试失败为止。
func getElasticsearchVersion(ds models.Datasource, timeout time.Duration) (string, error) {
client := &http.Client{
Timeout: timeout,
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: ds.HTTPJson.TLS.SkipTlsVerify,
},
},
}
urls := make([]string, 0)
if len(ds.HTTPJson.Urls) > 0 {
urls = append(urls, ds.HTTPJson.Urls...)
}
if ds.HTTPJson.Url != "" {
urls = append(urls, ds.HTTPJson.Url)
}
if len(urls) == 0 {
return "", fmt.Errorf("no url provided")
}
var lastErr error
for _, raw := range urls {
baseURL := strings.TrimRight(raw, "/") + "/"
req, err := http.NewRequest("GET", baseURL, nil)
if err != nil {
lastErr = err
continue
}
if ds.AuthJson.BasicAuthUser != "" {
req.SetBasicAuth(ds.AuthJson.BasicAuthUser, ds.AuthJson.BasicAuthPassword)
}
for k, v := range ds.HTTPJson.Headers {
req.Header.Set(k, v)
}
resp, err := client.Do(req)
if err != nil {
lastErr = err
continue
}
body, err := io.ReadAll(resp.Body)
resp.Body.Close()
if err != nil {
lastErr = err
continue
}
if resp.StatusCode != 200 {
lastErr = fmt.Errorf("request to %s failed with status: %d body:%s", baseURL, resp.StatusCode, string(body))
continue
}
var result map[string]interface{}
if err := json.Unmarshal(body, &result); err != nil {
lastErr = err
continue
}
if version, ok := result["version"].(map[string]interface{}); ok {
if number, ok := version["number"].(string); ok && number != "" {
return number, nil
}
}
lastErr = fmt.Errorf("version not found in response from %s", baseURL)
}
if lastErr != nil {
return "", lastErr
}
return "", fmt.Errorf("failed to get elasticsearch version")
}
================================================
FILE: center/router/router_datasource_db.go
================================================
package router
import (
"context"
"github.com/ccfos/nightingale/v6/dscache"
"github.com/ccfos/nightingale/v6/dskit/types"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/ccfos/nightingale/v6/pkg/logx"
"github.com/gin-gonic/gin"
)
func (rt *Router) ShowDatabases(c *gin.Context) {
var f models.QueryParam
ginx.BindJSON(c, &f)
plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId)
if !exists {
logx.Warningf(c.Request.Context(), "cluster:%d not exists", f.DatasourceId)
ginx.Bomb(200, "cluster not exists")
}
var databases []string
var err error
type DatabaseShower interface {
ShowDatabases(context.Context) ([]string, error)
}
switch plug.(type) {
case DatabaseShower:
databases, err = plug.(DatabaseShower).ShowDatabases(c.Request.Context())
ginx.Dangerous(err)
default:
ginx.Bomb(200, "datasource not exists")
}
if len(databases) == 0 {
databases = make([]string, 0)
}
ginx.NewRender(c).Data(databases, nil)
}
func (rt *Router) ShowTables(c *gin.Context) {
var f models.QueryParam
ginx.BindJSON(c, &f)
plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId)
if !exists {
logx.Warningf(c.Request.Context(), "cluster:%d not exists", f.DatasourceId)
ginx.Bomb(200, "cluster not exists")
}
// 只接受一个入参
tables := make([]string, 0)
var err error
type TableShower interface {
ShowTables(ctx context.Context, database string) ([]string, error)
}
switch plug.(type) {
case TableShower:
if len(f.Queries) > 0 {
database, ok := f.Queries[0].(string)
if ok {
tables, err = plug.(TableShower).ShowTables(c.Request.Context(), database)
}
}
default:
ginx.Bomb(200, "datasource not exists")
}
ginx.NewRender(c).Data(tables, err)
}
func (rt *Router) DescribeTable(c *gin.Context) {
var f models.QueryParam
ginx.BindJSON(c, &f)
plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId)
if !exists {
logx.Warningf(c.Request.Context(), "cluster:%d not exists", f.DatasourceId)
ginx.Bomb(200, "cluster not exists")
}
// 只接受一个入参
columns := make([]*types.ColumnProperty, 0)
var err error
type TableDescriber interface {
DescribeTable(context.Context, interface{}) ([]*types.ColumnProperty, error)
}
switch plug.(type) {
case TableDescriber:
client := plug.(TableDescriber)
if len(f.Queries) > 0 {
columns, err = client.DescribeTable(c.Request.Context(), f.Queries[0])
}
default:
ginx.Bomb(200, "datasource not exists")
}
ginx.NewRender(c).Data(columns, err)
}
================================================
FILE: center/router/router_embedded.go
================================================
package router
import (
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
func (rt *Router) embeddedProductGets(c *gin.Context) {
products, err := models.EmbeddedProductGets(rt.Ctx)
ginx.Dangerous(err)
models.FillUpdateByNicknames(rt.Ctx, products)
// 获取当前用户可访问的Group ID 列表
me := c.MustGet("user").(*models.User)
if me.IsAdmin() {
ginx.NewRender(c).Data(products, err)
return
}
gids, err := models.MyGroupIds(rt.Ctx, me.Id)
bgSet := make(map[int64]struct{}, len(gids))
for _, id := range gids {
bgSet[id] = struct{}{}
}
// 过滤出公开或有权限访问的私有 product link
var result []*models.EmbeddedProduct
for _, product := range products {
if !product.IsPrivate {
result = append(result, product)
continue
}
for _, tid := range product.TeamIDs {
if _, ok := bgSet[tid]; ok {
result = append(result, product)
break
}
}
}
ginx.NewRender(c).Data(result, err)
}
func (rt *Router) embeddedProductGet(c *gin.Context) {
id := ginx.UrlParamInt64(c, "id")
if id <= 0 {
ginx.Bomb(400, "invalid id")
}
data, err := models.GetEmbeddedProductByID(rt.Ctx, id)
ginx.Dangerous(err)
me := c.MustGet("user").(*models.User)
hashPermission, err := hasEmbeddedProductAccess(rt.Ctx, me, data)
ginx.Dangerous(err)
if !hashPermission {
ginx.Bomb(403, "forbidden")
}
ginx.NewRender(c).Data(data, nil)
}
func (rt *Router) embeddedProductAdd(c *gin.Context) {
var eps []models.EmbeddedProduct
ginx.BindJSON(c, &eps)
me := c.MustGet("user").(*models.User)
for i := range eps {
eps[i].CreateBy = me.Nickname
eps[i].UpdateBy = me.Nickname
}
err := models.AddEmbeddedProduct(rt.Ctx, eps)
ginx.NewRender(c).Message(err)
}
func (rt *Router) embeddedProductPut(c *gin.Context) {
var ep models.EmbeddedProduct
id := ginx.UrlParamInt64(c, "id")
ginx.BindJSON(c, &ep)
if id <= 0 {
ginx.Bomb(400, "invalid id")
}
oldProduct, err := models.GetEmbeddedProductByID(rt.Ctx, id)
ginx.Dangerous(err)
me := c.MustGet("user").(*models.User)
now := time.Now().Unix()
oldProduct.Name = ep.Name
oldProduct.URL = ep.URL
oldProduct.IsPrivate = ep.IsPrivate
oldProduct.TeamIDs = ep.TeamIDs
oldProduct.UpdateBy = me.Username
oldProduct.UpdateAt = now
err = models.UpdateEmbeddedProduct(rt.Ctx, oldProduct)
ginx.NewRender(c).Message(err)
}
func (rt *Router) embeddedProductDelete(c *gin.Context) {
id := ginx.UrlParamInt64(c, "id")
if id <= 0 {
ginx.Bomb(400, "invalid id")
}
err := models.DeleteEmbeddedProduct(rt.Ctx, id)
ginx.NewRender(c).Message(err)
}
func hasEmbeddedProductAccess(ctx *ctx.Context, user *models.User, ep *models.EmbeddedProduct) (bool, error) {
if user.IsAdmin() || !ep.IsPrivate {
return true, nil
}
gids, err := models.MyGroupIds(ctx, user.Id)
if err != nil {
return false, err
}
groupSet := make(map[int64]struct{}, len(gids))
for _, gid := range gids {
groupSet[gid] = struct{}{}
}
for _, tid := range ep.TeamIDs {
if _, ok := groupSet[tid]; ok {
return true, nil
}
}
return false, nil
}
================================================
FILE: center/router/router_es.go
================================================
package router
import (
"github.com/ccfos/nightingale/v6/datasource/es"
"github.com/ccfos/nightingale/v6/dscache"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/ccfos/nightingale/v6/pkg/logx"
"github.com/gin-gonic/gin"
)
type IndexReq struct {
Cate string `json:"cate"`
DatasourceId int64 `json:"datasource_id"`
Index string `json:"index"`
}
type FieldValueReq struct {
Cate string `json:"cate"`
DatasourceId int64 `json:"datasource_id"`
Index string `json:"index"`
Query FieldObj `json:"query"`
}
type FieldObj struct {
Find string `json:"find"`
Field string `json:"field"`
Query string `json:"query"`
}
func (rt *Router) QueryIndices(c *gin.Context) {
var f IndexReq
ginx.BindJSON(c, &f)
plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId)
if !exists {
logx.Warningf(c.Request.Context(), "cluster:%d not exists", f.DatasourceId)
ginx.Bomb(200, "cluster not exists")
}
indices, err := plug.(*es.Elasticsearch).QueryIndices()
ginx.Dangerous(err)
ginx.NewRender(c).Data(indices, nil)
}
func (rt *Router) QueryFields(c *gin.Context) {
var f IndexReq
ginx.BindJSON(c, &f)
plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId)
if !exists {
logx.Warningf(c.Request.Context(), "cluster:%d not exists", f.DatasourceId)
ginx.Bomb(200, "cluster not exists")
}
fields, err := plug.(*es.Elasticsearch).QueryFields([]string{f.Index})
ginx.Dangerous(err)
ginx.NewRender(c).Data(fields, nil)
}
func (rt *Router) QueryESVariable(c *gin.Context) {
var f FieldValueReq
ginx.BindJSON(c, &f)
plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId)
if !exists {
logx.Warningf(c.Request.Context(), "cluster:%d not exists", f.DatasourceId)
ginx.Bomb(200, "cluster not exists")
}
fields, err := plug.(*es.Elasticsearch).QueryFieldValue([]string{f.Index}, f.Query.Field, f.Query.Query)
ginx.Dangerous(err)
ginx.NewRender(c).Data(fields, nil)
}
================================================
FILE: center/router/router_es_index_pattern.go
================================================
package router
import (
"net/http"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
// 创建 ES Index Pattern
func (rt *Router) esIndexPatternAdd(c *gin.Context) {
var f models.EsIndexPattern
ginx.BindJSON(c, &f)
username := c.MustGet("username").(string)
now := time.Now().Unix()
f.CreateAt = now
f.CreateBy = username
f.UpdateAt = now
f.UpdateBy = username
err := f.Add(rt.Ctx)
ginx.NewRender(c).Message(err)
}
// 更新 ES Index Pattern
func (rt *Router) esIndexPatternPut(c *gin.Context) {
var f models.EsIndexPattern
ginx.BindJSON(c, &f)
id := ginx.QueryInt64(c, "id")
esIndexPattern, err := models.EsIndexPatternGetById(rt.Ctx, id)
ginx.Dangerous(err)
if esIndexPattern == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such EsIndexPattern")
return
}
f.UpdateBy = c.MustGet("username").(string)
ginx.NewRender(c).Message(esIndexPattern.Update(rt.Ctx, f))
}
// 删除 ES Index Pattern
func (rt *Router) esIndexPatternDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
if len(f.Ids) == 0 {
ginx.Bomb(http.StatusBadRequest, "ids empty")
}
ginx.NewRender(c).Message(models.EsIndexPatternDel(rt.Ctx, f.Ids))
}
// ES Index Pattern列表
func (rt *Router) esIndexPatternGetList(c *gin.Context) {
datasourceId := ginx.QueryInt64(c, "datasource_id", 0)
var lst []*models.EsIndexPattern
var err error
if datasourceId != 0 {
lst, err = models.EsIndexPatternGets(rt.Ctx, "datasource_id = ?", datasourceId)
} else {
lst, err = models.EsIndexPatternGets(rt.Ctx, "")
}
if err == nil {
models.FillUpdateByNicknames(rt.Ctx, lst)
}
ginx.NewRender(c).Data(lst, err)
}
// ES Index Pattern 单个数据
func (rt *Router) esIndexPatternGet(c *gin.Context) {
id := ginx.QueryInt64(c, "id")
item, err := models.EsIndexPatternGet(rt.Ctx, "id=?", id)
ginx.NewRender(c).Data(item, err)
}
================================================
FILE: center/router/router_event_detail.go
================================================
package router
import (
"encoding/json"
"fmt"
"io"
"net/http"
"strconv"
"time"
"github.com/ccfos/nightingale/v6/alert/naming"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/loggrep"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
// eventDetailPage renders an HTML log viewer page (for pages group).
func (rt *Router) eventDetailPage(c *gin.Context) {
hash := ginx.UrlParamStr(c, "hash")
if !loggrep.IsValidHash(hash) {
c.String(http.StatusBadRequest, "invalid hash format")
return
}
logs, instance, err := rt.getEventLogs(hash)
if err != nil {
c.String(http.StatusInternalServerError, "Error: %v", err)
return
}
c.Header("Content-Type", "text/html; charset=utf-8")
err = loggrep.RenderHTML(c.Writer, loggrep.PageData{
Hash: hash,
Instance: instance,
Logs: logs,
Total: len(logs),
})
if err != nil {
c.String(http.StatusInternalServerError, "render error: %v", err)
}
}
// eventDetailJSON returns JSON (for service group).
func (rt *Router) eventDetailJSON(c *gin.Context) {
hash := ginx.UrlParamStr(c, "hash")
if !loggrep.IsValidHash(hash) {
ginx.Bomb(200, "invalid hash format")
}
logs, instance, err := rt.getEventLogs(hash)
ginx.Dangerous(err)
ginx.NewRender(c).Data(loggrep.EventDetailResp{
Logs: logs,
Instance: instance,
}, nil)
}
// getNodeForDatasource returns the alert engine instance responsible for the given
// datasource and primary key. It first checks the local hashring, and falls back
// to querying the database for active instances if the hashring is empty
// (e.g. when the datasource belongs to another engine cluster).
func (rt *Router) getNodeForDatasource(datasourceId int64, pk string) (string, error) {
dsIdStr := strconv.FormatInt(datasourceId, 10)
node, err := naming.DatasourceHashRing.GetNode(dsIdStr, pk)
if err == nil {
return node, nil
}
// Hashring is empty for this datasource (likely belongs to another engine cluster).
// Query the DB for active instances.
servers, dbErr := models.AlertingEngineGetsInstances(rt.Ctx,
"datasource_id = ? and clock > ?",
datasourceId, time.Now().Unix()-30)
if dbErr != nil {
return "", dbErr
}
if len(servers) == 0 {
return "", fmt.Errorf("no active instances for datasource %d", datasourceId)
}
ring := naming.NewConsistentHashRing(int32(naming.NodeReplicas), servers)
return ring.Get(pk)
}
// getEventLogs resolves the target instance and retrieves logs.
func (rt *Router) getEventLogs(hash string) ([]string, string, error) {
event, err := models.AlertHisEventGetByHash(rt.Ctx, hash)
if err != nil {
return nil, "", err
}
if event == nil {
return nil, "", fmt.Errorf("no such alert event")
}
ruleId := strconv.FormatInt(event.RuleId, 10)
instance := fmt.Sprintf("%s:%d", rt.Alert.Heartbeat.IP, rt.HTTP.Port)
node, err := rt.getNodeForDatasource(event.DatasourceId, ruleId)
if err != nil || node == instance {
// hashring not ready or target is self, handle locally
logs, err := loggrep.GrepLogDir(rt.LogDir, hash)
return logs, instance, err
}
// forward to the target alert instance
return rt.forwardEventDetail(node, hash)
}
func (rt *Router) forwardEventDetail(node, hash string) ([]string, string, error) {
url := fmt.Sprintf("http://%s/v1/n9e/event-detail/%s", node, hash)
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, node, err
}
for user, pass := range rt.HTTP.APIForService.BasicAuth {
req.SetBasicAuth(user, pass)
break
}
client := &http.Client{Timeout: 15 * time.Second}
resp, err := client.Do(req)
if err != nil {
return nil, node, fmt.Errorf("forward to %s failed: %v", node, err)
}
defer resp.Body.Close()
body, err := io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024)) // 10MB limit
if err != nil {
return nil, node, err
}
var result struct {
Dat loggrep.EventDetailResp `json:"dat"`
Err string `json:"err"`
}
if err := json.Unmarshal(body, &result); err != nil {
return nil, node, err
}
if result.Err != "" {
return nil, node, fmt.Errorf("%s", result.Err)
}
return result.Dat.Logs, result.Dat.Instance, nil
}
================================================
FILE: center/router/router_event_pipeline.go
================================================
package router
import (
"encoding/json"
"fmt"
"net/http"
"time"
"github.com/ccfos/nightingale/v6/alert/pipeline/engine"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/google/uuid"
"github.com/toolkits/pkg/i18n"
"github.com/toolkits/pkg/logger"
)
// 获取事件Pipeline列表
func (rt *Router) eventPipelinesList(c *gin.Context) {
me := c.MustGet("user").(*models.User)
pipelines, err := models.ListEventPipelines(rt.Ctx)
ginx.Dangerous(err)
allTids := make([]int64, 0)
for _, pipeline := range pipelines {
allTids = append(allTids, pipeline.TeamIds...)
}
ugMap, err := models.UserGroupIdAndNameMap(rt.Ctx, allTids)
ginx.Dangerous(err)
for _, pipeline := range pipelines {
for _, tid := range pipeline.TeamIds {
pipeline.TeamNames = append(pipeline.TeamNames, ugMap[tid])
}
// 兼容处理:自动填充工作流字段
pipeline.FillWorkflowFields()
}
models.FillUpdateByNicknames(rt.Ctx, pipelines)
gids, err := models.MyGroupIdsMap(rt.Ctx, me.Id)
ginx.Dangerous(err)
if me.IsAdmin() {
for _, pipeline := range pipelines {
if pipeline.TriggerMode == "" {
pipeline.TriggerMode = models.TriggerModeEvent
}
if pipeline.UseCase == "" {
pipeline.UseCase = models.UseCaseEventPipeline
}
}
ginx.NewRender(c).Data(pipelines, nil)
return
}
res := make([]*models.EventPipeline, 0)
for _, pipeline := range pipelines {
if pipeline.TriggerMode == "" {
pipeline.TriggerMode = models.TriggerModeEvent
}
if pipeline.UseCase == "" {
pipeline.UseCase = models.UseCaseEventPipeline
}
for _, tid := range pipeline.TeamIds {
if _, ok := gids[tid]; ok {
res = append(res, pipeline)
break
}
}
}
ginx.NewRender(c).Data(res, nil)
}
// 获取单个事件Pipeline详情
func (rt *Router) getEventPipeline(c *gin.Context) {
me := c.MustGet("user").(*models.User)
id := ginx.UrlParamInt64(c, "id")
pipeline, err := models.GetEventPipeline(rt.Ctx, id)
ginx.Dangerous(err)
ginx.Dangerous(me.CheckGroupPermission(rt.Ctx, pipeline.TeamIds))
err = pipeline.FillTeamNames(rt.Ctx)
ginx.Dangerous(err)
// 兼容处理:自动填充工作流字段
pipeline.FillWorkflowFields()
if pipeline.TriggerMode == "" {
pipeline.TriggerMode = models.TriggerModeEvent
}
if pipeline.UseCase == "" {
pipeline.UseCase = models.UseCaseEventPipeline
}
ginx.NewRender(c).Data(pipeline, nil)
}
// 创建事件Pipeline
func (rt *Router) addEventPipeline(c *gin.Context) {
var pipeline models.EventPipeline
ginx.BindJSON(c, &pipeline)
user := c.MustGet("user").(*models.User)
now := time.Now().Unix()
pipeline.CreateBy = user.Username
pipeline.CreateAt = now
pipeline.UpdateAt = now
pipeline.UpdateBy = user.Username
err := pipeline.Verify()
if err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
ginx.Dangerous(user.CheckGroupPermission(rt.Ctx, pipeline.TeamIds))
err = models.CreateEventPipeline(rt.Ctx, &pipeline)
ginx.NewRender(c).Message(err)
}
// 更新事件Pipeline
func (rt *Router) updateEventPipeline(c *gin.Context) {
var f models.EventPipeline
ginx.BindJSON(c, &f)
me := c.MustGet("user").(*models.User)
f.UpdateBy = me.Username
f.UpdateAt = time.Now().Unix()
pipeline, err := models.GetEventPipeline(rt.Ctx, f.ID)
if err != nil {
ginx.Bomb(http.StatusNotFound, "No such event pipeline")
}
ginx.Dangerous(me.CheckGroupPermission(rt.Ctx, pipeline.TeamIds))
ginx.NewRender(c).Message(pipeline.Update(rt.Ctx, &f))
}
// 删除事件Pipeline
func (rt *Router) deleteEventPipelines(c *gin.Context) {
var f struct {
Ids []int64 `json:"ids"`
}
ginx.BindJSON(c, &f)
if len(f.Ids) == 0 {
ginx.Bomb(http.StatusBadRequest, "ids required")
}
me := c.MustGet("user").(*models.User)
for _, id := range f.Ids {
pipeline, err := models.GetEventPipeline(rt.Ctx, id)
ginx.Dangerous(err)
ginx.Dangerous(me.CheckGroupPermission(rt.Ctx, pipeline.TeamIds))
}
err := models.DeleteEventPipelines(rt.Ctx, f.Ids)
ginx.NewRender(c).Message(err)
}
// 测试事件Pipeline
func (rt *Router) tryRunEventPipeline(c *gin.Context) {
var f struct {
EventId int64 `json:"event_id"`
PipelineConfig models.EventPipeline `json:"pipeline_config"`
InputVariables map[string]string `json:"input_variables,omitempty"`
}
ginx.BindJSON(c, &f)
hisEvent, err := models.AlertHisEventGetById(rt.Ctx, f.EventId)
if err != nil || hisEvent == nil {
ginx.Bomb(http.StatusBadRequest, "event not found")
}
event := hisEvent.ToCur()
lang := c.GetHeader("X-Language")
me := c.MustGet("user").(*models.User)
// 统一使用工作流引擎执行(兼容线性模式和工作流模式)
workflowEngine := engine.NewWorkflowEngine(rt.Ctx)
triggerCtx := &models.WorkflowTriggerContext{
Mode: models.TriggerModeAPI,
TriggerBy: me.Username,
InputsOverrides: f.InputVariables,
}
resultEvent, result, err := workflowEngine.Execute(&f.PipelineConfig, event, triggerCtx)
if err != nil {
ginx.Bomb(http.StatusBadRequest, "pipeline execute error: %v", err)
}
m := map[string]interface{}{
"event": resultEvent,
"result": i18n.Sprintf(lang, result.Message),
"status": result.Status,
"node_results": result.NodeResults,
}
if resultEvent == nil {
m["result"] = i18n.Sprintf(lang, "event is dropped")
}
ginx.NewRender(c).Data(m, nil)
}
// 测试事件处理器
func (rt *Router) tryRunEventProcessor(c *gin.Context) {
var f struct {
EventId int64 `json:"event_id"`
ProcessorConfig models.ProcessorConfig `json:"processor_config"`
}
ginx.BindJSON(c, &f)
hisEvent, err := models.AlertHisEventGetById(rt.Ctx, f.EventId)
if err != nil || hisEvent == nil {
ginx.Bomb(http.StatusBadRequest, "event not found")
}
event := hisEvent.ToCur()
processor, err := models.GetProcessorByType(f.ProcessorConfig.Typ, f.ProcessorConfig.Config)
if err != nil {
ginx.Bomb(200, "get processor err: %+v", err)
}
wfCtx := &models.WorkflowContext{
Event: event,
Vars: make(map[string]interface{}),
}
wfCtx, res, err := processor.Process(rt.Ctx, wfCtx)
if err != nil {
ginx.Bomb(200, "processor err: %+v", err)
}
lang := c.GetHeader("X-Language")
ginx.NewRender(c).Data(map[string]interface{}{
"event": wfCtx.Event,
"result": i18n.Sprintf(lang, res),
}, nil)
}
func (rt *Router) tryRunEventProcessorByNotifyRule(c *gin.Context) {
var f struct {
EventId int64 `json:"event_id"`
PipelineConfigs []models.PipelineConfig `json:"pipeline_configs"`
}
ginx.BindJSON(c, &f)
hisEvent, err := models.AlertHisEventGetById(rt.Ctx, f.EventId)
if err != nil || hisEvent == nil {
ginx.Bomb(http.StatusBadRequest, "event not found")
}
event := hisEvent.ToCur()
pids := make([]int64, 0)
for _, pc := range f.PipelineConfigs {
if pc.Enable {
pids = append(pids, pc.PipelineId)
}
}
pipelines, err := models.GetEventPipelinesByIds(rt.Ctx, pids)
if err != nil {
ginx.Bomb(http.StatusBadRequest, "processors not found")
}
wfCtx := &models.WorkflowContext{
Event: event,
Vars: make(map[string]interface{}),
}
for _, pl := range pipelines {
for _, p := range pl.ProcessorConfigs {
processor, err := models.GetProcessorByType(p.Typ, p.Config)
if err != nil {
ginx.Bomb(http.StatusBadRequest, "get processor: %+v err: %+v", p, err)
}
wfCtx, _, err = processor.Process(rt.Ctx, wfCtx)
if err != nil {
ginx.Bomb(http.StatusBadRequest, "processor: %+v err: %+v", p, err)
}
if wfCtx == nil || wfCtx.Event == nil {
lang := c.GetHeader("X-Language")
ginx.NewRender(c).Data(map[string]interface{}{
"event": nil,
"result": i18n.Sprintf(lang, "event is dropped"),
}, nil)
return
}
}
}
ginx.NewRender(c).Data(wfCtx.Event, nil)
}
func (rt *Router) eventPipelinesListByService(c *gin.Context) {
pipelines, err := models.ListEventPipelines(rt.Ctx)
ginx.NewRender(c).Data(pipelines, err)
}
type EventPipelineRequest struct {
// 事件数据(可选,如果不传则使用空事件)
Event *models.AlertCurEvent `json:"event,omitempty"`
// 输入参数覆盖
InputsOverrides map[string]string `json:"inputs_overrides,omitempty"`
Username string `json:"username,omitempty"`
}
// executePipelineTrigger 执行 Pipeline 触发的公共逻辑
func (rt *Router) executePipelineTrigger(pipeline *models.EventPipeline, req *EventPipelineRequest, triggerBy string) (string, error) {
// 准备事件数据
var event *models.AlertCurEvent
if req.Event != nil {
event = req.Event
} else {
// 创建空事件
event = &models.AlertCurEvent{
TriggerTime: time.Now().Unix(),
}
}
// 生成执行ID
executionID := uuid.New().String()
// 创建触发上下文
triggerCtx := &models.WorkflowTriggerContext{
Mode: models.TriggerModeAPI,
TriggerBy: triggerBy,
InputsOverrides: req.InputsOverrides,
RequestID: executionID,
}
// 异步执行工作流
go func() {
workflowEngine := engine.NewWorkflowEngine(rt.Ctx)
_, _, err := workflowEngine.Execute(pipeline, event, triggerCtx)
if err != nil {
logger.Errorf("async workflow execute error: pipeline_id=%d execution_id=%s err=%v",
pipeline.ID, executionID, err)
}
}()
return executionID, nil
}
// triggerEventPipelineByService Service 调用触发工作流执行
func (rt *Router) triggerEventPipelineByService(c *gin.Context) {
pipelineID := ginx.UrlParamInt64(c, "id")
var f EventPipelineRequest
ginx.BindJSON(c, &f)
// 获取 Pipeline
pipeline, err := models.GetEventPipeline(rt.Ctx, pipelineID)
if err != nil {
ginx.Bomb(http.StatusNotFound, "pipeline not found: %v", err)
}
executionID, err := rt.executePipelineTrigger(pipeline, &f, f.Username)
if err != nil {
ginx.Bomb(http.StatusBadRequest, "%v", err)
}
ginx.NewRender(c).Data(gin.H{
"execution_id": executionID,
"message": "workflow execution started",
}, nil)
}
// triggerEventPipelineByAPI API 触发工作流执行
func (rt *Router) triggerEventPipelineByAPI(c *gin.Context) {
pipelineID := ginx.UrlParamInt64(c, "id")
var f EventPipelineRequest
ginx.BindJSON(c, &f)
// 获取 Pipeline
pipeline, err := models.GetEventPipeline(rt.Ctx, pipelineID)
if err != nil {
ginx.Bomb(http.StatusNotFound, "pipeline not found: %v", err)
}
// 检查权限
me := c.MustGet("user").(*models.User)
ginx.Dangerous(me.CheckGroupPermission(rt.Ctx, pipeline.TeamIds))
executionID, err := rt.executePipelineTrigger(pipeline, &f, me.Username)
if err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
ginx.NewRender(c).Data(gin.H{
"execution_id": executionID,
"message": "workflow execution started",
}, nil)
}
func (rt *Router) listAllEventPipelineExecutions(c *gin.Context) {
pipelineId := ginx.QueryInt64(c, "pipeline_id", 0)
pipelineName := ginx.QueryStr(c, "pipeline_name", "")
mode := ginx.QueryStr(c, "mode", "")
status := ginx.QueryStr(c, "status", "")
limit := ginx.QueryInt(c, "limit", 20)
offset := ginx.QueryInt(c, "p", 1)
if limit <= 0 || limit > 1000 {
limit = 20
}
if offset <= 0 {
offset = 1
}
executions, total, err := models.ListAllEventPipelineExecutions(rt.Ctx, pipelineId, pipelineName, mode, status, limit, (offset-1)*limit)
ginx.Dangerous(err)
ginx.NewRender(c).Data(gin.H{
"list": executions,
"total": total,
}, nil)
}
func (rt *Router) listEventPipelineExecutions(c *gin.Context) {
pipelineID := ginx.UrlParamInt64(c, "id")
mode := ginx.QueryStr(c, "mode", "")
status := ginx.QueryStr(c, "status", "")
limit := ginx.QueryInt(c, "limit", 20)
offset := ginx.QueryInt(c, "p", 1)
if limit <= 0 || limit > 1000 {
limit = 20
}
if offset <= 0 {
offset = 1
}
executions, total, err := models.ListEventPipelineExecutions(rt.Ctx, pipelineID, mode, status, limit, (offset-1)*limit)
ginx.Dangerous(err)
ginx.NewRender(c).Data(gin.H{
"list": executions,
"total": total,
}, nil)
}
func (rt *Router) getEventPipelineExecution(c *gin.Context) {
execID := ginx.UrlParamStr(c, "exec_id")
detail, err := models.GetEventPipelineExecutionDetail(rt.Ctx, execID)
if err != nil {
ginx.Bomb(http.StatusNotFound, "execution not found: %v", err)
}
ginx.NewRender(c).Data(detail, nil)
}
func (rt *Router) getEventPipelineExecutionStats(c *gin.Context) {
pipelineID := ginx.UrlParamInt64(c, "id")
stats, err := models.GetEventPipelineExecutionStatistics(rt.Ctx, pipelineID)
ginx.Dangerous(err)
ginx.NewRender(c).Data(stats, nil)
}
func (rt *Router) cleanEventPipelineExecutions(c *gin.Context) {
var f struct {
BeforeDays int `json:"before_days"`
}
ginx.BindJSON(c, &f)
if f.BeforeDays <= 0 {
f.BeforeDays = 30
}
beforeTime := time.Now().AddDate(0, 0, -f.BeforeDays).Unix()
affected, err := models.DeleteEventPipelineExecutions(rt.Ctx, beforeTime)
ginx.Dangerous(err)
ginx.NewRender(c).Data(gin.H{
"deleted": affected,
}, nil)
}
func (rt *Router) streamEventPipeline(c *gin.Context) {
pipelineID := ginx.UrlParamInt64(c, "id")
var f EventPipelineRequest
ginx.BindJSON(c, &f)
pipeline, err := models.GetEventPipeline(rt.Ctx, pipelineID)
if err != nil {
ginx.Bomb(http.StatusNotFound, "pipeline not found: %v", err)
}
me := c.MustGet("user").(*models.User)
ginx.Dangerous(me.CheckGroupPermission(rt.Ctx, pipeline.TeamIds))
var event *models.AlertCurEvent
if f.Event != nil {
event = f.Event
} else {
event = &models.AlertCurEvent{
TriggerTime: time.Now().Unix(),
}
}
triggerCtx := &models.WorkflowTriggerContext{
Mode: models.TriggerModeAPI,
TriggerBy: me.Username,
InputsOverrides: f.InputsOverrides,
RequestID: uuid.New().String(),
Stream: true, // 流式端点强制启用流式输出
}
workflowEngine := engine.NewWorkflowEngine(rt.Ctx)
_, result, err := workflowEngine.Execute(pipeline, event, triggerCtx)
if err != nil {
ginx.Bomb(http.StatusInternalServerError, "execute failed: %v", err)
}
if result.Stream && result.StreamChan != nil {
rt.handleStreamResponse(c, result, triggerCtx.RequestID)
return
}
ginx.NewRender(c).Data(result, nil)
}
func (rt *Router) handleStreamResponse(c *gin.Context, result *models.WorkflowResult, requestID string) {
// 设置 SSE 响应头
c.Header("Content-Type", "text/event-stream")
c.Header("Cache-Control", "no-cache")
c.Header("Connection", "keep-alive")
c.Header("X-Accel-Buffering", "no") // 禁用 nginx 缓冲
c.Header("X-Request-ID", requestID)
flusher, ok := c.Writer.(http.Flusher)
if !ok {
ginx.Bomb(http.StatusInternalServerError, "streaming not supported")
return
}
// 发送初始连接成功消息
initData := fmt.Sprintf(`{"type":"connected","request_id":"%s","timestamp":%d}`, requestID, time.Now().UnixMilli())
fmt.Fprintf(c.Writer, "data: %s\n\n", initData)
flusher.Flush()
// 从 channel 读取并发送 SSE
timeout := time.After(30 * time.Minute) // 最长流式输出时间
for {
select {
case chunk, ok := <-result.StreamChan:
if !ok {
// channel 关闭,发送结束标记
return
}
data, err := json.Marshal(chunk)
if err != nil {
logger.Errorf("stream: failed to marshal chunk: %v", err)
continue
}
fmt.Fprintf(c.Writer, "data: %s\n\n", data)
flusher.Flush()
if chunk.Done {
return
}
case <-c.Request.Context().Done():
// 客户端断开连接
logger.Infof("stream: client disconnected, request_id=%s", requestID)
return
case <-timeout:
logger.Errorf("stream: timeout, request_id=%s", requestID)
return
}
}
}
func (rt *Router) streamEventPipelineByService(c *gin.Context) {
pipelineID := ginx.UrlParamInt64(c, "id")
var f EventPipelineRequest
ginx.BindJSON(c, &f)
pipeline, err := models.GetEventPipeline(rt.Ctx, pipelineID)
if err != nil {
ginx.Bomb(http.StatusNotFound, "pipeline not found: %v", err)
}
var event *models.AlertCurEvent
if f.Event != nil {
event = f.Event
} else {
event = &models.AlertCurEvent{
TriggerTime: time.Now().Unix(),
}
}
triggerCtx := &models.WorkflowTriggerContext{
Mode: models.TriggerModeAPI,
TriggerBy: f.Username,
InputsOverrides: f.InputsOverrides,
RequestID: uuid.New().String(),
Stream: true, // 流式端点强制启用流式输出
}
workflowEngine := engine.NewWorkflowEngine(rt.Ctx)
_, result, err := workflowEngine.Execute(pipeline, event, triggerCtx)
if err != nil {
ginx.Bomb(http.StatusInternalServerError, "execute failed: %v", err)
}
// 检查是否是流式输出
if result.Stream && result.StreamChan != nil {
rt.handleStreamResponse(c, result, triggerCtx.RequestID)
return
}
ginx.NewRender(c).Data(result, nil)
}
// eventPipelineExecutionAdd 接收 edge 节点同步的 Pipeline 执行记录
func (rt *Router) eventPipelineExecutionAdd(c *gin.Context) {
var execution models.EventPipelineExecution
ginx.BindJSON(c, &execution)
if execution.ID == "" {
ginx.Bomb(http.StatusBadRequest, "id is required")
}
if execution.PipelineID <= 0 {
ginx.Bomb(http.StatusBadRequest, "pipeline_id is required")
}
ginx.NewRender(c).Message(models.DB(rt.Ctx).Create(&execution).Error)
}
================================================
FILE: center/router/router_funcs.go
================================================
package router
import (
"net/http"
"strconv"
"strings"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
const defaultLimit = 300
func (rt *Router) statistic(c *gin.Context) {
name := ginx.QueryStr(c, "name")
var model interface{}
var err error
var statistics *models.Statistics
switch name {
case "alert_mute":
model = models.AlertMute{}
case "alert_rule":
model = models.AlertRule{}
case "alert_subscribe":
model = models.AlertSubscribe{}
case "busi_group":
model = models.BusiGroup{}
case "recording_rule":
model = models.RecordingRule{}
case "target":
model = models.Target{}
case "user":
model = models.User{}
case "user_group":
model = models.UserGroup{}
case "notify_rule":
model = models.NotifyRule{}
case "notify_channel":
model = models.NotifyChannel{}
case "event_pipeline":
statistics, err = models.EventPipelineStatistics(rt.Ctx)
ginx.NewRender(c).Data(statistics, err)
return
case "datasource":
// datasource update_at is different from others
statistics, err = models.DatasourceStatistics(rt.Ctx)
ginx.NewRender(c).Data(statistics, err)
return
case "user_variable":
statistics, err = models.ConfigsUserVariableStatistics(rt.Ctx)
ginx.NewRender(c).Data(statistics, err)
return
case "cval":
statistics, err = models.ConfigCvalStatistics(rt.Ctx)
ginx.NewRender(c).Data(statistics, err)
return
case "message_template":
statistics, err = models.MessageTemplateStatistics(rt.Ctx)
ginx.NewRender(c).Data(statistics, err)
return
default:
ginx.Bomb(http.StatusBadRequest, "invalid name")
}
statistics, err = models.StatisticsGet(rt.Ctx, model)
ginx.NewRender(c).Data(statistics, err)
}
func queryDatasourceIds(c *gin.Context) []int64 {
datasourceIds := ginx.QueryStr(c, "datasource_ids", "")
datasourceIds = strings.ReplaceAll(datasourceIds, ",", " ")
idsStr := strings.Fields(datasourceIds)
ids := make([]int64, len(idsStr))
for i, idStr := range idsStr {
id, _ := strconv.ParseInt(idStr, 10, 64)
ids[i] = id
}
return ids
}
func queryStrListField(c *gin.Context, fieldName string, sep ...string) []string {
str := ginx.QueryStr(c, fieldName, "")
if str == "" {
return nil
}
lst := []string{str}
for _, s := range sep {
var newLst []string
for _, str := range lst {
newLst = append(newLst, strings.Split(str, s)...)
}
lst = newLst
}
return lst
}
type idsForm struct {
Ids []int64 `json:"ids"`
IsSyncToFlashDuty bool `json:"is_sync_to_flashduty"`
}
func (f idsForm) Verify() {
if len(f.Ids) == 0 {
ginx.Bomb(http.StatusBadRequest, "ids empty")
}
}
func User(ctx *ctx.Context, id int64) *models.User {
obj, err := models.UserGetById(ctx, id)
ginx.Dangerous(err)
if obj == nil {
ginx.Bomb(http.StatusNotFound, "No such user")
}
return obj
}
func UserGroup(ctx *ctx.Context, id int64) *models.UserGroup {
obj, err := models.UserGroupGetById(ctx, id)
ginx.Dangerous(err)
if obj == nil {
ginx.Bomb(http.StatusNotFound, "No such UserGroup")
}
bgids, err := models.BusiGroupIds(ctx, []int64{id})
ginx.Dangerous(err)
obj.BusiGroups, err = models.BusiGroupGetByIds(ctx, bgids)
ginx.Dangerous(err)
return obj
}
func BusiGroup(ctx *ctx.Context, id int64) *models.BusiGroup {
obj, err := models.BusiGroupGetById(ctx, id)
ginx.Dangerous(err)
if obj == nil {
ginx.Bomb(http.StatusNotFound, "No such BusiGroup")
}
return obj
}
func Dashboard(ctx *ctx.Context, id int64) *models.Dashboard {
obj, err := models.DashboardGet(ctx, "id=?", id)
ginx.Dangerous(err)
if obj == nil {
ginx.Bomb(http.StatusNotFound, "No such dashboard")
}
return obj
}
type DoneIdsReply struct {
Err string `json:"err"`
Dat struct {
List []int64 `json:"list"`
} `json:"dat"`
}
type TaskCreateReply struct {
Err string `json:"err"`
Dat int64 `json:"dat"` // task.id
}
func Username(c *gin.Context) string {
username := c.GetString(gin.AuthUserKey)
if username == "" {
user := c.MustGet("user").(*models.User)
username = user.Username
}
return username
}
func HasPermission(ctx *ctx.Context, c *gin.Context, sourceType, sourceId string, isAnonymousAccess bool) bool {
if sourceType == "event" && isAnonymousAccess {
return true
}
// 尝试从请求中获取 __token 参数
token := ginx.QueryStr(c, "__token", "")
// 如果有 __token 参数,验证其合法性
if token != "" {
return ValidateSourceToken(ctx, sourceType, sourceId, token)
}
return false
}
func ValidateSourceToken(ctx *ctx.Context, sourceType, sourceId, token string) bool {
if token == "" {
return false
}
// 根据源类型、源ID和令牌获取源令牌记录
sourceToken, err := models.GetSourceTokenBySource(ctx, sourceType, sourceId, token)
if err != nil {
return false
}
// 检查令牌是否过期
if sourceToken.IsExpired() {
return false
}
return true
}
================================================
FILE: center/router/router_heartbeat.go
================================================
package router
import (
"compress/gzip"
"encoding/json"
"errors"
"io/ioutil"
"sort"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/center/metas"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pushgw/idents"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/logger"
)
type HeartbeatHookFunc func(ident string) map[string]interface{}
func (rt *Router) heartbeat(c *gin.Context) {
req, err := HandleHeartbeat(c, rt.Ctx, rt.Alert.Heartbeat.EngineName, rt.MetaSet, rt.IdentSet, rt.TargetCache)
ginx.Dangerous(err)
m := rt.HeartbeatHook(req.Hostname)
ginx.NewRender(c).Data(m, err)
}
func HandleHeartbeat(c *gin.Context, ctx *ctx.Context, engineName string, metaSet *metas.Set, identSet *idents.Set, targetCache *memsto.TargetCacheType) (models.HostMeta, error) {
var bs []byte
var err error
var r *gzip.Reader
var req models.HostMeta
if c.GetHeader("Content-Encoding") == "gzip" {
r, err = gzip.NewReader(c.Request.Body)
if err != nil {
c.String(400, err.Error())
return req, err
}
defer r.Close()
bs, err = ioutil.ReadAll(r)
ginx.Dangerous(err)
} else {
defer c.Request.Body.Close()
bs, err = ioutil.ReadAll(c.Request.Body)
if err != nil {
return req, err
}
}
err = json.Unmarshal(bs, &req)
if err != nil {
return req, err
}
if req.Hostname == "" {
return req, errors.New("hostname is required")
}
// maybe from pushgw
if req.Offset == 0 {
req.Offset = (time.Now().UnixMilli() - req.UnixTime)
}
if req.RemoteAddr == "" {
req.RemoteAddr = c.ClientIP()
}
if req.EngineName == "" {
req.EngineName = engineName
}
metaSet.Set(req.Hostname, req)
var items = make(map[string]struct{})
items[req.Hostname] = struct{}{}
identSet.MSet(items)
if target, has := targetCache.Get(req.Hostname); has && target != nil {
gidsStr := ginx.QueryStr(c, "gid", "")
overwriteGids := ginx.QueryBool(c, "overwrite_gids", false)
hostIp := strings.TrimSpace(req.HostIp)
gids := strings.Split(gidsStr, ",")
if overwriteGids {
groupIds := make([]int64, 0)
for i := range gids {
if gids[i] == "" {
continue
}
groupId, err := strconv.ParseInt(gids[i], 10, 64)
if err != nil {
logger.Warningf("update target:%s group ids failed, err: %v", req.Hostname, err)
continue
}
groupIds = append(groupIds, groupId)
}
err := models.TargetOverrideBgids(ctx, []string{target.Ident}, groupIds, nil)
if err != nil {
logger.Warningf("update target:%s group ids failed, err: %v", target.Ident, err)
}
} else if gidsStr != "" {
for i := range gids {
groupId, err := strconv.ParseInt(gids[i], 10, 64)
if err != nil {
logger.Warningf("update target:%s group ids failed, err: %v", req.Hostname, err)
continue
}
if !target.MatchGroupId(groupId) {
err := models.TargetBindBgids(ctx, []string{target.Ident}, []int64{groupId}, nil)
if err != nil {
logger.Warningf("update target:%s group ids failed, err: %v", target.Ident, err)
}
}
}
}
newTarget := models.Target{}
targetNeedUpdate := false
if hostIp != "" && hostIp != target.HostIp {
newTarget.HostIp = hostIp
targetNeedUpdate = true
}
hostTagsMap := target.GetHostTagsMap()
hostTagNeedUpdate := false
if len(hostTagsMap) != len(req.GlobalLabels) {
hostTagNeedUpdate = true
} else {
for k, v := range req.GlobalLabels {
if v == "" {
continue
}
if tagv, ok := hostTagsMap[k]; !ok || tagv != v {
hostTagNeedUpdate = true
break
}
}
}
if hostTagNeedUpdate {
lst := []string{}
for k, v := range req.GlobalLabels {
lst = append(lst, k+"="+v)
}
sort.Strings(lst)
newTarget.HostTags = lst
targetNeedUpdate = true
}
userTagsMap := target.GetTagsMap()
userTagNeedUpdate := false
userTags := []string{}
for k, v := range userTagsMap {
if v == "" {
continue
}
if _, ok := req.GlobalLabels[k]; !ok {
userTags = append(userTags, k+"="+v)
} else { // 该key在hostTags中已经存在
userTagNeedUpdate = true
}
}
if userTagNeedUpdate {
newTarget.Tags = strings.Join(userTags, " ") + " "
targetNeedUpdate = true
}
if req.EngineName != "" && req.EngineName != target.EngineName {
newTarget.EngineName = req.EngineName
targetNeedUpdate = true
}
if req.AgentVersion != "" && req.AgentVersion != target.AgentVersion {
newTarget.AgentVersion = req.AgentVersion
targetNeedUpdate = true
}
if req.OS != "" && req.OS != target.OS {
newTarget.OS = req.OS
targetNeedUpdate = true
}
if targetNeedUpdate {
newTarget.UpdateAt = time.Now().Unix()
err := models.DB(ctx).Model(&target).Updates(newTarget).Error
if err != nil {
logger.Errorf("update target fields failed, err: %v", err)
}
}
logger.Debugf("heartbeat field:%+v target: %v", newTarget, *target)
}
return req, nil
}
================================================
FILE: center/router/router_login.go
================================================
package router
import (
"encoding/base64"
"encoding/json"
"fmt"
"net/http"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/cas"
"github.com/ccfos/nightingale/v6/pkg/dingtalk"
"github.com/ccfos/nightingale/v6/pkg/feishu"
"github.com/ccfos/nightingale/v6/pkg/ldapx"
"github.com/ccfos/nightingale/v6/pkg/logx"
"github.com/ccfos/nightingale/v6/pkg/oauth2x"
"github.com/ccfos/nightingale/v6/pkg/oidcx"
"github.com/ccfos/nightingale/v6/pkg/secu"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/dgrijalva/jwt-go"
"github.com/gin-gonic/gin"
"github.com/pelletier/go-toml/v2"
"github.com/pkg/errors"
"gorm.io/gorm"
)
type loginForm struct {
Username string `json:"username" binding:"required"`
Password string `json:"password" binding:"required"`
Captchaid string `json:"captchaid"`
Verifyvalue string `json:"verifyvalue"`
}
func (rt *Router) loginPost(c *gin.Context) {
var f loginForm
ginx.BindJSON(c, &f)
rctx := c.Request.Context()
logx.Infof(rctx, "username:%s login from:%s", f.Username, c.ClientIP())
if rt.HTTP.ShowCaptcha.Enable {
if !CaptchaVerify(f.Captchaid, f.Verifyvalue) {
ginx.NewRender(c).Message("incorrect verification code")
return
}
}
authPassWord := f.Password
// need decode
if rt.HTTP.RSA.OpenRSA {
decPassWord, err := secu.Decrypt(f.Password, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord)
if err != nil {
logx.Errorf(rctx, "RSA Decrypt failed: %v username: %s", err, f.Username)
ginx.NewRender(c).Message(err)
return
}
authPassWord = decPassWord
}
reqCtx := rt.Ctx.WithContext(rctx)
var user *models.User
var err error
lc := rt.Sso.LDAP.Copy()
if lc.Enable {
user, err = ldapx.LdapLogin(reqCtx, f.Username, authPassWord, lc.DefaultRoles, lc.DefaultTeams, lc)
if err != nil {
logx.Debugf(rctx, "ldap login failed: %v username: %s", err, f.Username)
var errLoginInN9e error
// to use n9e as the minimum guarantee for login
if user, errLoginInN9e = models.PassLogin(reqCtx, rt.Redis, f.Username, authPassWord); errLoginInN9e != nil {
ginx.NewRender(c).Message("ldap login failed: %v; n9e login failed: %v", err, errLoginInN9e)
return
}
} else {
user.RolesLst = strings.Fields(user.Roles)
}
} else {
user, err = models.PassLogin(reqCtx, rt.Redis, f.Username, authPassWord)
ginx.Dangerous(err)
}
if user == nil {
// Theoretically impossible
ginx.NewRender(c).Message("Username or password invalid")
return
}
userIdentity := fmt.Sprintf("%d-%s", user.Id, user.Username)
ts, err := rt.createTokens(rt.HTTP.JWTAuth.SigningKey, userIdentity)
ginx.Dangerous(err)
ginx.Dangerous(rt.createAuth(c.Request.Context(), userIdentity, ts))
ginx.NewRender(c).Data(gin.H{
"user": user,
"access_token": ts.AccessToken,
"refresh_token": ts.RefreshToken,
}, nil)
}
func (rt *Router) logoutPost(c *gin.Context) {
rctx := c.Request.Context()
logx.Infof(rctx, "username:%s logout from:%s", c.GetString("username"), c.ClientIP())
metadata, err := rt.extractTokenMetadata(c.Request)
if err != nil {
ginx.NewRender(c, http.StatusBadRequest).Message("failed to parse jwt token")
return
}
delErr := rt.deleteTokens(c.Request.Context(), metadata)
if delErr != nil {
ginx.NewRender(c).Message(http.StatusText(http.StatusInternalServerError))
return
}
var logoutAddr string
user := c.MustGet("user").(*models.User)
// 获取用户的 id_token
idToken, err := rt.fetchIdToken(c.Request.Context(), user.Id)
if err != nil {
logx.Debugf(rctx, "fetch id_token failed: %v, user_id: %d", err, user.Id)
idToken = "" // 如果获取失败,使用空字符串
}
// 删除 id_token
rt.deleteIdToken(c.Request.Context(), user.Id)
switch user.Belong {
case "oidc":
logoutAddr = rt.Sso.OIDC.GetSsoLogoutAddr(idToken)
case "cas":
logoutAddr = rt.Sso.CAS.GetSsoLogoutAddr()
case "oauth2":
logoutAddr = rt.Sso.OAuth2.GetSsoLogoutAddr()
}
ginx.NewRender(c).Data(logoutAddr, nil)
}
type refreshForm struct {
RefreshToken string `json:"refresh_token" binding:"required"`
}
func (rt *Router) refreshPost(c *gin.Context) {
var f refreshForm
ginx.BindJSON(c, &f)
// verify the token
token, err := jwt.Parse(f.RefreshToken, func(token *jwt.Token) (interface{}, error) {
if _, ok := token.Method.(*jwt.SigningMethodHMAC); !ok {
return nil, fmt.Errorf("unexpected jwt signing method: %v", token.Header["alg"])
}
return []byte(rt.HTTP.JWTAuth.SigningKey), nil
})
// if there is an error, the token must have expired
if err != nil {
// redirect to login page
ginx.NewRender(c, http.StatusUnauthorized).Message("refresh token expired")
return
}
// Since token is valid, get the uuid:
claims, ok := token.Claims.(jwt.MapClaims) //the token claims should conform to MapClaims
if ok && token.Valid {
refreshUuid, ok := claims["refresh_uuid"].(string) //convert the interface to string
if !ok {
// Theoretically impossible
ginx.NewRender(c, http.StatusUnauthorized).Message("failed to parse refresh_uuid from jwt")
return
}
// 看这个 token 是否还存在 redis 中
val, err := rt.fetchAuth(c.Request.Context(), refreshUuid)
if err != nil || val == "" {
ginx.NewRender(c, http.StatusUnauthorized).Message("refresh token expired")
return
}
userIdentity, ok := claims["user_identity"].(string)
if !ok {
// Theoretically impossible
ginx.NewRender(c, http.StatusUnauthorized).Message("failed to parse user_identity from jwt")
return
}
userid, err := strconv.ParseInt(strings.Split(userIdentity, "-")[0], 10, 64)
if err != nil {
ginx.NewRender(c, http.StatusUnauthorized).Message("failed to parse user_identity from jwt")
return
}
u, err := models.UserGetById(rt.Ctx, userid)
if err != nil {
ginx.NewRender(c, http.StatusInternalServerError).Message("failed to query user by id")
return
}
if u == nil {
// user already deleted
ginx.NewRender(c, http.StatusUnauthorized).Message("user already deleted")
return
}
// Delete the previous Refresh Token
err = rt.deleteAuth(c.Request.Context(), refreshUuid)
if err != nil {
ginx.NewRender(c, http.StatusUnauthorized).Message(http.StatusText(http.StatusInternalServerError))
return
}
// Delete previous Access Token
rt.deleteAuth(c.Request.Context(), strings.Split(refreshUuid, "++")[0])
// Create new pairs of refresh and access tokens
ts, err := rt.createTokens(rt.HTTP.JWTAuth.SigningKey, userIdentity)
ginx.Dangerous(err)
ginx.Dangerous(rt.createAuth(c.Request.Context(), userIdentity, ts))
// 延长 id_token 的过期时间,使其与新的 refresh token 生命周期保持一致
// 注意:这里不会获取新的 id_token,只是延长 Redis 中现有 id_token 的 TTL
if idToken, err := rt.fetchIdToken(c.Request.Context(), userid); err == nil && idToken != "" {
if err := rt.saveIdToken(c.Request.Context(), userid, idToken); err != nil {
logx.Debugf(c.Request.Context(), "refresh id_token ttl failed: %v, user_id: %d", err, userid)
}
}
ginx.NewRender(c).Data(gin.H{
"access_token": ts.AccessToken,
"refresh_token": ts.RefreshToken,
}, nil)
} else {
// redirect to login page
ginx.NewRender(c, http.StatusUnauthorized).Message("refresh token expired")
}
}
func (rt *Router) loginRedirect(c *gin.Context) {
redirect := ginx.QueryStr(c, "redirect", "/")
v, exists := c.Get("userid")
if exists {
userid := v.(int64)
user, err := models.UserGetById(rt.Ctx, userid)
ginx.Dangerous(err)
if user == nil {
ginx.Bomb(200, "user not found")
}
if user.Username != "" { // already login
ginx.NewRender(c).Data(redirect, nil)
return
}
}
if !rt.Sso.OIDC.Enable {
ginx.NewRender(c).Data("", nil)
return
}
redirect, err := rt.Sso.OIDC.Authorize(rt.Redis, redirect)
ginx.Dangerous(err)
ginx.NewRender(c).Data(redirect, err)
}
type CallbackOutput struct {
Redirect string `json:"redirect"`
User *models.User `json:"user"`
AccessToken string `json:"access_token"`
RefreshToken string `json:"refresh_token"`
}
func (rt *Router) loginCallback(c *gin.Context) {
rctx := c.Request.Context()
code := ginx.QueryStr(c, "code", "")
state := ginx.QueryStr(c, "state", "")
ret, err := rt.Sso.OIDC.Callback(rt.Redis, rctx, code, state)
if err != nil {
logx.Errorf(rctx, "sso_callback fail. code:%s, state:%s, get ret: %+v. error: %v", code, state, ret, err)
ginx.NewRender(c).Data(CallbackOutput{}, err)
return
}
user, err := models.UserGet(rt.Ctx, "username=?", ret.Username)
ginx.Dangerous(err)
if user != nil {
if rt.Sso.OIDC.CoverAttributes {
updatedFields := user.UpdateSsoFields("oidc", ret.Nickname, ret.Phone, ret.Email)
ginx.Dangerous(user.Update(rt.Ctx, "update_at", updatedFields...))
}
} else {
user = new(models.User)
user.FullSsoFields("oidc", ret.Username, ret.Nickname, ret.Phone, ret.Email, rt.Sso.OIDC.DefaultRoles)
// create user from oidc
ginx.Dangerous(user.Add(rt.Ctx))
if len(rt.Sso.OIDC.DefaultTeams) > 0 {
for _, gid := range rt.Sso.OIDC.DefaultTeams {
err = models.UserGroupMemberAdd(rt.Ctx, gid, user.Id)
if err != nil {
logx.Errorf(rctx, "user:%v UserGroupMemberAdd: %s", user, err)
}
}
}
}
// set user login state
userIdentity := fmt.Sprintf("%d-%s", user.Id, user.Username)
ts, err := rt.createTokens(rt.HTTP.JWTAuth.SigningKey, userIdentity)
ginx.Dangerous(err)
ginx.Dangerous(rt.createAuth(rctx, userIdentity, ts))
// 保存 id_token 到 Redis,用于登出时使用
if ret.IdToken != "" {
if err := rt.saveIdToken(rctx, user.Id, ret.IdToken); err != nil {
logx.Errorf(rctx, "save id_token failed: %v, user_id: %d", err, user.Id)
}
}
redirect := "/"
if ret.Redirect != "/login" {
redirect = ret.Redirect
}
ginx.NewRender(c).Data(CallbackOutput{
Redirect: redirect,
User: user,
AccessToken: ts.AccessToken,
RefreshToken: ts.RefreshToken,
}, nil)
}
type RedirectOutput struct {
Redirect string `json:"redirect"`
State string `json:"state"`
}
func (rt *Router) loginRedirectCas(c *gin.Context) {
redirect := ginx.QueryStr(c, "redirect", "/")
v, exists := c.Get("userid")
if exists {
userid := v.(int64)
user, err := models.UserGetById(rt.Ctx, userid)
ginx.Dangerous(err)
if user == nil {
ginx.Bomb(200, "user not found")
}
if user.Username != "" { // already login
ginx.NewRender(c).Data(redirect, nil)
return
}
}
if !rt.Sso.CAS.Enable {
logx.Errorf(c.Request.Context(), "cas is not enable")
ginx.NewRender(c).Data("", nil)
return
}
redirect, state, err := rt.Sso.CAS.Authorize(rt.Redis, redirect)
ginx.Dangerous(err)
ginx.NewRender(c).Data(RedirectOutput{
Redirect: redirect,
State: state,
}, err)
}
func (rt *Router) loginCallbackCas(c *gin.Context) {
rctx := c.Request.Context()
ticket := ginx.QueryStr(c, "ticket", "")
state := ginx.QueryStr(c, "state", "")
ret, err := rt.Sso.CAS.ValidateServiceTicket(rctx, ticket, state, rt.Redis)
if err != nil {
logx.Errorf(rctx, "ValidateServiceTicket: %s", err)
ginx.NewRender(c).Data("", err)
return
}
user, err := models.UserGet(rt.Ctx, "username=?", ret.Username)
if err != nil {
logx.Errorf(rctx, "UserGet: %s", err)
}
ginx.Dangerous(err)
if user != nil {
if rt.Sso.CAS.CoverAttributes {
updatedFields := user.UpdateSsoFields("cas", ret.Nickname, ret.Phone, ret.Email)
ginx.Dangerous(user.Update(rt.Ctx, "update_at", updatedFields...))
}
} else {
user = new(models.User)
user.FullSsoFields("cas", ret.Username, ret.Nickname, ret.Phone, ret.Email, rt.Sso.CAS.DefaultRoles)
// create user from cas
ginx.Dangerous(user.Add(rt.Ctx))
}
// set user login state
userIdentity := fmt.Sprintf("%d-%s", user.Id, user.Username)
ts, err := rt.createTokens(rt.HTTP.JWTAuth.SigningKey, userIdentity)
if err != nil {
logx.Errorf(rctx, "createTokens: %s", err)
}
ginx.Dangerous(err)
ginx.Dangerous(rt.createAuth(rctx, userIdentity, ts))
redirect := "/"
if ret.Redirect != "/login" {
redirect = ret.Redirect
}
ginx.NewRender(c).Data(CallbackOutput{
Redirect: redirect,
User: user,
AccessToken: ts.AccessToken,
RefreshToken: ts.RefreshToken,
}, nil)
}
func (rt *Router) loginRedirectOAuth(c *gin.Context) {
redirect := ginx.QueryStr(c, "redirect", "/")
v, exists := c.Get("userid")
if exists {
userid := v.(int64)
user, err := models.UserGetById(rt.Ctx, userid)
ginx.Dangerous(err)
if user == nil {
ginx.Bomb(200, "user not found")
}
if user.Username != "" { // already login
ginx.NewRender(c).Data(redirect, nil)
return
}
}
if !rt.Sso.OAuth2.Enable {
ginx.NewRender(c).Data("", nil)
return
}
redirect, err := rt.Sso.OAuth2.Authorize(rt.Redis, redirect)
ginx.Dangerous(err)
ginx.NewRender(c).Data(redirect, err)
}
func (rt *Router) loginRedirectDingTalk(c *gin.Context) {
redirect := ginx.QueryStr(c, "redirect", "/")
v, exists := c.Get("userid")
if exists {
userid := v.(int64)
user, err := models.UserGetById(rt.Ctx, userid)
ginx.Dangerous(err)
if user == nil {
ginx.Bomb(200, "user not found")
}
if user.Username != "" { // already login
ginx.NewRender(c).Data(redirect, nil)
return
}
}
if !rt.Sso.DingTalk.Enable {
ginx.NewRender(c).Data("", nil)
return
}
redirect, err := rt.Sso.DingTalk.Authorize(rt.Redis, redirect)
ginx.Dangerous(err)
ginx.NewRender(c).Data(redirect, err)
}
func (rt *Router) loginCallbackDingTalk(c *gin.Context) {
rctx := c.Request.Context()
code := ginx.QueryStr(c, "code", "")
state := ginx.QueryStr(c, "state", "")
ret, err := rt.Sso.DingTalk.Callback(rt.Redis, rctx, code, state)
if err != nil {
logx.Errorf(rctx, "sso_callback DingTalk fail. code:%s, state:%s, get ret: %+v. error: %v", code, state, ret, err)
ginx.NewRender(c).Data(CallbackOutput{}, err)
return
}
user, err := models.UserGet(rt.Ctx, "username=?", ret.Username)
ginx.Dangerous(err)
if user != nil {
if rt.Sso.DingTalk.DingTalkConfig.CoverAttributes {
updatedFields := user.UpdateSsoFields(dingtalk.SsoTypeName, ret.Nickname, ret.Phone, ret.Email)
ginx.Dangerous(user.Update(rt.Ctx, "update_at", updatedFields...))
}
} else {
user = new(models.User)
user.FullSsoFields(dingtalk.SsoTypeName, ret.Username, ret.Nickname, ret.Phone, ret.Email, rt.Sso.DingTalk.DingTalkConfig.DefaultRoles)
// create user from dingtalk
ginx.Dangerous(user.Add(rt.Ctx))
}
// set user login state
userIdentity := fmt.Sprintf("%d-%s", user.Id, user.Username)
ts, err := rt.createTokens(rt.HTTP.JWTAuth.SigningKey, userIdentity)
ginx.Dangerous(err)
ginx.Dangerous(rt.createAuth(c.Request.Context(), userIdentity, ts))
redirect := "/"
if ret.Redirect != "/login" {
redirect = ret.Redirect
}
ginx.NewRender(c).Data(CallbackOutput{
Redirect: redirect,
User: user,
AccessToken: ts.AccessToken,
RefreshToken: ts.RefreshToken,
}, nil)
}
func (rt *Router) loginRedirectFeiShu(c *gin.Context) {
redirect := ginx.QueryStr(c, "redirect", "/")
v, exists := c.Get("userid")
if exists {
userid := v.(int64)
user, err := models.UserGetById(rt.Ctx, userid)
ginx.Dangerous(err)
if user == nil {
ginx.Bomb(200, "user not found")
}
if user.Username != "" { // already login
ginx.NewRender(c).Data(redirect, nil)
return
}
}
if rt.Sso.FeiShu == nil || !rt.Sso.FeiShu.Enable {
ginx.NewRender(c).Data("", nil)
return
}
redirect, err := rt.Sso.FeiShu.Authorize(rt.Redis, redirect)
ginx.Dangerous(err)
ginx.NewRender(c).Data(redirect, err)
}
func (rt *Router) loginCallbackFeiShu(c *gin.Context) {
rctx := c.Request.Context()
code := ginx.QueryStr(c, "code", "")
state := ginx.QueryStr(c, "state", "")
ret, err := rt.Sso.FeiShu.Callback(rt.Redis, rctx, code, state)
if err != nil {
logx.Errorf(rctx, "sso_callback FeiShu fail. code:%s, state:%s, get ret: %+v. error: %v", code, state, ret, err)
ginx.NewRender(c).Data(CallbackOutput{}, err)
return
}
user, err := models.UserGet(rt.Ctx, "username=?", ret.Username)
ginx.Dangerous(err)
if user != nil {
if rt.Sso.FeiShu != nil && rt.Sso.FeiShu.FeiShuConfig != nil && rt.Sso.FeiShu.FeiShuConfig.CoverAttributes {
updatedFields := user.UpdateSsoFields(feishu.SsoTypeName, ret.Nickname, ret.Phone, ret.Email)
ginx.Dangerous(user.Update(rt.Ctx, "update_at", updatedFields...))
}
} else {
user = new(models.User)
defaultRoles := []string{}
defaultUserGroups := []int64{}
if rt.Sso.FeiShu != nil && rt.Sso.FeiShu.FeiShuConfig != nil {
defaultRoles = rt.Sso.FeiShu.FeiShuConfig.DefaultRoles
defaultUserGroups = rt.Sso.FeiShu.FeiShuConfig.DefaultUserGroups
}
user.FullSsoFields(feishu.SsoTypeName, ret.Username, ret.Nickname, ret.Phone, ret.Email, defaultRoles)
ginx.Dangerous(user.Add(rt.Ctx))
if len(defaultUserGroups) > 0 {
err = user.AddToUserGroups(rt.Ctx, defaultUserGroups)
if err != nil {
logx.Errorf(rctx, "sso feishu add user group error %v %v", ret, err)
}
}
}
// set user login state
userIdentity := fmt.Sprintf("%d-%s", user.Id, user.Username)
ts, err := rt.createTokens(rt.HTTP.JWTAuth.SigningKey, userIdentity)
ginx.Dangerous(err)
ginx.Dangerous(rt.createAuth(c.Request.Context(), userIdentity, ts))
redirect := "/"
if ret.Redirect != "/login" {
redirect = ret.Redirect
}
ginx.NewRender(c).Data(CallbackOutput{
Redirect: redirect,
User: user,
AccessToken: ts.AccessToken,
RefreshToken: ts.RefreshToken,
}, nil)
}
func (rt *Router) loginCallbackOAuth(c *gin.Context) {
rctx := c.Request.Context()
code := ginx.QueryStr(c, "code", "")
state := ginx.QueryStr(c, "state", "")
ret, err := rt.Sso.OAuth2.Callback(rt.Redis, rctx, code, state)
if err != nil {
logx.Debugf(rctx, "sso.callback() get ret %+v error %v", ret, err)
ginx.NewRender(c).Data(CallbackOutput{}, err)
return
}
user, err := models.UserGet(rt.Ctx, "username=?", ret.Username)
ginx.Dangerous(err)
if user != nil {
if rt.Sso.OAuth2.CoverAttributes {
updatedFields := user.UpdateSsoFields("oauth2", ret.Nickname, ret.Phone, ret.Email)
ginx.Dangerous(user.Update(rt.Ctx, "update_at", updatedFields...))
}
} else {
user = new(models.User)
user.FullSsoFields("oauth2", ret.Username, ret.Nickname, ret.Phone, ret.Email, rt.Sso.OAuth2.DefaultRoles)
// create user from oidc
ginx.Dangerous(user.Add(rt.Ctx))
}
// set user login state
userIdentity := fmt.Sprintf("%d-%s", user.Id, user.Username)
ts, err := rt.createTokens(rt.HTTP.JWTAuth.SigningKey, userIdentity)
ginx.Dangerous(err)
ginx.Dangerous(rt.createAuth(c.Request.Context(), userIdentity, ts))
redirect := "/"
if ret.Redirect != "/login" {
redirect = ret.Redirect
}
ginx.NewRender(c).Data(CallbackOutput{
Redirect: redirect,
User: user,
AccessToken: ts.AccessToken,
RefreshToken: ts.RefreshToken,
}, nil)
}
type SsoConfigOutput struct {
OidcDisplayName string `json:"oidcDisplayName"`
CasDisplayName string `json:"casDisplayName"`
OauthDisplayName string `json:"oauthDisplayName"`
DingTalkDisplayName string `json:"dingTalkDisplayName"`
FeiShuDisplayName string `json:"feishuDisplayName"`
}
func (rt *Router) ssoConfigNameGet(c *gin.Context) {
var oidcDisplayName, casDisplayName, oauthDisplayName, dingTalkDisplayName, feiShuDisplayName string
if rt.Sso.OIDC != nil {
oidcDisplayName = rt.Sso.OIDC.GetDisplayName()
}
if rt.Sso.CAS != nil {
casDisplayName = rt.Sso.CAS.GetDisplayName()
}
if rt.Sso.OAuth2 != nil {
oauthDisplayName = rt.Sso.OAuth2.GetDisplayName()
}
if rt.Sso.DingTalk != nil {
dingTalkDisplayName = rt.Sso.DingTalk.GetDisplayName()
}
if rt.Sso.FeiShu != nil {
feiShuDisplayName = rt.Sso.FeiShu.GetDisplayName()
}
ginx.NewRender(c).Data(SsoConfigOutput{
OidcDisplayName: oidcDisplayName,
CasDisplayName: casDisplayName,
OauthDisplayName: oauthDisplayName,
DingTalkDisplayName: dingTalkDisplayName,
FeiShuDisplayName: feiShuDisplayName,
}, nil)
}
func (rt *Router) ssoConfigGets(c *gin.Context) {
var ssoConfigs []models.SsoConfig
lst, err := models.SsoConfigGets(rt.Ctx)
ginx.Dangerous(err)
if len(lst) == 0 {
ginx.NewRender(c).Data(ssoConfigs, nil)
return
}
// TODO: dingTalkExist 为了兼容当前前端配置, 后期单点登陆统一调整后不在预先设置默认内容
dingTalkExist := false
feiShuExist := false
for _, config := range lst {
var ssoReqConfig models.SsoConfig
ssoReqConfig.Id = config.Id
ssoReqConfig.Name = config.Name
ssoReqConfig.UpdateAt = config.UpdateAt
switch config.Name {
case dingtalk.SsoTypeName:
dingTalkExist = true
err := json.Unmarshal([]byte(config.Content), &ssoReqConfig.SettingJson)
ginx.Dangerous(err)
case feishu.SsoTypeName:
feiShuExist = true
err := json.Unmarshal([]byte(config.Content), &ssoReqConfig.SettingJson)
ginx.Dangerous(err)
default:
ssoReqConfig.Content = config.Content
}
ssoConfigs = append(ssoConfigs, ssoReqConfig)
}
// TODO: dingTalkExist 为了兼容当前前端配置, 后期单点登陆统一调整后不在预先设置默认内容
if !dingTalkExist {
var ssoConfig models.SsoConfig
ssoConfig.Name = dingtalk.SsoTypeName
ssoConfigs = append(ssoConfigs, ssoConfig)
}
if !feiShuExist {
var ssoConfig models.SsoConfig
ssoConfig.Name = feishu.SsoTypeName
ssoConfigs = append(ssoConfigs, ssoConfig)
}
ginx.NewRender(c).Data(ssoConfigs, nil)
}
func (rt *Router) ssoConfigUpdate(c *gin.Context) {
var f models.SsoConfig
var ssoConfig models.SsoConfig
ginx.BindJSON(c, &ssoConfig)
switch ssoConfig.Name {
case dingtalk.SsoTypeName:
f.Name = ssoConfig.Name
setting, err := json.Marshal(ssoConfig.SettingJson)
ginx.Dangerous(err)
f.Content = string(setting)
f.UpdateAt = time.Now().Unix()
sso, err := f.Query(rt.Ctx)
if !errors.Is(err, gorm.ErrRecordNotFound) {
ginx.Dangerous(err)
}
if errors.Is(err, gorm.ErrRecordNotFound) {
err = f.Create(rt.Ctx)
} else {
f.Id = sso.Id
err = f.Update(rt.Ctx)
}
ginx.Dangerous(err)
case feishu.SsoTypeName:
f.Name = ssoConfig.Name
setting, err := json.Marshal(ssoConfig.SettingJson)
ginx.Dangerous(err)
f.Content = string(setting)
f.UpdateAt = time.Now().Unix()
sso, err := f.Query(rt.Ctx)
if !errors.Is(err, gorm.ErrRecordNotFound) {
ginx.Dangerous(err)
}
if errors.Is(err, gorm.ErrRecordNotFound) {
err = f.Create(rt.Ctx)
} else {
f.Id = sso.Id
err = f.Update(rt.Ctx)
}
ginx.Dangerous(err)
default:
f.Id = ssoConfig.Id
f.Name = ssoConfig.Name
f.Content = ssoConfig.Content
err := f.Update(rt.Ctx)
ginx.Dangerous(err)
}
switch f.Name {
case "LDAP":
var config ldapx.Config
err := toml.Unmarshal([]byte(f.Content), &config)
ginx.Dangerous(err)
rt.Sso.LDAP.Reload(config)
case "OIDC":
var config oidcx.Config
err := toml.Unmarshal([]byte(f.Content), &config)
ginx.Dangerous(err)
rt.Sso.OIDC, err = oidcx.New(config)
ginx.Dangerous(err)
case "CAS":
var config cas.Config
err := toml.Unmarshal([]byte(f.Content), &config)
ginx.Dangerous(err)
rt.Sso.CAS.Reload(config)
case "OAuth2":
var config oauth2x.Config
err := toml.Unmarshal([]byte(f.Content), &config)
ginx.Dangerous(err)
rt.Sso.OAuth2.Reload(config)
case dingtalk.SsoTypeName:
var config dingtalk.Config
err := json.Unmarshal([]byte(f.Content), &config)
ginx.Dangerous(err)
if rt.Sso.DingTalk == nil {
rt.Sso.DingTalk = dingtalk.New(config)
}
rt.Sso.DingTalk.Reload(config)
case feishu.SsoTypeName:
var config feishu.Config
err := json.Unmarshal([]byte(f.Content), &config)
ginx.Dangerous(err)
if rt.Sso.FeiShu == nil {
rt.Sso.FeiShu = feishu.New(config)
}
rt.Sso.FeiShu.Reload(config)
}
ginx.NewRender(c).Message(nil)
}
type RSAConfigOutput struct {
OpenRSA bool
RSAPublicKey string
}
func (rt *Router) rsaConfigGet(c *gin.Context) {
publicKey := ""
if len(rt.HTTP.RSA.RSAPublicKey) > 0 {
publicKey = base64.StdEncoding.EncodeToString(rt.HTTP.RSA.RSAPublicKey)
}
ginx.NewRender(c).Data(RSAConfigOutput{
OpenRSA: rt.HTTP.RSA.OpenRSA,
RSAPublicKey: publicKey,
}, nil)
}
================================================
FILE: center/router/router_message_template.go
================================================
package router
import (
"bytes"
"fmt"
"html/template"
"net/http"
"strings"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/slice"
"github.com/ccfos/nightingale/v6/pkg/strx"
"github.com/ccfos/nightingale/v6/pkg/tplx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/google/uuid"
)
func (rt *Router) messageTemplatesAdd(c *gin.Context) {
var lst []*models.MessageTemplate
ginx.BindJSON(c, &lst)
if len(lst) == 0 {
ginx.Bomb(http.StatusBadRequest, "input json is empty")
}
me := c.MustGet("user").(*models.User)
isAdmin := me.IsAdmin()
idents := make([]string, 0, len(lst))
gids, err := models.MyGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
now := time.Now().Unix()
for _, tpl := range lst {
// 生成一个唯一的标识符,以后也不允许修改,前端不需要传这个参数
tpl.Ident = uuid.New().String()
ginx.Dangerous(tpl.Verify())
if !isAdmin && !slice.HaveIntersection(gids, tpl.UserGroupIds) {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
idents = append(idents, tpl.Ident)
tpl.CreateBy = me.Username
tpl.CreateAt = now
tpl.UpdateBy = me.Username
tpl.UpdateAt = now
}
lstWithSameId, err := models.MessageTemplatesGet(rt.Ctx, "ident IN ?", idents)
ginx.Dangerous(err)
if len(lstWithSameId) > 0 {
ginx.Bomb(http.StatusBadRequest, "ident already exists")
}
ids := make([]int64, 0, len(lst))
for _, tpl := range lst {
err := models.Insert(rt.Ctx, tpl)
ginx.Dangerous(err)
ids = append(ids, tpl.ID)
}
ginx.NewRender(c).Data(ids, nil)
}
func (rt *Router) messageTemplatesDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
f.Verify()
lst, err := models.MessageTemplatesGet(rt.Ctx, "id in (?)", f.Ids)
ginx.Dangerous(err)
notifyRuleIds, err := models.UsedByNotifyRule(rt.Ctx, models.MsgTplList(lst))
ginx.Dangerous(err)
if len(notifyRuleIds) > 0 {
ginx.NewRender(c).Message(fmt.Errorf("used by notify rule: %v", notifyRuleIds))
return
}
if me := c.MustGet("user").(*models.User); !me.IsAdmin() {
gids, err := models.MyGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
for _, t := range lst {
if !slice.HaveIntersection(gids, t.UserGroupIds) {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
}
}
ginx.NewRender(c).Message(models.DB(rt.Ctx).Delete(
&models.MessageTemplate{}, "id in (?)", f.Ids).Error)
}
func (rt *Router) messageTemplatePut(c *gin.Context) {
var f models.MessageTemplate
ginx.BindJSON(c, &f)
mt, err := models.MessageTemplateGet(rt.Ctx, "id <> ? and ident = ?", ginx.UrlParamInt64(c, "id"), f.Ident)
ginx.Dangerous(err)
if mt != nil {
ginx.Bomb(http.StatusBadRequest, "message template ident already exists")
}
mt, err = models.MessageTemplateGet(rt.Ctx, "id = ?", ginx.UrlParamInt64(c, "id"))
ginx.Dangerous(err)
if mt == nil {
ginx.Bomb(http.StatusNotFound, "message template not found")
}
me := c.MustGet("user").(*models.User)
if !me.IsAdmin() {
gids, err := models.MyGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
if !slice.HaveIntersection(gids, mt.UserGroupIds) {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
}
f.UpdateBy = me.Username
ginx.NewRender(c).Message(mt.Update(rt.Ctx, f))
}
func (rt *Router) messageTemplateGet(c *gin.Context) {
me := c.MustGet("user").(*models.User)
gids, err := models.MyGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
tid := ginx.UrlParamInt64(c, "id")
mt, err := models.MessageTemplateGet(rt.Ctx, "id = ?", tid)
ginx.Dangerous(err)
if mt == nil {
ginx.Bomb(http.StatusNotFound, "message template not found")
}
if mt.Private == 1 && !slice.HaveIntersection(gids, mt.UserGroupIds) {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
ginx.NewRender(c).Data(mt, nil)
}
func (rt *Router) messageTemplatesGet(c *gin.Context) {
var notifyChannelIdents []string
if tmp := ginx.QueryStr(c, "notify_channel_idents", ""); tmp != "" {
notifyChannelIdents = strings.Split(tmp, ",")
}
notifyChannelIds := strx.IdsInt64ForAPI(ginx.QueryStr(c, "notify_channel_ids", ""))
if len(notifyChannelIds) > 0 {
ginx.Dangerous(models.DB(rt.Ctx).Model(models.NotifyChannelConfig{}).
Where("id in (?)", notifyChannelIds).Pluck("ident", ¬ifyChannelIdents).Error)
}
me := c.MustGet("user").(*models.User)
gids, err := models.MyGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
lst, err := models.MessageTemplatesGetBy(rt.Ctx, notifyChannelIdents)
ginx.Dangerous(err)
models.FillUpdateByNicknames(rt.Ctx, lst)
if me.IsAdmin() {
ginx.NewRender(c).Data(lst, nil)
return
}
res := make([]*models.MessageTemplate, 0)
for _, t := range lst {
if slice.HaveIntersection[int64](gids, t.UserGroupIds) || t.Private == 0 {
res = append(res, t)
}
}
ginx.NewRender(c).Data(res, nil)
}
type evtMsgReq struct {
EventIds []int64 `json:"event_ids"`
Tpl struct {
Content map[string]string `json:"content"`
} `json:"tpl"`
}
func (rt *Router) eventsMessage(c *gin.Context) {
var req evtMsgReq
ginx.BindJSON(c, &req)
hisEvents, err := models.AlertHisEventGetByIds(rt.Ctx, req.EventIds)
ginx.Dangerous(err)
if len(hisEvents) == 0 {
ginx.Bomb(http.StatusBadRequest, "event not found")
}
ginx.Dangerous(err)
events := make([]*models.AlertCurEvent, len(hisEvents))
for i, he := range hisEvents {
events[i] = he.ToCur()
}
renderData := make(map[string]interface{})
renderData["events"] = events
defs := models.GetDefs(renderData)
ret := make(map[string]string, len(req.Tpl.Content))
for k, v := range req.Tpl.Content {
text := strings.Join(append(defs, v), "")
tpl, err := template.New(k).Funcs(tplx.TemplateFuncMap).Parse(text)
if err != nil {
ret[k] = err.Error()
continue
}
var buf bytes.Buffer
err = tpl.Execute(&buf, renderData)
if err != nil {
ret[k] = err.Error()
continue
}
ret[k] = buf.String()
}
ginx.NewRender(c).Data(ret, nil)
}
================================================
FILE: center/router/router_metric_desc.go
================================================
package router
import (
"github.com/ccfos/nightingale/v6/center/cconf"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
func (rt *Router) metricsDescGetFile(c *gin.Context) {
c.JSON(200, rt.Center.MetricDesc)
}
// 前端传过来一个metric数组,后端去查询有没有对应的释义,返回map
func (rt *Router) metricsDescGetMap(c *gin.Context) {
var arr []string
ginx.BindJSON(c, &arr)
ret := make(map[string]string)
for _, key := range arr {
ret[key] = cconf.GetMetricDesc(c.GetHeader("X-Language"), key)
}
ginx.NewRender(c).Data(ret, nil)
}
// 页面功能暂时先不要了,直接通过配置文件来维护
// func metricDescriptionGets(c *gin.Context) {
// limit := ginx.QueryInt(c, "limit", 20)
// query := ginx.QueryStr(c, "query", "")
// total, err := models.MetricDescriptionTotal(query)
// ginx.Dangerous(err)
// list, err := models.MetricDescriptionGets(query, limit, ginx.Offset(c, limit))
// ginx.Dangerous(err)
// ginx.NewRender(c).Data(gin.H{
// "list": list,
// "total": total,
// }, nil)
// }
// type metricDescriptionAddForm struct {
// Data string `json:"data"`
// }
// func metricDescriptionAdd(c *gin.Context) {
// var f metricDescriptionAddForm
// ginx.BindJSON(c, &f)
// var metricDescriptions []models.MetricDescription
// lines := strings.Split(f.Data, "\n")
// for _, md := range lines {
// arr := strings.SplitN(md, ":", 2)
// if len(arr) != 2 {
// ginx.Bomb(200, "metric description %s is illegal", md)
// }
// m := models.MetricDescription{
// Metric: arr[0],
// Description: arr[1],
// }
// metricDescriptions = append(metricDescriptions, m)
// }
// if len(metricDescriptions) == 0 {
// ginx.Bomb(http.StatusBadRequest, "Decoded metric description empty")
// }
// ginx.NewRender(c).Message(models.MetricDescriptionUpdate(metricDescriptions))
// }
// func metricDescriptionDel(c *gin.Context) {
// var f idsForm
// ginx.BindJSON(c, &f)
// f.Verify()
// ginx.NewRender(c).Message(models.MetricDescriptionDel(f.Ids))
// }
// type metricDescriptionForm struct {
// Description string `json:"description"`
// }
// func metricDescriptionPut(c *gin.Context) {
// var f metricDescriptionForm
// ginx.BindJSON(c, &f)
// md, err := models.MetricDescriptionGet("id=?", ginx.UrlParamInt64(c, "id"))
// ginx.Dangerous(err)
// if md == nil {
// ginx.Bomb(200, "No such metric description")
// }
// ginx.NewRender(c).Message(md.Update(f.Description, time.Now().Unix()))
// }
================================================
FILE: center/router/router_metric_view.go
================================================
package router
import (
"net/http"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
// no param
func (rt *Router) metricViewGets(c *gin.Context) {
lst, err := models.MetricViewGets(rt.Ctx, c.MustGet("userid"))
ginx.NewRender(c).Data(lst, err)
}
// body: name, configs, cate
func (rt *Router) metricViewAdd(c *gin.Context) {
var f models.MetricView
ginx.BindJSON(c, &f)
me := c.MustGet("user").(*models.User)
if !me.IsAdmin() {
// 管理员可以选择当前这个视图是公开呢,还是私有,普通用户的话就只能是私有的
f.Cate = 1
}
f.Id = 0
f.CreateBy = me.Id
ginx.Dangerous(f.Add(rt.Ctx))
ginx.NewRender(c).Data(f, nil)
}
// body: ids
func (rt *Router) metricViewDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
f.Verify()
me := c.MustGet("user").(*models.User)
if me.IsAdmin() {
ginx.NewRender(c).Message(models.MetricViewDel(rt.Ctx, f.Ids))
} else {
ginx.NewRender(c).Message(models.MetricViewDel(rt.Ctx, f.Ids, me.Id))
}
}
// body: id, name, configs, cate
func (rt *Router) metricViewPut(c *gin.Context) {
var f models.MetricView
ginx.BindJSON(c, &f)
view, err := models.MetricViewGet(rt.Ctx, "id = ?", f.Id)
ginx.Dangerous(err)
if view == nil {
ginx.NewRender(c).Message("no such item(id: %d)", f.Id)
return
}
me := c.MustGet("user").(*models.User)
if !me.IsAdmin() {
f.Cate = 1
// 如果是普通用户,只能修改自己的
if view.CreateBy != me.Id {
ginx.NewRender(c, http.StatusForbidden).Message("forbidden")
return
}
}
ginx.NewRender(c).Message(view.Update(rt.Ctx, f.Name, f.Configs, f.Cate, me.Id))
}
================================================
FILE: center/router/router_mute.go
================================================
package router
import (
"net/http"
"strings"
"time"
"github.com/ccfos/nightingale/v6/alert/common"
"github.com/ccfos/nightingale/v6/alert/mute"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/strx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/i18n"
)
// Return all, front-end search and paging
func (rt *Router) alertMuteGetsByBG(c *gin.Context) {
bgid := ginx.UrlParamInt64(c, "id")
prods := strings.Fields(ginx.QueryStr(c, "prods", ""))
query := ginx.QueryStr(c, "query", "")
expired := ginx.QueryInt(c, "expired", -1)
lst, err := models.AlertMuteGets(rt.Ctx, prods, bgid, -1, expired, query)
if err == nil {
models.FillUpdateByNicknames(rt.Ctx, lst)
}
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) alertMuteGetsByGids(c *gin.Context) {
gids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",")
if len(gids) > 0 {
for _, gid := range gids {
rt.bgroCheck(c, gid)
}
} else {
me := c.MustGet("user").(*models.User)
if !me.IsAdmin() {
var err error
gids, err = models.MyBusiGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
if len(gids) == 0 {
ginx.NewRender(c).Data([]int{}, nil)
return
}
}
}
lst, err := models.AlertMuteGetsByBGIds(rt.Ctx, gids)
if err == nil {
models.FillUpdateByNicknames(rt.Ctx, lst)
}
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) alertMuteGets(c *gin.Context) {
prods := strings.Fields(ginx.QueryStr(c, "prods", ""))
bgid := ginx.QueryInt64(c, "bgid", -1)
query := ginx.QueryStr(c, "query", "")
disabled := ginx.QueryInt(c, "disabled", -1)
expired := ginx.QueryInt(c, "expired", -1)
lst, err := models.AlertMuteGets(rt.Ctx, prods, bgid, disabled, expired, query)
if err == nil {
models.FillUpdateByNicknames(rt.Ctx, lst)
}
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) activeAlertMuteGets(c *gin.Context) {
lst, err := models.AlertMuteGetsAll(rt.Ctx)
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) alertMuteAdd(c *gin.Context) {
var f models.AlertMute
ginx.BindJSON(c, &f)
username := c.MustGet("username").(string)
f.CreateBy = username
f.UpdateBy = username
f.GroupId = ginx.UrlParamInt64(c, "id")
ginx.Dangerous(f.Add(rt.Ctx))
ginx.NewRender(c).Data(f.Id, nil)
}
type MuteTestForm struct {
EventId int64 `json:"event_id" binding:"required"`
AlertMute models.AlertMute `json:"config" binding:"required"`
PassTimeCheck bool `json:"pass_time_check"`
}
func (rt *Router) alertMuteTryRun(c *gin.Context) {
var f MuteTestForm
ginx.BindJSON(c, &f)
ginx.Dangerous(f.AlertMute.Verify())
hisEvent, err := models.AlertHisEventGetById(rt.Ctx, f.EventId)
ginx.Dangerous(err)
if hisEvent == nil {
ginx.Bomb(http.StatusNotFound, "event not found")
}
curEvent := *hisEvent.ToCur()
curEvent.SetTagsMap()
if f.PassTimeCheck {
f.AlertMute.MuteTimeType = models.Periodic
f.AlertMute.PeriodicMutesJson = []models.PeriodicMute{
{
EnableDaysOfWeek: "0 1 2 3 4 5 6",
EnableStime: "00:00",
EnableEtime: "00:00",
},
}
}
match, err := mute.MatchMute(&curEvent, &f.AlertMute)
if err != nil {
// 对错误信息进行 i18n 翻译
translatedErr := i18n.Sprintf(c.GetHeader("X-Language"), err.Error())
ginx.Bomb(http.StatusBadRequest, translatedErr)
}
if !match {
ginx.NewRender(c).Data("event not match mute", nil)
return
}
ginx.NewRender(c).Data("event match mute", nil)
}
// Preview events (alert_cur_event) that match the mute strategy based on the following criteria:
// business group ID (group_id, group_id), product (prod, rule_prod),
// alert event severity (severities, severity), and event tags (tags, tags).
// For products of type not 'host', also consider the category (cate, cate) and datasource ID (datasource_ids, datasource_id).
func (rt *Router) alertMutePreview(c *gin.Context) {
//Generally the match of events would be less.
var f models.AlertMute
ginx.BindJSON(c, &f)
f.GroupId = ginx.UrlParamInt64(c, "id")
ginx.Dangerous(f.Verify()) //verify and parse tags json to ITags
events, err := models.AlertCurEventGetsFromAlertMute(rt.Ctx, &f)
ginx.Dangerous(err)
matchEvents := make([]*models.AlertCurEvent, 0, len(events))
for i := 0; i < len(events); i++ {
events[i].DB2Mem()
if common.MatchTags(events[i].TagsMap, f.ITags) {
matchEvents = append(matchEvents, events[i])
}
}
ginx.NewRender(c).Data(matchEvents, err)
}
func (rt *Router) alertMuteAddByService(c *gin.Context) {
var f models.AlertMute
ginx.BindJSON(c, &f)
err := f.Add(rt.Ctx)
ginx.NewRender(c).Data(f.Id, err)
}
func (rt *Router) alertMuteDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
f.Verify()
ginx.NewRender(c).Message(models.AlertMuteDel(rt.Ctx, f.Ids))
}
// alertMuteGet returns the alert mute by ID
func (rt *Router) alertMuteGet(c *gin.Context) {
amid := ginx.UrlParamInt64(c, "amid")
am, err := models.AlertMuteGetById(rt.Ctx, amid)
am.DB2FE()
ginx.NewRender(c).Data(am, err)
}
func (rt *Router) alertMutePutByFE(c *gin.Context) {
var f models.AlertMute
ginx.BindJSON(c, &f)
amid := ginx.UrlParamInt64(c, "amid")
am, err := models.AlertMuteGetById(rt.Ctx, amid)
ginx.Dangerous(err)
if am == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such AlertMute")
return
}
rt.bgrwCheck(c, am.GroupId)
f.UpdateBy = c.MustGet("username").(string)
ginx.NewRender(c).Message(am.Update(rt.Ctx, f))
}
type alertMuteFieldForm struct {
Ids []int64 `json:"ids"`
Fields map[string]interface{} `json:"fields"`
}
func (rt *Router) alertMutePutFields(c *gin.Context) {
var f alertMuteFieldForm
ginx.BindJSON(c, &f)
if len(f.Fields) == 0 {
ginx.Bomb(http.StatusBadRequest, "fields empty")
}
f.Fields["update_by"] = c.MustGet("username").(string)
f.Fields["update_at"] = time.Now().Unix()
for i := 0; i < len(f.Ids); i++ {
am, err := models.AlertMuteGetById(rt.Ctx, f.Ids[i])
ginx.Dangerous(err)
if am == nil {
continue
}
am.FE2DB()
ginx.Dangerous(am.UpdateFieldsMap(rt.Ctx, f.Fields))
}
ginx.NewRender(c).Message(nil)
}
================================================
FILE: center/router/router_mw.go
================================================
package router
import (
"context"
"errors"
"fmt"
"net/http"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/center/cstats"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/golang-jwt/jwt"
"github.com/google/uuid"
)
const (
DefaultTokenKey = "X-User-Token"
)
type AccessDetails struct {
AccessUuid string
UserIdentity string
}
func (rt *Router) handleProxyUser(c *gin.Context) *models.User {
headerUserNameKey := rt.HTTP.ProxyAuth.HeaderUserNameKey
username := c.GetHeader(headerUserNameKey)
if username == "" {
ginx.Bomb(http.StatusUnauthorized, "unauthorized")
}
user, err := models.UserGetByUsername(rt.Ctx, username)
if err != nil {
ginx.Bomb(http.StatusInternalServerError, err.Error())
}
if user == nil {
now := time.Now().Unix()
user = &models.User{
Username: username,
Nickname: username,
Roles: strings.Join(rt.HTTP.ProxyAuth.DefaultRoles, " "),
CreateAt: now,
UpdateAt: now,
CreateBy: "system",
UpdateBy: "system",
}
err = user.Add(rt.Ctx)
if err != nil {
ginx.Bomb(http.StatusInternalServerError, err.Error())
}
}
return user
}
func (rt *Router) proxyAuth() gin.HandlerFunc {
return func(c *gin.Context) {
user := rt.handleProxyUser(c)
c.Set("userid", user.Id)
c.Set("username", user.Username)
c.Next()
}
}
// tokenAuth 支持两种方式的认证,固定 token 和 jwt token
// 因为不太好区分用户使用哪个方式,所以两种方式放在一个中间件里
func (rt *Router) tokenAuth() gin.HandlerFunc {
return func(c *gin.Context) {
// 先验证固定 token
if rt.HTTP.TokenAuth.Enable {
tokenKey := rt.HTTP.TokenAuth.HeaderUserTokenKey
if tokenKey == "" {
tokenKey = DefaultTokenKey
}
token := c.GetHeader(tokenKey)
if token != "" {
user := rt.UserTokenCache.GetByToken(token)
if user != nil && user.Username != "" {
c.Set("userid", user.Id)
c.Set("username", user.Username)
c.Next()
return
}
}
}
// 再验证 jwt token
metadata, err := rt.extractTokenMetadata(c.Request)
if err != nil {
ginx.Bomb(http.StatusUnauthorized, "unauthorized")
}
userIdentity, err := rt.fetchAuth(c.Request.Context(), metadata.AccessUuid)
if err != nil {
ginx.Bomb(http.StatusUnauthorized, "unauthorized")
}
// ${userid}-${username}
arr := strings.SplitN(userIdentity, "-", 2)
if len(arr) != 2 {
ginx.Bomb(http.StatusUnauthorized, "unauthorized")
}
userid, err := strconv.ParseInt(arr[0], 10, 64)
if err != nil {
ginx.Bomb(http.StatusUnauthorized, "unauthorized")
}
c.Set("userid", userid)
c.Set("username", arr[1])
c.Next()
}
}
func (rt *Router) Auth() gin.HandlerFunc {
return rt.auth()
}
func (rt *Router) auth() gin.HandlerFunc {
if rt.HTTP.ProxyAuth.Enable {
return rt.proxyAuth()
} else {
return rt.tokenAuth()
}
}
// if proxy auth is enabled, mock jwt login/logout/refresh request
func (rt *Router) jwtMock() gin.HandlerFunc {
return func(c *gin.Context) {
if !rt.HTTP.ProxyAuth.Enable {
c.Next()
return
}
if strings.Contains(c.FullPath(), "logout") {
ginx.Bomb(http.StatusBadRequest, "logout is not supported when proxy auth is enabled")
}
user := rt.handleProxyUser(c)
ginx.NewRender(c).Data(gin.H{
"user": user,
"access_token": "",
"refresh_token": "",
}, nil)
c.Abort()
}
}
func (rt *Router) User() gin.HandlerFunc {
return rt.user()
}
func (rt *Router) user() gin.HandlerFunc {
return func(c *gin.Context) {
username := c.MustGet("username").(string)
user, err := models.UserGetByUsername(rt.Ctx, username)
if err != nil {
ginx.Bomb(http.StatusUnauthorized, "unauthorized")
}
if user == nil {
ginx.Bomb(http.StatusUnauthorized, "unauthorized")
}
c.Set("user", user)
c.Set("isadmin", user.IsAdmin())
// Update user.LastActiveTime
rt.UserCache.SetLastActiveTime(user.Id, time.Now().Unix())
c.Next()
}
}
func (rt *Router) userGroupWrite() gin.HandlerFunc {
return func(c *gin.Context) {
me := c.MustGet("user").(*models.User)
ug := UserGroup(rt.Ctx, ginx.UrlParamInt64(c, "id"))
can, err := me.CanModifyUserGroup(rt.Ctx, ug)
ginx.Dangerous(err)
if !can {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
c.Set("user_group", ug)
c.Next()
}
}
func (rt *Router) bgro() gin.HandlerFunc {
return func(c *gin.Context) {
me := c.MustGet("user").(*models.User)
bg := BusiGroup(rt.Ctx, ginx.UrlParamInt64(c, "id"))
can, err := me.CanDoBusiGroup(rt.Ctx, bg)
ginx.Dangerous(err)
if !can {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
c.Set("busi_group", bg)
c.Next()
}
}
// bgrw 逐步要被干掉,不安全
func (rt *Router) Bgrw() gin.HandlerFunc {
return rt.bgrw()
}
func (rt *Router) bgrw() gin.HandlerFunc {
return func(c *gin.Context) {
me := c.MustGet("user").(*models.User)
bg := BusiGroup(rt.Ctx, ginx.UrlParamInt64(c, "id"))
can, err := me.CanDoBusiGroup(rt.Ctx, bg, "rw")
ginx.Dangerous(err)
if !can {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
c.Set("busi_group", bg)
c.Next()
}
}
// bgrwCheck 要逐渐替换掉bgrw方法,更安全
func (rt *Router) bgrwCheck(c *gin.Context, bgid int64) {
me := c.MustGet("user").(*models.User)
bg := BusiGroup(rt.Ctx, bgid)
can, err := me.CanDoBusiGroup(rt.Ctx, bg, "rw")
ginx.Dangerous(err)
if !can {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
c.Set("busi_group", bg)
}
func (rt *Router) bgrwChecks(c *gin.Context, bgids []int64) {
set := make(map[int64]struct{})
for i := 0; i < len(bgids); i++ {
if _, has := set[bgids[i]]; has {
continue
}
rt.bgrwCheck(c, bgids[i])
set[bgids[i]] = struct{}{}
}
}
func (rt *Router) bgroCheck(c *gin.Context, bgid int64) {
me := c.MustGet("user").(*models.User)
bg := BusiGroup(rt.Ctx, bgid)
can, err := me.CanDoBusiGroup(rt.Ctx, bg)
ginx.Dangerous(err)
if !can {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
c.Set("busi_group", bg)
}
func (rt *Router) Perm(operation string) gin.HandlerFunc {
return rt.perm(operation)
}
func (rt *Router) perm(operation string) gin.HandlerFunc {
return func(c *gin.Context) {
me := c.MustGet("user").(*models.User)
can, err := me.CheckPerm(rt.Ctx, operation)
ginx.Dangerous(err)
if !can {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
c.Next()
}
}
func (rt *Router) admin() gin.HandlerFunc {
return func(c *gin.Context) {
userid := c.MustGet("userid").(int64)
user, err := models.UserGetById(rt.Ctx, userid)
if err != nil {
ginx.Bomb(http.StatusUnauthorized, "unauthorized")
}
if user == nil {
ginx.Bomb(http.StatusUnauthorized, "unauthorized")
}
roles := strings.Fields(user.Roles)
found := false
for i := 0; i < len(roles); i++ {
if roles[i] == models.AdminRole {
found = true
break
}
}
if !found {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
c.Set("user", user)
c.Next()
}
}
func (rt *Router) extractTokenMetadata(r *http.Request) (*AccessDetails, error) {
token, err := rt.verifyToken(rt.HTTP.JWTAuth.SigningKey, rt.extractToken(r))
if err != nil {
return nil, err
}
claims, ok := token.Claims.(jwt.MapClaims)
if ok && token.Valid {
accessUuid, ok := claims["access_uuid"].(string)
if !ok {
return nil, errors.New("failed to parse access_uuid from jwt")
}
// accessUuid 在 redis 里存在才放行
val, err := rt.fetchAuth(r.Context(), accessUuid)
if err != nil || val == "" {
return nil, errors.New("unauthorized")
}
return &AccessDetails{
AccessUuid: accessUuid,
UserIdentity: claims["user_identity"].(string),
}, nil
}
return nil, err
}
func (rt *Router) extractToken(r *http.Request) string {
tok := r.Header.Get("Authorization")
if len(tok) > 6 && strings.ToUpper(tok[0:7]) == "BEARER " {
return tok[7:]
}
return ""
}
func (rt *Router) createAuth(ctx context.Context, userIdentity string, td *TokenDetails) error {
username := strings.Split(userIdentity, "-")[1]
// 如果只能有一个账号登录,那么就删除之前的 token
if rt.HTTP.JWTAuth.SingleLogin {
delKeys, err := rt.Redis.SMembers(ctx, rt.wrapJwtKey(username)).Result()
if err != nil {
return err
}
if len(delKeys) > 0 {
errDel := rt.Redis.Del(ctx, delKeys...).Err()
if errDel != nil {
return errDel
}
}
if errDel := rt.Redis.Del(ctx, rt.wrapJwtKey(username)).Err(); errDel != nil {
return errDel
}
}
at := time.Unix(td.AtExpires, 0)
rte := time.Unix(td.RtExpires, 0)
now := time.Now()
if err := rt.Redis.Set(ctx, rt.wrapJwtKey(td.AccessUuid), userIdentity, at.Sub(now)).Err(); err != nil {
cstats.RedisOperationLatency.WithLabelValues("set_token", "fail").Observe(time.Since(now).Seconds())
return err
}
if err := rt.Redis.Set(ctx, rt.wrapJwtKey(td.RefreshUuid), userIdentity, rte.Sub(now)).Err(); err != nil {
cstats.RedisOperationLatency.WithLabelValues("set_token", "fail").Observe(time.Since(now).Seconds())
return err
}
cstats.RedisOperationLatency.WithLabelValues("set_token", "success").Observe(time.Since(now).Seconds())
if rt.HTTP.JWTAuth.SingleLogin {
if err := rt.Redis.SAdd(ctx, rt.wrapJwtKey(username), rt.wrapJwtKey(td.AccessUuid), rt.wrapJwtKey(td.RefreshUuid)).Err(); err != nil {
return err
}
}
return nil
}
func (rt *Router) fetchAuth(ctx context.Context, givenUuid string) (string, error) {
now := time.Now()
ret, err := rt.Redis.Get(ctx, rt.wrapJwtKey(givenUuid)).Result()
if err != nil {
cstats.RedisOperationLatency.WithLabelValues("get_token", "fail").Observe(time.Since(now).Seconds())
} else {
cstats.RedisOperationLatency.WithLabelValues("get_token", "success").Observe(time.Since(now).Seconds())
}
return ret, err
}
func (rt *Router) deleteAuth(ctx context.Context, givenUuid string) error {
err := rt.Redis.Del(ctx, rt.wrapJwtKey(givenUuid)).Err()
if err != nil {
cstats.RedisOperationLatency.WithLabelValues("del_token", "fail").Observe(time.Since(time.Now()).Seconds())
} else {
cstats.RedisOperationLatency.WithLabelValues("del_token", "success").Observe(time.Since(time.Now()).Seconds())
}
return err
}
func (rt *Router) deleteTokens(ctx context.Context, authD *AccessDetails) error {
// get the refresh uuid
refreshUuid := authD.AccessUuid + "++" + authD.UserIdentity
// delete access token
err := rt.Redis.Del(ctx, rt.wrapJwtKey(authD.AccessUuid)).Err()
if err != nil {
return err
}
// delete refresh token
err = rt.Redis.Del(ctx, rt.wrapJwtKey(refreshUuid)).Err()
if err != nil {
return err
}
return nil
}
func (rt *Router) wrapJwtKey(key string) string {
return rt.HTTP.JWTAuth.RedisKeyPrefix + key
}
func (rt *Router) wrapIdTokenKey(userId int64) string {
return fmt.Sprintf("n9e_id_token_%d", userId)
}
// saveIdToken 保存用户的 id_token 到 Redis
func (rt *Router) saveIdToken(ctx context.Context, userId int64, idToken string) error {
if idToken == "" {
return nil
}
// id_token 的过期时间应该与 RefreshToken 保持一致,确保在整个会话期间都可用于登出
expiration := time.Minute * time.Duration(rt.HTTP.JWTAuth.RefreshExpired)
return rt.Redis.Set(ctx, rt.wrapIdTokenKey(userId), idToken, expiration).Err()
}
// fetchIdToken 从 Redis 获取用户的 id_token
func (rt *Router) fetchIdToken(ctx context.Context, userId int64) (string, error) {
return rt.Redis.Get(ctx, rt.wrapIdTokenKey(userId)).Result()
}
// deleteIdToken 从 Redis 删除用户的 id_token
func (rt *Router) deleteIdToken(ctx context.Context, userId int64) error {
return rt.Redis.Del(ctx, rt.wrapIdTokenKey(userId)).Err()
}
type TokenDetails struct {
AccessToken string
RefreshToken string
AccessUuid string
RefreshUuid string
AtExpires int64
RtExpires int64
}
func (rt *Router) createTokens(signingKey, userIdentity string) (*TokenDetails, error) {
td := &TokenDetails{}
td.AtExpires = time.Now().Add(time.Minute * time.Duration(rt.HTTP.JWTAuth.AccessExpired)).Unix()
td.AccessUuid = uuid.NewString()
td.RtExpires = time.Now().Add(time.Minute * time.Duration(rt.HTTP.JWTAuth.RefreshExpired)).Unix()
td.RefreshUuid = td.AccessUuid + "++" + userIdentity
var err error
// Creating Access Token
atClaims := jwt.MapClaims{}
atClaims["authorized"] = true
atClaims["access_uuid"] = td.AccessUuid
atClaims["user_identity"] = userIdentity
atClaims["exp"] = td.AtExpires
at := jwt.NewWithClaims(jwt.SigningMethodHS256, atClaims)
td.AccessToken, err = at.SignedString([]byte(signingKey))
if err != nil {
return nil, err
}
// Creating Refresh Token
rtClaims := jwt.MapClaims{}
rtClaims["refresh_uuid"] = td.RefreshUuid
rtClaims["user_identity"] = userIdentity
rtClaims["exp"] = td.RtExpires
jrt := jwt.NewWithClaims(jwt.SigningMethodHS256, rtClaims)
td.RefreshToken, err = jrt.SignedString([]byte(signingKey))
if err != nil {
return nil, err
}
return td, nil
}
func (rt *Router) verifyToken(signingKey, tokenString string) (*jwt.Token, error) {
if tokenString == "" {
return nil, fmt.Errorf("bearer token not found")
}
token, err := jwt.Parse(tokenString, func(token *jwt.Token) (interface{}, error) {
if _, ok := token.Method.(*jwt.SigningMethodHMAC); !ok {
return nil, fmt.Errorf("unexpected jwt signing method: %v", token.Header["alg"])
}
return []byte(signingKey), nil
})
if err != nil {
return nil, err
}
return token, nil
}
================================================
FILE: center/router/router_notification_record.go
================================================
package router
import (
"strings"
"github.com/ccfos/nightingale/v6/alert/sender"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/logger"
)
type NotificationResponse struct {
SubRules []SubRule `json:"sub_rules"`
Notifies map[string][]Record `json:"notifies"`
}
type SubRule struct {
SubID int64 `json:"sub_id"`
NotifyRuleId int64 `json:"notify_rule_id"`
Notifies map[string][]Record `json:"notifies"`
}
type Record struct {
NotifyRuleId int64 `json:"notify_rule_id"`
Target string `json:"target"`
Username string `json:"username"`
Status int `json:"status"`
Detail string `json:"detail"`
}
// notificationRecordAdd
func (rt *Router) notificationRecordAdd(c *gin.Context) {
var req []*models.NotificationRecord
ginx.BindJSON(c, &req)
err := sender.PushNotifyRecords(req)
ginx.Dangerous(err, 429)
ginx.NewRender(c).Data(nil, err)
}
func (rt *Router) notificationRecordList(c *gin.Context) {
eid := ginx.UrlParamInt64(c, "eid")
lst, err := models.NotificationRecordsGetByEventId(rt.Ctx, eid)
ginx.Dangerous(err)
response := buildNotificationResponse(rt.Ctx, lst)
ginx.NewRender(c).Data(response, nil)
}
func buildNotificationResponse(ctx *ctx.Context, nl []*models.NotificationRecord) NotificationResponse {
response := NotificationResponse{
SubRules: []SubRule{},
Notifies: make(map[string][]Record),
}
subRuleMap := make(map[int64]*SubRule)
// Collect all group IDs
groupIdSet := make(map[int64]struct{})
// map[SubId]map[Channel]map[Target]index
filter := make(map[int64]map[string]map[string]int)
for i, n := range nl {
// 对相同的 channel-target 进行合并
for _, gid := range n.GetGroupIds(ctx) {
groupIdSet[gid] = struct{}{}
}
if _, exists := filter[n.SubId]; !exists {
filter[n.SubId] = make(map[string]map[string]int)
}
if _, exists := filter[n.SubId][n.Channel]; !exists {
filter[n.SubId][n.Channel] = make(map[string]int)
}
idx, exists := filter[n.SubId][n.Channel][n.Target]
if !exists {
filter[n.SubId][n.Channel][n.Target] = i
} else {
if nl[idx].Status < n.Status {
nl[idx].Status = n.Status
}
nl[idx].Details = nl[idx].Details + ", " + n.Details
nl[i] = nil
}
}
// Fill usernames only once
usernameByTarget := fillUserNames(ctx, groupIdSet)
for _, n := range nl {
if n == nil {
continue
}
m := usernameByTarget[n.Target]
usernames := make([]string, 0, len(m))
for k := range m {
usernames = append(usernames, k)
}
if !checkChannel(n.Channel) {
// Hide sensitive information
n.Target = replaceLastEightChars(n.Target)
}
record := Record{
Target: n.Target,
Status: n.Status,
Detail: n.Details,
NotifyRuleId: n.NotifyRuleID,
}
record.Username = strings.Join(usernames, ",")
if n.SubId > 0 {
// Handle SubRules
subRule, ok := subRuleMap[n.SubId]
if !ok {
newSubRule := &SubRule{
NotifyRuleId: n.NotifyRuleID,
SubID: n.SubId,
}
newSubRule.Notifies = make(map[string][]Record)
newSubRule.Notifies[n.Channel] = []Record{record}
subRuleMap[n.SubId] = newSubRule
} else {
if _, exists := subRule.Notifies[n.Channel]; !exists {
subRule.Notifies[n.Channel] = []Record{record}
} else {
subRule.Notifies[n.Channel] = append(subRule.Notifies[n.Channel], record)
}
}
continue
}
if response.Notifies == nil {
response.Notifies = make(map[string][]Record)
}
if _, exists := response.Notifies[n.Channel]; !exists {
response.Notifies[n.Channel] = []Record{record}
} else {
response.Notifies[n.Channel] = append(response.Notifies[n.Channel], record)
}
}
for _, subRule := range subRuleMap {
response.SubRules = append(response.SubRules, *subRule)
}
return response
}
// check channel is one of the following: tx-sms, tx-voice, ali-sms, ali-voice, email, script
func checkChannel(channel string) bool {
switch channel {
case "tx-sms", "tx-voice", "ali-sms", "ali-voice", "email", "script":
return true
}
return false
}
func replaceLastEightChars(s string) string {
if len(s) <= 8 {
return strings.Repeat("*", len(s))
}
return s[:len(s)-8] + strings.Repeat("*", 8)
}
func fillUserNames(ctx *ctx.Context, groupIdSet map[int64]struct{}) map[string]map[string]struct{} {
userNameByTarget := make(map[string]map[string]struct{})
gids := make([]int64, 0, len(groupIdSet))
for gid := range groupIdSet {
gids = append(gids, gid)
}
users, err := models.UsersGetByGroupIds(ctx, gids)
if err != nil {
logger.Errorf("UsersGetByGroupIds failed, err: %v", err)
return userNameByTarget
}
for _, user := range users {
logger.Warningf("user: %s", user.Username)
for _, ch := range models.DefaultChannels {
target, exist := user.ExtractToken(ch)
if exist {
if _, ok := userNameByTarget[target]; !ok {
userNameByTarget[target] = make(map[string]struct{})
}
userNameByTarget[target][user.Username] = struct{}{}
}
}
}
return userNameByTarget
}
================================================
FILE: center/router/router_notify_channel.go
================================================
package router
import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"sort"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
func (rt *Router) notifyChannelsAdd(c *gin.Context) {
me := c.MustGet("user").(*models.User)
var lst []*models.NotifyChannelConfig
ginx.BindJSON(c, &lst)
if len(lst) == 0 {
ginx.Bomb(http.StatusBadRequest, "input json is empty")
}
names := make([]string, 0, len(lst))
for i := range lst {
ginx.Dangerous(lst[i].Verify())
names = append(names, lst[i].Name)
lst[i].CreateBy = me.Username
lst[i].CreateAt = time.Now().Unix()
lst[i].UpdateBy = me.Username
lst[i].UpdateAt = time.Now().Unix()
}
lstWithSameName, err := models.NotifyChannelsGet(rt.Ctx, "name IN ?", names)
ginx.Dangerous(err)
if len(lstWithSameName) > 0 {
ginx.Bomb(http.StatusBadRequest, "name already exists")
}
ids := make([]int64, 0, len(lst))
for _, item := range lst {
err := models.Insert(rt.Ctx, item)
ginx.Dangerous(err)
ids = append(ids, item.ID)
}
ginx.NewRender(c).Data(ids, nil)
}
func (rt *Router) notifyChannelsDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
f.Verify()
lst, err := models.NotifyChannelsGet(rt.Ctx, "id in (?)", f.Ids)
ginx.Dangerous(err)
notifyRuleIds, err := models.UsedByNotifyRule(rt.Ctx, models.NotiChList(lst))
ginx.Dangerous(err)
if len(notifyRuleIds) > 0 {
ginx.NewRender(c).Message(fmt.Errorf("used by notify rule: %v", notifyRuleIds))
return
}
ginx.NewRender(c).Message(models.DB(rt.Ctx).
Delete(&models.NotifyChannelConfig{}, "id in (?)", f.Ids).Error)
}
func (rt *Router) notifyChannelPut(c *gin.Context) {
me := c.MustGet("user").(*models.User)
var f models.NotifyChannelConfig
ginx.BindJSON(c, &f)
lstWithSameName, err := models.NotifyChannelsGet(rt.Ctx, "name = ? and id <> ?", f.Name, f.ID)
ginx.Dangerous(err)
if len(lstWithSameName) > 0 {
ginx.Bomb(http.StatusBadRequest, "name already exists")
}
nc, err := models.NotifyChannelGet(rt.Ctx, "id = ?", ginx.UrlParamInt64(c, "id"))
ginx.Dangerous(err)
if nc == nil {
ginx.Bomb(http.StatusNotFound, "notify channel not found")
}
f.UpdateBy = me.Username
ginx.NewRender(c).Message(nc.Update(rt.Ctx, f))
}
func (rt *Router) notifyChannelGet(c *gin.Context) {
cid := ginx.UrlParamInt64(c, "id")
nc, err := models.NotifyChannelGet(rt.Ctx, "id = ?", cid)
ginx.Dangerous(err)
if nc == nil {
ginx.Bomb(http.StatusNotFound, "notify channel not found")
}
ginx.NewRender(c).Data(nc, nil)
}
func (rt *Router) notifyChannelGetBy(c *gin.Context) {
ident := ginx.QueryStr(c, "ident")
nc, err := models.NotifyChannelGet(rt.Ctx, "ident = ?", ident)
ginx.Dangerous(err)
if nc == nil {
ginx.Bomb(http.StatusNotFound, "notify channel not found")
}
nc.ParamConfig = &models.NotifyParamConfig{}
nc.RequestConfig = &models.RequestConfig{}
ginx.NewRender(c).Data(nc, nil)
}
func (rt *Router) notifyChannelsGet(c *gin.Context) {
lst, err := models.NotifyChannelsGet(rt.Ctx, "", nil)
if err == nil {
models.FillUpdateByNicknames(rt.Ctx, lst)
}
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) notifyChannelsGetForNormalUser(c *gin.Context) {
lst, err := models.NotifyChannelsGet(rt.Ctx, "")
ginx.Dangerous(err)
newLst := make([]*models.NotifyChannelConfig, 0, len(lst))
for _, c := range lst {
newLst = append(newLst, &models.NotifyChannelConfig{
ID: c.ID,
Name: c.Name,
Ident: c.Ident,
Enable: c.Enable,
RequestType: c.RequestType,
ParamConfig: c.ParamConfig,
})
}
ginx.NewRender(c).Data(newLst, nil)
}
func (rt *Router) notifyChannelIdentsGet(c *gin.Context) {
// 获取所有通知渠道
channels, err := models.NotifyChannelsGet(rt.Ctx, "", nil)
ginx.Dangerous(err)
// ident 去重
idents := make(map[string]struct{})
for _, channel := range channels {
if channel.Ident != "" {
idents[channel.Ident] = struct{}{}
}
}
lst := make([]string, 0, len(idents))
for ident := range idents {
lst = append(lst, ident)
}
sort.Strings(lst)
ginx.NewRender(c).Data(lst, nil)
}
func (rt *Router) flashDutyNotifyChannelsGet(c *gin.Context) {
cid := ginx.UrlParamInt64(c, "id")
nc, err := models.NotifyChannelGet(rt.Ctx, "id = ?", cid)
ginx.Dangerous(err)
if nc == nil {
ginx.Bomb(http.StatusNotFound, "notify channel not found")
}
configs, err := models.ConfigsSelectByCkey(rt.Ctx, "flashduty_app_key")
if err != nil {
ginx.Bomb(http.StatusInternalServerError, "failed to get flashduty app key")
}
jsonData := []byte("{}")
if len(configs) > 0 {
me := c.MustGet("user").(*models.User)
jsonData = []byte(fmt.Sprintf(`{"member_name":"%s","email":"%s","phone":"%s"}`, me.Username, me.Email, me.Phone))
}
items, err := getFlashDutyChannels(nc.RequestConfig.FlashDutyRequestConfig.IntegrationUrl, jsonData, time.Duration(nc.RequestConfig.FlashDutyRequestConfig.Timeout)*time.Millisecond)
ginx.Dangerous(err)
ginx.NewRender(c).Data(items, nil)
}
type flushDutyChannelsResponse struct {
Error struct {
Code string `json:"code"`
Message string `json:"message"`
} `json:"error"`
Data struct {
Items []FlashDutyChannel `json:"items"`
Total int `json:"total"`
} `json:"data"`
}
type FlashDutyChannel struct {
ChannelID int `json:"channel_id"`
ChannelName string `json:"channel_name"`
Status string `json:"status"`
}
// getFlashDutyChannels 从FlashDuty API获取频道列表
func getFlashDutyChannels(integrationUrl string, jsonData []byte, timeout time.Duration) ([]FlashDutyChannel, error) {
// 解析URL,提取baseUrl和参数
baseUrl, integrationKey, err := parseIntegrationUrl(integrationUrl)
if err != nil {
return nil, err
}
if integrationKey == "" {
return nil, fmt.Errorf("integration_key not found in URL")
}
// 构建新的API URL,保持原始路径
url := fmt.Sprintf("%s/channel/list-by-integration?integration_key=%s", baseUrl, integrationKey)
req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/json")
httpResp, err := (&http.Client{
Timeout: timeout,
}).Do(req)
if err != nil {
return nil, err
}
defer httpResp.Body.Close()
body, err := io.ReadAll(httpResp.Body)
if err != nil {
return nil, err
}
var res flushDutyChannelsResponse
if err := json.Unmarshal(body, &res); err != nil {
return nil, err
}
if res.Error.Message != "" {
return nil, fmt.Errorf(res.Error.Message)
}
return res.Data.Items, nil
}
// parseIntegrationUrl 从URL中提取baseUrl和参数
func parseIntegrationUrl(urlStr string) (baseUrl string, integrationKey string, err error) {
// 解析URL
parsedUrl, err := url.Parse(urlStr)
if err != nil {
return "", "", err
}
host := fmt.Sprintf("%s://%s", parsedUrl.Scheme, parsedUrl.Host)
// 提取查询参数
queryParams := parsedUrl.Query()
integrationKey = queryParams.Get("integration_key")
return host, integrationKey, nil
}
func (rt *Router) pagerDutyNotifyServicesGet(c *gin.Context) {
cid := ginx.UrlParamInt64(c, "id")
nc, err := models.NotifyChannelGet(rt.Ctx, "id = ?", cid)
ginx.Dangerous(err)
if err != nil || nc == nil {
ginx.Bomb(http.StatusNotFound, "notify channel not found")
}
items, err := getPagerDutyServices(nc.RequestConfig.PagerDutyRequestConfig.ApiKey, time.Duration(nc.RequestConfig.PagerDutyRequestConfig.Timeout)*time.Millisecond)
if err != nil {
ginx.Bomb(http.StatusInternalServerError, fmt.Sprintf("failed to get pagerduty services: %v", err))
}
// 服务: []集成,扁平化为服务-集成
var flattenedItems []map[string]string
for _, svc := range items {
for _, integ := range svc.Integrations {
flattenedItems = append(flattenedItems, map[string]string{
"service_id": svc.ID,
"service_name": svc.Name,
"integration_summary": integ.Summary,
"integration_id": integ.ID,
"integration_url": integ.Self,
})
}
}
ginx.NewRender(c).Data(flattenedItems, nil)
}
func (rt *Router) pagerDutyIntegrationKeyGet(c *gin.Context) {
serviceId := ginx.UrlParamStr(c, "service_id")
integrationId := ginx.UrlParamStr(c, "integration_id")
cid := ginx.UrlParamInt64(c, "id")
nc, err := models.NotifyChannelGet(rt.Ctx, "id = ?", cid)
ginx.Dangerous(err)
if err != nil || nc == nil {
ginx.Bomb(http.StatusNotFound, "notify channel not found")
}
integrationUrl := fmt.Sprintf("https://api.pagerduty.com/services/%s/integrations/%s", serviceId, integrationId)
integrationKey, err := getPagerDutyIntegrationKey(integrationUrl, nc.RequestConfig.PagerDutyRequestConfig.ApiKey, time.Duration(nc.RequestConfig.PagerDutyRequestConfig.Timeout)*time.Millisecond)
if err != nil {
ginx.Bomb(http.StatusInternalServerError, fmt.Sprintf("failed to get pagerduty integration key: %v", err))
}
ginx.NewRender(c).Data(map[string]string{
"integration_key": integrationKey,
}, nil)
}
type PagerDutyIntegration struct {
ID string `json:"id"`
IntegrationKey string `json:"integration_key"`
Self string `json:"self"` // integration 的 API URL
Summary string `json:"summary"`
}
type PagerDutyService struct {
Name string `json:"name"`
ID string `json:"id"`
Integrations []PagerDutyIntegration `json:"integrations"`
}
// getPagerDutyServices 从 PagerDuty API 分页获取所有服务及其集成信息
func getPagerDutyServices(apiKey string, timeout time.Duration) ([]PagerDutyService, error) {
const limit = 100 // 每页最大数量
var offset uint // 分页偏移量
var allServices []PagerDutyService
for {
// 构建带分页参数的 URL
url := fmt.Sprintf("https://api.pagerduty.com/services?limit=%d&offset=%d", limit, offset)
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("Authorization", fmt.Sprintf("Token token=%s", apiKey))
req.Header.Set("Accept", "application/vnd.pagerduty+json;version=2")
httpResp, err := (&http.Client{Timeout: timeout}).Do(req)
if err != nil {
return nil, err
}
body, err := io.ReadAll(httpResp.Body)
httpResp.Body.Close()
if err != nil {
return nil, err
}
// 定义包含分页信息的响应结构
var serviceRes struct {
Services []PagerDutyService `json:"services"`
More bool `json:"more"` // 是否还有更多数据
Limit uint `json:"limit"`
Offset uint `json:"offset"`
}
if err := json.Unmarshal(body, &serviceRes); err != nil {
return nil, err
}
allServices = append(allServices, serviceRes.Services...)
// 判断是否还有更多数据
if !serviceRes.More || len(serviceRes.Services) < int(limit) {
break
}
offset += limit // 准备请求下一页
}
return allServices, nil
}
// getPagerDutyIntegrationKey 通过 integration 的 API URL 获取 integration key
func getPagerDutyIntegrationKey(integrationUrl, apiKey string, timeout time.Duration) (string, error) {
req, err := http.NewRequest("GET", integrationUrl, nil)
if err != nil {
return "", err
}
req.Header.Set("Authorization", fmt.Sprintf("Token token=%s", apiKey))
httpResp, err := (&http.Client{
Timeout: timeout,
}).Do(req)
if err != nil {
return "", err
}
defer httpResp.Body.Close()
body, err := io.ReadAll(httpResp.Body)
if err != nil {
return "", err
}
var integRes struct {
Integration struct {
IntegrationKey string `json:"integration_key"`
} `json:"integration"`
}
if err := json.Unmarshal(body, &integRes); err != nil {
return "", err
}
return integRes.Integration.IntegrationKey, nil
}
================================================
FILE: center/router/router_notify_channel_test.go
================================================
package router
import (
"fmt"
"testing"
)
func TestGetFlashDutyChannels(t *testing.T) {
// 构造测试数据
integrationUrl := "https://api.flashcat.cloud/event/push/alert/n9e?integration_key=xxx"
jsonData := []byte(`{}`)
// 调用被测试的函数
channels, err := getFlashDutyChannels(integrationUrl, jsonData, 5000)
fmt.Println(channels, err)
}
================================================
FILE: center/router/router_notify_config.go
================================================
package router
import (
"encoding/json"
"fmt"
"strings"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/alert/sender"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/tplx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/pelletier/go-toml/v2"
"github.com/toolkits/pkg/str"
)
func (rt *Router) webhookGets(c *gin.Context) {
var webhooks []models.Webhook
cval, err := models.ConfigsGet(rt.Ctx, models.WEBHOOKKEY)
ginx.Dangerous(err)
if cval == "" {
ginx.NewRender(c).Data(webhooks, nil)
return
}
err = json.Unmarshal([]byte(cval), &webhooks)
ginx.NewRender(c).Data(webhooks, err)
}
func (rt *Router) webhookPuts(c *gin.Context) {
var webhooks []models.Webhook
ginx.BindJSON(c, &webhooks)
for i := 0; i < len(webhooks); i++ {
webhooks[i].Headers = []string{}
if len(webhooks[i].HeaderMap) > 0 {
for k, v := range webhooks[i].HeaderMap {
webhooks[i].Headers = append(webhooks[i].Headers, k)
webhooks[i].Headers = append(webhooks[i].Headers, v)
}
}
}
data, err := json.Marshal(webhooks)
ginx.Dangerous(err)
username := c.MustGet("username").(string)
ginx.NewRender(c).Message(models.ConfigsSetWithUname(rt.Ctx, models.WEBHOOKKEY, string(data), username))
}
func (rt *Router) notifyScriptGet(c *gin.Context) {
var notifyScript models.NotifyScript
cval, err := models.ConfigsGet(rt.Ctx, models.NOTIFYSCRIPT)
ginx.Dangerous(err)
if cval == "" {
ginx.NewRender(c).Data(notifyScript, nil)
return
}
err = json.Unmarshal([]byte(cval), ¬ifyScript)
ginx.NewRender(c).Data(notifyScript, err)
}
func (rt *Router) notifyScriptPut(c *gin.Context) {
var notifyScript models.NotifyScript
ginx.BindJSON(c, ¬ifyScript)
data, err := json.Marshal(notifyScript)
ginx.Dangerous(err)
username := c.MustGet("username").(string)
ginx.NewRender(c).Message(models.ConfigsSetWithUname(rt.Ctx, models.NOTIFYSCRIPT, string(data), username))
}
func (rt *Router) notifyChannelGets(c *gin.Context) {
var notifyChannels []models.NotifyChannel
cval, err := models.ConfigsGet(rt.Ctx, models.NOTIFYCHANNEL)
ginx.Dangerous(err)
if cval == "" {
ginx.NewRender(c).Data(notifyChannels, nil)
return
}
err = json.Unmarshal([]byte(cval), ¬ifyChannels)
ginx.NewRender(c).Data(notifyChannels, err)
}
func (rt *Router) notifyChannelPuts(c *gin.Context) {
var notifyChannels []models.NotifyChannel
ginx.BindJSON(c, ¬ifyChannels)
channels := []string{models.Dingtalk, models.Wecom, models.Feishu, models.Mm, models.Telegram,
models.Email, models.Lark, models.LarkCard}
m := make(map[string]struct{})
for _, v := range notifyChannels {
m[v.Ident] = struct{}{}
}
for _, v := range channels {
if _, ok := m[v]; !ok {
ginx.Bomb(200, "channel %s ident can not modify", v)
}
}
data, err := json.Marshal(notifyChannels)
ginx.Dangerous(err)
username := c.MustGet("username").(string)
ginx.NewRender(c).Message(models.ConfigsSetWithUname(rt.Ctx, models.NOTIFYCHANNEL, string(data), username))
}
func (rt *Router) notifyContactGets(c *gin.Context) {
notifyContacts := []models.NotifyContact{}
cval, err := models.ConfigsGet(rt.Ctx, models.NOTIFYCONTACT)
ginx.Dangerous(err)
if cval == "" {
ginx.NewRender(c).Data(notifyContacts, nil)
return
}
err = json.Unmarshal([]byte(cval), ¬ifyContacts)
ginx.NewRender(c).Data(notifyContacts, err)
}
func (rt *Router) notifyContactPuts(c *gin.Context) {
var notifyContacts []models.NotifyContact
ginx.BindJSON(c, ¬ifyContacts)
data, err := json.Marshal(notifyContacts)
ginx.Dangerous(err)
username := c.MustGet("username").(string)
ginx.NewRender(c).Message(models.ConfigsSetWithUname(rt.Ctx, models.NOTIFYCONTACT, string(data), username))
}
func (rt *Router) notifyConfigGet(c *gin.Context) {
key := ginx.QueryStr(c, "ckey")
cval, err := models.ConfigsGet(rt.Ctx, key)
if cval == "" {
switch key {
case models.IBEX:
cval = memsto.DefaultIbex
case models.SMTP:
cval = memsto.DefaultSMTP
}
}
ginx.NewRender(c).Data(cval, err)
}
func (rt *Router) notifyConfigPut(c *gin.Context) {
var f models.Configs
ginx.BindJSON(c, &f)
userVariableMap := rt.NotifyConfigCache.ConfigCache.Get()
text := tplx.ReplaceTemplateUseText(f.Ckey, f.Cval, userVariableMap)
switch f.Ckey {
case models.SMTP:
var smtp aconf.SMTPConfig
err := toml.Unmarshal([]byte(text), &smtp)
ginx.Dangerous(err)
default:
ginx.Bomb(200, "key %s can not modify", f.Ckey)
}
username := c.MustGet("username").(string)
//insert or update built-in config
ginx.Dangerous(models.ConfigsSetWithUname(rt.Ctx, f.Ckey, f.Cval, username))
if f.Ckey == models.SMTP {
// 重置邮件发送器
smtp, errSmtp := SmtpValidate(text)
ginx.Dangerous(errSmtp)
go sender.RestartEmailSender(rt.Ctx, smtp)
}
ginx.NewRender(c).Message(nil)
}
func SmtpValidate(text string) (aconf.SMTPConfig, error) {
var smtp aconf.SMTPConfig
var err error
err = toml.Unmarshal([]byte(text), &smtp)
if err != nil {
return smtp, err
}
if smtp.Host == "" || smtp.Port == 0 {
return smtp, fmt.Errorf("smtp host or port can not be empty")
}
return smtp, err
}
type form struct {
models.Configs
Email string `json:"email"`
}
// After configuring the aconf.SMTPConfig, users can choose to perform a test. In this test, the function attempts to send an email
func (rt *Router) attemptSendEmail(c *gin.Context) {
var f form
ginx.BindJSON(c, &f)
if f.Email = strings.TrimSpace(f.Email); f.Email == "" || !str.IsMail(f.Email) {
ginx.Bomb(200, "email(%s) invalid", f.Email)
}
if f.Ckey != models.SMTP {
ginx.Bomb(200, "config(%v) invalid", f)
}
userVariableMap := rt.NotifyConfigCache.ConfigCache.Get()
text := tplx.ReplaceTemplateUseText(f.Ckey, f.Cval, userVariableMap)
smtp, err := SmtpValidate(text)
ginx.Dangerous(err)
ginx.NewRender(c).Message(sender.SendEmail("Email test", "email content", []string{f.Email}, smtp))
}
func (rt *Router) notifyChannelConfigGets(c *gin.Context) {
id := ginx.QueryInt64(c, "id", 0)
name := ginx.QueryStr(c, "name", "")
ident := ginx.QueryStr(c, "ident", "")
enabled := ginx.QueryInt(c, "enabled", -1)
notifyChannels, err := models.NotifyChannelGets(rt.Ctx, id, name, ident, enabled)
ginx.NewRender(c).Data(notifyChannels, err)
}
================================================
FILE: center/router/router_notify_rule.go
================================================
package router
import (
"fmt"
"net/http"
"time"
"github.com/ccfos/nightingale/v6/alert/dispatch"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/slice"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/logger"
)
func (rt *Router) notifyRulesAdd(c *gin.Context) {
var lst []*models.NotifyRule
ginx.BindJSON(c, &lst)
if len(lst) == 0 {
ginx.Bomb(http.StatusBadRequest, "input json is empty")
}
me := c.MustGet("user").(*models.User)
isAdmin := me.IsAdmin()
gids, err := models.MyGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
now := time.Now().Unix()
for _, nr := range lst {
ginx.Dangerous(nr.Verify())
if !isAdmin && !slice.HaveIntersection(gids, nr.UserGroupIds) {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
nr.CreateBy = me.Username
nr.CreateAt = now
nr.UpdateBy = me.Username
nr.UpdateAt = now
err := models.Insert(rt.Ctx, nr)
ginx.Dangerous(err)
}
ginx.NewRender(c).Data(lst, nil)
}
func (rt *Router) notifyRulesDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
f.Verify()
if me := c.MustGet("user").(*models.User); !me.IsAdmin() {
lst, err := models.NotifyRulesGet(rt.Ctx, "id in (?)", f.Ids)
ginx.Dangerous(err)
gids, err := models.MyGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
for _, t := range lst {
if !slice.HaveIntersection(gids, t.UserGroupIds) {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
}
}
ginx.NewRender(c).Message(models.DB(rt.Ctx).
Delete(&models.NotifyRule{}, "id in (?)", f.Ids).Error)
}
func (rt *Router) notifyRulePut(c *gin.Context) {
var f models.NotifyRule
ginx.BindJSON(c, &f)
nr, err := models.NotifyRuleGet(rt.Ctx, "id = ?", ginx.UrlParamInt64(c, "id"))
ginx.Dangerous(err)
if nr == nil {
ginx.Bomb(http.StatusNotFound, "notify rule not found")
}
me := c.MustGet("user").(*models.User)
gids, err := models.MyGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
if !slice.HaveIntersection(gids, nr.UserGroupIds) && !me.IsAdmin() {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
f.UpdateBy = me.Username
ginx.NewRender(c).Message(nr.Update(rt.Ctx, f))
}
func (rt *Router) notifyRuleGet(c *gin.Context) {
me := c.MustGet("user").(*models.User)
gids, err := models.MyGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
tid := ginx.UrlParamInt64(c, "id")
nr, err := models.NotifyRuleGet(rt.Ctx, "id = ?", tid)
ginx.Dangerous(err)
if nr == nil {
ginx.Bomb(http.StatusNotFound, "notify rule not found")
}
if !slice.HaveIntersection(gids, nr.UserGroupIds) && !me.IsAdmin() {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
ginx.NewRender(c).Data(nr, nil)
}
func (rt *Router) notifyRulesGetByService(c *gin.Context) {
ginx.NewRender(c).Data(models.NotifyRulesGet(rt.Ctx, "enable = ?", true))
}
func (rt *Router) notifyRulesGet(c *gin.Context) {
me := c.MustGet("user").(*models.User)
gids, err := models.MyGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
lst, err := models.NotifyRulesGet(rt.Ctx, "", nil)
ginx.Dangerous(err)
models.FillUpdateByNicknames(rt.Ctx, lst)
if me.IsAdmin() {
ginx.NewRender(c).Data(lst, nil)
return
}
res := make([]*models.NotifyRule, 0)
for _, nr := range lst {
if slice.HaveIntersection[int64](gids, nr.UserGroupIds) {
res = append(res, nr)
}
}
ginx.NewRender(c).Data(res, nil)
}
type NotifyTestForm struct {
EventIDs []int64 `json:"event_ids" binding:"required"`
NotifyConfig models.NotifyConfig `json:"notify_config" binding:"required"`
}
func (rt *Router) notifyTest(c *gin.Context) {
var f NotifyTestForm
ginx.BindJSON(c, &f)
hisEvents, err := models.AlertHisEventGetByIds(rt.Ctx, f.EventIDs)
ginx.Dangerous(err)
if len(hisEvents) == 0 {
ginx.Bomb(http.StatusBadRequest, "event not found")
}
ginx.Dangerous(err)
events := []*models.AlertCurEvent{}
for _, he := range hisEvents {
event := he.ToCur()
event.SetTagsMap()
if err := dispatch.NotifyRuleMatchCheck(&f.NotifyConfig, event); err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
events = append(events, event)
}
resp, err := SendNotifyChannelMessage(rt.Ctx, rt.UserCache, rt.UserGroupCache, f.NotifyConfig, events)
if resp == "" {
resp = "success"
}
ginx.NewRender(c).Data(resp, err)
}
func SendNotifyChannelMessage(ctx *ctx.Context, userCache *memsto.UserCacheType, userGroup *memsto.UserGroupCacheType, notifyConfig models.NotifyConfig, events []*models.AlertCurEvent) (string, error) {
notifyChannels, err := models.NotifyChannelGets(ctx, notifyConfig.ChannelID, "", "", -1)
if err != nil {
return "", fmt.Errorf("failed to get notify channels: %v", err)
}
if len(notifyChannels) == 0 {
return "", fmt.Errorf("notify channel not found")
}
notifyChannel := notifyChannels[0]
if !notifyChannel.Enable {
return "", fmt.Errorf("notify channel not enabled, please enable it first")
}
// 获取站点URL用于模板渲染
siteUrl, _ := models.ConfigsGetSiteUrl(ctx)
if siteUrl == "" {
siteUrl = "http://127.0.0.1:17000"
}
tplContent := make(map[string]interface{})
if notifyChannel.RequestType != "flashduty" {
messageTemplates, err := models.MessageTemplateGets(ctx, notifyConfig.TemplateID, "", "")
if err != nil {
return "", fmt.Errorf("failed to get message templates: %v", err)
}
if len(messageTemplates) == 0 {
return "", fmt.Errorf("message template not found")
}
tplContent = messageTemplates[0].RenderEvent(events, siteUrl)
}
var contactKey string
if notifyChannel.ParamConfig != nil && notifyChannel.ParamConfig.UserInfo != nil {
contactKey = notifyChannel.ParamConfig.UserInfo.ContactKey
}
sendtos, flashDutyChannelIDs, pagerDutyRoutingKeys, customParams := dispatch.GetNotifyConfigParams(¬ifyConfig, contactKey, userCache, userGroup)
var resp string
switch notifyChannel.RequestType {
case "flashduty":
client, err := models.GetHTTPClient(notifyChannel)
if err != nil {
return "", fmt.Errorf("failed to get http client: %v", err)
}
for i := range flashDutyChannelIDs {
resp, err = notifyChannel.SendFlashDuty(events, flashDutyChannelIDs[i], client)
if err != nil {
return "", fmt.Errorf("failed to send flashduty notify: %v", err)
}
}
logger.Infof("channel_name: %v, event:%s, tplContent:%s, customParams:%v, respBody: %v, err: %v", notifyChannel.Name, events[0].Hash, tplContent, customParams, resp, err)
return resp, nil
case "pagerduty":
client, err := models.GetHTTPClient(notifyChannel)
if err != nil {
return "", fmt.Errorf("failed to get http client: %v", err)
}
for _, routingKey := range pagerDutyRoutingKeys {
resp, err = notifyChannel.SendPagerDuty(events, routingKey, siteUrl, client)
if err != nil {
return "", fmt.Errorf("failed to send pagerduty notify: %v", err)
}
}
logger.Infof("channel_name: %v, event:%s, tplContent:%s, customParams:%v, respBody: %v, err: %v", notifyChannel.Name, events[0].Hash, tplContent, customParams, resp, err)
return resp, nil
case "http":
client, err := models.GetHTTPClient(notifyChannel)
if err != nil {
return "", fmt.Errorf("failed to get http client: %v", err)
}
if notifyChannel.RequestConfig == nil {
return "", fmt.Errorf("request config is nil")
}
if notifyChannel.RequestConfig.HTTPRequestConfig == nil {
return "", fmt.Errorf("http request config is nil")
}
if dispatch.NeedBatchContacts(notifyChannel.RequestConfig.HTTPRequestConfig) || len(sendtos) == 0 {
resp, err = notifyChannel.SendHTTP(events, tplContent, customParams, sendtos, client)
logger.Infof("channel_name: %v, event:%s, sendtos:%+v, tplContent:%s, customParams:%v, respBody: %v, err: %v", notifyChannel.Name, events[0].Hash, sendtos, tplContent, customParams, resp, err)
if err != nil {
return "", fmt.Errorf("failed to send http notify: %v", err)
}
return resp, nil
} else {
for i := range sendtos {
resp, err = notifyChannel.SendHTTP(events, tplContent, customParams, []string{sendtos[i]}, client)
logger.Infof("channel_name: %v, event:%s, tplContent:%s, customParams:%v, sendto:%+v, respBody: %v, err: %v", notifyChannel.Name, events[0].Hash, tplContent, customParams, sendtos[i], resp, err)
if err != nil {
return "", fmt.Errorf("failed to send http notify: %v", err)
}
}
return resp, nil
}
case "smtp":
if len(sendtos) == 0 {
return "", fmt.Errorf("no valid email address in the user and team")
}
err := notifyChannel.SendEmailNow(events, tplContent, sendtos)
if err != nil {
return "", fmt.Errorf("failed to send email notify: %v", err)
}
return resp, nil
case "script":
resp, _, err := notifyChannel.SendScript(events, tplContent, customParams, sendtos)
logger.Infof("channel_name: %v, event:%s, tplContent:%s, customParams:%v, respBody: %v, err: %v", notifyChannel.Name, events[0].Hash, tplContent, customParams, resp, err)
return resp, err
default:
logger.Errorf("unsupported request type: %v", notifyChannel.RequestType)
return "", fmt.Errorf("unsupported request type")
}
}
type paramList struct {
Name string `json:"name"`
CName string `json:"cname"`
Value interface{} `json:"value"`
}
func (rt *Router) notifyRuleCustomParamsGet(c *gin.Context) {
notifyChannelID := ginx.QueryInt64(c, "notify_channel_id")
me := c.MustGet("user").(*models.User)
gids, err := models.MyGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
notifyChannel, err := models.NotifyChannelGet(rt.Ctx, "id=?", notifyChannelID)
ginx.Dangerous(err)
keyMap := make(map[string]string)
if notifyChannel == nil {
ginx.NewRender(c).Data([][]paramList{}, nil)
return
}
if notifyChannel.ParamConfig == nil {
ginx.NewRender(c).Data([][]paramList{}, nil)
return
}
for _, param := range notifyChannel.ParamConfig.Custom.Params {
keyMap[param.Key] = param.CName
}
lst, err := models.NotifyRulesGet(rt.Ctx, "", nil)
ginx.Dangerous(err)
res := make([][]paramList, 0)
filter := make(map[string]struct{})
for _, nr := range lst {
if !slice.HaveIntersection[int64](gids, nr.UserGroupIds) {
continue
}
for _, nc := range nr.NotifyConfigs {
if nc.ChannelID != notifyChannelID {
continue
}
list := make([]paramList, 0)
filterKey := ""
for key, value := range nc.Params {
// 找到在通知媒介中的自定义变量配置项,进行 cname 转换
cname, exists := keyMap[key]
if exists {
list = append(list, paramList{
Name: key,
CName: cname,
Value: value,
})
}
filterKey += fmt.Sprintf("%s:%s,", key, value)
}
if _, ok := filter[filterKey]; ok {
continue
}
filter[filterKey] = struct{}{}
res = append(res, list)
}
}
ginx.NewRender(c).Data(res, nil)
}
================================================
FILE: center/router/router_notify_tpl.go
================================================
package router
import (
"bytes"
"encoding/json"
"fmt"
"html/template"
"strings"
"time"
"github.com/ccfos/nightingale/v6/center/cconf"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/tplx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/str"
)
func (rt *Router) notifyTplGets(c *gin.Context) {
m := make(map[string]struct{})
for _, channel := range models.DefaultChannels {
m[channel] = struct{}{}
}
m[models.EmailSubject] = struct{}{}
lst, err := models.NotifyTplGets(rt.Ctx)
ginx.Dangerous(err)
for i := 0; i < len(lst); i++ {
if _, exists := m[lst[i].Channel]; exists {
lst[i].BuiltIn = true
}
}
models.FillUpdateByNicknames(rt.Ctx, lst)
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) notifyTplUpdateContent(c *gin.Context) {
user := c.MustGet("user").(*models.User)
var f models.NotifyTpl
ginx.BindJSON(c, &f)
ginx.Dangerous(templateValidate(f))
notifyTpl, err := models.NotifyTplGet(rt.Ctx, f.Id)
ginx.Dangerous(err)
if notifyTpl.CreateBy != user.Username && !user.IsAdmin() {
ginx.Bomb(403, "forbidden")
}
f.UpdateAt = time.Now().Unix()
f.UpdateBy = user.Username
ginx.NewRender(c).Message(f.UpdateContent(rt.Ctx))
}
func (rt *Router) notifyTplUpdate(c *gin.Context) {
var f models.NotifyTpl
ginx.BindJSON(c, &f)
ginx.Dangerous(templateValidate(f))
user := c.MustGet("user").(*models.User)
notifyTpl, err := models.NotifyTplGet(rt.Ctx, f.Id)
ginx.Dangerous(err)
if notifyTpl.CreateBy != user.Username && !user.IsAdmin() {
ginx.Bomb(403, "forbidden")
}
// get the count of the same channel and name but different id
count, err := models.Count(models.DB(rt.Ctx).Model(&models.NotifyTpl{}).Where("(channel = ? or name = ?) and id <> ?", f.Channel, f.Name, f.Id))
ginx.Dangerous(err)
if count != 0 {
ginx.Bomb(200, "Refuse to create duplicate channel or name")
}
notifyTpl.UpdateAt = time.Now().Unix()
notifyTpl.UpdateBy = user.Username
notifyTpl.Name = f.Name
ginx.NewRender(c).Message(notifyTpl.Update(rt.Ctx))
}
func templateValidate(f models.NotifyTpl) error {
if len(f.Channel) > 32 {
return fmt.Errorf("channel length should not exceed 32")
}
if str.Dangerous(f.Channel) {
return fmt.Errorf("channel should not contain dangerous characters")
}
if len(f.Name) > 255 {
return fmt.Errorf("name length should not exceed 255")
}
if str.Dangerous(f.Name) {
return fmt.Errorf("name should not contain dangerous characters")
}
if f.Content == "" {
return nil
}
var defs = []string{
"{{$labels := .TagsMap}}",
"{{$value := .TriggerValue}}",
}
text := strings.Join(append(defs, f.Content), "")
if _, err := template.New(f.Channel).Funcs(tplx.TemplateFuncMap).Parse(text); err != nil {
return fmt.Errorf("notify template verify illegal:%s", err.Error())
}
return nil
}
func (rt *Router) notifyTplPreview(c *gin.Context) {
var event models.AlertCurEvent
err := json.Unmarshal([]byte(cconf.EVENT_EXAMPLE), &event)
ginx.Dangerous(err)
var f models.NotifyTpl
ginx.BindJSON(c, &f)
var defs = []string{
"{{$labels := .TagsMap}}",
"{{$value := .TriggerValue}}",
}
text := strings.Join(append(defs, f.Content), "")
tpl, err := template.New(f.Channel).Funcs(tplx.TemplateFuncMap).Parse(text)
ginx.Dangerous(err)
event.TagsMap = make(map[string]string)
for i := 0; i < len(event.TagsJSON); i++ {
pair := strings.TrimSpace(event.TagsJSON[i])
if pair == "" {
continue
}
arr := strings.SplitN(pair, "=", 2)
if len(arr) != 2 {
continue
}
event.TagsMap[arr[0]] = arr[1]
}
var body bytes.Buffer
var ret string
if err := tpl.Execute(&body, event); err != nil {
ret = err.Error()
} else {
ret = body.String()
}
ginx.NewRender(c).Data(ret, nil)
}
// add new notify template
func (rt *Router) notifyTplAdd(c *gin.Context) {
var f models.NotifyTpl
ginx.BindJSON(c, &f)
user := c.MustGet("user").(*models.User)
f.CreateBy = user.Username
f.Channel = strings.TrimSpace(f.Channel)
ginx.Dangerous(templateValidate(f))
count, err := models.Count(models.DB(rt.Ctx).Model(&models.NotifyTpl{}).Where("channel = ? or name = ?", f.Channel, f.Name))
ginx.Dangerous(err)
if count != 0 {
ginx.Bomb(200, "Refuse to create duplicate channel(unique)")
}
f.CreateAt = time.Now().Unix()
ginx.NewRender(c).Message(f.Create(rt.Ctx))
}
// delete notify template, not allowed to delete the system defaults(models.DefaultChannels)
func (rt *Router) notifyTplDel(c *gin.Context) {
f := new(models.NotifyTpl)
id := ginx.UrlParamInt64(c, "id")
user := c.MustGet("user").(*models.User)
notifyTpl, err := models.NotifyTplGet(rt.Ctx, id)
ginx.Dangerous(err)
if notifyTpl.CreateBy != user.Username && !user.IsAdmin() {
ginx.Bomb(403, "forbidden")
}
ginx.NewRender(c).Message(f.NotifyTplDelete(rt.Ctx, id))
}
func (rt *Router) messageTemplateGets(c *gin.Context) {
id := ginx.QueryInt64(c, "id", 0)
name := ginx.QueryStr(c, "name", "")
ident := ginx.QueryStr(c, "ident", "")
tpls, err := models.MessageTemplateGets(rt.Ctx, id, name, ident)
if err == nil {
models.FillUpdateByNicknames(rt.Ctx, tpls)
}
ginx.NewRender(c).Data(tpls, err)
}
================================================
FILE: center/router/router_opensearch.go
================================================
package router
import (
"github.com/ccfos/nightingale/v6/datasource/opensearch"
"github.com/ccfos/nightingale/v6/dscache"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/logger"
)
func (rt *Router) QueryOSIndices(c *gin.Context) {
var f IndexReq
ginx.BindJSON(c, &f)
plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId)
if !exists {
logger.Warningf("cluster:%d not exists", f.DatasourceId)
ginx.Bomb(200, "cluster not exists")
}
indices, err := plug.(*opensearch.OpenSearch).QueryIndices()
ginx.Dangerous(err)
ginx.NewRender(c).Data(indices, nil)
}
func (rt *Router) QueryOSFields(c *gin.Context) {
var f IndexReq
ginx.BindJSON(c, &f)
plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId)
if !exists {
logger.Warningf("cluster:%d not exists", f.DatasourceId)
ginx.Bomb(200, "cluster not exists")
}
fields, err := plug.(*opensearch.OpenSearch).QueryFields([]string{f.Index})
ginx.Dangerous(err)
ginx.NewRender(c).Data(fields, nil)
}
func (rt *Router) QueryOSVariable(c *gin.Context) {
var f FieldValueReq
ginx.BindJSON(c, &f)
plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId)
if !exists {
logger.Warningf("cluster:%d not exists", f.DatasourceId)
ginx.Bomb(200, "cluster not exists")
}
fields, err := plug.(*opensearch.OpenSearch).QueryFieldValue([]string{f.Index}, f.Query.Field, f.Query.Query)
ginx.Dangerous(err)
ginx.NewRender(c).Data(fields, nil)
}
================================================
FILE: center/router/router_proxy.go
================================================
package router
import (
"context"
"fmt"
"net"
"net/http"
"net/http/httputil"
"regexp"
"strconv"
"strings"
"sync"
"time"
"github.com/ccfos/nightingale/v6/pkg/logx"
"github.com/ccfos/nightingale/v6/pkg/poster"
pkgprom "github.com/ccfos/nightingale/v6/pkg/prom"
"github.com/ccfos/nightingale/v6/prom"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/prometheus/common/model"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/net/httplib"
)
type QueryFormItem struct {
Start int64 `json:"start" binding:"required"`
End int64 `json:"end" binding:"required"`
Step int64 `json:"step" binding:"required"`
Query string `json:"query" binding:"required"`
}
type BatchQueryForm struct {
DatasourceId int64 `json:"datasource_id" binding:"required"`
Queries []QueryFormItem `json:"queries" binding:"required"`
}
func (rt *Router) promBatchQueryRange(c *gin.Context) {
var f BatchQueryForm
ginx.Dangerous(c.BindJSON(&f))
lst, err := PromBatchQueryRange(c.Request.Context(), rt.PromClients, f)
ginx.NewRender(c).Data(lst, err)
}
func PromBatchQueryRange(ctx context.Context, pc *prom.PromClientMap, f BatchQueryForm) ([]model.Value, error) {
var lst []model.Value
cli := pc.GetCli(f.DatasourceId)
if cli == nil {
logx.Warningf(ctx, "no such datasource id: %d", f.DatasourceId)
return lst, fmt.Errorf("no such datasource id: %d", f.DatasourceId)
}
for _, item := range f.Queries {
r := pkgprom.Range{
Start: time.Unix(item.Start, 0),
End: time.Unix(item.End, 0),
Step: time.Duration(item.Step) * time.Second,
}
resp, _, err := cli.QueryRange(ctx, item.Query, r)
if err != nil {
logx.Warningf(ctx, "query range error: query:%s err:%v", item.Query, err)
return lst, err
}
lst = append(lst, resp)
}
return lst, nil
}
type BatchInstantForm struct {
DatasourceId int64 `json:"datasource_id" binding:"required"`
Queries []InstantFormItem `json:"queries" binding:"required"`
}
type InstantFormItem struct {
Time int64 `json:"time" binding:"required"`
Query string `json:"query" binding:"required"`
}
func (rt *Router) promBatchQueryInstant(c *gin.Context) {
var f BatchInstantForm
ginx.Dangerous(c.BindJSON(&f))
lst, err := PromBatchQueryInstant(c.Request.Context(), rt.PromClients, f)
ginx.NewRender(c).Data(lst, err)
}
func PromBatchQueryInstant(ctx context.Context, pc *prom.PromClientMap, f BatchInstantForm) ([]model.Value, error) {
var lst []model.Value
cli := pc.GetCli(f.DatasourceId)
if cli == nil {
logx.Warningf(ctx, "no such datasource id: %d", f.DatasourceId)
return lst, fmt.Errorf("no such datasource id: %d", f.DatasourceId)
}
for _, item := range f.Queries {
resp, _, err := cli.Query(ctx, item.Query, time.Unix(item.Time, 0))
if err != nil {
logx.Warningf(ctx, "query instant error: query:%s err:%v", item.Query, err)
return lst, err
}
lst = append(lst, resp)
}
return lst, nil
}
func (rt *Router) dsProxy(c *gin.Context) {
dsId := ginx.UrlParamInt64(c, "id")
ds := rt.DatasourceCache.GetById(dsId)
if ds == nil {
c.String(http.StatusBadRequest, "no such datasource")
return
}
target, err := ds.HTTPJson.ParseUrl()
if err != nil {
c.String(http.StatusInternalServerError, "invalid urls: %s", ds.HTTPJson.GetUrls())
return
}
director := func(req *http.Request) {
req.URL.Scheme = target.Scheme
req.URL.Host = target.Host
req.Host = target.Host
req.Header.Set("Host", target.Host)
// fe request e.g. /api/n9e/proxy/:id/*
arr := strings.Split(req.URL.Path, "/")
if len(arr) < 6 {
c.String(http.StatusBadRequest, "invalid url path")
return
}
req.URL.Path = strings.TrimRight(target.Path, "/") + "/" + strings.Join(arr[5:], "/")
if target.RawQuery == "" || req.URL.RawQuery == "" {
req.URL.RawQuery = target.RawQuery + req.URL.RawQuery
} else {
req.URL.RawQuery = target.RawQuery + "&" + req.URL.RawQuery
}
if _, ok := req.Header["User-Agent"]; !ok {
req.Header.Set("User-Agent", "")
}
if ds.AuthJson.BasicAuthUser != "" {
req.SetBasicAuth(ds.AuthJson.BasicAuthUser, ds.AuthJson.BasicAuthPassword)
} else {
req.Header.Del("Authorization")
}
headerCount := len(ds.HTTPJson.Headers)
if headerCount > 0 {
for key, value := range ds.HTTPJson.Headers {
req.Header.Set(key, value)
if key == "Host" {
req.Host = value
}
}
}
}
errFunc := func(w http.ResponseWriter, r *http.Request, err error) {
http.Error(w, err.Error(), http.StatusBadGateway)
}
transport, has := transportGet(dsId, ds.UpdatedAt)
if !has {
// 使用 TLS 配置(支持 mTLS)
tlsConfig, err := ds.HTTPJson.TLS.TLSConfig()
if err != nil {
c.String(http.StatusInternalServerError, "failed to create TLS config: %s", err.Error())
return
}
transport = &http.Transport{
TLSClientConfig: tlsConfig,
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: time.Duration(ds.HTTPJson.DialTimeout) * time.Millisecond,
}).DialContext,
ResponseHeaderTimeout: time.Duration(ds.HTTPJson.Timeout) * time.Millisecond,
MaxIdleConnsPerHost: ds.HTTPJson.MaxIdleConnsPerHost,
}
transportPut(dsId, ds.UpdatedAt, transport)
}
modifyResponse := func(r *http.Response) error {
if r.StatusCode == http.StatusUnauthorized {
logx.Warningf(c.Request.Context(), "proxy path:%s unauthorized access ", c.Request.URL.Path)
return fmt.Errorf("unauthorized access")
}
return nil
}
proxy := &httputil.ReverseProxy{
Director: director,
Transport: transport,
ErrorHandler: errFunc,
ModifyResponse: modifyResponse,
}
proxy.ServeHTTP(c.Writer, c.Request)
}
var (
transports = map[int64]http.RoundTripper{}
updatedAts = map[int64]int64{}
transportsLock = &sync.Mutex{}
)
func transportGet(dsid, newUpdatedAt int64) (http.RoundTripper, bool) {
transportsLock.Lock()
defer transportsLock.Unlock()
tran, has := transports[dsid]
if !has {
return nil, false
}
oldUpdateAt, has := updatedAts[dsid]
if !has {
oldtran := tran.(*http.Transport)
oldtran.CloseIdleConnections()
delete(transports, dsid)
return nil, false
}
if oldUpdateAt != newUpdatedAt {
oldtran := tran.(*http.Transport)
oldtran.CloseIdleConnections()
delete(transports, dsid)
delete(updatedAts, dsid)
return nil, false
}
return tran, has
}
func transportPut(dsid, updatedat int64, tran http.RoundTripper) {
transportsLock.Lock()
transports[dsid] = tran
updatedAts[dsid] = updatedat
transportsLock.Unlock()
}
const (
DatasourceTypePrometheus = "Prometheus"
DatasourceTypeVictoriaMetrics = "VictoriaMetrics"
)
type deleteDatasourceSeriesForm struct {
DatasourceID int64 `json:"datasource_id"`
Match []string `json:"match"`
Start string `json:"start"`
End string `json:"end"`
}
func (rt *Router) deleteDatasourceSeries(c *gin.Context) {
var ddsf deleteDatasourceSeriesForm
ginx.BindJSON(c, &ddsf)
ds := rt.DatasourceCache.GetById(ddsf.DatasourceID)
if ds == nil {
ginx.Bomb(http.StatusBadRequest, "no such datasource")
return
}
// Get datasource type, now only support prometheus and victoriametrics
datasourceType, ok := ds.SettingsJson["prometheus.tsdb_type"]
if !ok {
ginx.Bomb(http.StatusBadRequest, "datasource type not found, please check your datasource settings")
return
}
target, err := ds.HTTPJson.ParseUrl()
if err != nil {
ginx.Bomb(http.StatusInternalServerError, "invalid urls: %s", ds.HTTPJson.GetUrls())
return
}
timeout := time.Duration(ds.HTTPJson.DialTimeout) * time.Millisecond
matchQueries := make([]string, 0)
for _, match := range ddsf.Match {
matchQueries = append(matchQueries, fmt.Sprintf("match[]=%s", match))
}
matchQuery := strings.Join(matchQueries, "&")
switch datasourceType {
case DatasourceTypePrometheus:
// Prometheus delete api need POST method
// https://prometheus.io/docs/prometheus/latest/querying/api/#delete-series
url := fmt.Sprintf("http://%s/api/v1/admin/tsdb/delete_series?%s&start=%s&end=%s", target.Host, matchQuery, ddsf.Start, ddsf.End)
go func() {
resp, _, err := poster.PostJSON(url, timeout, nil)
if err != nil {
logger.Errorf("delete series error datasource_id: %d, datasource_name: %s, match: %s, start: %s, end: %s, err: %v",
ddsf.DatasourceID, ds.Name, ddsf.Match, ddsf.Start, ddsf.End, err)
return
}
logger.Infof("delete datasource series datasource_id: %d, datasource_name: %s, match: %s, start: %s, end: %s, respBody: %s",
ddsf.DatasourceID, ds.Name, ddsf.Match, ddsf.Start, ddsf.End, string(resp))
}()
case DatasourceTypeVictoriaMetrics:
// Delete API doesn’t support the deletion of specific time ranges.
// Refer: https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/#how-to-delete-time-series
var url string
// Check VictoriaMetrics is single node or cluster
// Cluster will have /select//prometheus pattern
re := regexp.MustCompile(`/select/(\d+)/prometheus`)
matches := re.FindStringSubmatch(ds.HTTPJson.Url)
if len(matches) > 0 && matches[1] != "" {
accountID, err := strconv.Atoi(matches[1])
if err != nil {
ginx.Bomb(http.StatusInternalServerError, "invalid accountID: %s", matches[1])
}
url = fmt.Sprintf("http://%s/delete/%d/prometheus/api/v1/admin/tsdb/delete_series?%s", target.Host, accountID, matchQuery)
} else {
url = fmt.Sprintf("http://%s/api/v1/admin/tsdb/delete_series?%s", target.Host, matchQuery)
}
go func() {
resp, err := httplib.Get(url).SetTimeout(timeout).Response()
if err != nil {
logger.Errorf("delete series failed | datasource_id: %d, datasource_name: %s, match: %s, start: %s, end: %s, err: %v",
ddsf.DatasourceID, ds.Name, ddsf.Match, ddsf.Start, ddsf.End, err)
return
}
logger.Infof("sending delete series request | datasource_id: %d, datasource_name: %s, match: %s, start: %s, end: %s, respBody: %s",
ddsf.DatasourceID, ds.Name, ddsf.Match, ddsf.Start, ddsf.End, resp.Body)
}()
default:
ginx.Bomb(http.StatusBadRequest, "not support delete series yet")
}
ginx.NewRender(c).Data(nil, nil)
}
================================================
FILE: center/router/router_query.go
================================================
package router
import (
"fmt"
"sort"
"sync"
"github.com/ccfos/nightingale/v6/alert/eval"
"github.com/ccfos/nightingale/v6/dscache"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/logx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
type CheckDsPermFunc func(c *gin.Context, dsId int64, cate string, q interface{}) bool
var CheckDsPerm CheckDsPermFunc = func(c *gin.Context, dsId int64, cate string, q interface{}) bool {
// todo: 后续需要根据 cate 判断是否需要权限
return true
}
type QueryFrom struct {
Queries []Query `json:"queries"`
Exps []Exp `json:"exps"`
}
type Query struct {
Ref string `json:"ref"`
Did int64 `json:"ds_id"`
DsCate string `json:"ds_cate"`
Query interface{} `json:"query"`
}
type Exp struct {
Exp string `json:"exp"`
Ref string `json:"ref"`
}
type LogResp struct {
Total int64 `json:"total"`
List []interface{} `json:"list"`
}
func QueryLogBatchConcurrently(anonymousAccess bool, ctx *gin.Context, f QueryFrom) (LogResp, error) {
var resp LogResp
var mu sync.Mutex
var wg sync.WaitGroup
var errs []error
rctx := ctx.Request.Context()
for _, q := range f.Queries {
if !anonymousAccess && !CheckDsPerm(ctx, q.Did, q.DsCate, q) {
return LogResp{}, fmt.Errorf("forbidden")
}
plug, exists := dscache.DsCache.Get(q.DsCate, q.Did)
if !exists {
logx.Warningf(rctx, "cluster:%d not exists query:%+v", q.Did, q)
return LogResp{}, fmt.Errorf("cluster not exists")
}
// 根据数据源类型对 Query 进行模板渲染处理
err := eval.ExecuteQueryTemplate(q.DsCate, q.Query, nil)
if err != nil {
logx.Warningf(rctx, "query template execute error: %v", err)
return LogResp{}, fmt.Errorf("query template execute error: %v", err)
}
wg.Add(1)
go func(query Query) {
defer wg.Done()
data, total, err := plug.QueryLog(rctx, query.Query)
mu.Lock()
defer mu.Unlock()
if err != nil {
errMsg := fmt.Sprintf("query data error: %v query:%v\n ", err, query)
logx.Warningf(rctx, "%s", errMsg)
errs = append(errs, err)
return
}
m := make(map[string]interface{})
m["ref"] = query.Ref
m["ds_id"] = query.Did
m["ds_cate"] = query.DsCate
m["data"] = data
resp.List = append(resp.List, m)
resp.Total += total
}(q)
}
wg.Wait()
if len(errs) > 0 {
return LogResp{}, errs[0]
}
if len(resp.List) == 0 {
return LogResp{}, fmt.Errorf("no data")
}
return resp, nil
}
func (rt *Router) QueryLogBatch(c *gin.Context) {
var f QueryFrom
ginx.BindJSON(c, &f)
resp, err := QueryLogBatchConcurrently(rt.Center.AnonymousAccess.PromQuerier, c, f)
if err != nil {
ginx.Bomb(200, "err:%v", err)
}
ginx.NewRender(c).Data(resp, nil)
}
func QueryDataConcurrently(anonymousAccess bool, ctx *gin.Context, f models.QueryParam) ([]models.DataResp, error) {
var resp []models.DataResp
var mu sync.Mutex
var wg sync.WaitGroup
var errs []error
rctx := ctx.Request.Context()
for _, q := range f.Queries {
if !anonymousAccess && !CheckDsPerm(ctx, f.DatasourceId, f.Cate, q) {
return nil, fmt.Errorf("forbidden")
}
plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId)
if !exists {
logx.Warningf(rctx, "cluster:%d not exists", f.DatasourceId)
return nil, fmt.Errorf("cluster not exists")
}
wg.Add(1)
go func(query interface{}) {
defer wg.Done()
data, err := plug.QueryData(rctx, query)
if err != nil {
logx.Warningf(rctx, "query data error: req:%+v err:%v", query, err)
mu.Lock()
errs = append(errs, err)
mu.Unlock()
return
}
logx.Debugf(rctx, "query data: req:%+v resp:%+v", query, data)
mu.Lock()
resp = append(resp, data...)
mu.Unlock()
}(q)
}
wg.Wait()
if len(errs) > 0 {
return nil, errs[0]
}
// 面向API的统一处理
// 按照 .Metric 排序
// 确保仪表盘中相同图例的曲线颜色相同
if len(resp) > 1 {
sort.Slice(resp, func(i, j int) bool {
if resp[i].Metric != nil && resp[j].Metric != nil {
return resp[i].Metric.String() < resp[j].Metric.String()
}
return false
})
}
return resp, nil
}
func (rt *Router) QueryData(c *gin.Context) {
var f models.QueryParam
ginx.BindJSON(c, &f)
resp, err := QueryDataConcurrently(rt.Center.AnonymousAccess.PromQuerier, c, f)
if err != nil {
ginx.Bomb(200, "err:%v", err)
}
ginx.NewRender(c).Data(resp, nil)
}
// QueryLogConcurrently 并发查询日志
func QueryLogConcurrently(anonymousAccess bool, ctx *gin.Context, f models.QueryParam) (LogResp, error) {
var resp LogResp
var mu sync.Mutex
var wg sync.WaitGroup
var errs []error
rctx := ctx.Request.Context()
for _, q := range f.Queries {
if !anonymousAccess && !CheckDsPerm(ctx, f.DatasourceId, f.Cate, q) {
return LogResp{}, fmt.Errorf("forbidden")
}
plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId)
if !exists {
logx.Warningf(rctx, "cluster:%d not exists query:%+v", f.DatasourceId, f)
return LogResp{}, fmt.Errorf("cluster not exists")
}
wg.Add(1)
go func(query interface{}) {
defer wg.Done()
data, total, err := plug.QueryLog(rctx, query)
logx.Debugf(rctx, "query log: req:%+v resp:%+v", query, data)
if err != nil {
errMsg := fmt.Sprintf("query data error: %v query:%v\n ", err, query)
logx.Warningf(rctx, "%s", errMsg)
mu.Lock()
errs = append(errs, err)
mu.Unlock()
return
}
mu.Lock()
resp.List = append(resp.List, data...)
resp.Total += total
mu.Unlock()
}(q)
}
wg.Wait()
if len(errs) > 0 {
return LogResp{}, errs[0]
}
if len(resp.List) == 0 {
return LogResp{}, fmt.Errorf("no data")
}
return resp, nil
}
func (rt *Router) QueryLogV2(c *gin.Context) {
var f models.QueryParam
ginx.BindJSON(c, &f)
resp, err := QueryLogConcurrently(rt.Center.AnonymousAccess.PromQuerier, c, f)
ginx.NewRender(c).Data(resp, err)
}
func (rt *Router) QueryLog(c *gin.Context) {
var f models.QueryParam
ginx.BindJSON(c, &f)
rctx := c.Request.Context()
var resp []interface{}
for _, q := range f.Queries {
if !rt.Center.AnonymousAccess.PromQuerier && !CheckDsPerm(c, f.DatasourceId, f.Cate, q) {
ginx.Bomb(200, "forbidden")
}
plug, exists := dscache.DsCache.Get("elasticsearch", f.DatasourceId)
if !exists {
logx.Warningf(rctx, "cluster:%d not exists", f.DatasourceId)
ginx.Bomb(200, "cluster not exists")
}
data, _, err := plug.QueryLog(rctx, q)
if err != nil {
logx.Warningf(rctx, "query data error: %v", err)
ginx.Bomb(200, "err:%v", err)
continue
}
resp = append(resp, data...)
}
ginx.NewRender(c).Data(resp, nil)
}
================================================
FILE: center/router/router_recording_rule.go
================================================
package router
import (
"encoding/json"
"net/http"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/strx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
func (rt *Router) recordingRuleGets(c *gin.Context) {
busiGroupId := ginx.UrlParamInt64(c, "id")
ars, err := models.RecordingRuleGets(rt.Ctx, busiGroupId)
if err == nil {
models.FillUpdateByNicknames(rt.Ctx, ars)
}
ginx.NewRender(c).Data(ars, err)
}
func (rt *Router) recordingRuleGetsByGids(c *gin.Context) {
gids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",")
if len(gids) > 0 {
for _, gid := range gids {
rt.bgroCheck(c, gid)
}
} else {
me := c.MustGet("user").(*models.User)
if !me.IsAdmin() {
var err error
gids, err = models.MyBusiGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
if len(gids) == 0 {
ginx.NewRender(c).Data([]int{}, nil)
return
}
}
}
ars, err := models.RecordingRuleGetsByBGIds(rt.Ctx, gids)
if err == nil {
models.FillUpdateByNicknames(rt.Ctx, ars)
}
ginx.NewRender(c).Data(ars, err)
}
func (rt *Router) recordingRuleGetsByService(c *gin.Context) {
ars, err := models.RecordingRuleEnabledGets(rt.Ctx)
ginx.NewRender(c).Data(ars, err)
}
func (rt *Router) recordingRuleGet(c *gin.Context) {
rrid := ginx.UrlParamInt64(c, "rrid")
ar, err := models.RecordingRuleGetById(rt.Ctx, rrid)
ginx.Dangerous(err)
if ar == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such recording rule")
return
}
ginx.NewRender(c).Data(ar, err)
}
func (rt *Router) recordingRuleAddByFE(c *gin.Context) {
username := c.MustGet("username").(string)
var lst []models.RecordingRule
ginx.BindJSON(c, &lst)
count := len(lst)
if count == 0 {
ginx.Bomb(http.StatusBadRequest, "input json is empty")
}
for i := range lst {
if len(lst[i].DatasourceQueries) == 0 {
lst[i].DatasourceQueries = []models.DatasourceQuery{
models.DataSourceQueryAll,
}
}
}
bgid := ginx.UrlParamInt64(c, "id")
reterr := make(map[string]string)
for i := 0; i < count; i++ {
lst[i].Id = 0
lst[i].GroupId = bgid
lst[i].CreateBy = username
lst[i].UpdateBy = username
lst[i].FE2DB()
if err := lst[i].Add(rt.Ctx); err != nil {
reterr[lst[i].Name] = err.Error()
} else {
reterr[lst[i].Name] = ""
}
}
ginx.NewRender(c).Data(reterr, nil)
}
func (rt *Router) recordingRulePutByFE(c *gin.Context) {
var f models.RecordingRule
ginx.BindJSON(c, &f)
rrid := ginx.UrlParamInt64(c, "rrid")
ar, err := models.RecordingRuleGetById(rt.Ctx, rrid)
ginx.Dangerous(err)
if ar == nil {
ginx.NewRender(c, http.StatusNotFound).Message("No such recording rule")
return
}
rt.bgrwCheck(c, ar.GroupId)
rt.bgroCheck(c, f.GroupId)
f.UpdateBy = c.MustGet("username").(string)
ginx.NewRender(c).Message(ar.Update(rt.Ctx, f))
}
func (rt *Router) recordingRuleDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
f.Verify()
ginx.NewRender(c).Message(models.RecordingRuleDels(rt.Ctx, f.Ids, ginx.UrlParamInt64(c, "id")))
}
type recordRuleFieldForm struct {
Ids []int64 `json:"ids"`
Fields map[string]interface{} `json:"fields"`
}
func (rt *Router) recordingRulePutFields(c *gin.Context) {
var f recordRuleFieldForm
ginx.BindJSON(c, &f)
if len(f.Fields) == 0 {
ginx.Bomb(http.StatusBadRequest, "fields empty")
}
f.Fields["update_by"] = c.MustGet("username").(string)
f.Fields["update_at"] = time.Now().Unix()
if datasourceQueries, ok := f.Fields["datasource_queries"]; ok {
bytes, err := json.Marshal(datasourceQueries)
ginx.Dangerous(err)
f.Fields["datasource_queries"] = string(bytes)
}
if datasourceIds, ok := f.Fields["datasource_ids"]; ok {
bytes, err := json.Marshal(datasourceIds)
ginx.Dangerous(err)
f.Fields["datasource_ids"] = string(bytes)
}
for i := 0; i < len(f.Ids); i++ {
ar, err := models.RecordingRuleGetById(rt.Ctx, f.Ids[i])
ginx.Dangerous(err)
if ar == nil {
continue
}
ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, f.Fields))
}
ginx.NewRender(c).Message(nil)
}
================================================
FILE: center/router/router_role.go
================================================
package router
import (
"net/http"
"strings"
"github.com/ccfos/nightingale/v6/center/cconf"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
func (rt *Router) rolesGets(c *gin.Context) {
lst, err := models.RoleGetsAll(rt.Ctx)
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) permsGets(c *gin.Context) {
user := c.MustGet("user").(*models.User)
if user.IsAdmin() {
var lst []string
for _, ops := range cconf.Operations.Ops {
for _, op := range ops.Ops {
lst = append(lst, op.Name)
}
}
ginx.NewRender(c).Data(lst, nil)
return
}
lst, err := models.OperationsOfRole(rt.Ctx, strings.Fields(user.Roles))
ginx.NewRender(c).Data(lst, err)
}
// 创建角色
func (rt *Router) roleAdd(c *gin.Context) {
var f models.Role
ginx.BindJSON(c, &f)
err := f.Add(rt.Ctx)
ginx.NewRender(c).Message(err)
}
// 更新角色
func (rt *Router) rolePut(c *gin.Context) {
var f models.Role
ginx.BindJSON(c, &f)
oldRule, err := models.RoleGet(rt.Ctx, "id=?", f.Id)
ginx.Dangerous(err)
if oldRule == nil {
ginx.Bomb(http.StatusOK, "role not found")
}
if oldRule.Name == "Admin" {
ginx.Bomb(http.StatusOK, "admin role can not be modified")
}
if oldRule.Name != f.Name {
// name changed, check duplication
num, err := models.RoleCount(rt.Ctx, "name=? and id<>?", f.Name, oldRule.Id)
ginx.Dangerous(err)
if num > 0 {
ginx.Bomb(http.StatusOK, "role name already exists")
}
}
oldRule.Name = f.Name
oldRule.Note = f.Note
ginx.NewRender(c).Message(oldRule.Update(rt.Ctx, "name", "note"))
}
func (rt *Router) roleDel(c *gin.Context) {
id := ginx.UrlParamInt64(c, "id")
target, err := models.RoleGet(rt.Ctx, "id=?", id)
ginx.Dangerous(err)
if target.Name == "Admin" {
ginx.Bomb(http.StatusOK, "admin role can not be modified")
}
if target == nil {
ginx.NewRender(c).Message(nil)
return
}
ginx.NewRender(c).Message(target.Del(rt.Ctx))
}
// 角色列表
func (rt *Router) roleGets(c *gin.Context) {
lst, err := models.RoleGetsAll(rt.Ctx)
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) allPerms(c *gin.Context) {
roles, err := models.RoleGetsAll(rt.Ctx)
ginx.Dangerous(err)
m := make(map[string][]string)
for _, r := range roles {
lst, err := models.OperationsOfRole(rt.Ctx, strings.Fields(r.Name))
if err != nil {
continue
}
m[r.Name] = lst
}
ginx.NewRender(c).Data(m, err)
}
================================================
FILE: center/router/router_role_operation.go
================================================
package router
import (
"net/http"
"github.com/ccfos/nightingale/v6/center/cconf"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/i18n"
)
func (rt *Router) operationOfRole(c *gin.Context) {
var (
role *models.Role
err error
res []string
roleOperations []string
)
id := ginx.UrlParamInt64(c, "id")
role, err = models.RoleGet(rt.Ctx, "id=?", id)
ginx.Dangerous(err)
if role == nil {
ginx.Bomb(http.StatusOK, "role not found")
}
if role.Name == "Admin" {
for _, ops := range cconf.Operations.Ops {
for i := range ops.Ops {
res = append(res, ops.Ops[i].Name)
}
}
} else {
roleOperations, err = models.OperationsOfRole(rt.Ctx, []string{role.Name})
res = roleOperations
}
ginx.NewRender(c).Data(res, err)
}
func (rt *Router) roleBindOperation(c *gin.Context) {
id := ginx.UrlParamInt64(c, "id")
role, err := models.RoleGet(rt.Ctx, "id=?", id)
ginx.Dangerous(err)
if role == nil {
ginx.Bomb(http.StatusOK, "role not found")
}
if role.Name == "Admin" {
ginx.Bomb(http.StatusOK, "admin role can not be modified")
}
var ops []string
ginx.BindJSON(c, &ops)
ginx.NewRender(c).Message(models.RoleOperationBind(rt.Ctx, role.Name, ops))
}
func (rt *Router) operations(c *gin.Context) {
var ops []cconf.Ops
for _, v := range rt.Operations.Ops {
newOp := cconf.Ops{
Name: v.Name,
Cname: i18n.Sprintf(c.GetHeader("X-Language"), v.Cname),
Ops: []cconf.SingleOp{},
}
for i := range v.Ops {
op := cconf.SingleOp{
Name: v.Ops[i].Name,
Cname: i18n.Sprintf(c.GetHeader("X-Language"), v.Ops[i].Cname),
}
newOp.Ops = append(newOp.Ops, op)
}
ops = append(ops, newOp)
}
ginx.NewRender(c).Data(ops, nil)
}
================================================
FILE: center/router/router_saved_view.go
================================================
package router
import (
"net/http"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/slice"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
func (rt *Router) savedViewGets(c *gin.Context) {
page := ginx.QueryStr(c, "page", "")
me := c.MustGet("user").(*models.User)
lst, err := models.SavedViewGets(rt.Ctx, page)
if err != nil {
ginx.NewRender(c).Data(nil, err)
return
}
models.FillUpdateByNicknames(rt.Ctx, lst)
userGids, err := models.MyGroupIds(rt.Ctx, me.Id)
if err != nil {
ginx.NewRender(c).Data(nil, err)
return
}
favoriteMap, err := models.SavedViewFavoriteGetByUserId(rt.Ctx, me.Id)
if err != nil {
ginx.NewRender(c).Data(nil, err)
return
}
favoriteViews := make([]models.SavedView, 0)
normalViews := make([]models.SavedView, 0)
for _, view := range lst {
visible := view.CreateBy == me.Username ||
view.PublicCate == 2 ||
(view.PublicCate == 1 && slice.HaveIntersection[int64](userGids, view.Gids))
if !visible {
continue
}
view.IsFavorite = favoriteMap[view.Id]
// 收藏的排前面
if view.IsFavorite {
favoriteViews = append(favoriteViews, view)
} else {
normalViews = append(normalViews, view)
}
}
ginx.NewRender(c).Data(append(favoriteViews, normalViews...), nil)
}
func (rt *Router) savedViewAdd(c *gin.Context) {
var f models.SavedView
ginx.BindJSON(c, &f)
me := c.MustGet("user").(*models.User)
f.Id = 0
f.CreateBy = me.Username
f.UpdateBy = me.Username
err := models.SavedViewAdd(rt.Ctx, &f)
ginx.NewRender(c).Data(f.Id, err)
}
func (rt *Router) savedViewPut(c *gin.Context) {
id := ginx.UrlParamInt64(c, "id")
view, err := models.SavedViewGetById(rt.Ctx, id)
if err != nil {
ginx.NewRender(c).Data(nil, err)
return
}
if view == nil {
ginx.NewRender(c, http.StatusNotFound).Message("saved view not found")
return
}
me := c.MustGet("user").(*models.User)
// 只有创建者可以更新
if view.CreateBy != me.Username && !me.IsAdmin() {
ginx.NewRender(c, http.StatusForbidden).Message("forbidden")
return
}
var f models.SavedView
ginx.BindJSON(c, &f)
view.Name = f.Name
view.Filter = f.Filter
view.PublicCate = f.PublicCate
view.Gids = f.Gids
err = models.SavedViewUpdate(rt.Ctx, view, me.Username)
ginx.NewRender(c).Message(err)
}
func (rt *Router) savedViewDel(c *gin.Context) {
id := ginx.UrlParamInt64(c, "id")
view, err := models.SavedViewGetById(rt.Ctx, id)
if err != nil {
ginx.NewRender(c).Data(nil, err)
return
}
if view == nil {
ginx.NewRender(c, http.StatusNotFound).Message("saved view not found")
return
}
me := c.MustGet("user").(*models.User)
// 只有创建者或管理员可以删除
if view.CreateBy != me.Username && !me.IsAdmin() {
ginx.NewRender(c, http.StatusForbidden).Message("forbidden")
return
}
err = models.SavedViewDel(rt.Ctx, id)
ginx.NewRender(c).Message(err)
}
func (rt *Router) savedViewFavoriteAdd(c *gin.Context) {
id := ginx.UrlParamInt64(c, "id")
me := c.MustGet("user").(*models.User)
err := models.UserViewFavoriteAdd(rt.Ctx, id, me.Id)
ginx.NewRender(c).Message(err)
}
func (rt *Router) savedViewFavoriteDel(c *gin.Context) {
id := ginx.UrlParamInt64(c, "id")
me := c.MustGet("user").(*models.User)
err := models.UserViewFavoriteDel(rt.Ctx, id, me.Id)
ginx.NewRender(c).Message(err)
}
================================================
FILE: center/router/router_self.go
================================================
package router
import (
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/flashduty"
"github.com/ccfos/nightingale/v6/pkg/ormx"
"github.com/ccfos/nightingale/v6/pkg/secu"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/google/uuid"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/logger"
)
func (rt *Router) selfProfileGet(c *gin.Context) {
user := c.MustGet("user").(*models.User)
if user.IsAdmin() {
user.Admin = true
}
ginx.NewRender(c).Data(user, nil)
}
type selfProfileForm struct {
Nickname string `json:"nickname"`
Phone string `json:"phone"`
Email string `json:"email"`
Portrait string `json:"portrait"`
Contacts ormx.JSONObj `json:"contacts"`
}
func (rt *Router) selfProfilePut(c *gin.Context) {
var f selfProfileForm
ginx.BindJSON(c, &f)
user := c.MustGet("user").(*models.User)
oldInfo := models.User{
Username: user.Username,
Phone: user.Phone,
Email: user.Email,
}
user.Nickname = f.Nickname
user.Phone = f.Phone
user.Email = f.Email
user.Portrait = f.Portrait
user.Contacts = f.Contacts
user.UpdateBy = user.Username
if flashduty.NeedSyncUser(rt.Ctx) {
flashduty.UpdateUser(rt.Ctx, oldInfo, f.Email, f.Phone)
}
ginx.NewRender(c).Message(user.UpdateAllFields(rt.Ctx))
}
type selfPasswordForm struct {
OldPass string `json:"oldpass" binding:"required"`
NewPass string `json:"newpass" binding:"required"`
}
func (rt *Router) selfPasswordPut(c *gin.Context) {
var f selfPasswordForm
ginx.BindJSON(c, &f)
user := c.MustGet("user").(*models.User)
newPassWord := f.NewPass
oldPassWord := f.OldPass
if rt.HTTP.RSA.OpenRSA {
var err error
newPassWord, err = secu.Decrypt(f.NewPass, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord)
if err != nil {
logger.Errorf("RSA Decrypt failed: %v username: %s", err, user.Username)
ginx.NewRender(c).Message(err)
return
}
oldPassWord, err = secu.Decrypt(f.OldPass, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord)
if err != nil {
logger.Errorf("RSA Decrypt failed: %v username: %s", err, user.Username)
ginx.NewRender(c).Message(err)
return
}
}
ginx.NewRender(c).Message(user.ChangePassword(rt.Ctx, oldPassWord, newPassWord))
}
type tokenForm struct {
TokenName string `json:"token_name"`
Token string `json:"token"`
}
func (rt *Router) getToken(c *gin.Context) {
username := c.MustGet("username").(string)
tokens, err := models.GetTokensByUsername(rt.Ctx, username)
ginx.NewRender(c).Data(tokens, err)
}
func (rt *Router) addToken(c *gin.Context) {
var f tokenForm
ginx.BindJSON(c, &f)
username := c.MustGet("username").(string)
tokens, err := models.GetTokensByUsername(rt.Ctx, username)
ginx.Dangerous(err)
for _, token := range tokens {
if token.TokenName == f.TokenName {
ginx.NewRender(c).Message("token name already exists")
return
}
}
token, err := models.AddToken(rt.Ctx, username, uuid.New().String(), f.TokenName)
ginx.NewRender(c).Data(token, err)
}
func (rt *Router) deleteToken(c *gin.Context) {
id := ginx.UrlParamInt64(c, "id")
username := c.MustGet("username").(string)
tokenCount, err := models.CountToken(rt.Ctx, username)
ginx.Dangerous(err)
if tokenCount <= 1 {
ginx.NewRender(c).Message("cannot delete the last token")
return
}
ginx.NewRender(c).Message(models.DeleteToken(rt.Ctx, id))
}
================================================
FILE: center/router/router_server.go
================================================
package router
import (
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
func (rt *Router) serversGet(c *gin.Context) {
list, err := models.AlertingEngineGets(rt.Ctx, "")
ginx.NewRender(c).Data(list, err)
}
func (rt *Router) serverClustersGet(c *gin.Context) {
list, err := models.AlertingEngineGetsClusters(rt.Ctx, "")
ginx.NewRender(c).Data(list, err)
}
func (rt *Router) serverHeartbeat(c *gin.Context) {
var req models.HeartbeatInfo
ginx.BindJSON(c, &req)
err := models.AlertingEngineHeartbeatWithCluster(rt.Ctx, req.Instance, req.EngineCluster, req.DatasourceId)
ginx.NewRender(c).Message(err)
}
func (rt *Router) serversActive(c *gin.Context) {
datasourceId := ginx.QueryInt64(c, "dsid", 0)
engineName := ginx.QueryStr(c, "engine_name", "")
if engineName != "" {
servers, err := models.AlertingEngineGetsInstances(rt.Ctx, "engine_cluster = ? and clock > ?", engineName, time.Now().Unix()-30)
ginx.NewRender(c).Data(servers, err)
return
}
if datasourceId == 0 {
ginx.NewRender(c).Message("dsid is required")
return
}
servers, err := models.AlertingEngineGetsInstances(rt.Ctx, "datasource_id = ? and clock > ?", datasourceId, time.Now().Unix()-30)
ginx.NewRender(c).Data(servers, err)
}
================================================
FILE: center/router/router_source_token.go
================================================
package router
import (
"net/http"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/google/uuid"
"github.com/gin-gonic/gin"
)
// sourceTokenAdd 生成新的源令牌
func (rt *Router) sourceTokenAdd(c *gin.Context) {
var f models.SourceToken
ginx.BindJSON(c, &f)
if f.ExpireAt > 0 && f.ExpireAt <= time.Now().Unix() {
ginx.Bomb(http.StatusBadRequest, "expire time must be in the future")
}
token := uuid.New().String()
username := c.MustGet("username").(string)
f.Token = token
f.CreateBy = username
f.CreateAt = time.Now().Unix()
err := f.Add(rt.Ctx)
ginx.Dangerous(err)
go models.CleanupExpiredTokens(rt.Ctx)
ginx.NewRender(c).Data(token, nil)
}
================================================
FILE: center/router/router_target.go
================================================
package router
import (
"context"
"encoding/json"
"fmt"
"net/http"
"strings"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/strx"
"github.com/ccfos/nightingale/v6/pushgw/idents"
"github.com/ccfos/nightingale/v6/storage"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/prometheus/common/model"
"github.com/toolkits/pkg/logger"
)
type TargetQuery struct {
Filters []models.HostQuery `json:"queries"`
P int `json:"p"`
Limit int `json:"limit"`
}
func (rt *Router) targetGetsByHostFilter(c *gin.Context) {
var f TargetQuery
ginx.BindJSON(c, &f)
query := models.GetHostsQuery(f.Filters)
hosts, err := models.TargetGetsByFilter(rt.Ctx, query, f.Limit, (f.P-1)*f.Limit)
ginx.Dangerous(err)
total, err := models.TargetCountByFilter(rt.Ctx, query)
ginx.Dangerous(err)
models.FillTargetsBeatTime(rt.Redis, hosts)
now := time.Now().Unix()
for i := 0; i < len(hosts); i++ {
if now-hosts[i].BeatTime < 60 {
hosts[i].TargetUp = 2
} else if now-hosts[i].BeatTime < 180 {
hosts[i].TargetUp = 1
}
}
ginx.NewRender(c).Data(gin.H{
"list": hosts,
"total": total,
}, nil)
}
func (rt *Router) targetGets(c *gin.Context) {
bgids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",")
query := ginx.QueryStr(c, "query", "")
limit := ginx.QueryInt(c, "limit", 30)
downtime := ginx.QueryInt64(c, "downtime", 0)
dsIds := queryDatasourceIds(c)
order := ginx.QueryStr(c, "order", "ident")
desc := ginx.QueryBool(c, "desc", false)
hosts := queryStrListField(c, "hosts", ",", " ", "\n")
var err error
if len(bgids) > 0 {
for _, gid := range bgids {
if gid > 0 {
rt.bgroCheck(c, gid)
}
}
} else {
user := c.MustGet("user").(*models.User)
if !user.IsAdmin() {
// 如果是非 admin 用户,全部对象的情况,找到用户有权限的业务组
var err error
bgids, err = models.MyBusiGroupIds(rt.Ctx, user.Id)
ginx.Dangerous(err)
// 将未分配业务组的对象也加入到列表中
bgids = append(bgids, 0)
}
}
options := []models.BuildTargetWhereOption{
models.BuildTargetWhereWithBgids(bgids),
models.BuildTargetWhereWithDsIds(dsIds),
models.BuildTargetWhereWithQuery(query),
models.BuildTargetWhereWithHosts(hosts),
}
// downtime 筛选:从缓存获取心跳时间,选择较小的集合用 IN 或 NOT IN 过滤
if downtime != 0 {
downtimeOpt, hasMatch := rt.downtimeFilter(downtime)
if !hasMatch {
ginx.NewRender(c).Data(gin.H{
"list": []*models.Target{},
"total": 0,
}, nil)
return
}
if downtimeOpt != nil {
options = append(options, downtimeOpt)
}
}
total, err := models.TargetTotal(rt.Ctx, options...)
ginx.Dangerous(err)
list, err := models.TargetGets(rt.Ctx, limit,
ginx.Offset(c, limit), order, desc, options...)
ginx.Dangerous(err)
tgs, err := models.TargetBusiGroupsGetAll(rt.Ctx)
ginx.Dangerous(err)
for _, t := range list {
t.GroupIds = tgs[t.Ident]
}
if err == nil {
now := time.Now()
cache := make(map[int64]*models.BusiGroup)
// 从 Redis 补全 BeatTime
models.FillTargetsBeatTime(rt.Redis, list)
var keys []string
for i := 0; i < len(list); i++ {
ginx.Dangerous(list[i].FillGroup(rt.Ctx, cache))
keys = append(keys, models.WrapIdent(list[i].Ident))
if now.Unix()-list[i].BeatTime < 60 {
list[i].TargetUp = 2
} else if now.Unix()-list[i].BeatTime < 180 {
list[i].TargetUp = 1
}
}
if len(keys) > 0 {
metaMap := make(map[string]*models.HostMeta)
vals := storage.MGet(context.Background(), rt.Redis, keys)
for _, value := range vals {
var meta models.HostMeta
if value == nil {
continue
}
err := json.Unmarshal(value, &meta)
if err != nil {
logger.Warningf("unmarshal %v host meta failed: %v", value, err)
continue
}
metaMap[meta.Hostname] = &meta
}
for i := 0; i < len(list); i++ {
if meta, ok := metaMap[list[i].Ident]; ok {
list[i].FillMeta(meta)
} else {
// 未上报过元数据的主机,cpuNum默认为-1, 用于前端展示 unknown
list[i].CpuNum = -1
}
}
}
}
ginx.NewRender(c).Data(gin.H{
"list": list,
"total": total,
}, nil)
}
// downtimeFilter 从缓存获取心跳时间,生成 downtime 筛选条件
// 选择匹配集和非匹配集中较小的一方,用 IN 或 NOT IN 来减少 SQL 参数量
// 返回值:
// - option: 筛选条件,nil 表示所有 target 都符合条件(无需过滤)
// - hasMatch: 是否有符合条件的 target,false 表示无匹配应返回空结果
func (rt *Router) downtimeFilter(downtime int64) (option models.BuildTargetWhereOption, hasMatch bool) {
now := time.Now().Unix()
targets := rt.TargetCache.GetAll()
var matchIdents, nonMatchIdents []string
for _, target := range targets {
matched := false
if downtime > 0 {
matched = target.BeatTime < now-downtime
} else if downtime < 0 {
matched = target.BeatTime > now+downtime
}
if matched {
matchIdents = append(matchIdents, target.Ident)
} else {
nonMatchIdents = append(nonMatchIdents, target.Ident)
}
}
if len(matchIdents) == 0 {
return nil, false
}
if len(nonMatchIdents) == 0 {
return nil, true
}
if len(matchIdents) <= len(nonMatchIdents) {
return models.BuildTargetWhereWithIdents(matchIdents), true
}
return models.BuildTargetWhereExcludeIdents(nonMatchIdents), true
}
func (rt *Router) targetExtendInfoByIdent(c *gin.Context) {
ident := ginx.QueryStr(c, "ident", "")
key := models.WrapExtendIdent(ident)
vals := storage.MGet(context.Background(), rt.Redis, []string{key})
if len(vals) > 0 {
extInfo := string(vals[0])
if extInfo == "null" {
extInfo = ""
}
ginx.NewRender(c).Data(gin.H{
"extend_info": extInfo,
"ident": ident,
}, nil)
return
}
ginx.NewRender(c).Data(gin.H{
"extend_info": "",
"ident": ident,
}, nil)
}
func (rt *Router) targetGetsByService(c *gin.Context) {
lst, err := models.TargetGetsAll(rt.Ctx)
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) targetGetTags(c *gin.Context) {
idents := ginx.QueryStr(c, "idents", "")
idents = strings.ReplaceAll(idents, ",", " ")
ignoreHostTag := ginx.QueryBool(c, "ignore_host_tag", false)
lst, err := models.TargetGetTags(rt.Ctx, strings.Fields(idents), ignoreHostTag, "")
ginx.NewRender(c).Data(lst, err)
}
type targetTagsForm struct {
Idents []string `json:"idents" binding:"required_without=HostIps"`
HostIps []string `json:"host_ips" binding:"required_without=Idents"`
Tags []string `json:"tags" binding:"required"`
}
func (rt *Router) targetBindTagsByFE(c *gin.Context) {
var f targetTagsForm
var err error
var failedResults = make(map[string]string)
ginx.BindJSON(c, &f)
if len(f.Idents) == 0 && len(f.HostIps) == 0 {
ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided")
}
// Acquire idents by idents and hostIps
failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps)
if err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
rt.checkTargetPerm(c, f.Idents)
ginx.NewRender(c).Data(rt.targetBindTags(f, failedResults))
}
func (rt *Router) targetBindTagsByService(c *gin.Context) {
var f targetTagsForm
var err error
var failedResults = make(map[string]string)
ginx.BindJSON(c, &f)
if len(f.Idents) == 0 && len(f.HostIps) == 0 {
ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided")
}
// Acquire idents by idents and hostIps
failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps)
if err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
ginx.NewRender(c).Data(rt.targetBindTags(f, failedResults))
}
func (rt *Router) targetBindTags(f targetTagsForm, failedIdents map[string]string) (map[string]string, error) {
// 1. Check tags
if err := rt.validateTags(f.Tags); err != nil {
return nil, err
}
// 2. Acquire targets by idents
targets, err := models.TargetsGetByIdents(rt.Ctx, f.Idents)
if err != nil {
return nil, err
}
// 3. Add tags to targets
for _, target := range targets {
if err = rt.addTagsToTarget(target, f.Tags); err != nil {
failedIdents[target.Ident] = err.Error()
}
}
return failedIdents, nil
}
func (rt *Router) validateTags(tags []string) error {
for _, tag := range tags {
arr := strings.Split(tag, "=")
if len(arr) != 2 {
return fmt.Errorf("invalid tag format: %s (expected format: key=value)", tag)
}
key, value := strings.TrimSpace(arr[0]), strings.TrimSpace(arr[1])
if key == "" {
return fmt.Errorf("invalid tag: key is empty in tag %s", tag)
}
if value == "" {
return fmt.Errorf("invalid tag: value is empty in tag %s", tag)
}
if strings.Contains(key, ".") {
return fmt.Errorf("invalid tag key: %s (key cannot contain '.')", key)
}
if strings.Contains(key, "-") {
return fmt.Errorf("invalid tag key: %s (key cannot contain '-')", key)
}
if !model.LabelNameRE.MatchString(key) {
return fmt.Errorf("invalid tag key: %s "+
"(key must start with a letter or underscore, followed by letters, digits, or underscores)", key)
}
}
return nil
}
func (rt *Router) addTagsToTarget(target *models.Target, tags []string) error {
for _, tag := range tags {
tagKey := strings.Split(tag, "=")[0]
if _, exist := target.TagsMap[tagKey]; exist {
return fmt.Errorf("duplicate tagkey(%s)", tagKey)
}
}
return target.AddTags(rt.Ctx, tags)
}
func (rt *Router) targetUnbindTagsByFE(c *gin.Context) {
var f targetTagsForm
var err error
var failedResults = make(map[string]string)
ginx.BindJSON(c, &f)
if len(f.Idents) == 0 && len(f.HostIps) == 0 {
ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided")
}
// Acquire idents by idents and hostIps
failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps)
if err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
rt.checkTargetPerm(c, f.Idents)
ginx.NewRender(c).Data(rt.targetUnbindTags(f, failedResults))
}
func (rt *Router) targetUnbindTagsByService(c *gin.Context) {
var f targetTagsForm
var err error
var failedResults = make(map[string]string)
ginx.BindJSON(c, &f)
if len(f.Idents) == 0 && len(f.HostIps) == 0 {
ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided")
}
// Acquire idents by idents and hostIps
failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps)
if err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
ginx.NewRender(c).Data(rt.targetUnbindTags(f, failedResults))
}
func (rt *Router) targetUnbindTags(f targetTagsForm, failedIdents map[string]string) (map[string]string, error) {
// 1. Acquire targets by idents
targets, err := models.TargetsGetByIdents(rt.Ctx, f.Idents)
if err != nil {
return nil, err
}
// 2. Remove tags from targets
for _, target := range targets {
err = target.DelTags(rt.Ctx, f.Tags)
if err != nil {
failedIdents[target.Ident] = err.Error()
continue
}
}
return failedIdents, nil
}
type targetNoteForm struct {
Idents []string `json:"idents" binding:"required_without=HostIps"`
HostIps []string `json:"host_ips" binding:"required_without=Idents"`
Note string `json:"note"`
}
func (rt *Router) targetUpdateNote(c *gin.Context) {
var f targetNoteForm
var err error
var failedResults = make(map[string]string)
ginx.BindJSON(c, &f)
if len(f.Idents) == 0 && len(f.HostIps) == 0 {
ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided")
}
// Acquire idents by idents and hostIps
failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps)
if err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
rt.checkTargetPerm(c, f.Idents)
ginx.NewRender(c).Data(failedResults, models.TargetUpdateNote(rt.Ctx, f.Idents, f.Note))
}
func (rt *Router) targetUpdateNoteByService(c *gin.Context) {
var f targetNoteForm
var err error
var failedResults = make(map[string]string)
ginx.BindJSON(c, &f)
if len(f.Idents) == 0 && len(f.HostIps) == 0 {
ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided")
}
// Acquire idents by idents and hostIps
failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps)
if err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
ginx.NewRender(c).Data(failedResults, models.TargetUpdateNote(rt.Ctx, f.Idents, f.Note))
}
type targetBgidForm struct {
Idents []string `json:"idents" binding:"required_without=HostIps"`
HostIps []string `json:"host_ips" binding:"required_without=Idents"`
Bgid int64 `json:"bgid"`
}
type targetBgidsForm struct {
Idents []string `json:"idents" binding:"required_without=HostIps"`
HostIps []string `json:"host_ips" binding:"required_without=Idents"`
Bgids []int64 `json:"bgids"`
Tags []string `json:"tags"`
Action string `json:"action"` // add del reset
}
func haveNeverGroupedIdent(ctx *ctx.Context, idents []string) (bool, error) {
for _, ident := range idents {
bgids, err := models.TargetGroupIdsGetByIdent(ctx, ident)
if err != nil {
return false, err
}
if len(bgids) <= 0 {
return true, nil
}
}
return false, nil
}
func (rt *Router) targetBindBgids(c *gin.Context) {
var f targetBgidsForm
var err error
var failedResults = make(map[string]string)
ginx.BindJSON(c, &f)
if len(f.Idents) == 0 && len(f.HostIps) == 0 {
ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided")
}
// Acquire idents by idents and hostIps
failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps)
if err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
user := c.MustGet("user").(*models.User)
if !user.IsAdmin() {
// 普通用户,检查用户是否有权限操作所有请求的业务组
existing, _, err := models.SeparateTargetIdents(rt.Ctx, f.Idents)
ginx.Dangerous(err)
rt.checkTargetPerm(c, existing)
var groupIds []int64
if f.Action == "reset" {
// 如果是复写,则需要检查用户是否有权限操作机器之前的业务组
bgids, err := models.TargetGroupIdsGetByIdents(rt.Ctx, f.Idents)
ginx.Dangerous(err)
groupIds = append(groupIds, bgids...)
}
groupIds = append(groupIds, f.Bgids...)
for _, bgid := range groupIds {
bg := BusiGroup(rt.Ctx, bgid)
can, err := user.CanDoBusiGroup(rt.Ctx, bg, "rw")
ginx.Dangerous(err)
if !can {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
}
isNeverGrouped, checkErr := haveNeverGroupedIdent(rt.Ctx, f.Idents)
ginx.Dangerous(checkErr)
if isNeverGrouped {
can, err := user.CheckPerm(rt.Ctx, "/targets/bind")
ginx.Dangerous(err)
if !can {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
}
}
switch f.Action {
case "add":
ginx.NewRender(c).Data(failedResults, models.TargetBindBgids(rt.Ctx, f.Idents, f.Bgids, f.Tags))
case "del":
ginx.NewRender(c).Data(failedResults, models.TargetUnbindBgids(rt.Ctx, f.Idents, f.Bgids))
case "reset":
ginx.NewRender(c).Data(failedResults, models.TargetOverrideBgids(rt.Ctx, f.Idents, f.Bgids, f.Tags))
default:
ginx.Bomb(http.StatusBadRequest, "invalid action")
}
}
func (rt *Router) targetUpdateBgidByService(c *gin.Context) {
var f targetBgidForm
var err error
var failedResults = make(map[string]string)
ginx.BindJSON(c, &f)
if len(f.Idents) == 0 && len(f.HostIps) == 0 {
ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided")
}
// Acquire idents by idents and hostIps
failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps)
if err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
ginx.NewRender(c).Data(failedResults, models.TargetOverrideBgids(rt.Ctx, f.Idents, []int64{f.Bgid}, nil))
}
type identsForm struct {
Idents []string `json:"idents" binding:"required_without=HostIps"`
HostIps []string `json:"host_ips" binding:"required_without=Idents"`
}
func (rt *Router) targetDel(c *gin.Context) {
var f identsForm
var err error
var failedResults = make(map[string]string)
ginx.BindJSON(c, &f)
if len(f.Idents) == 0 && len(f.HostIps) == 0 {
ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided")
}
// Acquire idents by idents and hostIps
failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps)
if err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
ginx.NewRender(c).Data(failedResults, models.TargetDel(rt.Ctx, f.Idents, rt.TargetDeleteHook))
}
func (rt *Router) targetDelByService(c *gin.Context) {
var f identsForm
var err error
var failedResults = make(map[string]string)
ginx.BindJSON(c, &f)
if len(f.Idents) == 0 && len(f.HostIps) == 0 {
ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided")
}
// Acquire idents by idents and hostIps
failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps)
if err != nil {
ginx.Bomb(http.StatusBadRequest, err.Error())
}
ginx.NewRender(c).Data(failedResults, models.TargetDel(rt.Ctx, f.Idents, rt.TargetDeleteHook))
}
func (rt *Router) checkTargetPerm(c *gin.Context, idents []string) {
user := c.MustGet("user").(*models.User)
nopri, err := user.NopriIdents(rt.Ctx, idents)
ginx.Dangerous(err)
if len(nopri) > 0 {
ginx.Bomb(http.StatusForbidden, "forbidden")
}
}
func (rt *Router) targetsOfAlertRule(c *gin.Context) {
engineName := ginx.QueryStr(c, "engine_name", "")
m, err := models.GetTargetsOfHostAlertRule(rt.Ctx, engineName)
ret := make(map[string]map[int64][]string)
for en, v := range m {
if en != engineName {
continue
}
ret[en] = make(map[int64][]string)
for rid, idents := range v {
ret[en][rid] = idents
}
}
ginx.NewRender(c).Data(ret, err)
}
func (rt *Router) checkTargetsExistByIndent(idents []string) {
notExists, err := models.TargetNoExistIdents(rt.Ctx, idents)
ginx.Dangerous(err)
if len(notExists) > 0 {
ginx.Bomb(http.StatusBadRequest, "targets not exist: %s", strings.Join(notExists, ", "))
}
}
func (rt *Router) targetsOfHostQuery(c *gin.Context) {
var queries []models.HostQuery
ginx.BindJSON(c, &queries)
hostsQuery := models.GetHostsQuery(queries)
session := models.TargetFilterQueryBuild(rt.Ctx, hostsQuery, 0, 0)
var lst []*models.Target
err := session.Find(&lst).Error
if err != nil {
ginx.Bomb(http.StatusInternalServerError, err.Error())
}
ginx.NewRender(c).Data(lst, nil)
}
func (rt *Router) targetStats(c *gin.Context) {
bgids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",")
var err error
if len(bgids) > 0 {
for _, gid := range bgids {
if gid > 0 {
rt.bgroCheck(c, gid)
}
}
} else {
user := c.MustGet("user").(*models.User)
if !user.IsAdmin() {
bgids, err = models.MyBusiGroupIds(rt.Ctx, user.Id)
ginx.Dangerous(err)
bgids = append(bgids, 0)
}
}
targets := rt.TargetCache.GetAll()
now := time.Now().Unix()
var count, aliveCount, deadCount int64
memUsage := map[string]int64{"-1": 0, "20": 0, "40": 0, "60": 0, "80": 0, "100": 0}
cpuUsage := map[string]int64{"-1": 0, "20": 0, "40": 0, "60": 0, "80": 0, "100": 0}
versions := make(map[string]int64)
bgidSet := make(map[int64]struct{}, len(bgids))
for _, gid := range bgids {
bgidSet[gid] = struct{}{}
}
hasBgidFilter := len(bgids) > 0
for _, t := range targets {
if hasBgidFilter {
matched := false
if _, ok := bgidSet[0]; ok && len(t.GroupIds) == 0 {
matched = true
}
if !matched {
for _, gid := range t.GroupIds {
if _, ok := bgidSet[gid]; ok {
matched = true
break
}
}
}
if !matched {
continue
}
}
count++
if now-t.BeatTime < 180 {
aliveCount++
} else {
deadCount++
}
if t.CpuNum <= 0 {
cpuUsage["-1"]++
memUsage["-1"]++
} else {
cpuUsage[usageBucket(t.CpuUtil)]++
memUsage[usageBucket(t.MemUtil)]++
}
ver := t.AgentVersion
if ver == "" {
ver = "unknown"
}
versions[ver]++
}
ginx.NewRender(c).Data(gin.H{
"count": count,
"alive_count": aliveCount,
"dead_count": deadCount,
"mem_usage": memUsage,
"cpu_usage": cpuUsage,
"versions": versions,
}, nil)
}
func usageBucket(val float64) string {
switch {
case val < 20:
return "20"
case val < 40:
return "40"
case val < 60:
return "60"
case val < 80:
return "80"
default:
return "100"
}
}
func (rt *Router) targetUpdate(c *gin.Context) {
var f idents.TargetUpdate
ginx.BindJSON(c, &f)
ginx.NewRender(c).Message(rt.IdentSet.UpdateTargets(f.Lst, f.Now))
}
================================================
FILE: center/router/router_task.go
================================================
package router
import (
"strings"
"time"
"github.com/ccfos/nightingale/v6/alert/sender"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/strx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/i18n"
)
func (rt *Router) taskGets(c *gin.Context) {
bgid := ginx.UrlParamInt64(c, "id")
mine := ginx.QueryBool(c, "mine", false)
days := ginx.QueryInt64(c, "days", 7)
limit := ginx.QueryInt(c, "limit", 20)
query := ginx.QueryStr(c, "query", "")
user := c.MustGet("user").(*models.User)
creator := ""
if mine {
creator = user.Username
}
beginTime := time.Now().Unix() - days*24*3600
total, err := models.TaskRecordTotal(rt.Ctx, []int64{bgid}, beginTime, creator, query)
ginx.Dangerous(err)
list, err := models.TaskRecordGets(rt.Ctx, []int64{bgid}, beginTime, creator, query, limit, ginx.Offset(c, limit))
ginx.Dangerous(err)
ginx.NewRender(c).Data(gin.H{
"total": total,
"list": list,
}, nil)
}
func (rt *Router) taskGetsByGids(c *gin.Context) {
gids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",")
if len(gids) > 0 {
for _, gid := range gids {
rt.bgroCheck(c, gid)
}
} else {
me := c.MustGet("user").(*models.User)
if !me.IsAdmin() {
var err error
gids, err = models.MyBusiGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
if len(gids) == 0 {
ginx.NewRender(c).Data([]int{}, nil)
return
}
}
}
mine := ginx.QueryBool(c, "mine", false)
days := ginx.QueryInt64(c, "days", 7)
limit := ginx.QueryInt(c, "limit", 20)
query := ginx.QueryStr(c, "query", "")
user := c.MustGet("user").(*models.User)
creator := ""
if mine {
creator = user.Username
}
beginTime := time.Now().Unix() - days*24*3600
total, err := models.TaskRecordTotal(rt.Ctx, gids, beginTime, creator, query)
ginx.Dangerous(err)
list, err := models.TaskRecordGets(rt.Ctx, gids, beginTime, creator, query, limit, ginx.Offset(c, limit))
ginx.Dangerous(err)
ginx.NewRender(c).Data(gin.H{
"total": total,
"list": list,
}, nil)
}
func (rt *Router) taskRecordAdd(c *gin.Context) {
var f *models.TaskRecord
ginx.BindJSON(c, &f)
ginx.NewRender(c).Message(f.Add(rt.Ctx))
}
func (rt *Router) taskAdd(c *gin.Context) {
if !rt.Ibex.Enable {
ginx.Bomb(400, i18n.Sprintf(c.GetHeader("X-Language"), "This functionality has not been enabled. Please contact the system administrator to activate it."))
return
}
var f models.TaskForm
ginx.BindJSON(c, &f)
// 把 f.Hosts 中的空字符串过滤掉
hosts := make([]string, 0, len(f.Hosts))
for i := range f.Hosts {
if strings.TrimSpace(f.Hosts[i]) != "" {
hosts = append(hosts, strings.TrimSpace(f.Hosts[i]))
}
}
f.Hosts = hosts
bgid := ginx.UrlParamInt64(c, "id")
user := c.MustGet("user").(*models.User)
f.Creator = user.Username
rt.checkTargetsExistByIndent(f.Hosts)
err := f.Verify()
ginx.Dangerous(err)
f.HandleFH(f.Hosts[0])
// check permission
rt.checkTargetPerm(c, f.Hosts)
// call ibex
taskId, err := sender.TaskAdd(f, user.Username, rt.Ctx.IsCenter)
ginx.Dangerous(err)
if taskId <= 0 {
ginx.Dangerous("created task.id is zero")
}
// write db
record := models.TaskRecord{
Id: taskId,
GroupId: bgid,
Title: f.Title,
Account: f.Account,
Batch: f.Batch,
Tolerance: f.Tolerance,
Timeout: f.Timeout,
Pause: f.Pause,
Script: f.Script,
Args: f.Args,
CreateAt: time.Now().Unix(),
CreateBy: f.Creator,
}
err = record.Add(rt.Ctx)
ginx.NewRender(c).Data(taskId, err)
}
================================================
FILE: center/router/router_task_tpl.go
================================================
package router
import (
"net/http"
"sort"
"strings"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/strx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/i18n"
"github.com/toolkits/pkg/str"
)
func (rt *Router) taskTplGets(c *gin.Context) {
query := ginx.QueryStr(c, "query", "")
limit := ginx.QueryInt(c, "limit", 20)
groupId := ginx.UrlParamInt64(c, "id")
total, err := models.TaskTplTotal(rt.Ctx, []int64{groupId}, query)
ginx.Dangerous(err)
list, err := models.TaskTplGets(rt.Ctx, []int64{groupId}, query, limit, ginx.Offset(c, limit))
ginx.Dangerous(err)
models.FillUpdateByNicknames(rt.Ctx, list)
ginx.NewRender(c).Data(gin.H{
"total": total,
"list": list,
}, nil)
}
func (rt *Router) taskTplGetsByGids(c *gin.Context) {
query := ginx.QueryStr(c, "query", "")
limit := ginx.QueryInt(c, "limit", 20)
gids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",")
if len(gids) > 0 {
for _, gid := range gids {
rt.bgroCheck(c, gid)
}
} else {
me := c.MustGet("user").(*models.User)
if !me.IsAdmin() {
var err error
gids, err = models.MyBusiGroupIds(rt.Ctx, me.Id)
ginx.Dangerous(err)
if len(gids) == 0 {
ginx.NewRender(c).Data([]int{}, nil)
return
}
}
}
total, err := models.TaskTplTotal(rt.Ctx, gids, query)
ginx.Dangerous(err)
list, err := models.TaskTplGets(rt.Ctx, gids, query, limit, ginx.Offset(c, limit))
ginx.Dangerous(err)
models.FillUpdateByNicknames(rt.Ctx, list)
ginx.NewRender(c).Data(gin.H{
"total": total,
"list": list,
}, nil)
}
func (rt *Router) taskTplGet(c *gin.Context) {
tid := ginx.UrlParamInt64(c, "tid")
tpl, err := models.TaskTplGet(rt.Ctx, "id = ?", tid)
ginx.Dangerous(err)
if tpl == nil {
ginx.Bomb(404, "no such task template")
}
hosts, err := tpl.Hosts(rt.Ctx)
ginx.NewRender(c).Data(gin.H{
"tpl": tpl,
"hosts": hosts,
}, err)
}
func (rt *Router) taskTplGetByService(c *gin.Context) {
tid := ginx.UrlParamInt64(c, "tid")
tpl, err := models.TaskTplGetById(rt.Ctx, tid)
ginx.Dangerous(err)
if tpl == nil {
ginx.Bomb(404, "no such task template")
}
ginx.NewRender(c).Data(tpl, err)
}
func (rt *Router) taskTplGetsByService(c *gin.Context) {
ginx.NewRender(c).Data(models.TaskTplGetAll(rt.Ctx))
}
func (rt *Router) taskTplStatistics(c *gin.Context) {
ginx.NewRender(c).Data(models.TaskTplStatistics(rt.Ctx))
}
type taskTplForm struct {
Title string `json:"title" binding:"required"`
Batch int `json:"batch"`
Tolerance int `json:"tolerance"`
Timeout int `json:"timeout"`
Pause string `json:"pause"`
Script string `json:"script"`
Args string `json:"args"`
Tags []string `json:"tags"`
Account string `json:"account"`
Hosts []string `json:"hosts"`
}
func (f *taskTplForm) Verify() {
// 传入的 f.Hosts 可能是 []string{"", "a", "b"},需要过滤掉空字符串
args := make([]string, 0, len(f.Hosts))
for _, ident := range f.Hosts {
if strings.TrimSpace(ident) != "" {
args = append(args, strings.TrimSpace(ident))
}
}
f.Hosts = args
}
func (rt *Router) taskTplAdd(c *gin.Context) {
if !rt.Ibex.Enable {
ginx.Bomb(400, i18n.Sprintf(c.GetHeader("X-Language"), "This functionality has not been enabled. Please contact the system administrator to activate it."))
return
}
var f taskTplForm
ginx.BindJSON(c, &f)
f.Verify()
user := c.MustGet("user").(*models.User)
now := time.Now().Unix()
rt.checkTargetsExistByIndent(f.Hosts)
sort.Strings(f.Tags)
tpl := &models.TaskTpl{
GroupId: ginx.UrlParamInt64(c, "id"),
Title: f.Title,
Batch: f.Batch,
Tolerance: f.Tolerance,
Timeout: f.Timeout,
Pause: f.Pause,
Script: f.Script,
Args: f.Args,
Tags: strings.Join(f.Tags, " ") + " ",
Account: f.Account,
CreateBy: user.Username,
UpdateBy: user.Username,
CreateAt: now,
UpdateAt: now,
}
ginx.NewRender(c).Message(tpl.Save(rt.Ctx, f.Hosts))
}
func (rt *Router) taskTplPut(c *gin.Context) {
tid := ginx.UrlParamInt64(c, "tid")
tpl, err := models.TaskTplGet(rt.Ctx, "id = ?", tid)
ginx.Dangerous(err)
if tpl == nil {
ginx.NewRender(c).Message("no such task template")
return
}
user := c.MustGet("user").(*models.User)
var f taskTplForm
ginx.BindJSON(c, &f)
f.Verify()
rt.checkTargetsExistByIndent(f.Hosts)
sort.Strings(f.Tags)
tpl.Title = f.Title
tpl.Batch = f.Batch
tpl.Tolerance = f.Tolerance
tpl.Timeout = f.Timeout
tpl.Pause = f.Pause
tpl.Script = f.Script
tpl.Args = f.Args
tpl.Tags = strings.Join(f.Tags, " ") + " "
tpl.Account = f.Account
tpl.UpdateBy = user.Username
tpl.UpdateAt = time.Now().Unix()
ginx.NewRender(c).Message(tpl.Update(rt.Ctx, f.Hosts))
}
func (rt *Router) taskTplDel(c *gin.Context) {
tid := ginx.UrlParamInt64(c, "tid")
tpl, err := models.TaskTplGet(rt.Ctx, "id = ?", tid)
ginx.Dangerous(err)
if tpl == nil {
ginx.NewRender(c).Message(nil)
return
}
ids, err := models.GetAlertRuleIdsByTaskId(rt.Ctx, tid)
ginx.Dangerous(err)
if len(ids) > 0 {
ginx.NewRender(c).Message("can't del this task tpl, used by alert rule ids(%v) ", ids)
return
}
ginx.NewRender(c).Message(tpl.Del(rt.Ctx))
}
type tplTagsForm struct {
Ids []int64 `json:"ids" binding:"required"`
Tags []string `json:"tags" binding:"required"`
}
func (f *tplTagsForm) Verify() {
if len(f.Ids) == 0 {
ginx.Bomb(http.StatusBadRequest, "arg(ids) empty")
}
if len(f.Tags) == 0 {
ginx.Bomb(http.StatusBadRequest, "arg(tags) empty")
}
newTags := make([]string, 0, len(f.Tags))
for i := 0; i < len(f.Tags); i++ {
tag := strings.TrimSpace(f.Tags[i])
if tag == "" {
continue
}
if str.Dangerous(tag) {
ginx.Bomb(http.StatusBadRequest, "arg(tags) invalid")
}
newTags = append(newTags, tag)
}
f.Tags = newTags
if len(f.Tags) == 0 {
ginx.Bomb(http.StatusBadRequest, "arg(tags) empty")
}
}
func (rt *Router) taskTplBindTags(c *gin.Context) {
var f tplTagsForm
ginx.BindJSON(c, &f)
f.Verify()
username := c.MustGet("username").(string)
for i := 0; i < len(f.Ids); i++ {
tpl, err := models.TaskTplGet(rt.Ctx, "id = ?", f.Ids[i])
ginx.Dangerous(err)
if tpl == nil {
continue
}
ginx.Dangerous(tpl.AddTags(rt.Ctx, f.Tags, username))
}
ginx.NewRender(c).Message(nil)
}
func (rt *Router) taskTplUnbindTags(c *gin.Context) {
var f tplTagsForm
ginx.BindJSON(c, &f)
f.Verify()
username := c.MustGet("username").(string)
for i := 0; i < len(f.Ids); i++ {
tpl, err := models.TaskTplGet(rt.Ctx, "id = ?", f.Ids[i])
ginx.Dangerous(err)
if tpl == nil {
continue
}
ginx.Dangerous(tpl.DelTags(rt.Ctx, f.Tags, username))
}
ginx.NewRender(c).Message(nil)
}
================================================
FILE: center/router/router_tdengine.go
================================================
package router
import (
"fmt"
"net/http"
"github.com/ccfos/nightingale/v6/center/cconf"
"github.com/ccfos/nightingale/v6/datasource/tdengine"
"github.com/ccfos/nightingale/v6/dscache"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
type databasesQueryForm struct {
Cate string `json:"cate" form:"cate"`
DatasourceId int64 `json:"datasource_id" form:"datasource_id"`
}
func (rt *Router) tdengineDatabases(c *gin.Context) {
var f databasesQueryForm
ginx.BindJSON(c, &f)
datasource, hit := dscache.DsCache.Get(f.Cate, f.DatasourceId)
if _, ok := datasource.(*tdengine.TDengine); !hit || !ok {
ginx.NewRender(c, http.StatusNotFound).Message("No such datasource")
return
}
databases, err := datasource.(*tdengine.TDengine).ShowDatabases(rt.Ctx.Ctx)
ginx.NewRender(c).Data(databases, err)
}
type tablesQueryForm struct {
Cate string `json:"cate"`
DatasourceId int64 `json:"datasource_id" `
Database string `json:"db"`
IsStable bool `json:"is_stable"`
}
type Column struct {
Name string `json:"name"`
Type string `json:"type"`
Size int `json:"size"`
}
// get tdengine tables
func (rt *Router) tdengineTables(c *gin.Context) {
var f tablesQueryForm
ginx.BindJSON(c, &f)
datasource, hit := dscache.DsCache.Get(f.Cate, f.DatasourceId)
if _, ok := datasource.(*tdengine.TDengine); !hit || !ok {
ginx.NewRender(c, http.StatusNotFound).Message("No such datasource")
return
}
database := fmt.Sprintf("%s.tables", f.Database)
if f.IsStable {
database = fmt.Sprintf("%s.stables", f.Database)
}
tables, err := datasource.(*tdengine.TDengine).ShowTables(rt.Ctx.Ctx, database)
ginx.NewRender(c).Data(tables, err)
}
type columnsQueryForm struct {
Cate string `json:"cate"`
DatasourceId int64 `json:"datasource_id" `
Database string `json:"db"`
Table string `json:"table"`
}
func (rt *Router) tdengineColumns(c *gin.Context) {
var f columnsQueryForm
ginx.BindJSON(c, &f)
datasource, hit := dscache.DsCache.Get(f.Cate, f.DatasourceId)
if _, ok := datasource.(*tdengine.TDengine); !hit || !ok {
ginx.NewRender(c, http.StatusNotFound).Message("No such datasource")
return
}
query := map[string]string{
"database": f.Database,
"table": f.Table,
}
columns, err := datasource.(*tdengine.TDengine).DescribeTable(rt.Ctx.Ctx, query)
// 对齐前端,后续可以将 tdEngine 的查数据的接口都统一
tdColumns := make([]Column, len(columns))
for i, column := range columns {
tdColumns[i] = Column{
Name: column.Field,
Type: column.Type,
}
}
ginx.NewRender(c).Data(tdColumns, err)
}
// query sql template
func (rt *Router) QuerySqlTemplate(c *gin.Context) {
cate := ginx.QueryStr(c, "cate")
m := make(map[string]string)
switch cate {
case models.TDENGINE:
m = cconf.TDengineSQLTpl
}
ginx.NewRender(c).Data(m, nil)
}
================================================
FILE: center/router/router_trace_logs.go
================================================
package router
import (
"encoding/json"
"fmt"
"io"
"net/http"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/ccfos/nightingale/v6/pkg/loggrep"
"github.com/toolkits/pkg/logger"
"github.com/gin-gonic/gin"
)
// traceLogsPage renders an HTML log viewer page for trace logs.
func (rt *Router) traceLogsPage(c *gin.Context) {
traceId := ginx.UrlParamStr(c, "traceid")
if !loggrep.IsValidTraceID(traceId) {
c.String(http.StatusBadRequest, "invalid trace id format")
return
}
logs, instance, err := rt.getTraceLogs(traceId)
if err != nil {
c.String(http.StatusInternalServerError, "Error: %v", err)
return
}
c.Header("Content-Type", "text/html; charset=utf-8")
err = loggrep.RenderTraceLogsHTML(c.Writer, loggrep.TraceLogsPageData{
TraceID: traceId,
Instance: instance,
Logs: logs,
Total: len(logs),
})
if err != nil {
c.String(http.StatusInternalServerError, "render error: %v", err)
}
}
// traceLogsJSON returns JSON for trace logs.
func (rt *Router) traceLogsJSON(c *gin.Context) {
traceId := ginx.UrlParamStr(c, "traceid")
if !loggrep.IsValidTraceID(traceId) {
ginx.Bomb(200, "invalid trace id format")
}
logs, instance, err := rt.getTraceLogs(traceId)
ginx.Dangerous(err)
ginx.NewRender(c).Data(loggrep.EventDetailResp{
Logs: logs,
Instance: instance,
}, nil)
}
// getTraceLogs finds the same-engine instances and queries each one
// until trace logs are found. Trace logs belong to a single instance.
func (rt *Router) getTraceLogs(traceId string) ([]string, string, error) {
keyword := "trace_id=" + traceId
instance := fmt.Sprintf("%s:%d", rt.Alert.Heartbeat.IP, rt.HTTP.Port)
engineName := rt.Alert.Heartbeat.EngineName
// try local first
logs, err := loggrep.GrepLatestLogFiles(rt.LogDir, keyword)
if err == nil && len(logs) > 0 {
return logs, instance, nil
}
// find all instances with the same engineName
servers, err := models.AlertingEngineGetsInstances(rt.Ctx,
"engine_cluster = ? and clock > ?",
engineName, time.Now().Unix()-30)
if err != nil {
return nil, "", err
}
// loop through remote instances until we find logs
for _, node := range servers {
if node == instance {
continue // already tried local
}
logs, nodeAddr, err := rt.forwardTraceLogs(node, traceId)
if err != nil {
logger.Errorf("forwardTraceLogs failed: %v", err)
continue
}
if len(logs) > 0 {
return logs, nodeAddr, nil
}
}
return nil, instance, nil
}
func (rt *Router) forwardTraceLogs(node, traceId string) ([]string, string, error) {
url := fmt.Sprintf("http://%s/v1/n9e/trace-logs/%s", node, traceId)
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, node, err
}
for user, pass := range rt.HTTP.APIForService.BasicAuth {
req.SetBasicAuth(user, pass)
break
}
client := &http.Client{Timeout: 15 * time.Second}
resp, err := client.Do(req)
if err != nil {
return nil, node, fmt.Errorf("forward to %s failed: %v", node, err)
}
defer resp.Body.Close()
body, err := io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024))
if err != nil {
return nil, node, err
}
var result struct {
Dat loggrep.EventDetailResp `json:"dat"`
Err string `json:"err"`
}
if err := json.Unmarshal(body, &result); err != nil {
return nil, node, err
}
if result.Err != "" {
return nil, node, fmt.Errorf("%s", result.Err)
}
return result.Dat.Logs, result.Dat.Instance, nil
}
================================================
FILE: center/router/router_user.go
================================================
package router
import (
"fmt"
"net/http"
"strings"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/flashduty"
"github.com/ccfos/nightingale/v6/pkg/ormx"
"github.com/ccfos/nightingale/v6/pkg/secu"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/logger"
"gorm.io/gorm"
)
func (rt *Router) userBusiGroupsGets(c *gin.Context) {
userid := ginx.QueryInt64(c, "userid", 0)
username := ginx.QueryStr(c, "username", "")
if userid == 0 && username == "" {
ginx.Bomb(http.StatusBadRequest, "userid or username required")
}
var user *models.User
var err error
if userid > 0 {
user, err = models.UserGetById(rt.Ctx, userid)
} else {
user, err = models.UserGetByUsername(rt.Ctx, username)
}
ginx.Dangerous(err)
groups, err := user.BusiGroups(rt.Ctx, 10000, "")
ginx.NewRender(c).Data(groups, err)
}
func (rt *Router) userFindAll(c *gin.Context) {
list, err := models.UserGetAll(rt.Ctx)
ginx.NewRender(c).Data(list, err)
}
func (rt *Router) userGets(c *gin.Context) {
stime, etime := getTimeRange(c)
limit := ginx.QueryInt(c, "limit", 20)
query := ginx.QueryStr(c, "query", "")
order := ginx.QueryStr(c, "order", "username")
desc := ginx.QueryBool(c, "desc", false)
usernames := strings.Split(ginx.QueryStr(c, "usernames", ""), ",")
phones := strings.Split(ginx.QueryStr(c, "phones", ""), ",")
emails := strings.Split(ginx.QueryStr(c, "emails", ""), ",")
if len(usernames) == 1 && usernames[0] == "" {
usernames = []string{}
}
if len(phones) == 1 && phones[0] == "" {
phones = []string{}
}
if len(emails) == 1 && emails[0] == "" {
emails = []string{}
}
go rt.UserCache.UpdateUsersLastActiveTime()
total, err := models.UserTotal(rt.Ctx, query, stime, etime)
ginx.Dangerous(err)
list, err := models.UserGets(rt.Ctx, query, limit, ginx.Offset(c, limit), stime, etime, order, desc, usernames, phones, emails)
ginx.Dangerous(err)
user := c.MustGet("user").(*models.User)
ginx.NewRender(c).Data(gin.H{
"list": list,
"total": total,
"admin": user.IsAdmin(),
}, nil)
}
type userAddForm struct {
Username string `json:"username" binding:"required"`
Password string `json:"password" binding:"required"`
Nickname string `json:"nickname"`
Phone string `json:"phone"`
Email string `json:"email"`
Portrait string `json:"portrait"`
Roles []string `json:"roles" binding:"required"`
Contacts ormx.JSONObj `json:"contacts"`
}
func (rt *Router) userAddPost(c *gin.Context) {
var f userAddForm
ginx.BindJSON(c, &f)
authPassWord := f.Password
if rt.HTTP.RSA.OpenRSA {
decPassWord, err := secu.Decrypt(f.Password, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord)
if err != nil {
logger.Errorf("RSA Decrypt failed: %v username: %s", err, f.Username)
ginx.NewRender(c).Message(err)
return
}
authPassWord = decPassWord
}
password, err := models.CryptoPass(rt.Ctx, authPassWord)
ginx.Dangerous(err)
if len(f.Roles) == 0 {
ginx.Bomb(http.StatusBadRequest, "roles empty")
}
username := Username(c)
u := models.User{
Username: f.Username,
Password: password,
Nickname: f.Nickname,
Phone: f.Phone,
Email: f.Email,
Portrait: f.Portrait,
Roles: strings.Join(f.Roles, " "),
Contacts: f.Contacts,
CreateBy: username,
UpdateBy: username,
}
ginx.Dangerous(u.Verify())
ginx.NewRender(c).Message(u.Add(rt.Ctx))
}
func (rt *Router) userProfileGet(c *gin.Context) {
user := User(rt.Ctx, ginx.UrlParamInt64(c, "id"))
ginx.NewRender(c).Data(user, nil)
}
type userProfileForm struct {
Nickname string `json:"nickname"`
Phone string `json:"phone"`
Email string `json:"email"`
Roles []string `json:"roles"`
Contacts ormx.JSONObj `json:"contacts"`
}
func (rt *Router) userProfilePutByService(c *gin.Context) {
var f models.User
ginx.BindJSON(c, &f)
if len(f.RolesLst) == 0 {
ginx.Bomb(http.StatusBadRequest, "roles empty")
}
password, err := models.CryptoPass(rt.Ctx, f.Password)
ginx.Dangerous(err)
target := User(rt.Ctx, ginx.UrlParamInt64(c, "id"))
target.Nickname = f.Nickname
target.Password = password
target.Phone = f.Phone
target.Email = f.Email
target.Portrait = f.Portrait
target.Roles = strings.Join(f.RolesLst, " ")
target.Contacts = f.Contacts
target.UpdateBy = Username(c)
ginx.NewRender(c).Message(target.UpdateAllFields(rt.Ctx))
}
func (rt *Router) userProfilePut(c *gin.Context) {
var f userProfileForm
ginx.BindJSON(c, &f)
if len(f.Roles) == 0 {
ginx.Bomb(http.StatusBadRequest, "roles empty")
}
target := User(rt.Ctx, ginx.UrlParamInt64(c, "id"))
oldInfo := models.User{
Username: target.Username,
Phone: target.Phone,
Email: target.Email,
}
target.Nickname = f.Nickname
target.Phone = f.Phone
target.Email = f.Email
target.Roles = strings.Join(f.Roles, " ")
target.Contacts = f.Contacts
target.UpdateBy = c.MustGet("username").(string)
if flashduty.NeedSyncUser(rt.Ctx) {
flashduty.UpdateUser(rt.Ctx, oldInfo, f.Email, f.Phone)
}
ginx.NewRender(c).Message(target.UpdateAllFields(rt.Ctx))
}
type userPasswordForm struct {
Password string `json:"password" binding:"required"`
}
func (rt *Router) userPasswordPut(c *gin.Context) {
var f userPasswordForm
ginx.BindJSON(c, &f)
target := User(rt.Ctx, ginx.UrlParamInt64(c, "id"))
authPassWord := f.Password
if rt.HTTP.RSA.OpenRSA {
decPassWord, err := secu.Decrypt(f.Password, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord)
if err != nil {
logger.Errorf("RSA Decrypt failed: %v username: %s", err, target.Username)
ginx.NewRender(c).Message(err)
return
}
authPassWord = decPassWord
}
cryptoPass, err := models.CryptoPass(rt.Ctx, authPassWord)
ginx.Dangerous(err)
ginx.NewRender(c).Message(target.UpdatePassword(rt.Ctx, cryptoPass, c.MustGet("username").(string)))
}
func (rt *Router) userDel(c *gin.Context) {
id := ginx.UrlParamInt64(c, "id")
target, err := models.UserGetById(rt.Ctx, id)
ginx.Dangerous(err)
if target == nil {
ginx.NewRender(c).Message(nil)
return
}
// 如果要删除的用户是 admin 角色,检查是否是最后一个 admin
if target.IsAdmin() {
adminCount, err := models.CountAdminUsers(rt.Ctx)
ginx.Dangerous(err)
if adminCount <= 1 {
ginx.Bomb(http.StatusBadRequest, "Cannot delete the last admin user")
}
}
ginx.NewRender(c).Message(target.Del(rt.Ctx))
}
func (rt *Router) installDateGet(c *gin.Context) {
rootUser, err := models.UserGetByUsername(rt.Ctx, "root")
if err != nil {
logger.Errorf("get root user failed: %v", err)
ginx.NewRender(c).Data(0, nil)
return
}
if rootUser == nil {
logger.Errorf("root user not found")
ginx.NewRender(c).Data(0, nil)
return
}
ginx.NewRender(c).Data(rootUser.CreateAt, nil)
}
// usersPhoneEncrypt 统一手机号加密
func (rt *Router) usersPhoneEncrypt(c *gin.Context) {
users, err := models.UserGetAll(rt.Ctx)
if err != nil {
ginx.NewRender(c).Message(fmt.Errorf("get users failed: %v", err))
return
}
// 获取RSA密钥
_, publicKey, _, err := models.GetRSAKeys(rt.Ctx)
if err != nil {
ginx.NewRender(c).Message(fmt.Errorf("get RSA keys failed: %v", err))
return
}
// 先启用手机号加密功能
err = models.SetPhoneEncryptionEnabled(rt.Ctx, true)
if err != nil {
ginx.NewRender(c).Message(fmt.Errorf("enable phone encryption failed: %v", err))
return
}
// 刷新配置缓存
err = models.RefreshPhoneEncryptionCache(rt.Ctx)
if err != nil {
logger.Errorf("Failed to refresh phone encryption cache: %v", err)
// 回滚配置
models.SetPhoneEncryptionEnabled(rt.Ctx, false)
ginx.NewRender(c).Message(fmt.Errorf("refresh cache failed: %v", err))
return
}
successCount := 0
failCount := 0
var failedUsers []string
// 使用事务处理所有用户的手机号加密
err = models.DB(rt.Ctx).Transaction(func(tx *gorm.DB) error {
// 对每个用户的手机号进行加密
for _, user := range users {
if user.Phone == "" {
continue
}
if isPhoneEncrypted(user.Phone) {
continue
}
encryptedPhone, err := secu.EncryptValue(user.Phone, publicKey)
if err != nil {
logger.Errorf("Failed to encrypt phone for user %s: %v", user.Username, err)
failCount++
failedUsers = append(failedUsers, user.Username)
continue
}
err = tx.Model(&models.User{}).Where("id = ?", user.Id).Update("phone", encryptedPhone).Error
if err != nil {
logger.Errorf("Failed to update phone for user %s: %v", user.Username, err)
failCount++
failedUsers = append(failedUsers, user.Username)
continue
}
successCount++
logger.Debugf("Successfully encrypted phone for user %s", user.Username)
}
// 如果有失败的用户,回滚事务
if failCount > 0 {
return fmt.Errorf("encrypt failed users: %d, failed users: %v", failCount, failedUsers)
}
return nil
})
if err != nil {
// 加密失败,回滚配置
models.SetPhoneEncryptionEnabled(rt.Ctx, false)
models.RefreshPhoneEncryptionCache(rt.Ctx)
ginx.NewRender(c).Message(fmt.Errorf("encrypt phone failed: %v", err))
return
}
ginx.NewRender(c).Data(gin.H{
"success_count": successCount,
"fail_count": failCount,
}, nil)
}
func (rt *Router) usersPhoneDecryptRefresh(c *gin.Context) {
err := models.RefreshPhoneEncryptionCache(rt.Ctx)
if err != nil {
ginx.NewRender(c).Message(fmt.Errorf("refresh phone encryption cache failed: %v", err))
return
}
ginx.NewRender(c).Message(nil)
}
// usersPhoneDecrypt 统一手机号解密
func (rt *Router) usersPhoneDecrypt(c *gin.Context) {
// 先关闭手机号加密功能
err := models.SetPhoneEncryptionEnabled(rt.Ctx, false)
if err != nil {
ginx.NewRender(c).Message(fmt.Errorf("disable phone encryption failed: %v", err))
return
}
// 刷新配置缓存
err = models.RefreshPhoneEncryptionCache(rt.Ctx)
if err != nil {
logger.Errorf("Failed to refresh phone encryption cache: %v", err)
// 回滚配置
models.SetPhoneEncryptionEnabled(rt.Ctx, true)
ginx.NewRender(c).Message(fmt.Errorf("refresh cache failed: %v", err))
return
}
// 获取所有用户(此时加密开关已关闭,直接读取数据库原始数据)
var users []*models.User
err = models.DB(rt.Ctx).Find(&users).Error
if err != nil {
// 回滚配置
models.SetPhoneEncryptionEnabled(rt.Ctx, true)
models.RefreshPhoneEncryptionCache(rt.Ctx)
ginx.NewRender(c).Message(fmt.Errorf("get users failed: %v", err))
return
}
// 获取RSA密钥
privateKey, _, password, err := models.GetRSAKeys(rt.Ctx)
if err != nil {
// 回滚配置
models.SetPhoneEncryptionEnabled(rt.Ctx, true)
models.RefreshPhoneEncryptionCache(rt.Ctx)
ginx.NewRender(c).Message(fmt.Errorf("get RSA keys failed: %v", err))
return
}
successCount := 0
failCount := 0
var failedUsers []string
// 使用事务处理所有用户的手机号解密
err = models.DB(rt.Ctx).Transaction(func(tx *gorm.DB) error {
// 对每个用户的手机号进行解密
for _, user := range users {
if user.Phone == "" {
continue
}
// 检查是否是加密的手机号
if !isPhoneEncrypted(user.Phone) {
continue
}
// 对手机号进行解密
decryptedPhone, err := secu.Decrypt(user.Phone, privateKey, password)
if err != nil {
logger.Errorf("Failed to decrypt phone for user %s: %v", user.Username, err)
failCount++
failedUsers = append(failedUsers, user.Username)
continue
}
// 直接更新数据库中的手机号字段(绕过GORM钩子)
err = tx.Model(&models.User{}).Where("id = ?", user.Id).Update("phone", decryptedPhone).Error
if err != nil {
logger.Errorf("Failed to update phone for user %s: %v", user.Username, err)
failCount++
failedUsers = append(failedUsers, user.Username)
continue
}
successCount++
logger.Debugf("Successfully decrypted phone for user %s", user.Username)
}
// 如果有失败的用户,回滚事务
if failCount > 0 {
return fmt.Errorf("decrypt failed users: %d, failed users: %v", failCount, failedUsers)
}
return nil
})
if err != nil {
// 解密失败,回滚配置
models.SetPhoneEncryptionEnabled(rt.Ctx, true)
models.RefreshPhoneEncryptionCache(rt.Ctx)
ginx.NewRender(c).Message(fmt.Errorf("decrypt phone failed: %v", err))
return
}
ginx.NewRender(c).Data(gin.H{
"success_count": successCount,
"fail_count": failCount,
}, nil)
}
// isPhoneEncrypted 检查手机号是否已经加密
func isPhoneEncrypted(phone string) bool {
// 检查是否有 "enc:" 前缀标记
return len(phone) > 4 && phone[:4] == "enc:"
}
================================================
FILE: center/router/router_user_group.go
================================================
package router
import (
"net/http"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/flashduty"
"github.com/ccfos/nightingale/v6/pkg/strx"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/logger"
)
func (rt *Router) checkBusiGroupPerm(c *gin.Context) {
me := c.MustGet("user").(*models.User)
bg := BusiGroup(rt.Ctx, ginx.UrlParamInt64(c, "id"))
can, err := me.CanDoBusiGroup(rt.Ctx, bg, ginx.UrlParamStr(c, "perm"))
ginx.NewRender(c).Data(can, err)
}
func (rt *Router) userGroupGets(c *gin.Context) {
limit := ginx.QueryInt(c, "limit", 1500)
query := ginx.QueryStr(c, "query", "")
me := c.MustGet("user").(*models.User)
lst, err := me.UserGroups(rt.Ctx, limit, query)
if err == nil {
models.FillUpdateByNicknames(rt.Ctx, lst)
}
ginx.NewRender(c).Data(lst, err)
}
func (rt *Router) userGroupGetsByService(c *gin.Context) {
ids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "ids", ""))
if len(ids) == 0 {
lst, err := models.UserGroupGetAll(rt.Ctx)
ginx.Dangerous(err)
for i := 0; i < len(lst); i++ {
ids, err := models.MemberIds(rt.Ctx, lst[i].Id)
ginx.Dangerous(err)
lst[i].Users, err = models.UserGetsByIds(rt.Ctx, ids)
ginx.Dangerous(err)
}
ginx.NewRender(c).Data(lst, err)
return
}
lst := make([]models.UserGroup, 0)
for _, id := range ids {
ug := UserGroup(rt.Ctx, id)
ids, err := models.MemberIds(rt.Ctx, ug.Id)
ginx.Dangerous(err)
ug.Users, err = models.UserGetsByIds(rt.Ctx, ids)
ginx.Dangerous(err)
lst = append(lst, *ug)
}
ginx.NewRender(c).Data(lst, nil)
}
// user group member get by service
func (rt *Router) userGroupMemberGetsByService(c *gin.Context) {
members, err := models.UserGroupMemberGetAll(rt.Ctx)
ginx.NewRender(c).Data(members, err)
}
type userGroupForm struct {
Name string `json:"name" binding:"required"`
Note string `json:"note"`
IsSyncToFlashDuty bool `json:"is_sync_to_flashduty"`
}
func (rt *Router) userGroupAdd(c *gin.Context) {
var f userGroupForm
ginx.BindJSON(c, &f)
me := c.MustGet("user").(*models.User)
ug := models.UserGroup{
Name: f.Name,
Note: f.Note,
CreateBy: me.Username,
UpdateBy: me.Username,
}
err := ug.Add(rt.Ctx)
ginx.Dangerous(err)
// Even failure is not a big deal
models.UserGroupMemberAdd(rt.Ctx, ug.Id, me.Id)
if f.IsSyncToFlashDuty || flashduty.NeedSyncTeam(rt.Ctx) {
ugs, err := flashduty.NewUserGroupSyncer(rt.Ctx, &ug)
ginx.Dangerous(err)
err = ugs.SyncUGAdd()
ginx.Dangerous(err)
}
ginx.NewRender(c).Data(ug.Id, err)
}
func (rt *Router) userGroupPut(c *gin.Context) {
var f userGroupForm
ginx.BindJSON(c, &f)
me := c.MustGet("user").(*models.User)
ug := c.MustGet("user_group").(*models.UserGroup)
if ug.Name != f.Name {
// name changed, check duplication
num, err := models.UserGroupCount(rt.Ctx, "name=? and id<>?", f.Name, ug.Id)
ginx.Dangerous(err)
if num > 0 {
ginx.Bomb(http.StatusOK, "UserGroup already exists")
}
}
ug.Name = f.Name
ug.Note = f.Note
ug.UpdateBy = me.Username
ug.UpdateAt = time.Now().Unix()
if f.IsSyncToFlashDuty || flashduty.NeedSyncTeam(rt.Ctx) {
ugs, err := flashduty.NewUserGroupSyncer(rt.Ctx, ug)
ginx.Dangerous(err)
err = ugs.SyncUGPut()
ginx.Dangerous(err)
}
ginx.NewRender(c).Message(ug.Update(rt.Ctx, "Name", "Note", "UpdateAt", "UpdateBy"))
}
// Return all members, front-end search and paging
func (rt *Router) userGroupGet(c *gin.Context) {
ug := UserGroup(rt.Ctx, ginx.UrlParamInt64(c, "id"))
ids, err := models.MemberIds(rt.Ctx, ug.Id)
ginx.Dangerous(err)
logger.Info("userGroupGet", ids)
users, err := models.UserGetsByIds(rt.Ctx, ids)
ginx.NewRender(c).Data(gin.H{
"users": users,
"user_group": ug,
}, err)
}
func (rt *Router) userGroupDel(c *gin.Context) {
isSyncToFlashDuty := ginx.QueryBool(c, "is_sync_to_flashduty", false)
ug := c.MustGet("user_group").(*models.UserGroup)
if isSyncToFlashDuty || flashduty.NeedSyncTeam(rt.Ctx) {
ugs, err := flashduty.NewUserGroupSyncer(rt.Ctx, ug)
ginx.Dangerous(err)
err = ugs.SyncUGDel()
// 如果team 在 duty 被引用或者已经删除,会报错,可以忽略报错
if err != nil {
logger.Warningf("failed to sync user group %s to flashduty's team: %v", ug.Name, err)
}
}
ginx.NewRender(c).Message(ug.Del(rt.Ctx))
}
func (rt *Router) userGroupMemberAdd(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
f.Verify()
me := c.MustGet("user").(*models.User)
ug := c.MustGet("user_group").(*models.UserGroup)
err := ug.AddMembers(rt.Ctx, f.Ids)
ginx.Dangerous(err)
if err == nil {
ug.UpdateAt = time.Now().Unix()
ug.UpdateBy = me.Username
ug.Update(rt.Ctx, "UpdateAt", "UpdateBy")
}
if f.IsSyncToFlashDuty || flashduty.NeedSyncTeam(rt.Ctx) {
ugs, err := flashduty.NewUserGroupSyncer(rt.Ctx, ug)
ginx.Dangerous(err)
err = ugs.SyncMembersAdd()
ginx.Dangerous(err)
}
ginx.NewRender(c).Message(err)
}
func (rt *Router) userGroupMemberDel(c *gin.Context) {
var f idsForm
ginx.BindJSON(c, &f)
f.Verify()
me := c.MustGet("user").(*models.User)
ug := c.MustGet("user_group").(*models.UserGroup)
err := ug.DelMembers(rt.Ctx, f.Ids)
if err == nil {
ug.UpdateAt = time.Now().Unix()
ug.UpdateBy = me.Username
ug.Update(rt.Ctx, "UpdateAt", "UpdateBy")
}
if f.IsSyncToFlashDuty || flashduty.NeedSyncTeam(rt.Ctx) {
ugs, err := flashduty.NewUserGroupSyncer(rt.Ctx, ug)
ginx.Dangerous(err)
err = ugs.SyncMembersDel()
ginx.Dangerous(err)
}
ginx.NewRender(c).Message(err)
}
================================================
FILE: center/router/router_user_variable_config.go
================================================
package router
import (
"strings"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
func (rt *Router) userVariableConfigGets(context *gin.Context) {
userVariables, err := models.ConfigsGetUserVariable(rt.Ctx)
ginx.NewRender(context).Data(userVariables, err)
}
func (rt *Router) userVariableConfigAdd(context *gin.Context) {
var f models.Configs
ginx.BindJSON(context, &f)
f.Ckey = strings.TrimSpace(f.Ckey)
//insert external config. needs to make sure not plaintext for an encrypted type config
username := context.MustGet("username").(string)
now := time.Now().Unix()
f.CreateBy = username
f.UpdateBy = username
f.CreateAt = now
f.UpdateAt = now
ginx.NewRender(context).Message(models.ConfigsUserVariableInsert(rt.Ctx, f))
}
func (rt *Router) userVariableConfigPut(context *gin.Context) {
var f models.Configs
ginx.BindJSON(context, &f)
f.Id = ginx.UrlParamInt64(context, "id")
f.Ckey = strings.TrimSpace(f.Ckey)
f.UpdateBy = context.MustGet("username").(string)
f.UpdateAt = time.Now().Unix()
user := context.MustGet("user").(*models.User)
if !user.IsAdmin() && f.CreateBy != user.Username {
// only admin or creator can update
ginx.Bomb(403, "forbidden")
}
ginx.NewRender(context).Message(models.ConfigsUserVariableUpdate(rt.Ctx, f))
}
func (rt *Router) userVariableConfigDel(context *gin.Context) {
id := ginx.UrlParamInt64(context, "id")
configs, err := models.ConfigGet(rt.Ctx, id)
ginx.Dangerous(err)
user := context.MustGet("user").(*models.User)
if !user.IsAdmin() && configs.CreateBy != user.Username {
// only admin or creator can delete
ginx.Bomb(403, "forbidden")
}
if configs != nil && configs.External == models.ConfigExternal {
ginx.NewRender(context).Message(models.ConfigsDel(rt.Ctx, []int64{id}))
} else {
ginx.NewRender(context).Message(nil)
}
}
func (rt *Router) userVariableGetDecryptByService(context *gin.Context) {
decryptMap, decryptErr := models.ConfigUserVariableGetDecryptMap(rt.Ctx, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord)
ginx.NewRender(context).Data(decryptMap, decryptErr)
}
================================================
FILE: center/sso/init.go
================================================
package sso
import (
"encoding/json"
"fmt"
"log"
"time"
"github.com/ccfos/nightingale/v6/center/cconf"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/cas"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/dingtalk"
"github.com/ccfos/nightingale/v6/pkg/feishu"
"github.com/ccfos/nightingale/v6/pkg/ldapx"
"github.com/ccfos/nightingale/v6/pkg/oauth2x"
"github.com/ccfos/nightingale/v6/pkg/oidcx"
"github.com/ccfos/nightingale/v6/pkg/tplx"
"github.com/BurntSushi/toml"
"github.com/toolkits/pkg/logger"
)
type SsoClient struct {
OIDC *oidcx.SsoClient
LDAP *ldapx.SsoClient
CAS *cas.SsoClient
OAuth2 *oauth2x.SsoClient
DingTalk *dingtalk.SsoClient
FeiShu *feishu.SsoClient
LastUpdateTime int64
configCache *memsto.ConfigCache
configLastUpdateTime int64
}
const LDAP = `
Enable = false
Host = 'ldap.example.org'
Port = 389
BaseDn = 'dc=example,dc=org'
BindUser = 'cn=manager,dc=example,dc=org'
BindPass = '*******'
SyncAddUsers = false
SyncDelUsers = false
# unit: s
SyncInterval = 86400
# openldap format e.g. (&(uid=%s))
# AD format e.g. (&(sAMAccountName=%s))
AuthFilter = '(&(uid=%s))'
UserFilter = '(&(uid=*))'
CoverAttributes = true
TLS = false
StartTLS = true
DefaultRoles = ['Standard']
[Attributes]
Username = 'uid'
Nickname = 'cn'
Phone = 'mobile'
Email = 'mail'
`
const OAuth2 = `
Enable = false
DisplayName = 'OAuth2登录'
RedirectURL = 'http://n9e.com/callback/oauth'
SsoAddr = 'https://sso.example.com/oauth2/authorize'
SsoLogoutAddr = 'https://sso.example.com/oauth2/authorize/session/end'
TokenAddr = 'https://sso.example.com/oauth2/token'
UserInfoAddr = 'https://api.example.com/api/v1/user/info'
TranTokenMethod = 'header'
ClientId = ''
ClientSecret = ''
CoverAttributes = true
DefaultRoles = ['Standard']
UserinfoIsArray = false
UserinfoPrefix = 'data'
Scopes = ['profile', 'email', 'phone']
[Attributes]
Username = 'sub'
Nickname = 'nickname'
Phone = 'phone_number'
Email = 'email'
`
const CAS = `
Enable = false
DisplayName = 'CAS登录'
RedirectURL = 'http://n9e.com/callback/cas'
SsoAddr = 'https://cas.example.com/cas/'
SsoLogoutAddr = 'https://cas.example.com/cas/session/end'
# LoginPath = ''
CoverAttributes = true
DefaultRoles = ['Standard']
[Attributes]
Username = 'sub'
Nickname = 'nickname'
Phone = 'phone_number'
Email = 'email'
`
const OIDC = `
Enable = false
DisplayName = 'OIDC登录'
RedirectURL = 'http://n9e.com/callback'
SsoAddr = 'http://sso.example.org'
SsoLogoutAddr = 'http://sso.example.org/session/end'
ClientId = ''
ClientSecret = ''
CoverAttributes = true
DefaultRoles = ['Standard']
Scopes = ['openid', 'profile', 'email', 'phone']
[Attributes]
Username = 'sub'
Nickname = 'nickname'
Phone = 'phone_number'
Email = 'email'
`
func Init(center cconf.Center, ctx *ctx.Context, configCache *memsto.ConfigCache) *SsoClient {
ssoClient := new(SsoClient)
m := make(map[string]string)
m["LDAP"] = LDAP
m["CAS"] = CAS
m["OIDC"] = OIDC
m["OAuth2"] = OAuth2
for name, config := range m {
count, err := models.SsoConfigCountByName(ctx, name)
if err != nil {
logger.Error(err)
continue
}
if count > 0 {
continue
}
ssoConfig := models.SsoConfig{
Name: name,
Content: config,
}
err = ssoConfig.Create(ctx)
if err != nil {
log.Fatalln(err)
}
}
if configCache == nil {
log.Fatalln(fmt.Errorf("configCache is nil, sso initialization failed"))
}
ssoClient.configCache = configCache
userVariableMap := configCache.Get()
configs, err := models.SsoConfigGets(ctx)
if err != nil {
log.Fatalln(err)
}
for _, cfg := range configs {
cfg.Content = tplx.ReplaceTemplateUseText(cfg.Name, cfg.Content, userVariableMap)
switch cfg.Name {
case "LDAP":
var config ldapx.Config
err := toml.Unmarshal([]byte(cfg.Content), &config)
if err != nil {
log.Fatalln("init ldap failed", err)
}
ssoClient.LDAP = ldapx.New(config)
case "OIDC":
var config oidcx.Config
err := toml.Unmarshal([]byte(cfg.Content), &config)
if err != nil {
log.Fatalln("init oidc failed:", err)
}
logger.Info("init oidc..")
oidcClient, err := oidcx.New(config)
if err != nil {
logger.Error("init oidc failed:", err)
} else {
ssoClient.OIDC = oidcClient
}
case "CAS":
var config cas.Config
err := toml.Unmarshal([]byte(cfg.Content), &config)
if err != nil {
log.Fatalln("init cas failed:", err)
}
ssoClient.CAS = cas.New(config)
case "OAuth2":
var config oauth2x.Config
err := toml.Unmarshal([]byte(cfg.Content), &config)
if err != nil {
log.Fatalln("init oauth2 failed:", err)
}
ssoClient.OAuth2 = oauth2x.New(config)
case dingtalk.SsoTypeName:
var config dingtalk.Config
err := json.Unmarshal([]byte(cfg.Content), &config)
if err != nil {
log.Fatalf("init %s failed: %s", dingtalk.SsoTypeName, err)
}
ssoClient.DingTalk = dingtalk.New(config)
case feishu.SsoTypeName:
var config feishu.Config
err := json.Unmarshal([]byte(cfg.Content), &config)
if err != nil {
log.Fatalf("init %s failed: %s", feishu.SsoTypeName, err)
}
ssoClient.FeiShu = feishu.New(config)
}
}
go ssoClient.SyncSsoUsers(ctx)
go ssoClient.Reload(ctx)
return ssoClient
}
// 定期更新sso配置
func (s *SsoClient) reload(ctx *ctx.Context) error {
lastUpdateTime, err := models.SsoConfigLastUpdateTime(ctx)
if err != nil {
return err
}
lastCacheUpdateTime := s.configCache.GetLastUpdateTime()
if lastUpdateTime == s.LastUpdateTime && lastCacheUpdateTime == s.configLastUpdateTime {
return nil
}
configs, err := models.SsoConfigGets(ctx)
if err != nil {
return err
}
userVariableMap := s.configCache.Get()
ssoConfigMap := make(map[string]models.SsoConfig, 0)
for _, cfg := range configs {
ssoConfigMap[cfg.Name] = cfg
cfg.Content = tplx.ReplaceTemplateUseText(cfg.Name, cfg.Content, userVariableMap)
switch cfg.Name {
case "LDAP":
var config ldapx.Config
err := toml.Unmarshal([]byte(cfg.Content), &config)
if err != nil {
logger.Warning("reload ldap failed", err)
continue
}
s.LDAP.Reload(config)
case "OIDC":
var config oidcx.Config
err := toml.Unmarshal([]byte(cfg.Content), &config)
if err != nil {
logger.Warning("reload oidc failed:", err)
continue
}
logger.Info("reload oidc..")
err = s.OIDC.Reload(config)
if err != nil {
logger.Error("reload oidc failed:", err)
continue
}
case "CAS":
var config cas.Config
err := toml.Unmarshal([]byte(cfg.Content), &config)
if err != nil {
logger.Warning("reload cas failed:", err)
continue
}
s.CAS.Reload(config)
case "OAuth2":
var config oauth2x.Config
err := toml.Unmarshal([]byte(cfg.Content), &config)
if err != nil {
logger.Warning("reload oauth2 failed:", err)
continue
}
s.OAuth2.Reload(config)
}
}
if dingTalkConfig, ok := ssoConfigMap[dingtalk.SsoTypeName]; ok {
var config dingtalk.Config
err := json.Unmarshal([]byte(dingTalkConfig.Content), &config)
if err != nil {
logger.Warningf("reload %s failed: %s", dingtalk.SsoTypeName, err)
} else {
if s.DingTalk != nil {
s.DingTalk.Reload(config)
} else {
s.DingTalk = dingtalk.New(config)
}
}
} else {
s.DingTalk = nil
}
if feiShuConfig, ok := ssoConfigMap[feishu.SsoTypeName]; ok {
var config feishu.Config
err := json.Unmarshal([]byte(feiShuConfig.Content), &config)
if err != nil {
logger.Warningf("reload %s failed: %s", feishu.SsoTypeName, err)
} else {
if s.FeiShu != nil {
s.FeiShu.Reload(config)
} else {
s.FeiShu = feishu.New(config)
}
}
} else {
s.FeiShu = nil
}
s.LastUpdateTime = lastUpdateTime
s.configLastUpdateTime = lastCacheUpdateTime
return nil
}
func (s *SsoClient) Reload(ctx *ctx.Context) {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := s.reload(ctx); err != nil {
logger.Warning("reload sso client err:", err)
}
}
}
================================================
FILE: center/sso/sync.go
================================================
package sso
import (
"fmt"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/logger"
)
func (s *SsoClient) SyncSsoUsers(ctx *ctx.Context) {
if err := s.LDAP.SyncAddAndDelUsers(ctx); err != nil {
fmt.Println("failed to sync the addition and deletion of ldap users:", err)
}
if err := s.LDAP.SyncDelUsers(ctx); err != nil {
fmt.Println("failed to sync deletion of ldap users:", err)
}
go s.loopSyncSsoUsers(ctx)
}
func (s *SsoClient) loopSyncSsoUsers(ctx *ctx.Context) {
for {
select {
case <-s.LDAP.Ticker.C:
lc := s.LDAP.Copy()
if err := lc.SyncAddAndDelUsers(ctx); err != nil {
logger.Warningf("failed to sync the addition and deletion of ldap users: %v", err)
}
if err := lc.SyncDelUsers(ctx); err != nil {
logger.Warningf("failed to sync deletion of ldap users: %v", err)
}
}
}
}
================================================
FILE: cli/cli.go
================================================
package cli
import (
"github.com/ccfos/nightingale/v6/cli/upgrade"
)
func Upgrade(configFile string) error {
return upgrade.Upgrade(configFile)
}
================================================
FILE: cli/upgrade/config.go
================================================
package upgrade
import (
"bytes"
"path"
"github.com/ccfos/nightingale/v6/pkg/cfg"
"github.com/ccfos/nightingale/v6/pkg/ormx"
"github.com/ccfos/nightingale/v6/pkg/tlsx"
"github.com/koding/multiconfig"
)
type Config struct {
DB ormx.DBConfig
Clusters []ClusterOptions
}
type ClusterOptions struct {
Name string
Prom string
BasicAuthUser string
BasicAuthPass string
Headers []string
Timeout int64
DialTimeout int64
UseTLS bool
tlsx.ClientConfig
MaxIdleConnsPerHost int
}
func Parse(fpath string, configPtr *Config) error {
var (
tBuf []byte
)
loaders := []multiconfig.Loader{
&multiconfig.TagLoader{},
&multiconfig.EnvironmentLoader{},
}
s := cfg.NewFileScanner()
s.Read(path.Join(fpath))
tBuf = append(tBuf, s.Data()...)
tBuf = append(tBuf, []byte("\n")...)
if s.Err() != nil {
return s.Err()
}
if len(tBuf) != 0 {
loaders = append(loaders, &multiconfig.TOMLLoader{Reader: bytes.NewReader(tBuf)})
}
m := multiconfig.DefaultLoader{
Loader: multiconfig.MultiLoader(loaders...),
Validator: multiconfig.MultiValidator(&multiconfig.RequiredValidator{}),
}
return m.Load(configPtr)
}
================================================
FILE: cli/upgrade/readme.md
================================================
# v5 升级 v6 手册
0. 操作之前,记得备注下数据库!
1. 需要先将你正在使用的夜莺数据源表结构更新到和 v5.15.0 一致,[release](https://github.com/ccfos/nightingale/releases) 页面有每个版本表结构的更新说明,可以根据你正在使用的版本,按照说明,逐个执行的更新表结构的语句
2. 解压 n9e 安装包,导入 upgrade.sql 到 n9e_v5 数据库
```
mysql -h 127.0.0.1 -u root -p1234 < cli/upgrade/upgrade.sql
```
3. 执行 n9e-cli 完成数据库表结构升级, webapi.conf 为 v5 版本 n9e-webapi 正在使用的配置文件
```
./n9e-cli --upgrade --config webapi.conf
```
4. 修改 n9e 配置文件中的数据库为 n9e_v5,启动 n9e 进程
```
nohup ./n9e &> n9e.log &
```
5. n9e 监听的端口为 17000,需要将之前的 web 端口和数据上报的端口,都调整为 17000
================================================
FILE: cli/upgrade/upgrade.go
================================================
package upgrade
import (
"context"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/storage"
"github.com/toolkits/pkg/logger"
)
func Upgrade(configFile string) error {
var config Config
Parse(configFile, &config)
db, err := storage.New(config.DB)
if err != nil {
return err
}
ctx := ctx.NewContext(context.Background(), db, true)
for _, cluster := range config.Clusters {
count, err := models.GetDatasourcesCountByName(ctx, cluster.Name)
if err != nil {
logger.Errorf("get datasource %s count error: %v", cluster.Name, err)
continue
}
if count > 0 {
continue
}
header := make(map[string]string)
headerCount := len(cluster.Headers)
if headerCount > 0 && headerCount%2 == 0 {
for i := 0; i < len(cluster.Headers); i += 2 {
header[cluster.Headers[i]] = cluster.Headers[i+1]
}
}
authJson := models.Auth{
BasicAuthUser: cluster.BasicAuthUser,
BasicAuthPassword: cluster.BasicAuthPass,
}
httpJson := models.HTTP{
Timeout: cluster.Timeout,
DialTimeout: cluster.DialTimeout,
TLS: models.TLS{
SkipTlsVerify: cluster.UseTLS,
},
MaxIdleConnsPerHost: cluster.MaxIdleConnsPerHost,
Url: cluster.Prom,
Headers: header,
}
datasource := models.Datasource{
PluginId: 1,
PluginType: "prometheus",
PluginTypeName: "Prometheus Like",
Name: cluster.Name,
HTTPJson: httpJson,
AuthJson: authJson,
ClusterName: "default",
Status: "enabled",
}
err = datasource.Add(ctx)
if err != nil {
logger.Errorf("add datasource %s error: %v", cluster.Name, err)
}
}
datasources, err := models.GetDatasources(ctx)
if err != nil {
return err
}
m := make(map[string]models.Datasource)
for i := 0; i < len(datasources); i++ {
m[datasources[i].Name] = datasources[i]
}
err = models.AlertRuleUpgradeToV6(ctx, m)
if err != nil {
return err
}
// alert mute
err = models.AlertMuteUpgradeToV6(ctx, m)
if err != nil {
return err
}
// alert subscribe
err = models.AlertSubscribeUpgradeToV6(ctx, m)
if err != nil {
return err
}
// recoding rule
err = models.RecordingRuleUpgradeToV6(ctx, m)
if err != nil {
return err
}
// alert cur event
err = models.AlertCurEventUpgradeToV6(ctx, m)
if err != nil {
return err
}
// alert his event
err = models.AlertHisEventUpgradeToV6(ctx, m)
if err != nil {
return err
}
return nil
}
================================================
FILE: cli/upgrade/upgrade.sql
================================================
use n9e_v5;
insert into `role_operation`(role_name, operation) values('Guest', '/log/explorer');
insert into `role_operation`(role_name, operation) values('Guest', '/trace/explorer');
insert into `role_operation`(role_name, operation) values('Standard', '/log/explorer');
insert into `role_operation`(role_name, operation) values('Standard', '/trace/explorer');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules-built-in');
insert into `role_operation`(role_name, operation) values('Standard', '/dashboards-built-in');
insert into `role_operation`(role_name, operation) values('Standard', '/trace/dependencies');
insert into `role_operation`(role_name, operation) values('Standard', '/help/servers');
insert into `role_operation`(role_name, operation) values('Standard', '/help/migrate');
insert into `role_operation`(role_name, operation) values('Admin', '/help/source');
insert into `role_operation`(role_name, operation) values('Admin', '/help/sso');
insert into `role_operation`(role_name, operation) values('Admin', '/help/notification-tpls');
insert into `role_operation`(role_name, operation) values('Admin', '/help/notification-settings');
alter table `board` add built_in tinyint(1) not null default 0 comment '0:false 1:true';
alter table `board` add hide tinyint(1) not null default 0 comment '0:false 1:true';
alter table `chart_share` add datasource_id bigint unsigned not null default 0;
alter table `alert_rule` add datasource_ids varchar(255) not null default '';
alter table `alert_rule` add rule_config text not null comment 'rule_config';
alter table `alert_rule` add annotations text not null comment 'annotations';
alter table `alert_mute` add datasource_ids varchar(255) not null default '';
alter table `alert_mute` add periodic_mutes varchar(4096) not null default '[]';
alter table `alert_mute` add mute_time_type tinyint(1) not null default 0;
alter table `alert_subscribe` add datasource_ids varchar(255) not null default '';
alter table `alert_subscribe` add prod varchar(255) not null default '';
alter table `alert_subscribe` add webhooks text;
alter table `alert_subscribe` add redefine_webhooks tinyint(1) default 0;
alter table `alert_subscribe` add for_duration bigint not null default 0;
alter table `recording_rule` add datasource_ids varchar(255) default '';
alter table `target` modify cluster varchar(128) not null default '';
alter table `alert_cur_event` add datasource_id bigint unsigned not null default 0;
alter table `alert_cur_event` add annotations text not null comment 'annotations';
alter table `alert_cur_event` add rule_config text not null comment 'rule_config';
alter table `alert_his_event` add datasource_id bigint unsigned not null default 0;
alter table `alert_his_event` add annotations text not null comment 'annotations';
alter table `alert_his_event` add rule_config text not null comment 'rule_config';
alter table `alerting_engines` add datasource_id bigint unsigned not null default 0;
alter table `alerting_engines` change cluster engine_cluster varchar(128) not null default '' comment 'n9e engine cluster';
alter table `task_record` add event_id bigint not null comment 'event id' default 0;
CREATE TABLE `datasource`
(
`id` int unsigned NOT NULL AUTO_INCREMENT,
`name` varchar(255) not null default '',
`description` varchar(255) not null default '',
`category` varchar(255) not null default '',
`plugin_id` int unsigned not null default 0,
`plugin_type` varchar(255) not null default '',
`plugin_type_name` varchar(255) not null default '',
`cluster_name` varchar(255) not null default '',
`settings` text not null,
`status` varchar(255) not null default '',
`http` varchar(4096) not null default '',
`auth` varchar(8192) not null default '',
`created_at` bigint not null default 0,
`created_by` varchar(64) not null default '',
`updated_at` bigint not null default 0,
`updated_by` varchar(64) not null default '',
PRIMARY KEY (`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
-- datasource add weight field
alter table `datasource` add `weight` int not null default 0;
CREATE TABLE `builtin_cate` (
`id` bigint unsigned not null auto_increment,
`name` varchar(191) not null,
`user_id` bigint not null default 0,
PRIMARY KEY (`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `notify_tpl` (
`id` bigint unsigned not null auto_increment,
`channel` varchar(32) not null,
`name` varchar(255) not null,
`content` text not null,
PRIMARY KEY (`id`),
UNIQUE KEY (`channel`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `sso_config` (
`id` bigint unsigned not null auto_increment,
`name` varchar(191) not null,
`content` text not null,
PRIMARY KEY (`id`),
UNIQUE KEY (`name`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
================================================
FILE: cmd/alert/main.go
================================================
package main
import (
"flag"
"fmt"
"log"
"os"
"os/signal"
"syscall"
"github.com/ccfos/nightingale/v6/alert"
"github.com/ccfos/nightingale/v6/pkg/osx"
"github.com/ccfos/nightingale/v6/pkg/version"
"github.com/toolkits/pkg/runner"
)
var (
showVersion = flag.Bool("version", false, "Show version.")
configDir = flag.String("configs", osx.GetEnv("N9E_ALERT_CONFIGS", "etc"), "Specify configuration directory.(env:N9E_ALERT_CONFIGS)")
cryptoKey = flag.String("crypto-key", "", "Specify the secret key for configuration file field encryption.")
)
func main() {
flag.Parse()
if *showVersion {
fmt.Println(version.Version)
os.Exit(0)
}
printEnv()
cleanFunc, err := alert.Initialize(*configDir, *cryptoKey)
if err != nil {
log.Fatalln("failed to initialize:", err)
}
code := 1
sc := make(chan os.Signal, 1)
signal.Notify(sc, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
EXIT:
for {
sig := <-sc
fmt.Println("received signal:", sig.String())
switch sig {
case syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT:
code = 0
break EXIT
case syscall.SIGHUP:
// reload configuration?
default:
break EXIT
}
}
cleanFunc()
fmt.Println("process exited")
os.Exit(code)
}
func printEnv() {
runner.Init()
fmt.Println("runner.cwd:", runner.Cwd)
fmt.Println("runner.hostname:", runner.Hostname)
fmt.Println("runner.fd_limits:", runner.FdLimits())
fmt.Println("runner.vm_limits:", runner.VMLimits())
}
================================================
FILE: cmd/center/main.go
================================================
package main
import (
"flag"
"fmt"
"log"
"os"
"os/signal"
"syscall"
"github.com/ccfos/nightingale/v6/center"
"github.com/ccfos/nightingale/v6/pkg/osx"
"github.com/ccfos/nightingale/v6/pkg/version"
"github.com/toolkits/pkg/net/tcpx"
"github.com/toolkits/pkg/runner"
)
var (
showVersion = flag.Bool("version", false, "Show version.")
configDir = flag.String("configs", osx.GetEnv("N9E_CONFIGS", "etc"), "Specify configuration directory.(env:N9E_CONFIGS)")
cryptoKey = flag.String("crypto-key", "", "Specify the secret key for configuration file field encryption.")
)
func main() {
flag.Parse()
if *showVersion {
fmt.Println(version.Version)
os.Exit(0)
}
printEnv()
tcpx.WaitHosts()
cleanFunc, err := center.Initialize(*configDir, *cryptoKey)
if err != nil {
log.Fatalln("failed to initialize:", err)
}
code := 1
sc := make(chan os.Signal, 1)
signal.Notify(sc, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
EXIT:
for {
sig := <-sc
fmt.Println("received signal:", sig.String())
switch sig {
case syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT:
code = 0
break EXIT
case syscall.SIGHUP:
// reload configuration?
default:
break EXIT
}
}
cleanFunc()
fmt.Println("process exited")
os.Exit(code)
}
func printEnv() {
runner.Init()
fmt.Println("runner.cwd:", runner.Cwd)
fmt.Println("runner.hostname:", runner.Hostname)
fmt.Println("runner.fd_limits:", runner.FdLimits())
fmt.Println("runner.vm_limits:", runner.VMLimits())
}
================================================
FILE: cmd/cli/main.go
================================================
package main
import (
"flag"
"fmt"
"os"
"github.com/ccfos/nightingale/v6/cli"
"github.com/ccfos/nightingale/v6/pkg/version"
)
var (
upgrade = flag.Bool("upgrade", false, "Upgrade the database.")
showVersion = flag.Bool("version", false, "Show version.")
configFile = flag.String("config", "", "Specify webapi.conf of v5.x version")
)
func main() {
flag.Parse()
if *showVersion {
fmt.Println(version.Version)
os.Exit(0)
}
if *upgrade {
if *configFile == "" {
fmt.Println("Please specify the configuration directory.")
os.Exit(1)
}
err := cli.Upgrade(*configFile)
if err != nil {
fmt.Println(err)
os.Exit(1)
}
fmt.Print("Upgrade successfully.")
os.Exit(0)
}
}
================================================
FILE: cmd/edge/edge.go
================================================
package main
import (
"context"
"errors"
"fmt"
"github.com/ccfos/nightingale/v6/alert"
"github.com/ccfos/nightingale/v6/alert/astats"
"github.com/ccfos/nightingale/v6/alert/dispatch"
"github.com/ccfos/nightingale/v6/alert/process"
alertrt "github.com/ccfos/nightingale/v6/alert/router"
"github.com/ccfos/nightingale/v6/center/metas"
"github.com/ccfos/nightingale/v6/conf"
"github.com/ccfos/nightingale/v6/dscache"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/httpx"
"github.com/ccfos/nightingale/v6/pkg/logx"
"github.com/ccfos/nightingale/v6/pkg/macros"
"github.com/ccfos/nightingale/v6/prom"
"github.com/ccfos/nightingale/v6/pushgw/idents"
pushgwrt "github.com/ccfos/nightingale/v6/pushgw/router"
"github.com/ccfos/nightingale/v6/pushgw/writer"
"github.com/ccfos/nightingale/v6/storage"
"github.com/flashcatcloud/ibex/src/cmd/ibex"
)
func Initialize(configDir string, cryptoKey string) (func(), error) {
config, err := conf.InitConfig(configDir, cryptoKey)
if err != nil {
return nil, fmt.Errorf("failed to init config: %v", err)
}
logxClean, err := logx.Init(config.Log)
if err != nil {
return nil, err
}
//check CenterApi is default value
if len(config.CenterApi.Addrs) < 1 {
return nil, errors.New("failed to init config: the CenterApi configuration is missing")
}
ctx := ctx.NewContext(context.Background(), nil, false, config.CenterApi)
var redis storage.Redis
redis, err = storage.NewRedis(config.Redis)
if err != nil {
return nil, err
}
syncStats := memsto.NewSyncStats()
targetCache := memsto.NewTargetCache(ctx, syncStats, redis)
busiGroupCache := memsto.NewBusiGroupCache(ctx, syncStats)
configCvalCache := memsto.NewCvalCache(ctx, syncStats)
idents := idents.New(ctx, redis, config.Pushgw)
metas := metas.New(redis)
writers := writer.NewWriters(config.Pushgw)
pushgwRouter := pushgwrt.New(config.HTTP, config.Pushgw, config.Alert, targetCache, busiGroupCache, idents, metas, writers, ctx)
r := httpx.GinEngine(config.Global.RunMode, config.HTTP, configCvalCache.PrintBodyPaths, configCvalCache.PrintAccessLog)
pushgwRouter.Config(r)
macros.RegisterMacro(macros.MacroInVain)
dscache.Init(ctx, false)
if !config.Alert.Disable {
configCache := memsto.NewConfigCache(ctx, syncStats, nil, "")
alertStats := astats.NewSyncStats()
dsCache := memsto.NewDatasourceCache(ctx, syncStats)
alertMuteCache := memsto.NewAlertMuteCache(ctx, syncStats)
alertRuleCache := memsto.NewAlertRuleCache(ctx, syncStats)
notifyConfigCache := memsto.NewNotifyConfigCache(ctx, configCache)
userCache := memsto.NewUserCache(ctx, syncStats)
userGroupCache := memsto.NewUserGroupCache(ctx, syncStats)
taskTplsCache := memsto.NewTaskTplCache(ctx)
notifyRuleCache := memsto.NewNotifyRuleCache(ctx, syncStats)
notifyChannelCache := memsto.NewNotifyChannelCache(ctx, syncStats)
messageTemplateCache := memsto.NewMessageTemplateCache(ctx, syncStats)
promClients := prom.NewPromClient(ctx)
dispatch.InitRegisterQueryFunc(promClients)
externalProcessors := process.NewExternalProcessors()
alert.Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache,
alertRuleCache, notifyConfigCache, taskTplsCache, dsCache, ctx, promClients, userCache, userGroupCache, notifyRuleCache, notifyChannelCache, messageTemplateCache, configCvalCache)
alertrtRouter := alertrt.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors, config.Log.Dir)
alertrtRouter.Config(r)
if config.Ibex.Enable {
ibex.ServerStart(false, nil, redis, config.HTTP.APIForService.BasicAuth, config.Alert.Heartbeat, &config.CenterApi, r, nil, config.Ibex, config.HTTP.Port)
}
}
dumper.ConfigRouter(r)
httpClean := httpx.Init(config.HTTP, r)
return func() {
logxClean()
httpClean()
}, nil
}
================================================
FILE: cmd/edge/main.go
================================================
package main
import (
"flag"
"fmt"
"log"
"os"
"os/signal"
"syscall"
"github.com/ccfos/nightingale/v6/pkg/osx"
"github.com/ccfos/nightingale/v6/pkg/version"
"github.com/toolkits/pkg/runner"
)
var (
showVersion = flag.Bool("version", false, "Show version.")
configDir = flag.String("configs", osx.GetEnv("N9E_EDGE_CONFIGS", "etc"), "Specify configuration directory.(env:N9E_EDGE_CONFIGS)")
cryptoKey = flag.String("crypto-key", "", "Specify the secret key for configuration file field encryption.")
)
func main() {
flag.Parse()
if *showVersion {
fmt.Println(version.Version)
os.Exit(0)
}
printEnv()
cleanFunc, err := Initialize(*configDir, *cryptoKey)
if err != nil {
log.Fatalln("failed to initialize:", err)
}
code := 1
sc := make(chan os.Signal, 1)
signal.Notify(sc, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
EXIT:
for {
sig := <-sc
fmt.Println("received signal:", sig.String())
switch sig {
case syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT:
code = 0
break EXIT
case syscall.SIGHUP:
// reload configuration?
default:
break EXIT
}
}
cleanFunc()
fmt.Println("process exited")
os.Exit(code)
}
func printEnv() {
runner.Init()
fmt.Println("runner.cwd:", runner.Cwd)
fmt.Println("runner.hostname:", runner.Hostname)
fmt.Println("runner.fd_limits:", runner.FdLimits())
fmt.Println("runner.vm_limits:", runner.VMLimits())
}
================================================
FILE: cmd/pushgw/main.go
================================================
package main
import (
"flag"
"fmt"
"log"
"os"
"os/signal"
"syscall"
"github.com/ccfos/nightingale/v6/pkg/osx"
"github.com/ccfos/nightingale/v6/pkg/version"
"github.com/ccfos/nightingale/v6/pushgw"
"github.com/toolkits/pkg/runner"
)
var (
showVersion = flag.Bool("version", false, "Show version.")
configDir = flag.String("configs", osx.GetEnv("N9E_PUSHGW_CONFIGS", "etc"), "Specify configuration directory.(env:N9E_PUSHGW_CONFIGS)")
cryptoKey = flag.String("crypto-key", "", "Specify the secret key for configuration file field encryption.")
)
func main() {
flag.Parse()
if *showVersion {
fmt.Println(version.Version)
os.Exit(0)
}
printEnv()
cleanFunc, err := pushgw.Initialize(*configDir, *cryptoKey)
if err != nil {
log.Fatalln("failed to initialize:", err)
}
code := 1
sc := make(chan os.Signal, 1)
signal.Notify(sc, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
EXIT:
for {
sig := <-sc
fmt.Println("received signal:", sig.String())
switch sig {
case syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT:
code = 0
break EXIT
case syscall.SIGHUP:
// reload configuration?
default:
break EXIT
}
}
cleanFunc()
fmt.Println("process exited")
os.Exit(code)
}
func printEnv() {
runner.Init()
fmt.Println("runner.cwd:", runner.Cwd)
fmt.Println("runner.hostname:", runner.Hostname)
fmt.Println("runner.fd_limits:", runner.FdLimits())
fmt.Println("runner.vm_limits:", runner.VMLimits())
}
================================================
FILE: conf/conf.go
================================================
package conf
import (
"fmt"
"net"
"os"
"strings"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/center/cconf"
"github.com/ccfos/nightingale/v6/pkg/cfg"
"github.com/ccfos/nightingale/v6/pkg/httpx"
"github.com/ccfos/nightingale/v6/pkg/logx"
"github.com/ccfos/nightingale/v6/pkg/ormx"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/ccfos/nightingale/v6/storage"
)
type ConfigType struct {
Global GlobalConfig
Log logx.Config
HTTP httpx.Config
DB ormx.DBConfig
Redis storage.RedisConfig
CenterApi CenterApi
Pushgw pconf.Pushgw
Alert aconf.Alert
Center cconf.Center
Ibex Ibex
}
type CenterApi struct {
Addrs []string
BasicAuthUser string
BasicAuthPass string
Timeout int64
}
type GlobalConfig struct {
RunMode string
}
type Ibex struct {
Enable bool
RPCListen string
Output Output
}
type Output struct {
ComeFrom string
AgtdPort int
}
func InitConfig(configDir, cryptoKey string) (*ConfigType, error) {
var config = new(ConfigType)
if err := cfg.LoadConfigByDir(configDir, config); err != nil {
return nil, fmt.Errorf("failed to load configs of directory: %s error: %s", configDir, err)
}
config.Pushgw.PreCheck()
config.Alert.PreCheck(configDir)
config.Center.PreCheck()
err := decryptConfig(config, cryptoKey)
if err != nil {
return nil, err
}
if config.Alert.Heartbeat.IP == "" {
// auto detect
config.Alert.Heartbeat.IP = fmt.Sprint(GetOutboundIP())
if config.Alert.Heartbeat.IP == "" {
hostname, err := os.Hostname()
if err != nil {
fmt.Println("failed to get hostname:", err)
os.Exit(1)
}
if strings.Contains(hostname, "localhost") {
fmt.Println("Warning! hostname contains substring localhost, setting a more unique hostname is recommended")
}
config.Alert.Heartbeat.IP = hostname
}
}
config.Alert.Heartbeat.Endpoint = fmt.Sprintf("%s:%d", config.Alert.Heartbeat.IP, config.HTTP.Port)
return config, nil
}
func GetOutboundIP() net.IP {
conn, err := net.Dial("udp", "223.5.5.5:80")
if err != nil {
fmt.Println("auto get outbound ip fail:", err)
return []byte{}
}
defer conn.Close()
localAddr := conn.LocalAddr().(*net.UDPAddr)
return localAddr.IP
}
================================================
FILE: conf/crypto.go
================================================
package conf
import (
"fmt"
"github.com/ccfos/nightingale/v6/pkg/secu"
)
func decryptConfig(config *ConfigType, cryptoKey string) error {
decryptDsn, err := secu.DealWithDecrypt(config.DB.DSN, cryptoKey)
if err != nil {
return fmt.Errorf("failed to decrypt the db dsn: %s", err)
}
config.DB.DSN = decryptDsn
decryptRedisPwd, err := secu.DealWithDecrypt(config.Redis.Password, cryptoKey)
if err != nil {
return fmt.Errorf("failed to decrypt the redis password: %s", err)
}
config.Redis.Password = decryptRedisPwd
for k := range config.HTTP.APIForService.BasicAuth {
decryptPwd, err := secu.DealWithDecrypt(config.HTTP.APIForService.BasicAuth[k], cryptoKey)
if err != nil {
return fmt.Errorf("failed to decrypt http basic auth password: %s", err)
}
config.HTTP.APIForService.BasicAuth[k] = decryptPwd
}
for k := range config.HTTP.APIForAgent.BasicAuth {
decryptPwd, err := secu.DealWithDecrypt(config.HTTP.APIForAgent.BasicAuth[k], cryptoKey)
if err != nil {
return fmt.Errorf("failed to decrypt http basic auth password: %s", err)
}
config.HTTP.APIForAgent.BasicAuth[k] = decryptPwd
}
for i, v := range config.Pushgw.Writers {
decryptWriterPwd, err := secu.DealWithDecrypt(v.BasicAuthPass, cryptoKey)
if err != nil {
return fmt.Errorf("failed to decrypt writer basic auth password: %s", err)
}
config.Pushgw.Writers[i].BasicAuthPass = decryptWriterPwd
}
return nil
}
================================================
FILE: cron/clean_notify_record.go
================================================
package cron
import (
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/robfig/cron/v3"
"github.com/toolkits/pkg/logger"
)
func cleanNotifyRecord(ctx *ctx.Context, day int) {
lastWeek := time.Now().Unix() - 86400*int64(day)
err := models.DB(ctx).Model(&models.NotificationRecord{}).Where("created_at < ?", lastWeek).Delete(&models.NotificationRecord{}).Error
if err != nil {
logger.Errorf("Failed to clean notify record: %v", err)
}
}
// 每天凌晨1点执行清理任务
func CleanNotifyRecord(ctx *ctx.Context, day int) {
c := cron.New()
if day < 1 {
day = 7
}
// 使用cron表达式设置每天凌晨1点执行
_, err := c.AddFunc("0 1 * * *", func() {
cleanNotifyRecord(ctx, day)
})
if err != nil {
logger.Errorf("Failed to add clean notify record cron job: %v", err)
return
}
// 启动cron任务
c.Start()
}
================================================
FILE: cron/clean_pipeline_execution.go
================================================
package cron
import (
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/robfig/cron/v3"
"github.com/toolkits/pkg/logger"
)
const (
defaultBatchSize = 100 // 每批删除数量
defaultSleepMs = 10 // 每批删除后休眠时间(毫秒)
)
// cleanPipelineExecutionInBatches 分批删除执行记录,避免大批量删除影响数据库性能
func cleanPipelineExecutionInBatches(ctx *ctx.Context, day int) {
threshold := time.Now().Unix() - 86400*int64(day)
var totalDeleted int64
for {
deleted, err := models.DeleteEventPipelineExecutionsInBatches(ctx, threshold, defaultBatchSize)
if err != nil {
logger.Errorf("Failed to clean pipeline execution records in batch: %v", err)
return
}
totalDeleted += deleted
// 如果本批删除数量小于 batchSize,说明已删除完毕
if deleted < int64(defaultBatchSize) {
break
}
// 休眠一段时间,降低数据库压力
time.Sleep(time.Duration(defaultSleepMs) * time.Millisecond)
}
if totalDeleted > 0 {
logger.Infof("Cleaned %d pipeline execution records older than %d days", totalDeleted, day)
}
}
// CleanPipelineExecution starts a cron job to clean old pipeline execution records in batches
// Runs daily at 6:00 AM
// day: 数据保留天数,默认 7 天
// 使用分批删除方式,每批 100 条,间隔 10ms,避免大批量删除影响数据库性能
func CleanPipelineExecution(ctx *ctx.Context, day int) {
c := cron.New()
if day < 1 {
day = 7 // default retention: 7 days
}
_, err := c.AddFunc("0 6 * * *", func() {
cleanPipelineExecutionInBatches(ctx, day)
})
if err != nil {
logger.Errorf("Failed to add clean pipeline execution cron job: %v", err)
return
}
c.Start()
logger.Infof("Pipeline execution cleanup cron started, retention: %d days, batch_size: %d, sleep_ms: %d", day, defaultBatchSize, defaultSleepMs)
}
================================================
FILE: datasource/ck/clickhouse.go
================================================
package ck
import (
"context"
"fmt"
"strings"
"github.com/ccfos/nightingale/v6/datasource"
ck "github.com/ccfos/nightingale/v6/dskit/clickhouse"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/macros"
"github.com/mitchellh/mapstructure"
"github.com/toolkits/pkg/logger"
"github.com/ccfos/nightingale/v6/pkg/logx"
)
const (
CKType = "ck"
TimeFieldFormatEpochMilli = "epoch_millis"
TimeFieldFormatEpochSecond = "epoch_second"
DefaultLimit = 500
)
var (
ckPrivBanned = []string{
"INSERT",
"CREATE",
"DROP",
"DELETE",
"UPDATE",
"ALL",
}
ckBannedOp = map[string]struct{}{
"CREATE": {},
"INSERT": {},
"ALTER": {},
"REVOKE": {},
"DROP": {},
"RENAME": {},
"ATTACH": {},
"DETACH": {},
"OPTIMIZE": {},
"TRUNCATE": {},
"SET": {},
}
)
func init() {
datasource.RegisterDatasource(CKType, new(Clickhouse))
}
type CKShard struct {
Addr string `json:"ck.addr" mapstructure:"ck.addr"`
User string `json:"ck.user" mapstructure:"ck.user"`
Password string `json:"ck.password" mapstructure:"ck.password"`
Database string `json:"ck.db" mapstructure:"ck.db"`
IsEncrypted bool `json:"ck.is_encrypt" mapstructure:"ck.is_encrypt"`
}
type QueryParam struct {
Limit int `json:"limit" mapstructure:"limit"`
Sql string `json:"sql" mapstructure:"sql"`
Ref string `json:"ref" mapstructure:"ref"`
From int64 `json:"from" mapstructure:"from"`
To int64 `json:"to" mapstructure:"to"`
TimeField string `json:"time_field" mapstructure:"time_field"`
TimeFormat string `json:"time_format" mapstructure:"time_format"`
Keys datasource.Keys `json:"keys" mapstructure:"keys"`
Database string `json:"database" mapstructure:"database"`
Table string `json:"table" mapstructure:"table"`
}
type Clickhouse struct {
ck.Clickhouse `json:",inline" mapstructure:",squash"`
}
func (c *Clickhouse) Init(settings map[string]interface{}) (datasource.Datasource, error) {
newest := new(Clickhouse)
err := mapstructure.Decode(settings, newest)
return newest, err
}
func (c *Clickhouse) InitClient() error {
return c.InitCli()
}
func (c *Clickhouse) Validate(ctx context.Context) error {
if len(c.Nodes) == 0 {
return fmt.Errorf("ck shard is invalid, please check datasource setting")
}
addr := c.Nodes[0]
if len(strings.Trim(c.User, " ")) == 0 {
return fmt.Errorf("ck shard user is invalid, please check datasource setting")
}
if len(strings.Trim(addr, " ")) == 0 {
return fmt.Errorf("ck shard addr is invalid, please check datasource setting")
}
// if len(strings.Trim(shard.Password, " ")) == 0 {
// return fmt.Errorf("ck shard password is empty, please check datasource setting or set password for user")
// }
return nil
}
// Equal compares whether two objects are the same, used for caching
func (c *Clickhouse) Equal(p datasource.Datasource) bool {
plg, ok := p.(*Clickhouse)
if !ok {
logger.Errorf("unexpected plugin type, expected is ck")
return false
}
// only compare first shard
if len(c.Nodes) == 0 {
logger.Errorf("ck shard is empty")
return false
}
addr := c.Nodes[0]
if len(plg.Nodes) == 0 {
logger.Errorf("new ck plugin obj shard is empty")
return false
}
newAddr := plg.Nodes[0]
if c.User != plg.User {
return false
}
if addr != newAddr {
return false
}
if c.Password != plg.Password {
return false
}
return true
}
func (c *Clickhouse) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
return nil, nil
}
func (c *Clickhouse) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
return nil, nil
}
func (c *Clickhouse) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) {
return nil, nil
}
func (c *Clickhouse) QueryData(ctx context.Context, query interface{}) ([]models.DataResp, error) {
ckQueryParam := new(ck.QueryParam)
if err := mapstructure.Decode(query, ckQueryParam); err != nil {
return nil, err
}
if strings.Contains(ckQueryParam.Sql, "$__") {
var err error
ckQueryParam.Sql, err = macros.Macro(ckQueryParam.Sql, ckQueryParam.From, ckQueryParam.To)
if err != nil {
return nil, err
}
}
if ckQueryParam.Keys.ValueKey == "" {
return nil, fmt.Errorf("valueKey is required")
}
rows, err := c.QueryTimeseries(ctx, ckQueryParam)
if err != nil {
logx.Warningf(ctx, "query:%+v get data err:%v", ckQueryParam, err)
return nil, err
}
data := make([]models.DataResp, 0)
for i := range rows {
data = append(data, models.DataResp{
Ref: ckQueryParam.Ref,
Metric: rows[i].Metric,
Values: rows[i].Values,
})
}
return data, nil
}
func (c *Clickhouse) QueryLog(ctx context.Context, query interface{}) ([]interface{}, int64, error) {
ckQueryParam := new(QueryParam)
if err := mapstructure.Decode(query, ckQueryParam); err != nil {
return nil, 0, err
}
if strings.Contains(ckQueryParam.Sql, "$__") {
var err error
ckQueryParam.Sql, err = macros.Macro(ckQueryParam.Sql, ckQueryParam.From, ckQueryParam.To)
if err != nil {
return nil, 0, err
}
}
rows, err := c.Query(ctx, ckQueryParam)
if err != nil {
logx.Warningf(ctx, "query:%+v get data err:%v", ckQueryParam, err)
return nil, 0, err
}
limit := getLimit(len(rows), ckQueryParam.Limit)
logs := make([]interface{}, 0)
for i := 0; i < limit; i++ {
logs = append(logs, rows[i])
}
return logs, int64(limit), nil
}
func getLimit(rowLen, pLimit int) int {
limit := DefaultLimit
if pLimit > 0 {
limit = pLimit
}
if rowLen > limit {
return limit
}
return rowLen
}
================================================
FILE: datasource/commons/eslike/eslike.go
================================================
package eslike
import (
"context"
"encoding/json"
"fmt"
"strconv"
"strings"
"time"
"github.com/araddon/dateparse"
"github.com/bitly/go-simplejson"
"github.com/mitchellh/mapstructure"
"github.com/olivere/elastic/v7"
"github.com/prometheus/common/model"
"github.com/toolkits/pkg/logger"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/logx"
)
type FixedField string
const (
FieldIndex FixedField = "_index"
FieldId FixedField = "_id"
)
// LabelSeparator 用于分隔多个标签的分隔符
// 使用 ASCII 控制字符 Record Separator (0x1E),避免与用户数据中的 "--" 冲突
const LabelSeparator = "\x1e"
type Query struct {
Ref string `json:"ref" mapstructure:"ref"`
IndexType string `json:"index_type" mapstructure:"index_type"` // 普通索引:index 索引模式:index_pattern
Index string `json:"index" mapstructure:"index"`
IndexPatternId int64 `json:"index_pattern" mapstructure:"index_pattern"`
Filter string `json:"filter" mapstructure:"filter"`
Offset int64 `json:"offset" mapstructure:"offset"`
MetricAggr MetricAggr `json:"value" mapstructure:"value"`
GroupBy []GroupBy `json:"group_by" mapstructure:"group_by"`
DateField string `json:"date_field" mapstructure:"date_field"`
Interval int64 `json:"interval" mapstructure:"interval"`
Start int64 `json:"start" mapstructure:"start"`
End int64 `json:"end" mapstructure:"end"`
P int `json:"page" mapstructure:"page"` // 页码
Limit int `json:"limit" mapstructure:"limit"` // 每页个数
Ascending bool `json:"ascending" mapstructure:"ascending"` // 按照DataField排序
Timeout int `json:"timeout" mapstructure:"timeout"`
MaxShard int `json:"max_shard" mapstructure:"max_shard"`
SearchAfter *SearchAfter `json:"search_after" mapstructure:"search_after"`
}
type SortField struct {
Field string `json:"field" mapstructure:"field"`
Ascending bool `json:"ascending" mapstructure:"ascending"`
}
type SearchAfter struct {
SortFields []SortField `json:"sort_fields" mapstructure:"sort_fields"` // 指定排序字段, 一般是timestamp:desc, _index:asc, _id:asc 三者组合,构成唯一的排序字段
SearchAfter []interface{} `json:"search_after" mapstructure:"search_after"` // 指定排序字段的搜索值,搜索值必须和sort_fields的顺序一致,为上一次查询的最后一条日志的值
}
type MetricAggr struct {
Field string `json:"field" mapstructure:"field"`
Func string `json:"func" mapstructure:"func"`
Ref string `json:"ref" mapstructure:"ref"` // 变量名,A B C
}
type GroupBy struct {
Cate GroupByCate `json:"cate" mapstructure:"cate"` // 分组类型
Field string `json:"field" mapstructure:"field"`
MinDocCount int64 `json:"min_doc_count" mapstructure:"min_doc_count"`
Order string `json:"order" mapstructure:"order"`
OrderBy string `json:"order_by" mapstructure:"order_by"`
Size int `json:"size" mapstructure:"size"`
Params []Param `json:"params" mapstructure:"params"` // 类型是 filter 时使用
Interval int64 `json:"interval" mapstructure:"interval"` // 分组间隔
}
type SearchFunc func(ctx context.Context, indices []string, source interface{}, timeout int, maxShard int) (*elastic.SearchResult, error)
type QueryFieldsFunc func(indices []string) ([]string, error)
// 分组类型
type GroupByCate string
const (
Filters GroupByCate = "filters"
Histogram GroupByCate = "histogram"
Terms GroupByCate = "terms"
)
// 参数
type Param struct {
Alias string `json:"alias,omitempty"` // 别名,a=b的形式,filter 特有
Query string `json:"query,omitempty"` // 查询条件,filter 特有
}
type MetricPtr struct {
Data map[string][][]float64
}
func IterGetMap(m, ret map[string]interface{}, prefixKey string) {
for k, v := range m {
switch v.(type) {
case map[string]interface{}:
var key string
if prefixKey != "" {
key = fmt.Sprintf("%s.%s", prefixKey, k)
} else {
key = k
}
IterGetMap(v.(map[string]interface{}), ret, key)
default:
ret[prefixKey+"."+k] = []interface{}{v}
}
}
}
func TransferData(metric, ref string, m map[string][][]float64) []models.DataResp {
var datas []models.DataResp
for k, v := range m {
data := models.DataResp{
Ref: ref,
Metric: make(model.Metric),
Labels: k,
Values: v,
}
data.Metric["__name__"] = model.LabelValue(metric)
labels := strings.Split(k, LabelSeparator)
for _, label := range labels {
arr := strings.SplitN(label, "=", 2)
if len(arr) == 2 {
data.Metric[model.LabelName(arr[0])] = model.LabelValue(arr[1])
}
}
datas = append(datas, data)
}
for i := 0; i < len(datas); i++ {
for k, v := range datas[i].Metric {
if k == "__name__" {
datas[i].Metric[k] = model.LabelValue(ref) + "_" + v
}
}
}
return datas
}
func GetQueryString(filter string, q *elastic.RangeQuery) *elastic.BoolQuery {
var queryString *elastic.BoolQuery
if filter != "" {
if strings.Contains(filter, ":") || strings.Contains(filter, "AND") || strings.Contains(filter, "OR") || strings.Contains(filter, "NOT") {
queryString = elastic.NewBoolQuery().Must(elastic.NewQueryStringQuery(filter)).Filter(q)
} else {
queryString = elastic.NewBoolQuery().Filter(elastic.NewMultiMatchQuery(filter).Lenient(true).Type("phrase")).Filter(q)
}
} else {
queryString = elastic.NewBoolQuery().Should(q)
}
return queryString
}
func getUnixTs(timeStr string) int64 {
ts, err := strconv.ParseInt(timeStr, 10, 64)
if err == nil {
return ts
}
parsedTime, err := dateparse.ParseAny(timeStr)
if err != nil {
logger.Error("failed to ParseAny: ", err)
return 0
}
return parsedTime.UnixMilli()
}
func GetBuckets(labelKey string, keys []string, arr []interface{}, metrics *MetricPtr, labels string, ts int64, f string) {
var err error
bucketsKey := ""
if len(keys) > 0 {
bucketsKey = keys[0]
}
newlabels := ""
for i := 0; i < len(arr); i++ {
tmp := arr[i].(map[string]interface{})
keyAsString, getTs := tmp["key_as_string"]
if getTs {
ts = getUnixTs(keyAsString.(string))
}
keyValue := tmp["key"]
switch keyValue.(type) {
case json.Number, string:
if !getTs {
if labels != "" {
newlabels = fmt.Sprintf("%s%s%s=%v", labels, LabelSeparator, labelKey, keyValue)
} else {
newlabels = fmt.Sprintf("%s=%v", labelKey, keyValue)
}
}
default:
continue
}
var finalValue float64
if len(keys) == 0 { // 计算 doc_count 的情况
count := tmp["doc_count"]
finalValue, err = count.(json.Number).Float64()
if err != nil {
logger.Warningf("labelKey:%s get value error:%v", labelKey, err)
}
newValues := []float64{float64(ts / 1000), finalValue}
metrics.Data[newlabels] = append(metrics.Data[newlabels], newValues)
continue
}
innerBuckets, exists := tmp[bucketsKey]
if !exists {
continue
}
nextBucketsArr, exists := innerBuckets.(map[string]interface{})["buckets"]
if exists {
if len(keys[1:]) >= 1 {
GetBuckets(bucketsKey, keys[1:], nextBucketsArr.([]interface{}), metrics, newlabels, ts, f)
} else {
GetBuckets(bucketsKey, []string{}, nextBucketsArr.([]interface{}), metrics, newlabels, ts, f)
}
} else {
// doc_count
if f == "count" || f == "nodata" {
count := tmp["doc_count"]
finalValue, err = count.(json.Number).Float64()
if err != nil {
logger.Warningf("get %v value error:%v", count, err)
}
} else {
values, exists := innerBuckets.(map[string]interface{})["value"]
if exists {
switch values.(type) {
case json.Number:
value, err := values.(json.Number).Float64()
if err != nil {
logger.Warningf("labelKey:%s get value error:%v", labelKey, err)
}
finalValue = value
}
} else {
switch values.(type) {
case map[string]interface{}:
var err error
values := innerBuckets.(map[string]interface{})["values"]
for _, v := range values.(map[string]interface{}) {
finalValue, err = v.(json.Number).Float64()
if err != nil {
logger.Warningf("labelKey:%s get value error:%v", labelKey, err)
}
}
default:
values := innerBuckets.(map[string]interface{})["values"]
for _, v := range values.(map[string]interface{}) {
// Todo 修复 v is nil 导致 panic 情况
finalValue, err = v.(json.Number).Float64()
if err != nil {
logger.Warningf("labelKey:%s get value error:%v", labelKey, err)
}
}
}
}
}
if _, exists := metrics.Data[newlabels]; !exists {
metrics.Data[newlabels] = [][]float64{}
}
newValues := []float64{float64(ts / 1000), finalValue}
metrics.Data[newlabels] = append(metrics.Data[newlabels], newValues)
}
}
}
func MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
param := new(Query)
if err := mapstructure.Decode(query, param); err != nil {
return nil, err
}
for i := 0; i < len(eventTags); i++ {
arr := strings.SplitN(eventTags[i], "=", 2)
if len(arr) == 2 {
eventTags[i] = fmt.Sprintf("%s:%s", arr[0], strconv.Quote(arr[1]))
}
}
if len(eventTags) > 0 {
if param.Filter == "" {
param.Filter = strings.Join(eventTags, " AND ")
} else {
param.Filter = param.Filter + " AND " + strings.Join(eventTags, " AND ")
}
}
param.Start = start
param.End = end
return param, nil
}
func MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
param := new(Query)
if err := mapstructure.Decode(query, param); err != nil {
return nil, err
}
for i := 0; i < len(eventTags); i++ {
arr := strings.SplitN(eventTags[i], "=", 2)
if len(arr) == 2 {
eventTags[i] = fmt.Sprintf("%s:%s", arr[0], strconv.Quote(arr[1]))
}
}
if len(eventTags) > 0 {
if param.Filter == "" {
param.Filter = strings.Join(eventTags, " AND ")
} else {
param.Filter = param.Filter + " AND " + strings.Join(eventTags, " AND ")
}
}
param.Start = start
param.End = end
return param, nil
}
var esIndexPatternCache *memsto.EsIndexPatternCacheType
func SetEsIndexPatternCacheType(c *memsto.EsIndexPatternCacheType) {
esIndexPatternCache = c
}
func GetEsIndexPatternCacheType() *memsto.EsIndexPatternCacheType {
return esIndexPatternCache
}
func QueryData(ctx context.Context, queryParam interface{}, cliTimeout int64, version string, search SearchFunc) ([]models.DataResp, error) {
param := new(Query)
if err := mapstructure.Decode(queryParam, param); err != nil {
return nil, err
}
if param.Timeout == 0 {
param.Timeout = int(cliTimeout) / 1000
}
if param.Interval == 0 {
param.Interval = 60
}
if param.MaxShard < 1 {
param.MaxShard = 5
}
if param.DateField == "" {
param.DateField = "@timestamp"
}
var indexArr []string
if param.IndexType == "index_pattern" {
if ip, ok := GetEsIndexPatternCacheType().Get(param.IndexPatternId); ok {
param.DateField = ip.TimeField
indexArr = []string{ip.Name}
param.Index = ip.Name
} else {
return nil, fmt.Errorf("index pattern:%d not found", param.IndexPatternId)
}
} else {
indexArr = strings.Split(param.Index, ",")
}
q := elastic.NewRangeQuery(param.DateField)
now := time.Now().Unix()
var start, end int64
if param.End != 0 && param.Start != 0 {
end = param.End
start = param.Start
} else {
end = now
start = end - param.Interval
}
delay, ok := ctx.Value("delay").(int64)
if ok && delay != 0 {
end = end - delay
start = start - delay
}
if param.Offset > 0 {
end = end - param.Offset
start = start - param.Offset
}
q.Gte(time.Unix(start, 0).UnixMilli())
q.Lt(time.Unix(end, 0).UnixMilli())
q.Format("epoch_millis")
field := param.MetricAggr.Field
groupBys := param.GroupBy
queryString := GetQueryString(param.Filter, q)
var aggr elastic.Aggregation
switch param.MetricAggr.Func {
case "avg":
aggr = elastic.NewAvgAggregation().Field(field)
case "max":
aggr = elastic.NewMaxAggregation().Field(field)
case "min":
aggr = elastic.NewMinAggregation().Field(field)
case "sum":
aggr = elastic.NewSumAggregation().Field(field)
case "count":
aggr = elastic.NewValueCountAggregation().Field(field)
case "p90":
aggr = elastic.NewPercentilesAggregation().Percentiles(90).Field(field)
case "p95":
aggr = elastic.NewPercentilesAggregation().Percentiles(95).Field(field)
case "p99":
aggr = elastic.NewPercentilesAggregation().Percentiles(99).Field(field)
case "median":
aggr = elastic.NewPercentilesAggregation().Percentiles(50).Field(field)
default:
return nil, fmt.Errorf("func %s not support", param.MetricAggr.Func)
}
tsAggr := elastic.NewDateHistogramAggregation().
Field(param.DateField).
MinDocCount(1)
versionParts := strings.Split(version, ".")
major := 0
if len(versionParts) > 0 {
if m, err := strconv.Atoi(versionParts[0]); err == nil {
major = m
}
}
minor := 0
if len(versionParts) > 1 {
if m, err := strconv.Atoi(versionParts[1]); err == nil {
minor = m
}
}
if major >= 7 {
// 添加偏移量,使第一个分桶bucket的左边界对齐为 start 时间
offset := (start % param.Interval) - param.Interval
// 使用 fixed_interval 的条件:ES 7.2+ 或者任何 major > 7(例如 ES8)
if (major > 7) || (major == 7 && minor >= 2) {
// ES 7.2+ 以及 ES8+ 使用 fixed_interval
tsAggr.FixedInterval(fmt.Sprintf("%ds", param.Interval)).Offset(fmt.Sprintf("%ds", offset))
} else {
// 7.0-7.1 使用 interval(带 offset)
tsAggr.Interval(fmt.Sprintf("%ds", param.Interval)).Offset(fmt.Sprintf("%ds", offset))
}
} else {
// 兼容 7.0 以下的版本
// OpenSearch 也使用这个字段
tsAggr.Interval(fmt.Sprintf("%ds", param.Interval))
}
// group by
var groupByAggregation elastic.Aggregation
if len(groupBys) > 0 {
groupBy := groupBys[0]
if groupBy.MinDocCount == 0 {
groupBy.MinDocCount = 1
}
if groupBy.Size == 0 {
groupBy.Size = 300
}
switch groupBy.Cate {
case Terms:
if param.MetricAggr.Func != "count" {
groupByAggregation = elastic.NewTermsAggregation().Field(groupBy.Field).SubAggregation(field, aggr).OrderByKeyDesc().Size(groupBy.Size).MinDocCount(int(groupBy.MinDocCount))
} else {
groupByAggregation = elastic.NewTermsAggregation().Field(groupBy.Field).OrderByKeyDesc().Size(groupBy.Size).MinDocCount(int(groupBy.MinDocCount))
}
case Histogram:
if param.MetricAggr.Func != "count" {
groupByAggregation = elastic.NewHistogramAggregation().Field(groupBy.Field).Interval(float64(groupBy.Interval)).SubAggregation(field, aggr)
} else {
groupByAggregation = elastic.NewHistogramAggregation().Field(groupBy.Field).Interval(float64(groupBy.Interval))
}
case Filters:
for _, filterParam := range groupBy.Params {
if param.MetricAggr.Func != "count" {
groupByAggregation = elastic.NewFilterAggregation().Filter(elastic.NewTermQuery(filterParam.Query, "true")).SubAggregation(field, aggr)
} else {
groupByAggregation = elastic.NewFilterAggregation().Filter(elastic.NewTermQuery(filterParam.Query, "true"))
}
}
}
for i := 1; i < len(groupBys); i++ {
groupBy := groupBys[i]
if groupBy.MinDocCount == 0 {
groupBy.MinDocCount = 1
}
if groupBy.Size == 0 {
groupBy.Size = 300
}
switch groupBy.Cate {
case Terms:
groupByAggregation = elastic.NewTermsAggregation().Field(groupBy.Field).SubAggregation(groupBys[i-1].Field, groupByAggregation).OrderByKeyDesc().Size(groupBy.Size).MinDocCount(int(groupBy.MinDocCount))
case Histogram:
groupByAggregation = elastic.NewHistogramAggregation().Field(groupBy.Field).Interval(float64(groupBy.Interval)).SubAggregation(groupBys[i-1].Field, groupByAggregation)
case Filters:
for _, filterParam := range groupBy.Params {
groupByAggregation = elastic.NewFilterAggregation().Filter(elastic.NewTermQuery(filterParam.Query, "true")).SubAggregation(groupBys[i-1].Field, groupByAggregation)
}
}
}
tsAggr.SubAggregation(groupBys[len(groupBys)-1].Field, groupByAggregation)
} else if param.MetricAggr.Func != "count" {
tsAggr.SubAggregation(field, aggr)
}
source, _ := queryString.Source()
b, _ := json.Marshal(source)
logx.Debugf(ctx, "query_data q:%+v indexArr:%+v tsAggr:%+v query_string:%s", param, indexArr, tsAggr, string(b))
searchSource := elastic.NewSearchSource().
Query(queryString).
Aggregation("ts", tsAggr)
searchSourceString, err := searchSource.Source()
if err != nil {
logx.Warningf(ctx, "query_data searchSource:%s to string error:%v", searchSourceString, err)
}
jsonSearchSource, err := json.Marshal(searchSourceString)
if err != nil {
logx.Warningf(ctx, "query_data searchSource:%s to json error:%v", searchSourceString, err)
}
result, err := search(ctx, indexArr, searchSource, param.Timeout, param.MaxShard)
if err != nil {
logx.Warningf(ctx, "query_data searchSource:%s query_data error:%v", searchSourceString, err)
return nil, err
}
// 检查是否有 shard failures,有部分数据时仅记录警告继续处理
if shardErr := checkShardFailures(ctx, result.Shards, "query_data", searchSourceString); shardErr != nil {
if len(result.Aggregations["ts"]) == 0 {
return nil, shardErr
}
// 有部分数据,checkShardFailures 已记录警告,继续处理
}
logx.Infof(ctx, "query_data searchSource:%s resp:%s", string(jsonSearchSource), string(result.Aggregations["ts"]))
js, err := simplejson.NewJson(result.Aggregations["ts"])
if err != nil {
return nil, err
}
bucketsData, err := js.Get("buckets").Array()
if err != nil {
return nil, err
}
var keys []string
for i := len(groupBys) - 1; i >= 0; i-- {
keys = append(keys, groupBys[i].Field)
}
if param.MetricAggr.Func != "count" {
keys = append(keys, field)
}
metrics := &MetricPtr{Data: make(map[string][][]float64)}
GetBuckets("", keys, bucketsData, metrics, "", 0, param.MetricAggr.Func)
// Drop the last incomplete bucket to avoid inaccurate values at the boundary.
// When the last bucket's time range extends beyond or reaches the query end time,
// it may contain only partial data, making aggregated values (count, sum, etc.) artificially low.
for k, v := range metrics.Data {
if len(v) <= 1 {
continue
}
lastTs := v[len(v)-1][0]
if int64(lastTs)+param.Interval > end {
metrics.Data[k] = v[:len(v)-1]
}
}
items, err := TransferData(fmt.Sprintf("%s_%s", field, param.MetricAggr.Func), param.Ref, metrics.Data), nil
var m map[string]interface{}
bs, _ := json.Marshal(queryParam)
json.Unmarshal(bs, &m)
m["index"] = param.Index
for i := range items {
items[i].Query = fmt.Sprintf("%+v", m)
}
return items, nil
}
// checkShardFailures 检查 ES 查询结果中的 shard failures,返回格式化的错误信息
func checkShardFailures(ctx context.Context, shards *elastic.ShardsInfo, logPrefix string, queryContext interface{}) error {
if shards == nil || shards.Failed == 0 || len(shards.Failures) == 0 {
return nil
}
var failureReasons []string
for _, failure := range shards.Failures {
reason := ""
if failure.Reason != nil {
if reasonType, ok := failure.Reason["type"].(string); ok {
reason = reasonType
}
if reasonMsg, ok := failure.Reason["reason"].(string); ok {
if reason != "" {
reason += ": " + reasonMsg
} else {
reason = reasonMsg
}
}
}
if reason != "" {
failureReasons = append(failureReasons, fmt.Sprintf("index=%s shard=%d: %s", failure.Index, failure.Shard, reason))
}
}
if len(failureReasons) > 0 {
errMsg := fmt.Sprintf("elasticsearch shard failures (%d/%d failed): %s", shards.Failed, shards.Total, strings.Join(failureReasons, "; "))
logx.Warningf(ctx, "%s query:%v %s", logPrefix, queryContext, errMsg)
return fmt.Errorf("%s", errMsg)
}
return nil
}
func HitFilter(typ string) bool {
switch typ {
case "keyword", "date", "long", "integer", "short", "byte", "double", "float", "half_float", "scaled_float", "unsigned_long":
return false
default:
return true
}
}
func QueryLog(ctx context.Context, queryParam interface{}, timeout int64, version string, maxShard int, search SearchFunc) ([]interface{}, int64, error) {
param := new(Query)
if err := mapstructure.Decode(queryParam, param); err != nil {
return nil, 0, err
}
if param.Timeout == 0 {
param.Timeout = int(timeout)
}
var indexArr []string
if param.IndexType == "index_pattern" {
if ip, ok := GetEsIndexPatternCacheType().Get(param.IndexPatternId); ok {
param.DateField = ip.TimeField
indexArr = []string{ip.Name}
} else {
return nil, 0, fmt.Errorf("index pattern:%d not found", param.IndexPatternId)
}
} else {
indexArr = strings.Split(param.Index, ",")
}
now := time.Now().Unix()
var start, end int64
if param.End != 0 && param.Start != 0 {
end = param.End
start = param.Start
} else {
end = now
start = end - param.Interval
}
q := elastic.NewRangeQuery(param.DateField)
q.Gte(time.Unix(start, 0).UnixMilli())
q.Lt(time.Unix(end, 0).UnixMilli())
q.Format("epoch_millis")
queryString := GetQueryString(param.Filter, q)
if param.Limit <= 0 {
param.Limit = 10
}
if param.MaxShard < 1 {
param.MaxShard = maxShard
}
// from+size 分页方式获取日志,受es 的max_result_window参数限制,默认最多返回1w条日志, 可以使用search_after方式获取更多日志
source := elastic.NewSearchSource().
TrackTotalHits(true).
Query(queryString).
Size(param.Limit)
// 是否使用search_after方式
if param.SearchAfter != nil {
// 设置默认排序字段
if len(param.SearchAfter.SortFields) == 0 {
source = source.Sort(param.DateField, param.Ascending).Sort(string(FieldIndex), true).Sort(string(FieldId), true)
} else {
for _, field := range param.SearchAfter.SortFields {
source = source.Sort(field.Field, field.Ascending)
}
}
if len(param.SearchAfter.SearchAfter) > 0 {
source = source.SearchAfter(param.SearchAfter.SearchAfter...)
}
} else {
source = source.From(param.P).Sort(param.DateField, param.Ascending)
}
sourceBytes, _ := json.Marshal(source)
result, err := search(ctx, indexArr, source, param.Timeout, param.MaxShard)
if err != nil {
logx.Warningf(ctx, "query_log source:%s error:%v", string(sourceBytes), err)
return nil, 0, err
}
// 检查是否有 shard failures,有部分数据时仅记录警告继续处理
if shardErr := checkShardFailures(ctx, result.Shards, "query_log", string(sourceBytes)); shardErr != nil {
if len(result.Hits.Hits) == 0 {
return nil, 0, shardErr
}
// 有部分数据,checkShardFailures 已记录警告,继续处理
}
total := result.TotalHits()
var ret []interface{}
logx.Debugf(ctx, "query_log source:%s len:%d total:%d", string(sourceBytes), len(result.Hits.Hits), total)
resultBytes, _ := json.Marshal(result)
logx.Debugf(ctx, "query_log source:%s result:%s", string(sourceBytes), string(resultBytes))
if strings.HasPrefix(version, "6") {
for i := 0; i < len(result.Hits.Hits); i++ {
var x map[string]interface{}
err := json.Unmarshal(result.Hits.Hits[i].Source, &x)
if err != nil {
logx.Warningf(ctx, "Unmarshal source error:%v", err)
continue
}
if result.Hits.Hits[i].Fields == nil {
result.Hits.Hits[i].Fields = make(map[string]interface{})
}
IterGetMap(x, result.Hits.Hits[i].Fields, "")
ret = append(ret, result.Hits.Hits[i])
}
} else {
for _, hit := range result.Hits.Hits {
ret = append(ret, hit)
}
}
return ret, total, nil
}
================================================
FILE: datasource/datasource.go
================================================
package datasource
import (
"context"
"fmt"
"strings"
"github.com/ccfos/nightingale/v6/models"
)
type DatasourceType struct {
Id int64 `json:"id"`
Category string `json:"category"`
PluginType string `json:"type"`
PluginTypeName string `json:"type_name"`
}
type Keys struct {
ValueKey string `json:"valueKey" mapstructure:"valueKey"` // 多个用空格分隔
LabelKey string `json:"labelKey" mapstructure:"labelKey"` // 多个用空格分隔
TimeKey string `json:"timeKey" mapstructure:"timeKey"`
TimeFormat string `json:"timeFormat" mapstructure:"timeFormat"`
}
var DatasourceTypes map[int64]DatasourceType
func init() {
DatasourceTypes = make(map[int64]DatasourceType)
DatasourceTypes[1] = DatasourceType{
Id: 1,
Category: "timeseries",
PluginType: "prometheus",
PluginTypeName: "Prometheus Like",
}
DatasourceTypes[2] = DatasourceType{
Id: 2,
Category: "logging",
PluginType: "elasticsearch",
PluginTypeName: "Elasticsearch",
}
DatasourceTypes[3] = DatasourceType{
Id: 3,
Category: "logging",
PluginType: "aliyun-sls",
PluginTypeName: "SLS",
}
DatasourceTypes[4] = DatasourceType{
Id: 4,
Category: "timeseries",
PluginType: "ck",
PluginTypeName: "ClickHouse",
}
DatasourceTypes[5] = DatasourceType{
Id: 5,
Category: "timeseries",
PluginType: "mysql",
PluginTypeName: "MySQL",
}
DatasourceTypes[6] = DatasourceType{
Id: 6,
Category: "timeseries",
PluginType: "pgsql",
PluginTypeName: "PostgreSQL",
}
DatasourceTypes[7] = DatasourceType{
Id: 7,
Category: "logging",
PluginType: "victorialogs",
PluginTypeName: "VictoriaLogs",
}
}
type NewDatasourceFn func(settings map[string]interface{}) (Datasource, error)
var datasourceRegister = map[string]NewDatasourceFn{}
type Datasource interface {
Init(settings map[string]interface{}) (Datasource, error) // 初始化配置
InitClient() error // 初始化客户端
Validate(ctx context.Context) error // 参数验证
Equal(p Datasource) bool // 验证是否相等
MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error)
MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error)
QueryData(ctx context.Context, query interface{}) ([]models.DataResp, error)
QueryLog(ctx context.Context, query interface{}) ([]interface{}, int64, error)
// 在生成告警事件时,会调用该方法,用于获取额外的数据
QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error)
}
func RegisterDatasource(typ string, p Datasource) {
if _, found := datasourceRegister[typ]; found {
return
}
datasourceRegister[typ] = p.Init
}
func GetDatasourceByType(typ string, settings map[string]interface{}) (Datasource, error) {
typ = strings.ReplaceAll(typ, ".logging", "")
fn, found := datasourceRegister[typ]
if !found {
return nil, fmt.Errorf("plugin type %s not found", typ)
}
plug, err := fn(settings)
if err != nil {
return nil, err
}
return plug, nil
}
type DatasourceInfo struct {
Id int64 `json:"id"`
Name string `json:"name"`
Identifier string `json:"identifier"`
Description string `json:"description"`
ClusterName string `json:"cluster_name"`
Category string `json:"category"`
PluginId int64 `json:"plugin_id"`
Type string `json:"plugin_type"`
PluginTypeName string `json:"plugin_type_name"`
Settings map[string]interface{} `json:"settings"`
HTTPJson models.HTTP `json:"http"`
AuthJson models.Auth `json:"auth"`
Status string `json:"status"`
CreatedAt int64 `json:"created_at"`
UpdatedAt int64 `json:"updated_at"`
IsDefault bool `json:"is_default"`
Weight int `json:"weight"`
}
================================================
FILE: datasource/doris/doris.go
================================================
package doris
import (
"context"
"fmt"
"strings"
"time"
"github.com/ccfos/nightingale/v6/datasource"
"github.com/ccfos/nightingale/v6/dskit/doris"
"github.com/ccfos/nightingale/v6/dskit/types"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/macros"
"github.com/mitchellh/mapstructure"
"github.com/toolkits/pkg/logger"
"github.com/ccfos/nightingale/v6/pkg/logx"
)
const (
DorisType = "doris"
)
func init() {
datasource.RegisterDatasource(DorisType, new(Doris))
}
type Doris struct {
doris.Doris `json:",inline" mapstructure:",squash"`
}
type QueryParam struct {
Ref string `json:"ref" mapstructure:"ref"`
Database string `json:"database" mapstructure:"database"`
Table string `json:"table" mapstructure:"table"`
SQL string `json:"sql" mapstructure:"sql"`
Keys datasource.Keys `json:"keys" mapstructure:"keys"`
Limit int `json:"limit" mapstructure:"limit"`
From int64 `json:"from" mapstructure:"from"`
To int64 `json:"to" mapstructure:"to"`
TimeField string `json:"time_field" mapstructure:"time_field"`
TimeFormat string `json:"time_format" mapstructure:"time_format"`
Interval int64 `json:"interval" mapstructure:"interval"` // 查询时间间隔(秒)
Offset int `json:"offset" mapstructure:"offset"` // 延迟计算,不在使用通用配置delay
}
func (d *Doris) InitClient() error {
if len(d.Addr) == 0 {
return fmt.Errorf("not found doris addr, please check datasource config")
}
if _, err := d.NewConn(context.TODO(), ""); err != nil {
return err
}
return nil
}
func (d *Doris) Init(settings map[string]interface{}) (datasource.Datasource, error) {
newest := new(Doris)
err := mapstructure.Decode(settings, newest)
return newest, err
}
func (d *Doris) Validate(ctx context.Context) error {
if len(d.Addr) == 0 || len(strings.TrimSpace(d.Addr)) == 0 {
return fmt.Errorf("doris addr is invalid, please check datasource setting")
}
if len(strings.TrimSpace(d.User)) == 0 {
return fmt.Errorf("doris user is invalid, please check datasource setting")
}
return nil
}
// Equal compares whether two objects are the same, used for caching
func (d *Doris) Equal(p datasource.Datasource) bool {
newest, ok := p.(*Doris)
if !ok {
logger.Errorf("unexpected plugin type, expected is doris")
return false
}
return d.Addr == newest.Addr &&
d.FeAddr == newest.FeAddr &&
d.User == newest.User &&
d.Password == newest.Password &&
d.EnableWrite == newest.EnableWrite &&
d.UserWrite == newest.UserWrite &&
d.PasswordWrite == newest.PasswordWrite &&
d.MaxQueryRows == newest.MaxQueryRows &&
d.Timeout == newest.Timeout &&
d.MaxIdleConns == newest.MaxIdleConns &&
d.MaxOpenConns == newest.MaxOpenConns &&
d.ConnMaxLifetime == newest.ConnMaxLifetime &&
d.ClusterName == newest.ClusterName
}
func (d *Doris) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
return nil, nil
}
func (d *Doris) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
return nil, nil
}
func (d *Doris) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) {
return nil, nil
}
func (d *Doris) QueryData(ctx context.Context, query interface{}) ([]models.DataResp, error) {
dorisQueryParam := new(QueryParam)
if err := mapstructure.Decode(query, dorisQueryParam); err != nil {
return nil, err
}
if dorisQueryParam.Keys.ValueKey == "" {
return nil, fmt.Errorf("valueKey is required")
}
// 设置默认 interval
if dorisQueryParam.Interval == 0 {
dorisQueryParam.Interval = 60
}
// 计算时间范围
now := time.Now().Unix()
var start, end int64
if dorisQueryParam.To != 0 && dorisQueryParam.From != 0 {
end = dorisQueryParam.To
start = dorisQueryParam.From
} else {
end = now
start = end - dorisQueryParam.Interval
}
if dorisQueryParam.Offset != 0 {
end -= int64(dorisQueryParam.Offset)
start -= int64(dorisQueryParam.Offset)
}
dorisQueryParam.From = start
dorisQueryParam.To = end
if strings.Contains(dorisQueryParam.SQL, "$__") {
var err error
dorisQueryParam.SQL, err = macros.Macro(dorisQueryParam.SQL, dorisQueryParam.From, dorisQueryParam.To)
if err != nil {
return nil, err
}
}
items, err := d.QueryTimeseries(ctx, &doris.QueryParam{
Database: dorisQueryParam.Database,
Sql: dorisQueryParam.SQL,
Keys: types.Keys{
ValueKey: dorisQueryParam.Keys.ValueKey,
LabelKey: dorisQueryParam.Keys.LabelKey,
TimeKey: dorisQueryParam.Keys.TimeKey,
Offset: dorisQueryParam.Offset,
},
})
if err != nil {
logx.Warningf(ctx, "query:%+v get data err:%v", dorisQueryParam, err)
return []models.DataResp{}, err
}
data := make([]models.DataResp, 0)
for i := range items {
data = append(data, models.DataResp{
Ref: dorisQueryParam.Ref,
Metric: items[i].Metric,
Values: items[i].Values,
})
}
// parse resp to time series data
logx.Infof(ctx, "req:%+v keys:%+v \n data:%v", dorisQueryParam, dorisQueryParam.Keys, data)
return data, nil
}
func (d *Doris) QueryLog(ctx context.Context, query interface{}) ([]interface{}, int64, error) {
dorisQueryParam := new(QueryParam)
if err := mapstructure.Decode(query, dorisQueryParam); err != nil {
return nil, 0, err
}
// 记录规则预览场景下,只传了interval, 没有传From和To
now := time.Now().Unix()
if dorisQueryParam.To == 0 && dorisQueryParam.From == 0 && dorisQueryParam.Interval != 0 {
dorisQueryParam.To = now
dorisQueryParam.From = now - dorisQueryParam.Interval
}
if dorisQueryParam.Offset != 0 {
dorisQueryParam.To -= int64(dorisQueryParam.Offset)
dorisQueryParam.From -= int64(dorisQueryParam.Offset)
}
if strings.Contains(dorisQueryParam.SQL, "$__") {
var err error
dorisQueryParam.SQL, err = macros.Macro(dorisQueryParam.SQL, dorisQueryParam.From, dorisQueryParam.To)
if err != nil {
return nil, 0, err
}
}
items, err := d.QueryLogs(ctx, &doris.QueryParam{
Database: dorisQueryParam.Database,
Sql: dorisQueryParam.SQL,
})
if err != nil {
logx.Warningf(ctx, "query:%+v get data err:%v", dorisQueryParam, err)
return []interface{}{}, 0, err
}
logs := make([]interface{}, 0)
for i := range items {
logs = append(logs, items[i])
}
return logs, int64(len(logs)), nil
}
func (d *Doris) DescribeTable(ctx context.Context, query interface{}) ([]*types.ColumnProperty, error) {
dorisQueryParam := new(QueryParam)
if err := mapstructure.Decode(query, dorisQueryParam); err != nil {
return nil, err
}
return d.DescTable(ctx, dorisQueryParam.Database, dorisQueryParam.Table)
}
================================================
FILE: datasource/es/es.go
================================================
package es
import (
"context"
"encoding/json"
"fmt"
"net"
"net/http"
"net/url"
"reflect"
"sort"
"strings"
"time"
"github.com/ccfos/nightingale/v6/datasource"
"github.com/ccfos/nightingale/v6/datasource/commons/eslike"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/tlsx"
"github.com/mitchellh/mapstructure"
"github.com/olivere/elastic/v7"
"github.com/ccfos/nightingale/v6/pkg/logx"
)
const (
ESType = "elasticsearch"
)
type Elasticsearch struct {
Addr string `json:"es.addr" mapstructure:"es.addr"`
Nodes []string `json:"es.nodes" mapstructure:"es.nodes"`
Timeout int64 `json:"es.timeout" mapstructure:"es.timeout"` // millis
Basic BasicAuth `json:"es.basic" mapstructure:"es.basic"`
TLS TLS `json:"es.tls" mapstructure:"es.tls"`
Version string `json:"es.version" mapstructure:"es.version"`
Headers map[string]string `json:"es.headers" mapstructure:"es.headers"`
MinInterval int `json:"es.min_interval" mapstructure:"es.min_interval"` // seconds
MaxShard int `json:"es.max_shard" mapstructure:"es.max_shard"`
ClusterName string `json:"es.cluster_name" mapstructure:"es.cluster_name"`
EnableWrite bool `json:"es.enable_write" mapstructure:"es.enable_write"` // 允许写操作
Client *elastic.Client `json:"es.client" mapstructure:"es.client"`
}
type TLS struct {
SkipTlsVerify bool `json:"es.tls.skip_tls_verify" mapstructure:"es.tls.skip_tls_verify"`
}
type BasicAuth struct {
Enable bool `json:"es.auth.enable" mapstructure:"es.auth.enable"`
Username string `json:"es.user" mapstructure:"es.user"`
Password string `json:"es.password" mapstructure:"es.password"`
}
func init() {
datasource.RegisterDatasource(ESType, new(Elasticsearch))
}
func (e *Elasticsearch) Init(settings map[string]interface{}) (datasource.Datasource, error) {
newest := new(Elasticsearch)
err := mapstructure.Decode(settings, newest)
return newest, err
}
func (e *Elasticsearch) InitClient() error {
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: time.Duration(e.Timeout) * time.Millisecond,
}).DialContext,
ResponseHeaderTimeout: time.Duration(e.Timeout) * time.Millisecond,
}
if len(e.Nodes) > 0 {
e.Addr = e.Nodes[0]
}
if strings.Contains(e.Addr, "https") {
tlsConfig := tlsx.ClientConfig{
InsecureSkipVerify: e.TLS.SkipTlsVerify,
UseTLS: true,
}
cfg, err := tlsConfig.TLSConfig()
if err != nil {
return err
}
transport.TLSClientConfig = cfg
}
var err error
options := []elastic.ClientOptionFunc{
elastic.SetURL(e.Nodes...),
}
if e.Basic.Username != "" {
options = append(options, elastic.SetBasicAuth(e.Basic.Username, e.Basic.Password))
}
headers := http.Header{}
for k, v := range e.Headers {
headers[k] = []string{v}
}
options = append(options, elastic.SetHeaders(headers))
options = append(options, elastic.SetHttpClient(&http.Client{Transport: transport}))
options = append(options, elastic.SetSniff(false))
options = append(options, elastic.SetHealthcheck(false))
e.Client, err = elastic.NewClient(options...)
if err != nil {
return err
}
return err
}
func (e *Elasticsearch) Equal(other datasource.Datasource) bool {
sort.Strings(e.Nodes)
sort.Strings(other.(*Elasticsearch).Nodes)
if strings.Join(e.Nodes, ",") != strings.Join(other.(*Elasticsearch).Nodes, ",") {
return false
}
if e.Basic.Username != other.(*Elasticsearch).Basic.Username {
return false
}
if e.Basic.Password != other.(*Elasticsearch).Basic.Password {
return false
}
if e.TLS.SkipTlsVerify != other.(*Elasticsearch).TLS.SkipTlsVerify {
return false
}
if e.EnableWrite != other.(*Elasticsearch).EnableWrite {
return false
}
if !reflect.DeepEqual(e.Headers, other.(*Elasticsearch).Headers) {
return false
}
return true
}
func (e *Elasticsearch) Validate(ctx context.Context) (err error) {
if len(e.Nodes) == 0 {
return fmt.Errorf("need a valid addr")
}
for _, addr := range e.Nodes {
_, err = url.Parse(addr)
if err != nil {
return fmt.Errorf("parse addr error: %v", err)
}
}
if e.Basic.Enable && (len(e.Basic.Username) == 0 || len(e.Basic.Password) == 0) {
return fmt.Errorf("need a valid user, password")
}
if e.MaxShard == 0 {
e.MaxShard = 5
}
if e.MinInterval < 10 {
e.MinInterval = 10
}
if e.Timeout == 0 {
e.Timeout = 60000
}
return nil
}
func (e *Elasticsearch) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
return eslike.MakeLogQuery(ctx, query, eventTags, start, end)
}
func (e *Elasticsearch) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
return eslike.MakeTSQuery(ctx, query, eventTags, start, end)
}
func (e *Elasticsearch) QueryData(ctx context.Context, queryParam interface{}) ([]models.DataResp, error) {
search := func(ctx context.Context, indices []string, source interface{}, timeout int, maxShard int) (*elastic.SearchResult, error) {
return e.Client.Search().
Index(indices...).
IgnoreUnavailable(true).
Source(source).
Timeout(fmt.Sprintf("%ds", timeout)).
MaxConcurrentShardRequests(maxShard).
Do(ctx)
}
return eslike.QueryData(ctx, queryParam, e.Timeout, e.Version, search)
}
func (e *Elasticsearch) QueryIndices() ([]string, error) {
result, err := e.Client.IndexNames()
return result, err
}
func (e *Elasticsearch) QueryFields(indexes []string) ([]string, error) {
var fields []string
result, err := elastic.NewGetFieldMappingService(e.Client).Index(indexes...).IgnoreUnavailable(true).Do(context.Background())
if err != nil {
return fields, err
}
fieldMap := make(map[string]struct{})
for _, indexMap := range result {
if m, exists := indexMap.(map[string]interface{})["mappings"]; exists {
for k, v := range m.(map[string]interface{}) {
// 兼容 es6 版本
if k == "doc" && strings.HasPrefix(e.Version, "6") {
// if k == "doc" {
for kk, vv := range v.(map[string]interface{}) {
typ := getFieldType(kk, vv.(map[string]interface{}))
if eslike.HitFilter(typ) {
continue
}
if _, exists := fieldMap[kk]; !exists {
fieldMap[kk] = struct{}{}
fields = append(fields, kk)
}
}
} else {
// es7 版本
typ := getFieldType(k, v.(map[string]interface{}))
if eslike.HitFilter(typ) {
continue
}
if _, exists := fieldMap[k]; !exists {
fieldMap[k] = struct{}{}
fields = append(fields, k)
}
}
}
}
}
sort.Strings(fields)
return fields, nil
}
func (e *Elasticsearch) QueryLog(ctx context.Context, queryParam interface{}) ([]interface{}, int64, error) {
search := func(ctx context.Context, indices []string, source interface{}, timeout int, maxShard int) (*elastic.SearchResult, error) {
// 应该是之前为了获取 fields 字段,做的这个兼容
// fields, err := e.QueryFields(indices)
// if err != nil {
// logger.Warningf("query data error:%v", err)
// return nil, err
// }
// if source != nil && strings.HasPrefix(e.Version, "7") {
// source = source.(*elastic.SearchSource).DocvalueFields(fields...)
// }
return e.Client.Search().
Index(indices...).
IgnoreUnavailable(true).
MaxConcurrentShardRequests(maxShard).
Source(source).
Timeout(fmt.Sprintf("%ds", timeout)).
Do(ctx)
}
return eslike.QueryLog(ctx, queryParam, e.Timeout, e.Version, e.MaxShard, search)
}
func (e *Elasticsearch) QueryFieldValue(indexes []string, field string, query string) ([]string, error) {
var values []string
search := e.Client.Search().
IgnoreUnavailable(true).
Index(indexes...).
Size(0)
if query != "" {
search = search.Query(elastic.NewBoolQuery().Must(elastic.NewQueryStringQuery(query)))
}
search = search.Aggregation("distinct", elastic.NewTermsAggregation().Field(field).Size(10000))
result, err := search.Do(context.Background())
if err != nil {
return values, err
}
agg, found := result.Aggregations.Terms("distinct")
if !found {
return values, nil
}
for _, bucket := range agg.Buckets {
values = append(values, bucket.Key.(string))
}
return values, nil
}
func (e *Elasticsearch) Test(ctx context.Context) (err error) {
err = e.Validate(ctx)
if err != nil {
return err
}
if e.Addr == "" {
return fmt.Errorf("addr is invalid")
}
if e.Version == "7.10+" {
options := []elastic.ClientOptionFunc{
elastic.SetURL(e.Addr),
}
if e.Basic.Enable {
options = append(options, elastic.SetBasicAuth(e.Basic.Username, e.Basic.Password))
}
client, err := elastic.NewClient(options...)
if err != nil {
return fmt.Errorf("config is invalid:%v", err)
}
_, err = client.ElasticsearchVersion(e.Addr)
if err != nil {
return fmt.Errorf("config is invalid:%v", err)
}
} else {
return fmt.Errorf("version must be 7.10+")
}
return nil
}
func getFieldType(key string, m map[string]interface{}) string {
if innerMap, exists := m["mapping"]; exists {
if innerM, exists := innerMap.(map[string]interface{})[key]; exists {
if typ, exists := innerM.(map[string]interface{})["type"]; exists {
return typ.(string)
}
} else {
arr := strings.Split(key, ".")
if innerM, exists := innerMap.(map[string]interface{})[arr[len(arr)-1]]; exists {
if typ, exists := innerM.(map[string]interface{})["type"]; exists {
return typ.(string)
}
}
}
}
return ""
}
func (e *Elasticsearch) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) {
search := func(ctx context.Context, indices []string, source interface{}, timeout int, maxShard int) (*elastic.SearchResult, error) {
return e.Client.Search().
Index(indices...).
IgnoreUnavailable(true).
Source(source).
Timeout(fmt.Sprintf("%ds", timeout)).
Do(ctx)
}
param := new(eslike.Query)
if err := mapstructure.Decode(query, param); err != nil {
return nil, err
}
// 扩大查询范围, 解决上一次查询消耗时间太多,导致本次查询时间范围起止时间,滞后问题
param.Interval += 30
res, _, err := eslike.QueryLog(ctx, param, e.Timeout, e.Version, e.MaxShard, search)
if err != nil {
return nil, err
}
var result []map[string]string
for _, item := range res {
logx.Debugf(ctx, "query:%v item:%v", query, item)
if itemMap, ok := item.(*elastic.SearchHit); ok {
mItem := make(map[string]string)
// 遍历 fields 字段的每个键值对
sourceMap := make(map[string]interface{})
err := json.Unmarshal(itemMap.Source, &sourceMap)
if err != nil {
logx.Warningf(ctx, "unmarshal source%s error:%v", string(itemMap.Source), err)
continue
}
for k, v := range sourceMap {
mItem[k] = fmt.Sprintf("%v", v)
}
// 将处理好的 map 添加到 m 切片中
result = append(result, mItem)
if param.Limit > 0 {
continue
}
// 只取第一条数据
break
}
}
return result, nil
}
================================================
FILE: datasource/mysql/mysql.go
================================================
package mysql
import (
"context"
"fmt"
"strings"
"time"
"github.com/ccfos/nightingale/v6/datasource"
"github.com/ccfos/nightingale/v6/dskit/mysql"
"github.com/ccfos/nightingale/v6/dskit/sqlbase"
"github.com/ccfos/nightingale/v6/dskit/types"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/macros"
"github.com/mitchellh/mapstructure"
"github.com/toolkits/pkg/logger"
"github.com/ccfos/nightingale/v6/pkg/logx"
)
const (
MySQLType = "mysql"
)
func init() {
datasource.RegisterDatasource(MySQLType, new(MySQL))
}
type MySQL struct {
mysql.MySQL `json:",inline" mapstructure:",squash"`
}
type QueryParam struct {
Ref string `json:"ref" mapstructure:"ref"`
Database string `json:"database" mapstructure:"database"`
Table string `json:"table" mapstructure:"table"`
SQL string `json:"sql" mapstructure:"sql"`
Keys datasource.Keys `json:"keys" mapstructure:"keys"`
From int64 `json:"from" mapstructure:"from"`
To int64 `json:"to" mapstructure:"to"`
}
func (m *MySQL) InitClient() error {
if len(m.Shards) == 0 {
return fmt.Errorf("not found mysql addr, please check datasource config")
}
if _, err := m.NewConn(context.TODO(), ""); err != nil {
return err
}
return nil
}
func (m *MySQL) Init(settings map[string]interface{}) (datasource.Datasource, error) {
newest := new(MySQL)
err := mapstructure.Decode(settings, newest)
return newest, err
}
func (m *MySQL) Validate(ctx context.Context) error {
if len(m.Shards) == 0 || len(strings.TrimSpace(m.Shards[0].Addr)) == 0 {
return fmt.Errorf("mysql addr is invalid, please check datasource setting")
}
if len(strings.TrimSpace(m.Shards[0].User)) == 0 {
return fmt.Errorf("mysql user is invalid, please check datasource setting")
}
return nil
}
// Equal compares whether two objects are the same, used for caching
func (m *MySQL) Equal(p datasource.Datasource) bool {
newest, ok := p.(*MySQL)
if !ok {
logger.Errorf("unexpected plugin type, expected is mysql")
return false
}
if len(m.Shards) == 0 || len(newest.Shards) == 0 {
return false
}
oldShard := m.Shards[0]
newShard := newest.Shards[0]
if oldShard.Addr != newShard.Addr {
return false
}
if oldShard.User != newShard.User {
return false
}
if oldShard.Password != newShard.Password {
return false
}
if oldShard.MaxQueryRows != newShard.MaxQueryRows {
return false
}
if oldShard.Timeout != newShard.Timeout {
return false
}
if oldShard.MaxIdleConns != newShard.MaxIdleConns {
return false
}
if oldShard.MaxOpenConns != newShard.MaxOpenConns {
return false
}
if oldShard.ConnMaxLifetime != newShard.ConnMaxLifetime {
return false
}
return true
}
func (m *MySQL) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
return nil, nil
}
func (m *MySQL) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
return nil, nil
}
func (m *MySQL) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) {
return nil, nil
}
func (m *MySQL) QueryData(ctx context.Context, query interface{}) ([]models.DataResp, error) {
mysqlQueryParam := new(QueryParam)
if err := mapstructure.Decode(query, mysqlQueryParam); err != nil {
return nil, err
}
if strings.Contains(mysqlQueryParam.SQL, "$__") {
var err error
mysqlQueryParam.SQL, err = macros.Macro(mysqlQueryParam.SQL, mysqlQueryParam.From, mysqlQueryParam.To)
if err != nil {
return nil, err
}
}
if mysqlQueryParam.Keys.ValueKey == "" {
return nil, fmt.Errorf("valueKey is required")
}
timeout := m.Shards[0].Timeout
if timeout == 0 {
timeout = 60
}
timeoutCtx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second)
defer cancel()
items, err := m.QueryTimeseries(timeoutCtx, &sqlbase.QueryParam{
Sql: mysqlQueryParam.SQL,
Keys: types.Keys{
ValueKey: mysqlQueryParam.Keys.ValueKey,
LabelKey: mysqlQueryParam.Keys.LabelKey,
TimeKey: mysqlQueryParam.Keys.TimeKey,
},
})
if err != nil {
logx.Warningf(ctx, "query:%+v get data err:%v", mysqlQueryParam, err)
return []models.DataResp{}, err
}
data := make([]models.DataResp, 0)
for i := range items {
data = append(data, models.DataResp{
Ref: mysqlQueryParam.Ref,
Metric: items[i].Metric,
Values: items[i].Values,
})
}
return data, nil
}
func (m *MySQL) QueryLog(ctx context.Context, query interface{}) ([]interface{}, int64, error) {
mysqlQueryParam := new(QueryParam)
if err := mapstructure.Decode(query, mysqlQueryParam); err != nil {
return nil, 0, err
}
if strings.Contains(mysqlQueryParam.SQL, "$__") {
var err error
mysqlQueryParam.SQL, err = macros.Macro(mysqlQueryParam.SQL, mysqlQueryParam.From, mysqlQueryParam.To)
if err != nil {
return nil, 0, err
}
}
timeout := m.Shards[0].Timeout
if timeout == 0 {
timeout = 60
}
timeoutCtx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second)
defer cancel()
items, err := m.Query(timeoutCtx, &sqlbase.QueryParam{
Sql: mysqlQueryParam.SQL,
})
if err != nil {
logx.Warningf(ctx, "query:%+v get data err:%v", mysqlQueryParam, err)
return []interface{}{}, 0, err
}
logs := make([]interface{}, 0)
for i := range items {
logs = append(logs, items[i])
}
return logs, 0, nil
}
func (m *MySQL) DescribeTable(ctx context.Context, query interface{}) ([]*types.ColumnProperty, error) {
mysqlQueryParam := new(QueryParam)
if err := mapstructure.Decode(query, mysqlQueryParam); err != nil {
return nil, err
}
return m.DescTable(ctx, mysqlQueryParam.Database, mysqlQueryParam.Table)
}
================================================
FILE: datasource/opensearch/opensearch.go
================================================
package opensearch
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net"
"net/http"
"net/url"
"reflect"
"regexp"
"sort"
"strings"
"time"
"github.com/ccfos/nightingale/v6/datasource"
"github.com/ccfos/nightingale/v6/datasource/commons/eslike"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/tlsx"
"github.com/mitchellh/mapstructure"
"github.com/olivere/elastic/v7"
oscliv2 "github.com/opensearch-project/opensearch-go/v2"
osapiv2 "github.com/opensearch-project/opensearch-go/v2/opensearchapi"
)
const (
OpenSearchType = "opensearch"
)
type OpenSearch struct {
Addr string `json:"os.addr" mapstructure:"os.addr"`
Nodes []string `json:"os.nodes" mapstructure:"os.nodes"`
Timeout int64 `json:"os.timeout" mapstructure:"os.timeout"` // millis
Basic BasicAuth `json:"os.basic" mapstructure:"os.basic"`
TLS TLS `json:"os.tls" mapstructure:"os.tls"`
Version string `json:"os.version" mapstructure:"os.version"`
Headers map[string]string `json:"os.headers" mapstructure:"os.headers"`
MinInterval int `json:"os.min_interval" mapstructure:"os.min_interval"` // seconds
MaxShard int `json:"os.max_shard" mapstructure:"os.max_shard"`
ClusterName string `json:"os.cluster_name" mapstructure:"os.cluster_name"`
Client *oscliv2.Client `json:"os.client" mapstructure:"os.client"`
}
type TLS struct {
SkipTlsVerify bool `json:"os.tls.skip_tls_verify" mapstructure:"os.tls.skip_tls_verify"`
}
type BasicAuth struct {
Enable bool `json:"os.auth.enable" mapstructure:"os.auth.enable"`
Username string `json:"os.user" mapstructure:"os.user"`
Password string `json:"os.password" mapstructure:"os.password"`
}
func init() {
datasource.RegisterDatasource(OpenSearchType, new(OpenSearch))
}
func (os *OpenSearch) Init(settings map[string]interface{}) (datasource.Datasource, error) {
newest := new(OpenSearch)
err := mapstructure.Decode(settings, newest)
return newest, err
}
func (os *OpenSearch) InitClient() error {
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: time.Duration(os.Timeout) * time.Millisecond,
}).DialContext,
ResponseHeaderTimeout: time.Duration(os.Timeout) * time.Millisecond,
}
if len(os.Nodes) > 0 {
os.Addr = os.Nodes[0]
}
if strings.Contains(os.Addr, "https") {
tlsConfig := tlsx.ClientConfig{
InsecureSkipVerify: os.TLS.SkipTlsVerify,
UseTLS: true,
}
cfg, err := tlsConfig.TLSConfig()
if err != nil {
return err
}
transport.TLSClientConfig = cfg
}
headers := http.Header{}
for k, v := range os.Headers {
headers[k] = []string{v}
}
options := oscliv2.Config{
Addresses: os.Nodes,
Transport: transport,
Header: headers,
}
// 只要有用户名就添加认证,不依赖 Enable 字段
if os.Basic.Username != "" {
options.Username = os.Basic.Username
options.Password = os.Basic.Password
}
var err = error(nil)
os.Client, err = oscliv2.NewClient(options)
return err
}
func (os *OpenSearch) Equal(other datasource.Datasource) bool {
sort.Strings(os.Nodes)
sort.Strings(other.(*OpenSearch).Nodes)
if strings.Join(os.Nodes, ",") != strings.Join(other.(*OpenSearch).Nodes, ",") {
return false
}
if os.Basic.Username != other.(*OpenSearch).Basic.Username {
return false
}
if os.Basic.Password != other.(*OpenSearch).Basic.Password {
return false
}
if os.TLS.SkipTlsVerify != other.(*OpenSearch).TLS.SkipTlsVerify {
return false
}
if os.Timeout != other.(*OpenSearch).Timeout {
return false
}
if !reflect.DeepEqual(os.Headers, other.(*OpenSearch).Headers) {
return false
}
return true
}
func (os *OpenSearch) Validate(ctx context.Context) (err error) {
if len(os.Nodes) == 0 {
return fmt.Errorf("need a valid addr")
}
for _, addr := range os.Nodes {
_, err = url.Parse(addr)
if err != nil {
return fmt.Errorf("parse addr error: %v", err)
}
}
// 如果提供了用户名,必须同时提供密码
if len(os.Basic.Username) > 0 && len(os.Basic.Password) == 0 {
return fmt.Errorf("password is required when username is provided")
}
if os.MaxShard == 0 {
os.MaxShard = 5
}
if os.MinInterval < 10 {
os.MinInterval = 10
}
if os.Timeout == 0 {
os.Timeout = 6000
}
if !strings.HasPrefix(os.Version, "2") {
return fmt.Errorf("version must be 2.0+")
}
return nil
}
func (os *OpenSearch) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
return eslike.MakeLogQuery(ctx, query, eventTags, start, end)
}
func (os *OpenSearch) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
return eslike.MakeTSQuery(ctx, query, eventTags, start, end)
}
func search(ctx context.Context, indices []string, source interface{}, timeout int, cli *oscliv2.Client) (*elastic.SearchResult, error) {
var body *bytes.Buffer = nil
if source != nil {
body = new(bytes.Buffer)
err := json.NewEncoder(body).Encode(source)
if err != nil {
return nil, err
}
}
req := osapiv2.SearchRequest{
Index: indices,
Body: body,
}
if timeout > 0 {
req.Timeout = time.Second * time.Duration(timeout)
}
resp, err := req.Do(ctx, cli)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return nil, fmt.Errorf("opensearch response not 2xx, resp is %v", resp)
}
bs, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
result := new(elastic.SearchResult)
err = json.Unmarshal(bs, &result)
if err != nil {
return nil, err
}
return result, nil
}
func (os *OpenSearch) QueryData(ctx context.Context, queryParam interface{}) ([]models.DataResp, error) {
search := func(ctx context.Context, indices []string, source interface{}, timeout int, maxShard int) (*elastic.SearchResult, error) {
return search(ctx, indices, source, timeout, os.Client)
}
return eslike.QueryData(ctx, queryParam, os.Timeout, os.Version, search)
}
func (os *OpenSearch) QueryIndices() ([]string, error) {
cir := osapiv2.CatIndicesRequest{
Format: "json",
}
rsp, err := cir.Do(context.Background(), os.Client)
if err != nil {
return nil, err
}
defer rsp.Body.Close()
bs, err := io.ReadAll(rsp.Body)
if err != nil {
return nil, err
}
resp := make([]struct {
Index string `json:"index"`
}, 0)
err = json.Unmarshal(bs, &resp)
if err != nil {
return nil, err
}
var ret []string
for _, k := range resp {
ret = append(ret, k.Index)
}
return ret, nil
}
func (os *OpenSearch) QueryFields(indices []string) ([]string, error) {
var fields []string
mappingRequest := osapiv2.IndicesGetMappingRequest{
Index: indices,
}
resp, err := mappingRequest.Do(context.Background(), os.Client)
if err != nil {
return fields, err
}
defer resp.Body.Close()
bs, err := io.ReadAll(resp.Body)
if err != nil {
return fields, err
}
result := map[string]interface{}{}
err = json.Unmarshal(bs, &result)
if err != nil {
return fields, err
}
idx := ""
if len(indices) > 0 {
idx = indices[0]
}
mappingIndex := ""
indexReg, _ := regexp.Compile(idx)
for key, value := range result {
mappings, ok := value.(map[string]interface{})
if !ok {
continue
}
if len(mappings) == 0 {
continue
}
if key == idx || strings.Contains(key, idx) ||
(indexReg != nil && indexReg.MatchString(key)) {
mappingIndex = key
break
}
}
if len(mappingIndex) == 0 {
return fields, nil
}
fields = propertyMappingRange(result[mappingIndex], 1)
sort.Strings(fields)
return fields, nil
}
func propertyMappingRange(v interface{}, depth int) (fields []string) {
mapping, ok := v.(map[string]interface{})
if !ok {
return
}
if len(mapping) == 0 {
return
}
for key, value := range mapping {
if reflect.TypeOf(value).Kind() == reflect.Map {
valueMap := value.(map[string]interface{})
if prop, found := valueMap["properties"]; found {
subFields := propertyMappingRange(prop, depth+1)
for i := range subFields {
if depth == 1 {
fields = append(fields, subFields[i])
} else {
fields = append(fields, key+"."+subFields[i])
}
}
} else if typ, found := valueMap["type"]; found {
if eslike.HitFilter(typ.(string)) {
continue
}
fields = append(fields, key)
}
}
}
return
}
func (os *OpenSearch) QueryLog(ctx context.Context, queryParam interface{}) ([]interface{}, int64, error) {
search := func(ctx context.Context, indices []string, source interface{}, timeout int, maxShard int) (*elastic.SearchResult, error) {
return search(ctx, indices, source, timeout, os.Client)
}
return eslike.QueryLog(ctx, queryParam, os.Timeout, os.Version, 0, search)
}
func (os *OpenSearch) QueryFieldValue(indexes []string, field string, query string) ([]string, error) {
var values []string
source := elastic.NewSearchSource().
Size(0)
if query != "" {
source = source.Query(elastic.NewBoolQuery().Must(elastic.NewQueryStringQuery(query)))
}
source = source.Aggregation("distinct", elastic.NewTermsAggregation().Field(field).Size(10000))
result, err := search(context.Background(), indexes, source, 0, os.Client)
if err != nil {
return values, err
}
agg, found := result.Aggregations.Terms("distinct")
if !found {
return values, nil
}
for _, bucket := range agg.Buckets {
values = append(values, bucket.Key.(string))
}
return values, nil
}
func (os *OpenSearch) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) {
return nil, nil
}
================================================
FILE: datasource/postgresql/postgresql.go
================================================
package postgresql
import (
"context"
"fmt"
"regexp"
"strings"
"time"
"github.com/ccfos/nightingale/v6/datasource"
"github.com/ccfos/nightingale/v6/pkg/macros"
"github.com/ccfos/nightingale/v6/dskit/postgres"
"github.com/ccfos/nightingale/v6/dskit/sqlbase"
"github.com/ccfos/nightingale/v6/dskit/types"
"github.com/ccfos/nightingale/v6/models"
"github.com/mitchellh/mapstructure"
"github.com/toolkits/pkg/logger"
"github.com/ccfos/nightingale/v6/pkg/logx"
)
const (
PostgreSQLType = "pgsql"
)
var (
regx = `(?i)from\s+((?:"[^"]+"|[a-zA-Z0-9_]+))\.((?:"[^"]+"|[a-zA-Z0-9_]+))\.((?:"[^"]+"|[a-zA-Z0-9_]+))`
)
func init() {
datasource.RegisterDatasource(PostgreSQLType, new(PostgreSQL))
}
type PostgreSQL struct {
Shards []*postgres.PostgreSQL `json:"pgsql.shards" mapstructure:"pgsql.shards"`
}
type QueryParam struct {
Ref string `json:"ref" mapstructure:"ref"`
Database string `json:"database" mapstructure:"database"`
Table string `json:"table" mapstructure:"table"`
SQL string `json:"sql" mapstructure:"sql"`
Keys datasource.Keys `json:"keys" mapstructure:"keys"`
From int64 `json:"from" mapstructure:"from"`
To int64 `json:"to" mapstructure:"to"`
}
func (p *PostgreSQL) InitClient() error {
if len(p.Shards) == 0 {
return fmt.Errorf("not found postgresql addr, please check datasource config")
}
for _, shard := range p.Shards {
if db, err := shard.NewConn(context.TODO(), "postgres"); err != nil {
defer sqlbase.CloseDB(db)
return err
}
}
return nil
}
func (p *PostgreSQL) Init(settings map[string]interface{}) (datasource.Datasource, error) {
newest := new(PostgreSQL)
err := mapstructure.Decode(settings, newest)
return newest, err
}
func (p *PostgreSQL) Validate(ctx context.Context) error {
if len(p.Shards) == 0 || len(strings.TrimSpace(p.Shards[0].Addr)) == 0 {
return fmt.Errorf("postgresql addr is invalid, please check datasource setting")
}
if len(strings.TrimSpace(p.Shards[0].User)) == 0 {
return fmt.Errorf("postgresql user is invalid, please check datasource setting")
}
return nil
}
// Equal compares whether two objects are the same, used for caching
func (p *PostgreSQL) Equal(d datasource.Datasource) bool {
newest, ok := d.(*PostgreSQL)
if !ok {
logger.Errorf("unexpected plugin type, expected is postgresql")
return false
}
if len(p.Shards) == 0 || len(newest.Shards) == 0 {
return false
}
oldShard := p.Shards[0]
newShard := newest.Shards[0]
if oldShard.Addr != newShard.Addr {
return false
}
if oldShard.User != newShard.User {
return false
}
if oldShard.Password != newShard.Password {
return false
}
if oldShard.MaxQueryRows != newShard.MaxQueryRows {
return false
}
if oldShard.Timeout != newShard.Timeout {
return false
}
if oldShard.MaxIdleConns != newShard.MaxIdleConns {
return false
}
if oldShard.MaxOpenConns != newShard.MaxOpenConns {
return false
}
if oldShard.ConnMaxLifetime != newShard.ConnMaxLifetime {
return false
}
return true
}
func (p *PostgreSQL) ShowDatabases(ctx context.Context) ([]string, error) {
return p.Shards[0].ShowDatabases(ctx, "")
}
func (p *PostgreSQL) ShowTables(ctx context.Context, database string) ([]string, error) {
p.Shards[0].DB = database
rets, err := p.Shards[0].ShowTables(ctx, "")
if err != nil {
return nil, err
}
tables := make([]string, 0, len(rets))
for scheme, tabs := range rets {
for _, tab := range tabs {
tables = append(tables, scheme+"."+tab)
}
}
return tables, nil
}
func (p *PostgreSQL) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
return nil, nil
}
func (p *PostgreSQL) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
return nil, nil
}
func (p *PostgreSQL) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) {
return nil, nil
}
func (p *PostgreSQL) QueryData(ctx context.Context, query interface{}) ([]models.DataResp, error) {
postgresqlQueryParam := new(QueryParam)
if err := mapstructure.Decode(query, postgresqlQueryParam); err != nil {
return nil, err
}
postgresqlQueryParam.SQL = formatSQLDatabaseNameWithRegex(postgresqlQueryParam.SQL)
if strings.Contains(postgresqlQueryParam.SQL, "$__") {
var err error
postgresqlQueryParam.SQL, err = macros.Macro(postgresqlQueryParam.SQL, postgresqlQueryParam.From, postgresqlQueryParam.To)
if err != nil {
return nil, err
}
}
if postgresqlQueryParam.Database != "" {
p.Shards[0].DB = postgresqlQueryParam.Database
} else {
db, err := parseDBName(postgresqlQueryParam.SQL)
if err != nil {
return nil, err
}
p.Shards[0].DB = db
}
timeout := p.Shards[0].Timeout
if timeout == 0 {
timeout = 60
}
timeoutCtx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second)
defer cancel()
items, err := p.Shards[0].QueryTimeseries(timeoutCtx, &sqlbase.QueryParam{
Sql: postgresqlQueryParam.SQL,
Keys: types.Keys{
ValueKey: postgresqlQueryParam.Keys.ValueKey,
LabelKey: postgresqlQueryParam.Keys.LabelKey,
TimeKey: postgresqlQueryParam.Keys.TimeKey,
},
})
if err != nil {
logx.Warningf(ctx, "query:%+v get data err:%v", postgresqlQueryParam, err)
return []models.DataResp{}, err
}
data := make([]models.DataResp, 0)
for i := range items {
data = append(data, models.DataResp{
Ref: postgresqlQueryParam.Ref,
Metric: items[i].Metric,
Values: items[i].Values,
})
}
// parse resp to time series data
logx.Infof(ctx, "req:%+v keys:%+v \n data:%v", postgresqlQueryParam, postgresqlQueryParam.Keys, data)
return data, nil
}
func (p *PostgreSQL) QueryLog(ctx context.Context, query interface{}) ([]interface{}, int64, error) {
postgresqlQueryParam := new(QueryParam)
if err := mapstructure.Decode(query, postgresqlQueryParam); err != nil {
return nil, 0, err
}
if postgresqlQueryParam.Database != "" {
p.Shards[0].DB = postgresqlQueryParam.Database
} else {
db, err := parseDBName(postgresqlQueryParam.SQL)
if err != nil {
return nil, 0, err
}
p.Shards[0].DB = db
}
postgresqlQueryParam.SQL = formatSQLDatabaseNameWithRegex(postgresqlQueryParam.SQL)
if strings.Contains(postgresqlQueryParam.SQL, "$__") {
var err error
postgresqlQueryParam.SQL, err = macros.Macro(postgresqlQueryParam.SQL, postgresqlQueryParam.From, postgresqlQueryParam.To)
if err != nil {
return nil, 0, err
}
}
timeout := p.Shards[0].Timeout
if timeout == 0 {
timeout = 60
}
timeoutCtx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second)
defer cancel()
items, err := p.Shards[0].Query(timeoutCtx, &sqlbase.QueryParam{
Sql: postgresqlQueryParam.SQL,
})
if err != nil {
logx.Warningf(ctx, "query:%+v get data err:%v", postgresqlQueryParam, err)
return []interface{}{}, 0, err
}
logs := make([]interface{}, 0)
for i := range items {
logs = append(logs, items[i])
}
return logs, 0, nil
}
func (p *PostgreSQL) DescribeTable(ctx context.Context, query interface{}) ([]*types.ColumnProperty, error) {
postgresqlQueryParam := new(QueryParam)
if err := mapstructure.Decode(query, postgresqlQueryParam); err != nil {
return nil, err
}
p.Shards[0].DB = postgresqlQueryParam.Database
pairs := strings.Split(postgresqlQueryParam.Table, ".") // format: scheme.table_name
scheme := ""
table := postgresqlQueryParam.Table
if len(pairs) == 2 {
scheme = pairs[0]
table = pairs[1]
}
return p.Shards[0].DescTable(ctx, scheme, table)
}
func parseDBName(sql string) (db string, err error) {
re := regexp.MustCompile(regx)
matches := re.FindStringSubmatch(sql)
if len(matches) != 4 {
return "", fmt.Errorf("no valid table name in format database.schema.table found")
}
return strings.Trim(matches[1], `"`), nil
}
// formatSQLDatabaseNameWithRegex 只对 dbname.scheme.tabname 格式进行数据库名称格式化,转为 "dbname".scheme.tabname
// 在pgsql中,大小写是通过"" 双引号括起来区分的,默认pg都是转为小写的,所以这里转为 "dbname".scheme."tabname"
func formatSQLDatabaseNameWithRegex(sql string) string {
// 匹配 from dbname.scheme.table_name 的模式
// 使用捕获组来精确匹配数据库名称,确保后面跟着scheme和table
re := regexp.MustCompile(`(?i)\bfrom\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\.\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\.\s*([a-zA-Z_][a-zA-Z0-9_]*)`)
return re.ReplaceAllString(sql, `from "$1"."$2"."$3"`)
}
func extractColumns(sql string) ([]string, error) {
// 将 SQL 转换为小写以简化匹配
sql = strings.ToLower(sql)
// 匹配 SELECT 和 FROM 之间的内容
re := regexp.MustCompile(`select\s+(.*?)\s+from`)
matches := re.FindStringSubmatch(sql)
if len(matches) < 2 {
return nil, fmt.Errorf("no columns found or invalid SQL syntax")
}
// 提取列部分
columnsString := matches[1]
// 分割列
columns := splitColumns(columnsString)
// 清理每个列名
for i, col := range columns {
columns[i] = strings.TrimSpace(col)
}
return columns, nil
}
func splitColumns(columnsString string) []string {
var columns []string
var currentColumn strings.Builder
parenthesesCount := 0
inQuotes := false
for _, char := range columnsString {
switch char {
case '(':
parenthesesCount++
currentColumn.WriteRune(char)
case ')':
parenthesesCount--
currentColumn.WriteRune(char)
case '\'', '"':
inQuotes = !inQuotes
currentColumn.WriteRune(char)
case ',':
if parenthesesCount == 0 && !inQuotes {
columns = append(columns, currentColumn.String())
currentColumn.Reset()
} else {
currentColumn.WriteRune(char)
}
default:
currentColumn.WriteRune(char)
}
}
if currentColumn.Len() > 0 {
columns = append(columns, currentColumn.String())
}
return columns
}
================================================
FILE: datasource/prom/prom.go
================================================
package prom
type Prometheus struct {
PrometheusAddr string `json:"prometheus.addr"`
PrometheusBasic struct {
PrometheusUser string `json:"prometheus.user"`
PrometheusPass string `json:"prometheus.password"`
} `json:"prometheus.basic"`
Headers map[string]string `json:"prometheus.headers"`
PrometheusTimeout int64 `json:"prometheus.timeout"`
ClusterName string `json:"prometheus.cluster_name"`
WriteAddr string `json:"prometheus.write_addr"`
TsdbType string `json:"prometheus.tsdb_type"`
InternalAddr string `json:"prometheus.internal_addr"`
}
================================================
FILE: datasource/tdengine/tdengine.go
================================================
package tdengine
import (
"context"
"encoding/json"
"fmt"
"reflect"
"strconv"
"strings"
"time"
"github.com/prometheus/common/model"
"github.com/toolkits/pkg/logger"
"github.com/ccfos/nightingale/v6/pkg/logx"
"github.com/ccfos/nightingale/v6/datasource"
td "github.com/ccfos/nightingale/v6/dskit/tdengine"
"github.com/ccfos/nightingale/v6/models"
"github.com/mitchellh/mapstructure"
)
const (
TDEngineType = "tdengine"
)
type TDengine struct {
td.Tdengine `json:",inline" mapstructure:",squash"`
}
type TdengineQuery struct {
From string `json:"from"`
Interval int64 `json:"interval"`
Keys Keys `json:"keys"`
Query string `json:"query"` // 查询条件
Ref string `json:"ref"` // 变量
To string `json:"to"`
}
type Keys struct {
LabelKey string `json:"labelKey"` // 多个用空格分隔
MetricKey string `json:"metricKey"` // 多个用空格分隔
TimeFormat string `json:"timeFormat"`
}
func init() {
datasource.RegisterDatasource(TDEngineType, new(TDengine))
}
func (td *TDengine) Init(settings map[string]interface{}) (datasource.Datasource, error) {
newest := new(TDengine)
err := mapstructure.Decode(settings, newest)
return newest, err
}
func (td *TDengine) InitClient() error {
td.InitCli()
return nil
}
func (td *TDengine) Equal(other datasource.Datasource) bool {
otherTD, ok := other.(*TDengine)
if !ok {
return false
}
if td.Addr != otherTD.Addr {
return false
}
if td.Basic != nil && otherTD.Basic != nil {
if td.Basic.User != otherTD.Basic.User {
return false
}
if td.Basic.Password != otherTD.Basic.Password {
return false
}
}
if td.Token != otherTD.Token {
return false
}
if td.Timeout != otherTD.Timeout {
return false
}
if td.DialTimeout != otherTD.DialTimeout {
return false
}
if td.MaxIdleConnsPerHost != otherTD.MaxIdleConnsPerHost {
return false
}
if len(td.Headers) != len(otherTD.Headers) {
return false
}
for k, v := range td.Headers {
if otherV, ok := otherTD.Headers[k]; !ok || v != otherV {
return false
}
}
return true
}
func (td *TDengine) Validate(ctx context.Context) (err error) {
return nil
}
func (td *TDengine) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
return nil, nil
}
func (td *TDengine) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
return nil, nil
}
func (td *TDengine) QueryData(ctx context.Context, queryParam interface{}) ([]models.DataResp, error) {
return td.Query(ctx, queryParam, 0)
}
func (td *TDengine) QueryLog(ctx context.Context, queryParam interface{}) ([]interface{}, int64, error) {
b, err := json.Marshal(queryParam)
if err != nil {
return nil, 0, err
}
var q TdengineQuery
err = json.Unmarshal(b, &q)
if err != nil {
return nil, 0, err
}
if q.Interval == 0 {
q.Interval = 60
}
if q.From == "" {
// 2023-09-21T05:37:30.000Z format
to := time.Now().Unix()
q.To = time.Unix(to, 0).UTC().Format(time.RFC3339)
from := to - q.Interval
q.From = time.Unix(from, 0).UTC().Format(time.RFC3339)
}
replacements := map[string]string{
"$from": fmt.Sprintf("'%s'", q.From),
"$to": fmt.Sprintf("'%s'", q.To),
"$interval": fmt.Sprintf("%ds", q.Interval),
}
for key, val := range replacements {
q.Query = strings.ReplaceAll(q.Query, key, val)
}
if !strings.Contains(q.Query, "limit") {
q.Query = q.Query + " limit 200"
}
data, err := td.QueryTable(q.Query)
if err != nil {
return nil, 0, err
}
return ConvertToTable(data), int64(len(data.Data)), nil
}
func (td *TDengine) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) {
return nil, nil
}
func (td *TDengine) Query(ctx context.Context, query interface{}, delay ...int) ([]models.DataResp, error) {
b, err := json.Marshal(query)
if err != nil {
return nil, err
}
var q TdengineQuery
err = json.Unmarshal(b, &q)
if err != nil {
return nil, err
}
if q.Interval == 0 {
q.Interval = 60
}
delaySec := 0
if len(delay) > 0 {
delaySec = delay[0]
}
if q.From == "" {
// 2023-09-21T05:37:30.000Z format
to := time.Now().Unix() - int64(delaySec)
q.To = time.Unix(to, 0).UTC().Format(time.RFC3339)
from := to - q.Interval
q.From = time.Unix(from, 0).UTC().Format(time.RFC3339)
}
replacements := map[string]string{
"$from": fmt.Sprintf("'%s'", q.From),
"$to": fmt.Sprintf("'%s'", q.To),
"$interval": fmt.Sprintf("%ds", q.Interval),
}
for key, val := range replacements {
q.Query = strings.ReplaceAll(q.Query, key, val)
}
data, err := td.QueryTable(q.Query)
if err != nil {
return nil, err
}
logx.Debugf(ctx, "tdengine query:%s result: %+v", q.Query, data)
return ConvertToTStData(data, q.Keys, q.Ref)
}
func ConvertToTStData(src td.APIResponse, key Keys, ref string) ([]models.DataResp, error) {
metricIdxMap := make(map[string]int)
labelIdxMap := make(map[string]int)
metricMap := make(map[string]struct{})
if key.MetricKey != "" {
metricList := strings.Split(key.MetricKey, " ")
for _, metric := range metricList {
metricMap[metric] = struct{}{}
}
}
labelMap := make(map[string]string)
if key.LabelKey != "" {
labelList := strings.Split(key.LabelKey, " ")
for _, label := range labelList {
labelMap[label] = label
}
}
tsIdx := -1
for colIndex, colData := range src.ColumnMeta {
colName := colData[0].(string)
var colType string
// 处理v2版本数字类型和v3版本字符串类型
switch t := colData[1].(type) {
case float64:
// v2版本数字类型映射
switch int(t) {
case 1:
colType = "BOOL"
case 2:
colType = "TINYINT"
case 3:
colType = "SMALLINT"
case 4:
colType = "INT"
case 5:
colType = "BIGINT"
case 6:
colType = "FLOAT"
case 7:
colType = "DOUBLE"
case 8:
colType = "BINARY"
case 9:
colType = "TIMESTAMP"
case 10:
colType = "NCHAR"
default:
colType = "UNKNOWN"
}
case string:
// v3版本直接使用字符串类型
colType = t
default:
logger.Warningf("unexpected column type format: %v", colData[1])
continue
}
switch colType {
case "TIMESTAMP":
tsIdx = colIndex
case "BIGINT", "INT", "INT UNSIGNED", "BIGINT UNSIGNED", "FLOAT", "DOUBLE",
"SMALLINT", "SMALLINT UNSIGNED", "TINYINT", "TINYINT UNSIGNED", "BOOL":
if len(metricMap) > 0 {
if _, ok := metricMap[colName]; !ok {
continue
}
metricIdxMap[colName] = colIndex
} else {
metricIdxMap[colName] = colIndex
}
default:
if len(labelMap) > 0 {
if _, ok := labelMap[colName]; !ok {
continue
}
labelIdxMap[colName] = colIndex
} else {
labelIdxMap[colName] = colIndex
}
}
}
if tsIdx == -1 {
return nil, fmt.Errorf("timestamp column not found, please check your query")
}
var result []models.DataResp
m := make(map[string]*models.DataResp)
for _, row := range src.Data {
for metricName, metricIdx := range metricIdxMap {
value, err := interfaceToFloat64(row[metricIdx])
if err != nil {
logger.Warningf("parse %v value failed: %v", row, err)
continue
}
metric := make(model.Metric)
for labelName, labelIdx := range labelIdxMap {
metric[model.LabelName(labelName)] = model.LabelValue(row[labelIdx].(string))
}
metric[model.MetricNameLabel] = model.LabelValue(metricName)
// transfer 2022-06-29T05:52:16.603Z to unix timestamp
t, err := parseTimeString(row[tsIdx].(string))
if err != nil {
logger.Warningf("parse %v timestamp failed: %v", row, err)
continue
}
timestamp := t.Unix()
if _, ok := m[metric.String()]; !ok {
m[metric.String()] = &models.DataResp{
Metric: metric,
Values: [][]float64{
{float64(timestamp), value},
},
}
} else {
m[metric.String()].Values = append(m[metric.String()].Values, []float64{float64(timestamp), value})
}
}
}
for _, v := range m {
v.Ref = ref
result = append(result, *v)
}
return result, nil
}
func interfaceToFloat64(input interface{}) (float64, error) {
// Check for the kind of the value first
if input == nil {
return 0, fmt.Errorf("unsupported type: %T", input)
}
kind := reflect.TypeOf(input).Kind()
switch kind {
case reflect.Float64:
return input.(float64), nil
case reflect.Float32:
return float64(input.(float32)), nil
case reflect.Int, reflect.Int32, reflect.Int64, reflect.Int8, reflect.Int16:
return float64(reflect.ValueOf(input).Int()), nil
case reflect.Uint, reflect.Uint32, reflect.Uint64, reflect.Uint8, reflect.Uint16:
return float64(reflect.ValueOf(input).Uint()), nil
case reflect.String:
return strconv.ParseFloat(input.(string), 64)
case reflect.Bool:
if input.(bool) {
return 1.0, nil
}
return 0.0, nil
default:
return 0, fmt.Errorf("unsupported type: %T", input)
}
}
func parseTimeString(ts string) (time.Time, error) {
// 尝试不同的时间格式
formats := []string{
// 标准格式
time.Layout, // "01/02 03:04:05PM '06 -0700"
time.ANSIC, // "Mon Jan _2 15:04:05 2006"
time.UnixDate, // "Mon Jan _2 15:04:05 MST 2006"
time.RubyDate, // "Mon Jan 02 15:04:05 -0700 2006"
time.RFC822, // "02 Jan 06 15:04 MST"
time.RFC822Z, // "02 Jan 06 15:04 -0700"
time.RFC850, // "Monday, 02-Jan-06 15:04:05 MST"
time.RFC1123, // "Mon, 02 Jan 2006 15:04:05 MST"
time.RFC1123Z, // "Mon, 02 Jan 2006 15:04:05 -0700"
time.RFC3339, // "2006-01-02T15:04:05Z07:00"
time.RFC3339Nano, // "2006-01-02T15:04:05.999999999Z07:00"
time.Kitchen, // "3:04PM"
// 实用时间戳格式
time.Stamp, // "Jan _2 15:04:05"
time.StampMilli, // "Jan _2 15:04:05.000"
time.StampMicro, // "Jan _2 15:04:05.000000"
time.StampNano, // "Jan _2 15:04:05.000000000"
time.DateTime, // "2006-01-02 15:04:05"
time.DateOnly, // "2006-01-02"
time.TimeOnly, // "15:04:05"
// 常用自定义格式
"2006-01-02T15:04:05", // 无时区的ISO格式
"2006-01-02T15:04:05.000Z",
"2006-01-02T15:04:05Z",
"2006-01-02 15:04:05.999999999", // 纳秒
"2006-01-02 15:04:05.999999", // 微秒
"2006-01-02 15:04:05.999", // 毫秒
"2006/01/02",
"20060102",
"01/02/2006",
"2006年01月02日",
"2006年01月02日 15:04:05",
}
var lastErr error
for _, format := range formats {
t, err := time.Parse(format, ts)
if err == nil {
return t, nil
}
lastErr = err
}
// 尝试解析 Unix 时间戳
if timestamp, err := strconv.ParseInt(ts, 10, 64); err == nil {
switch len(ts) {
case 10: // 秒
return time.Unix(timestamp, 0), nil
case 13: // 毫秒
return time.Unix(timestamp/1000, (timestamp%1000)*1000000), nil
case 16: // 微秒
return time.Unix(timestamp/1000000, (timestamp%1000000)*1000), nil
case 19: // 纳秒
return time.Unix(timestamp/1000000000, timestamp%1000000000), nil
}
}
return time.Time{}, fmt.Errorf("failed to parse time with any format: %v", lastErr)
}
func ConvertToTable(src td.APIResponse) []interface{} {
var resp []interface{}
for i := range src.Data {
cur := make(map[string]interface{})
for j := range src.Data[i] {
cur[src.ColumnMeta[j][0].(string)] = src.Data[i][j]
}
resp = append(resp, cur)
}
return resp
}
================================================
FILE: datasource/victorialogs/victorialogs.go
================================================
package victorialogs
import (
"context"
"fmt"
"net/url"
"reflect"
"strconv"
"time"
"github.com/ccfos/nightingale/v6/datasource"
"github.com/ccfos/nightingale/v6/dskit/victorialogs"
"github.com/ccfos/nightingale/v6/models"
"github.com/mitchellh/mapstructure"
"github.com/prometheus/common/model"
)
const (
VictoriaLogsType = "victorialogs"
)
// VictoriaLogs 数据源实现
type VictoriaLogs struct {
victorialogs.VictoriaLogs `json:",inline" mapstructure:",squash"`
}
// Query 查询参数
type Query struct {
Query string `json:"query" mapstructure:"query"` // LogsQL 查询语句
Start int64 `json:"start" mapstructure:"start"` // 开始时间(秒)
End int64 `json:"end" mapstructure:"end"` // 结束时间(秒)
Time int64 `json:"time" mapstructure:"time"` // 单点时间(秒)- 用于告警
Step string `json:"step" mapstructure:"step"` // 步长,如 "1m", "5m"
Limit int `json:"limit" mapstructure:"limit"` // 限制返回数量
Ref string `json:"ref" mapstructure:"ref"` // 变量引用名(如 A、B)
}
// IsInstantQuery 判断是否为即时查询(告警场景)
func (q *Query) IsInstantQuery() bool {
return q.Time > 0 || (q.Start >= 0 && q.Start == q.End)
}
func init() {
datasource.RegisterDatasource(VictoriaLogsType, new(VictoriaLogs))
}
// Init 初始化配置
func (vl *VictoriaLogs) Init(settings map[string]interface{}) (datasource.Datasource, error) {
newest := new(VictoriaLogs)
err := mapstructure.Decode(settings, newest)
return newest, err
}
// InitClient 初始化客户端
func (vl *VictoriaLogs) InitClient() error {
if err := vl.InitHTTPClient(); err != nil {
return fmt.Errorf("failed to init victorialogs http client: %w", err)
}
return nil
}
// Validate 参数验证
func (vl *VictoriaLogs) Validate(ctx context.Context) error {
if vl.VictorialogsAddr == "" {
return fmt.Errorf("victorialogs.addr is required")
}
// 验证 URL 格式
_, err := url.Parse(vl.VictorialogsAddr)
if err != nil {
return fmt.Errorf("invalid victorialogs.addr: %w", err)
}
// 必须同时提供用户名和密码
if (vl.VictorialogsBasic.VictorialogsUser != "" && vl.VictorialogsBasic.VictorialogsPass == "") ||
(vl.VictorialogsBasic.VictorialogsUser == "" && vl.VictorialogsBasic.VictorialogsPass != "") {
return fmt.Errorf("both username and password must be provided")
}
// 设置默认值
if vl.Timeout == 0 {
vl.Timeout = 10000 // 默认 10 秒
}
if vl.MaxQueryRows == 0 {
vl.MaxQueryRows = 1000
}
return nil
}
// Equal 验证是否相等
func (vl *VictoriaLogs) Equal(other datasource.Datasource) bool {
o, ok := other.(*VictoriaLogs)
if !ok {
return false
}
return vl.VictorialogsAddr == o.VictorialogsAddr &&
vl.VictorialogsBasic.VictorialogsUser == o.VictorialogsBasic.VictorialogsUser &&
vl.VictorialogsBasic.VictorialogsPass == o.VictorialogsBasic.VictorialogsPass &&
vl.VictorialogsTls.SkipTlsVerify == o.VictorialogsTls.SkipTlsVerify &&
vl.Timeout == o.Timeout &&
reflect.DeepEqual(vl.Headers, o.Headers)
}
// QueryLog 日志查询
func (vl *VictoriaLogs) QueryLog(ctx context.Context, queryParam interface{}) ([]interface{}, int64, error) {
param := new(Query)
if err := mapstructure.Decode(queryParam, param); err != nil {
return nil, 0, fmt.Errorf("decode query param failed: %w", err)
}
logs, err := vl.Query(ctx, param.Query, param.Start, param.End, param.Limit)
if err != nil {
return nil, 0, err
}
// 转换为 interface{} 数组
result := make([]interface{}, len(logs))
for i, log := range logs {
result[i] = log
}
// 调用 HitsLogs 获取真实的 total
total, err := vl.HitsLogs(ctx, param.Query, param.Start, param.End)
if err != nil {
// 如果获取 total 失败,使用当前结果数量
total = int64(len(logs))
}
return result, total, nil
}
// QueryData 指标数据查询
func (vl *VictoriaLogs) QueryData(ctx context.Context, queryParam interface{}) ([]models.DataResp, error) {
param := new(Query)
if err := mapstructure.Decode(queryParam, param); err != nil {
return nil, fmt.Errorf("decode query param failed: %w", err)
}
// 判断使用哪个 API
if param.IsInstantQuery() {
return vl.queryDataInstant(ctx, param)
}
return vl.queryDataRange(ctx, param)
}
// queryDataInstant 告警场景,调用 /select/logsql/stats_query
func (vl *VictoriaLogs) queryDataInstant(ctx context.Context, param *Query) ([]models.DataResp, error) {
queryTime := param.Time
if queryTime == 0 {
queryTime = param.End // 如果没有 time,使用 end 作为查询时间点
}
if queryTime == 0 {
queryTime = time.Now().Unix()
}
result, err := vl.StatsQuery(ctx, param.Query, queryTime)
if err != nil {
return nil, err
}
return convertPrometheusInstantToDataResp(result, param.Ref), nil
}
// queryDataRange 看图场景,调用 /select/logsql/stats_query_range
func (vl *VictoriaLogs) queryDataRange(ctx context.Context, param *Query) ([]models.DataResp, error) {
step := param.Step
if step == "" {
// 根据时间范围计算合适的步长
duration := param.End - param.Start
if duration <= 3600 {
step = "1m" // 1 小时内,1 分钟步长
} else if duration <= 86400 {
step = "5m" // 1 天内,5 分钟步长
} else {
step = "1h" // 超过 1 天,1 小时步长
}
}
result, err := vl.StatsQueryRange(ctx, param.Query, param.Start, param.End, step)
if err != nil {
return nil, err
}
return convertPrometheusRangeToDataResp(result, param.Ref), nil
}
// convertPrometheusInstantToDataResp 将 Prometheus Instant Query 格式转换为 DataResp
func convertPrometheusInstantToDataResp(resp *victorialogs.PrometheusResponse, ref string) []models.DataResp {
var dataResps []models.DataResp
for _, item := range resp.Data.Result {
dataResp := models.DataResp{
Ref: ref,
}
// 转换 Metric
dataResp.Metric = make(model.Metric)
for k, v := range item.Metric {
dataResp.Metric[model.LabelName(k)] = model.LabelValue(v)
}
if len(item.Value) == 2 {
// [timestamp, value]
timestamp := item.Value[0].(float64)
value, _ := strconv.ParseFloat(item.Value[1].(string), 64)
dataResp.Values = [][]float64{
{timestamp, value},
}
}
dataResps = append(dataResps, dataResp)
}
return dataResps
}
// convertPrometheusRangeToDataResp 将 Prometheus Range Query 格式转换为 DataResp
func convertPrometheusRangeToDataResp(resp *victorialogs.PrometheusResponse, ref string) []models.DataResp {
var dataResps []models.DataResp
for _, item := range resp.Data.Result {
dataResp := models.DataResp{
Ref: ref,
}
// 转换 Metric
dataResp.Metric = make(model.Metric)
for k, v := range item.Metric {
dataResp.Metric[model.LabelName(k)] = model.LabelValue(v)
}
var values [][]float64
for _, v := range item.Values {
if len(v) == 2 {
timestamp := v[0].(float64)
value, _ := strconv.ParseFloat(v[1].(string), 64)
values = append(values, []float64{timestamp, value})
}
}
dataResp.Values = values
dataResps = append(dataResps, dataResp)
}
return dataResps
}
// MakeLogQuery 构造日志查询参数
func (vl *VictoriaLogs) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
q := &Query{
Start: start,
End: end,
Limit: 1000,
}
// 如果 query 是字符串,直接使用
if queryStr, ok := query.(string); ok {
q.Query = queryStr
} else if queryMap, ok := query.(map[string]interface{}); ok {
// 如果是 map,尝试提取 query 字段
if qStr, exists := queryMap["query"]; exists {
q.Query = fmt.Sprintf("%v", qStr)
}
if limit, exists := queryMap["limit"]; exists {
if limitInt, ok := limit.(int); ok {
q.Limit = limitInt
} else if limitFloat, ok := limit.(float64); ok {
q.Limit = int(limitFloat)
}
}
}
return q, nil
}
// MakeTSQuery 构造时序查询参数
func (vl *VictoriaLogs) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) {
q := &Query{
Start: start,
End: end,
}
// 如果 query 是字符串,直接使用
if queryStr, ok := query.(string); ok {
q.Query = queryStr
} else if queryMap, ok := query.(map[string]interface{}); ok {
// 如果是 map,提取相关字段
if qStr, exists := queryMap["query"]; exists {
q.Query = fmt.Sprintf("%v", qStr)
}
if step, exists := queryMap["step"]; exists {
q.Step = fmt.Sprintf("%v", step)
}
}
return q, nil
}
// QueryMapData 用于告警事件生成时获取额外数据
func (vl *VictoriaLogs) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) {
param := new(Query)
if err := mapstructure.Decode(query, param); err != nil {
return nil, err
}
// 扩大查询范围,解决时间滞后问题
if param.End > 0 && param.Start > 0 {
param.Start = param.Start - 30
}
// 限制只取 1 条
param.Limit = 1
logs, _, err := vl.QueryLog(ctx, param)
if err != nil {
return nil, err
}
var result []map[string]string
for _, log := range logs {
if logMap, ok := log.(map[string]interface{}); ok {
strMap := make(map[string]string)
for k, v := range logMap {
strMap[k] = fmt.Sprintf("%v", v)
}
result = append(result, strMap)
break // 只取第一条
}
}
return result, nil
}
================================================
FILE: doc/README.bak.md
================================================
All-in-one 的开源观测平台
开箱即用 ,集数据采集、可视化、监控告警于一体
推荐升级您的 Prometheus + AlertManager + Grafana + ELK + Jaeger 组合方案到夜莺!
[English](./README_en.md) | [中文](./README.md)
## 功能和特点
- **开箱即用**
- 支持 Docker、Helm Chart、云服务等多种部署方式,集数据采集、监控告警、可视化为一体,内置多种监控仪表盘、快捷视图、告警规则模板,导入即可快速使用,**大幅降低云原生监控系统的建设成本、学习成本、使用成本**;
- **专业告警**
- 可视化的告警配置和管理,支持丰富的告警规则,提供屏蔽规则、订阅规则的配置能力,支持告警多种送达渠道,支持告警自愈、告警事件管理等;
- **推荐您使用夜莺的同时,无缝搭配[FlashDuty](https://flashcat.cloud/product/flashcat-duty/),实现告警聚合收敛、认领、升级、排班、协同,让告警的触达既高效,又确保告警处理不遗漏、做到件件有回响**。
- **云原生**
- 以交钥匙的方式快速构建企业级的云原生监控体系,支持 [Categraf](https://github.com/flashcatcloud/categraf)、Telegraf、Grafana-agent 等多种采集器,支持 Prometheus、VictoriaMetrics、M3DB、ElasticSearch、Jaeger 等多种数据源,兼容支持导入 Grafana 仪表盘,**与云原生生态无缝集成**;
- **高性能 高可用**
- 得益于夜莺的多数据源管理引擎,和夜莺引擎侧优秀的架构设计,借助于高性能时序库,可以满足数亿时间线的采集、存储、告警分析场景,节省大量成本;
- 夜莺监控组件均可水平扩展,无单点,已在上千家企业部署落地,经受了严苛的生产实践检验。众多互联网头部公司,夜莺集群机器达百台,处理数亿级时间线,重度使用夜莺监控;
- **灵活扩展 中心化管理**
- 夜莺监控,可部署在 1 核 1G 的云主机,可在上百台机器集群化部署,可运行在 K8s 中;也可将时序库、告警引擎等组件下沉到各机房、各 Region,兼顾边缘部署和中心化统一管理,**解决数据割裂,缺乏统一视图的难题**;
- **开放社区**
- 托管于[中国计算机学会开源发展委员会](https://www.ccf.org.cn/kyfzwyh/),有[快猫星云](https://flashcat.cloud)和众多公司的持续投入,和数千名社区用户的积极参与,以及夜莺监控项目清晰明确的定位,都保证了夜莺开源社区健康、长久的发展。活跃、专业的社区用户也在持续迭代和沉淀更多的最佳实践于产品中;
## 使用场景
1. **如果您希望在一个平台中,统一管理和查看 Metrics、Logging、Tracing 数据,推荐你使用夜莺**:
- 请参考阅读:[不止于监控,夜莺 V6 全新升级为开源观测平台](http://flashcat.cloud/blog/nightingale-v6-release/)
2. **如果您在使用 Prometheus 过程中,有以下的一个或者多个需求场景,推荐您无缝升级到夜莺**:
- Prometheus、Alertmanager、Grafana 等多个系统较为割裂,缺乏统一视图,无法开箱即用;
- 通过修改配置文件来管理 Prometheus、Alertmanager 的方式,学习曲线大,协同有难度;
- 数据量过大而无法扩展您的 Prometheus 集群;
- 生产环境运行多套 Prometheus 集群,面临管理和使用成本高的问题;
3. **如果您在使用 Zabbix,有以下的场景,推荐您升级到夜莺**:
- 监控的数据量太大,希望有更好的扩展解决方案;
- 学习曲线高,多人多团队模式下,希望有更好的协同使用效率;
- 微服务和云原生架构下,监控数据的生命周期多变、监控数据维度基数高,Zabbix 数据模型不易适配;
- 了解更多Zabbix和夜莺监控的对比,推荐您进一步阅读[Zabbix 和夜莺监控选型对比](https://flashcat.cloud/blog/zabbx-vs-nightingale/)
4. **如果您在使用 [Open-Falcon](https://github.com/open-falcon/falcon-plus),我们推荐您升级到夜莺:**
- 关于 Open-Falcon 和夜莺的详细介绍,请参考阅读:[云原生监控的十个特点和趋势](http://flashcat.cloud/blog/10-trends-of-cloudnative-monitoring/)
- 监控系统和可观测平台的区别,请参考阅读:[从监控系统到可观测平台,Gap有多大
](https://flashcat.cloud/blog/gap-of-monitoring-to-o11y/)
5. **我们推荐您使用 [Categraf](https://github.com/flashcatcloud/categraf) 作为首选的监控数据采集器**:
- [Categraf](https://github.com/flashcatcloud/categraf) 是夜莺监控的默认采集器,采用开放插件机制和 All-in-one 的设计理念,同时支持 metric、log、trace、event 的采集。Categraf 不仅可以采集 CPU、内存、网络等系统层面的指标,也集成了众多开源组件的采集能力,支持K8s生态。Categraf 内置了对应的仪表盘和告警规则,开箱即用。
## 文档
[English Doc](https://n9e.github.io/) | [中文文档](https://flashcat.cloud/docs/)
## 产品示意图
https://user-images.githubusercontent.com/792850/216888712-2565fcea-9df5-47bd-a49e-d60af9bd76e8.mp4
## 夜莺架构
夜莺监控可以接收各种采集器上报的监控数据(比如 [Categraf](https://github.com/flashcatcloud/categraf)、telegraf、grafana-agent、Prometheus),并写入多种流行的时序数据库中(可以支持Prometheus、M3DB、VictoriaMetrics、Thanos、TDEngine等),提供告警规则、屏蔽规则、订阅规则的配置能力,提供监控数据的查看能力,提供告警自愈机制(告警触发之后自动回调某个webhook地址或者执行某个脚本),提供历史告警事件的存储管理、分组查看的能力。
### 中心汇聚式部署方案

夜莺只有一个模块,就是 n9e,可以部署多个 n9e 实例组成集群,n9e 依赖 2 个存储,数据库、Redis,数据库可以使用 MySQL 或 Postgres,自己按需选用。
n9e 提供的是 HTTP 接口,前面负载均衡可以是 4 层的,也可以是 7 层的。一般就选用 Nginx 就可以了。
n9e 这个模块接收到数据之后,需要转发给后端的时序库,相关配置是:
```toml
[Pushgw]
LabelRewrite = true
[[Pushgw.Writers]]
Url = "http://127.0.0.1:9090/api/v1/write"
```
> 注意:虽然数据源可以在页面配置了,但是上报转发链路,还是需要在配置文件指定。
所有机房的 agent( 比如 Categraf、Telegraf、 Grafana-agent、Datadog-agent ),都直接推数据给 n9e,这个架构最为简单,维护成本最低。当然,前提是要求机房之间网络链路比较好,一般有专线。如果网络链路不好,则要使用下面的部署方式了。
### 边缘下沉式混杂部署方案

这个图尝试解释 3 种不同的情形,比如 A 机房和中心网络链路很好,Categraf 可以直接汇报数据给中心 n9e 模块,另一个机房网络链路不好,就需要把时序库下沉部署,时序库下沉了,对应的告警引擎和转发网关也都要跟随下沉,这样数据不会跨机房传输,比较稳定。但是心跳还是需要往中心心跳,要不然在对象列表里看不到机器的 CPU、内存使用率。还有的时候,可能是接入的一个已有的 Prometheus,数据采集没有走 Categraf,那此时只需要把 Prometheus 作为数据源接入夜莺即可,可以在夜莺里看图、配告警规则,但是就是在对象列表里看不到,也不能使用告警自愈的功能,问题也不大,核心功能都不受影响。
边缘机房,下沉部署时序库、告警引擎、转发网关的时候,要注意,告警引擎需要依赖数据库,因为要同步告警规则,转发网关也要依赖数据库,因为要注册对象到数据库里去,需要打通相关网络,告警引擎和转发网关都不用Redis,所以无需为 Redis 打通网络。
### VictoriaMetrics 集群架构
如果单机版本的时序数据库(比如 Prometheus) 性能有瓶颈或容灾较差,我们推荐使用 [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics),VictoriaMetrics 架构较为简单,性能优异,易于部署和运维,架构图如上。VictoriaMetrics 更详尽的文档,还请参考其[官网](https://victoriametrics.com/)。
## 夜莺社区
开源项目要更有生命力,离不开开放的治理架构和源源不断的开发者和用户共同参与,我们致力于建立开放、中立的开源治理架构,吸纳更多来自企业、高校等各方面对云原生监控感兴趣、有热情的开发者,一起打造有活力的夜莺开源社区。关于《夜莺开源项目和社区治理架构(草案)》,请查阅 [COMMUNITY GOVERNANCE](./doc/community-governance.md).
**我们欢迎您以各种方式参与到夜莺开源项目和开源社区中来,工作包括不限于**:
- 补充和完善文档 => [n9e.github.io](https://n9e.github.io/)
- 分享您在使用夜莺监控过程中的最佳实践和经验心得 => [文章分享](https://flashcat.cloud/docs/content/flashcat-monitor/nightingale/share/)
- 提交产品建议 =》 [github issue](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=kind%2Ffeature&template=enhancement.md)
- 提交代码,让夜莺监控更快、更稳、更好用 => [github pull request](https://github.com/ccfos/nightingale/pulls)
**尊重、认可和记录每一位贡献者的工作**是夜莺开源社区的第一指导原则,我们提倡**高效的提问**,这既是对开发者时间的尊重,也是对整个社区知识沉淀的贡献:
- 提问之前请先查阅 [FAQ](https://www.gitlink.org.cn/ccfos/nightingale/wiki/faq)
- 我们使用[论坛](https://answer.flashcat.cloud/)进行交流,有问题可以到这里搜索、提问
## Who is using Nightingale
您可以通过在 **[Who is Using Nightingale](https://github.com/ccfos/nightingale/issues/897)** 登记您的使用情况,分享您的使用经验。
## Stargazers over time
[](https://starchart.cc/ccfos/nightingale)
## Contributors
## License
[Apache License V2.0](https://github.com/ccfos/nightingale/blob/main/LICENSE)
## 加入交流群
================================================
FILE: doc/active-contributors.md
================================================
## Active Contributors
- [xiaoziv](https://github.com/xiaoziv)
- [tanxiao1990](https://github.com/tanxiao1990)
- [bbaobelief](https://github.com/bbaobelief)
- [freedomkk-qfeng](https://github.com/freedomkk-qfeng)
- [lsy1990](https://github.com/lsy1990)
================================================
FILE: doc/committers.md
================================================
## Committers
- [YeningQin](https://github.com/710leo)
- [FeiKong](https://github.com/kongfei605)
- [XiaqingDai](https://github.com/jsers)
================================================
FILE: doc/community-governance.md
================================================
[夜莺监控](https://github.com/ccfos/nightingale "夜莺监控")是一款开源云原生监控系统,由滴滴设计开发,2020 年 3 月份开源之后,凭借其优秀的产品设计、灵活性架构和明确清晰的定位,夜莺监控快速发展为国内最活跃的企业级云原生监控方案。[截止当前](具体指2022年8月 "截止当前"),在 [Github](https://github.com/ccfos/nightingale "Github") 上已经迭代发布了 **70** 多个版本,获得了 **5K** 多个 Star,**80** 多位代码贡献者。快速的迭代,也让夜莺监控的用户群越来越大,涉及各行各业。
更进一步,夜莺监控于 2022 年 5 月 11 日,正式捐赠予中国计算机学会开源发展委员会 [CCF ODC](https://www.ccf.org.cn/kyfzwyh/ "CCF ODC"),为 CCF ODC 成立后接受捐赠的第一个开源项目。
开源项目要更有生命力,离不开开放的治理架构和源源不断的开发者共同参与。夜莺监控项目加入 CCF 开源大家庭后,能在计算机学会的支持和带动下,进一步结合云原生、可观测、国产化等多个技术发展的需求,建立开放、中立的开源治理架构,打造更专业、有活力的开发者社区。
**今天,我们郑重发布夜莺监控开源社区治理架构,并公示相关的任命和社区荣誉,期待开源的道路上,一起同行。**
# 夜莺监控开源社区架构
### User|用户
> 欢迎任何个人、公司以及组织,使用夜莺监控,并积极的反馈 bug、提交功能需求、以及相互帮助,我们推荐使用 [Github Issue](https://github.com/ccfos/nightingale/issues "Github Issue") 来跟踪 bug 和管理需求。
社区用户,可以通过在 **[Who is Using Nightingale](https://github.com/ccfos/nightingale/issues/897 "Who is Using Nightingale")** 登记您的使用情况,并分享您使用夜莺监控的经验,将会自动进入 **[END USERS](https://github.com/ccfos/nightingale/blob/main/doc/end-users.md "END USERS")** 文件列表,并获得社区的 **VIP Support**。
### Contributor|贡献者
> 欢迎每一位用户,包括但不限于以下方式参与到夜莺开源社区并做出贡献:
1. 在 [Github Issue](https://github.com/ccfos/nightingale/issues "Github Issue") 中积极参与讨论,参与社区活动;
1. 提交代码补丁;
1. 翻译、修订、补充和完善[文档](https://n9e.github.io "文档");
1. 分享夜莺监控的使用经验,积极布道;
1. 提交建议 / 批评;
年度累计向 [CCFOS/NIGHTINGALE](https://github.com/ccfos/nightingale "CCFOS/NIGHTINGALE") 提交 **5** 个PR(被合并),或者因为其他贡献被**项目管委会**一致认可,将会自动进入到 **[ACTIVE CONTRIBUTORS](https://github.com/ccfos/nightingale/blob/main/doc/active-contributors.md "ACTIVE CONTRIBUTORS")** 列表,并获得夜莺开源社区颁发的证书,享有夜莺开源社区一定的权益和福利。
所有向 [CCFOS/NIGHTINGALE](https://github.com/ccfos/nightingale "CCFOS/NIGHTINGALE") 提交过PR(被合并),或者做出过重要贡献的 Contributor,都会被永久记载于 [CONTRIBUTORS](https://github.com/ccfos/nightingale/blob/main/doc/contributors.md "CONTRIBUTORS") 列表。
### Committer|提交者
> Committer 是指拥有 [CCFOS/NIGHTINGALE](https://github.com/ccfos/nightingale "CCFOS/NIGHTINGALE") 代码仓库写操作权限的贡献者。原则上 Committer 能够自主决策某个代码补丁是否可以合入到夜莺代码仓库,但是项目管委会拥有最终的决策权。
Committer 承担以下一个或多个职责:
- 积极回应 Issues;
- Review PRs;
- 参加开发者例行会议,积极讨论项目规划和技术方案;
- 代表夜莺开源社区出席相关技术会议并做演讲;
Committer 记录并公示于 **[COMMITTERS](https://github.com/ccfos/nightingale/blob/main/doc/committers.md "COMMITTERS")** 列表,并获得夜莺开源社区颁发的证书,以及享有夜莺开源社区的各种权益和福利。
### PMC|项目管委会
> PMC(项目管委会)作为一个实体,来管理和领导夜莺项目,为整个项目的发展全权负责。项目管委会相关内容记录并公示于文件[PMC](https://github.com/ccfos/nightingale/blob/main/doc/pmc.md "PMC").
- 项目管委会成员(PMC Member),从 Contributor 或者 Committer 中选举产生,他们拥有 [CCFOS/NIGHTINGALE](https://github.com/ccfos/nightingale "CCFOS/NIGHTINGALE") 代码仓库的写操作权限,拥有 Nightingale 社区相关事务的投票权、以及提名 Committer 候选人的权利。
- 项目管委会主席(PMC Chair),从项目管委会成员中投票产生。管委会主席是 **[CCF ODC](https://www.ccf.org.cn/kyfzwyh/ "CCF ODC")** 和项目管委会之间的沟通桥梁,履行特定的项目管理职责。
## Communication|沟通机制
1. 我们推荐使用邮件列表来反馈建议(待发布);
2. 我们推荐使用 [Github Issue](https://github.com/ccfos/nightingale/issues "Github Issue") 跟踪 bug 和管理需求;
3. 我们推荐使用 [Github Milestone](https://github.com/ccfos/nightingale/milestones "Github Milestone") 来管理项目进度和规划;
4. 我们推荐使用腾讯会议来定期召开项目例会(会议 ID 待发布);
## Documentation|文档
1. 我们推荐使用 [Github Pages](https://n9e.github.io "Github Pages") 来沉淀文档;
2. 我们推荐使用 [Gitlink Wiki](https://www.gitlink.org.cn/ccfos/nightingale/wiki/faq "Gitlink Wiki") 来沉淀 FAQ;
## Operation|运营机制
1. 我们定期组织用户、贡献者、项目管委会成员之间的沟通会议,讨论项目开发的目标、方案、进度,以及讨论相关需求的合理性、优先级等议题;
2. 我们定期组织 meetup (线上&线下),创造良好的用户交流分享环境,并沉淀相关内容到文档站点;
3. 我们定期组织夜莺开发者大会,分享 [best user story](https://n9e.github.io/docs/prologue/share/ "best user story")、同步年度开发目标和计划、讨论新技术方向等;
## Philosophy|社区指导原则
>尊重、认可和记录每一位贡献者的工作。
## 关于提问的原则
按照**尊重、认可、记录每一位贡献者的工作**原则,我们提倡**高效的提问**,这既是对开发者时间的尊重,也是对整个社区的知识沉淀的贡献:
1. 提问之前请先查阅 [FAQ](https://www.gitlink.org.cn/ccfos/nightingale/wiki/faq "FAQ") ;
2. 提问之前请先搜索 [Github Issues](https://github.com/ccfos/nightingale/issues "Github Issue");
3. 我们优先推荐通过提交 [Github Issue](https://github.com/ccfos/nightingale/issues "Github Issue") 来提问,如果[有问题点击这里](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=kind%2Fbug&template=bug_report.yml "有问题点击这里") | [有需求建议点击这里](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=kind%2Ffeature&template=enhancement.md "有需求建议点击这里");
================================================
FILE: doc/contributors.md
================================================
## Contributors
================================================
FILE: doc/end-users.md
================================================
## End Users
- [中移动](https://github.com/ccfos/nightingale/issues/897#issuecomment-1086573166)
- [inke](https://github.com/ccfos/nightingale/issues/897#issuecomment-1099840636)
- [方正证券](https://github.com/ccfos/nightingale/issues/897#issuecomment-1110492461)
================================================
FILE: doc/pmc.md
================================================
### PMC Chair
- [laiwei](https://github.com/laiwei)
### PMC Co-Chair
- [UlricQin](https://github.com/UlricQin)
### PMC Member
================================================
FILE: doc/server-dash.json
================================================
{
"name": "夜莺大盘",
"tags": "",
"configs": {
"var": [],
"panels": [
{
"targets": [
{
"refId": "A",
"expr": "rate(n9e_server_samples_received_total[1m])"
}
],
"name": "每秒接收的数据点个数",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 0,
"i": "53fcb9dc-23f9-41e0-bc5e-121eed14c3a4",
"isResizable": true
},
"id": "53fcb9dc-23f9-41e0-bc5e-121eed14c3a4"
},
{
"targets": [
{
"refId": "A",
"expr": "rate(n9e_server_alerts_total[10m])"
}
],
"name": "每秒产生的告警事件个数",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 0,
"i": "47fc6252-9cc8-4b53-8e27-0c5c59a47269",
"isResizable": true
},
"id": "f70dcb8b-b58b-4ef9-9e48-f230d9e17140"
},
{
"targets": [
{
"refId": "A",
"expr": "n9e_server_alert_queue_size"
}
],
"name": "告警事件内存队列长度",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 4,
"i": "ad1af16c-de0c-45f4-8875-cea4e85d51d0",
"isResizable": true
},
"id": "caf23e58-d907-42b0-9ed6-722c8c6f3c5f"
},
{
"targets": [
{
"refId": "A",
"expr": "n9e_server_http_request_duration_seconds_sum/n9e_server_http_request_duration_seconds_count"
}
],
"name": "数据接收接口平均响应时间(单位:秒)",
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "normal"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 4,
"i": "64c3abc2-404c-4462-a82f-c109a21dac91",
"isResizable": true
},
"id": "6b8d2db1-efca-4b9e-b429-57a9d2272bc5"
},
{
"targets": [
{
"refId": "A",
"expr": "n9e_server_sample_queue_size"
}
],
"name": "内存数据队列长度",
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 8,
"i": "1c7da942-58c2-40dc-b42f-983e4a35b89b",
"isResizable": true
},
"id": "bd41677d-40d3-482e-bb6e-fbd25df46d87"
},
{
"targets": [
{
"refId": "A",
"expr": "avg(n9e_server_forward_duration_seconds_sum/n9e_server_forward_duration_seconds_count)"
}
],
"name": "数据发往TSDB平均耗时(单位:秒)",
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 8
},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "normal"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 8,
"i": "eed94a0b-954f-48ac-82e5-a2eada1c8a3d",
"isResizable": true
},
"id": "c8642e72-f384-46a5-8410-1e6be2953c3c"
}
],
"version": "2.0.0"
}
}
================================================
FILE: docker/.dockerignore
================================================
compose-host-network
compose-postgres
compose-bridge
initsql
build.sh
================================================
FILE: docker/Dockerfile.goreleaser
================================================
FROM --platform=$TARGETPLATFORM python:3-slim
WORKDIR /app
ADD n9e /app/
ADD etc /app/etc/
ADD integrations /app/integrations/
RUN pip install requests Jinja2
EXPOSE 17000
CMD ["/app/n9e", "-h"]
================================================
FILE: docker/Dockerfile.goreleaser.arm64
================================================
FROM --platform=$TARGETPLATFORM python:3-slim
WORKDIR /app
ADD n9e /app/
ADD etc /app/etc/
ADD integrations /app/integrations/
EXPOSE 17000
CMD ["/app/n9e", "-h"]
================================================
FILE: docker/build.sh
================================================
#!/bin/sh
if [ $# -ne 1 ]; then
echo "$0 "
exit 0
fi
tag=$1
echo "tag: ${tag}"
rm -rf n9e pub
cp ../n9e .
docker build -t nightingale:${tag} .
docker tag nightingale:${tag} ulric2019/nightingale:${tag}
docker push ulric2019/nightingale:${tag}
rm -rf n9e pub
================================================
FILE: docker/compose-bridge/docker-compose.yaml
================================================
networks:
nightingale:
driver: bridge
services:
mysql:
image: "mysql:8"
container_name: mysql
hostname: mysql
restart: always
environment:
TZ: Asia/Shanghai
MYSQL_ROOT_PASSWORD: 1234
volumes:
- ./mysqldata:/var/lib/mysql/
- ../initsql:/docker-entrypoint-initdb.d/
- ./etc-mysql/my.cnf:/etc/my.cnf
networks:
- nightingale
ports:
- "3306:3306"
redis:
image: "redis:6.2"
container_name: redis
hostname: redis
restart: always
environment:
TZ: Asia/Shanghai
networks:
- nightingale
ports:
- "6379:6379"
# prometheus:
# image: prom/prometheus
# container_name: prometheus
# hostname: prometheus
# restart: always
# environment:
# TZ: Asia/Shanghai
# volumes:
# - ./etc-prometheus:/etc/prometheus
# command:
# - "--config.file=/etc/prometheus/prometheus.yml"
# - "--storage.tsdb.path=/prometheus"
# - "--web.console.libraries=/usr/share/prometheus/console_libraries"
# - "--web.console.templates=/usr/share/prometheus/consoles"
# - "--enable-feature=remote-write-receiver"
# - "--query.lookback-delta=2m"
# networks:
# - nightingale
# ports:
# - "9090:9090"
victoriametrics:
image: victoriametrics/victoria-metrics:v1.79.12
container_name: victoriametrics
hostname: victoriametrics
restart: always
environment:
TZ: Asia/Shanghai
ports:
- "8428:8428"
networks:
- nightingale
command:
- "--loggerTimezone=Asia/Shanghai"
nightingale:
image: flashcatcloud/nightingale:latest
container_name: nightingale
hostname: nightingale
restart: always
environment:
GIN_MODE: release
TZ: Asia/Shanghai
WAIT_HOSTS: mysql:3306, redis:6379
volumes:
- ./etc-nightingale:/app/etc
networks:
- nightingale
ports:
- "17000:17000"
- "20090:20090"
depends_on:
- mysql
- redis
- victoriametrics
command:
- /app/n9e
categraf:
image: "flashcatcloud/categraf:latest"
container_name: "categraf"
hostname: "categraf01"
restart: always
environment:
TZ: Asia/Shanghai
HOST_PROC: /hostfs/proc
HOST_SYS: /hostfs/sys
HOST_MOUNT_PREFIX: /hostfs
WAIT_HOSTS: nightingale:17000, nightingale:20090
volumes:
- ./etc-categraf:/etc/categraf/conf
- /:/hostfs
networks:
- nightingale
depends_on:
- nightingale
================================================
FILE: docker/compose-bridge/etc-categraf/config.toml
================================================
[global]
# whether print configs
print_configs = false
# add label(agent_hostname) to series
# "" -> auto detect hostname
# "xx" -> use specified string xx
# "$hostname" -> auto detect hostname
# "$ip" -> auto detect ip
# "$hostname-$ip" -> auto detect hostname and ip to replace the vars
hostname = "$HOSTNAME"
# will not add label(agent_hostname) if true
omit_hostname = false
# s | ms
precision = "ms"
# global collect interval
interval = 15
# [global.labels]
# source="categraf"
# region = "shanghai"
# env = "localhost"
[writer_opt]
# default: 2000
batch = 2000
# channel(as queue) size
chan_size = 10000
[[writers]]
url = "http://nightingale:17000/prometheus/v1/write"
# Basic auth username
basic_auth_user = ""
# Basic auth password
basic_auth_pass = ""
# timeout settings, unit: ms
timeout = 5000
dial_timeout = 2500
max_idle_conns_per_host = 100
[http]
enable = false
address = ":9100"
print_access = false
run_mode = "release"
[heartbeat]
enable = true
# report os version cpu.util mem.util metadata
url = "http://nightingale:17000/v1/n9e/heartbeat"
# interval, unit: s
interval = 10
# Basic auth username
basic_auth_user = ""
# Basic auth password
basic_auth_pass = ""
## Optional headers
# headers = ["X-From", "categraf", "X-Xyz", "abc"]
# timeout settings, unit: ms
timeout = 5000
dial_timeout = 2500
max_idle_conns_per_host = 100
[ibex]
enable = true
## ibex flush interval
interval = "1000ms"
## n9e ibex server rpc address
servers = ["nightingale:20090"]
## temp script dir
meta_dir = "./meta"
================================================
FILE: docker/compose-bridge/etc-categraf/input.cpu/cpu.toml
================================================
# # collect interval
# interval = 15
# # whether collect per cpu
# collect_per_cpu = false
================================================
FILE: docker/compose-bridge/etc-categraf/input.disk/disk.toml
================================================
# # collect interval
# interval = 15
# # By default stats will be gathered for all mount points.
# # Set mount_points will restrict the stats to only the specified mount points.
mount_points = ["/"]
# Ignore mount points by filesystem type.
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs", "nsfs"]
================================================
FILE: docker/compose-bridge/etc-categraf/input.diskio/diskio.toml
================================================
# # collect interval
# interval = 15
# # By default, categraf will gather stats for all devices including disk partitions.
# # Setting devices will restrict the stats to the specified devices.
# devices = ["sda", "sdb", "vd*"]
================================================
FILE: docker/compose-bridge/etc-categraf/input.kernel/kernel.toml
================================================
# # collect interval
# interval = 15
================================================
FILE: docker/compose-bridge/etc-categraf/input.mem/mem.toml
================================================
# # collect interval
# interval = 15
# # whether collect platform specified metrics
collect_platform_fields = true
================================================
FILE: docker/compose-bridge/etc-categraf/input.mysql/mysql.toml
================================================
[[instances]]
address = "mysql:3306"
username = "root"
password = "1234"
# # set tls=custom to enable tls
# parameters = "tls=false"
# extra_status_metrics = true
# extra_innodb_metrics = false
# gather_processlist_processes_by_state = false
# gather_processlist_processes_by_user = false
# gather_schema_size = true
# gather_table_size = false
# gather_system_table_size = false
# gather_slave_status = true
# # timeout
# timeout_seconds = 3
# # interval = global.interval * interval_times
# interval_times = 1
# important! use global unique string to specify instance
labels = { instance="docker-compose-mysql" }
## Optional TLS Config
# use_tls = false
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
#[[instances.queries]]
# measurement = "lock_wait"
# metric_fields = [ "total" ]
# timeout = "3s"
# request = '''
#SELECT count(*) as total FROM information_schema.innodb_trx WHERE trx_state='LOCK WAIT'
#'''
================================================
FILE: docker/compose-bridge/etc-categraf/input.net/net.toml
================================================
# # collect interval
# interval = 15
# # whether collect protocol stats on Linux
# collect_protocol_stats = false
# # setting interfaces will tell categraf to gather these explicit interfaces
# interfaces = ["eth0"]
================================================
FILE: docker/compose-bridge/etc-categraf/input.netstat/netstat.toml
================================================
# # collect interval
# interval = 15
================================================
FILE: docker/compose-bridge/etc-categraf/input.processes/processes.toml
================================================
# # collect interval
# interval = 15
# # force use ps command to gather
# force_ps = false
# # force use /proc to gather
# force_proc = false
================================================
FILE: docker/compose-bridge/etc-categraf/input.prometheus/prometheus.toml
================================================
[[instances]]
urls = [
"http://nightingale:17000/metrics"
]
================================================
FILE: docker/compose-bridge/etc-categraf/input.redis/redis.toml
================================================
[[instances]]
address = "redis:6379"
username = ""
password = ""
# pool_size = 2
## 是否开启slowlog 收集
# gather_slowlog = true
## 最多收集少条slowlog
# slowlog_max_len = 100
## 收集距离现在多少秒以内的slowlog
## 注意插件的采集周期,该参数不要小于采集周期,否则会有slowlog查不到
# slowlog_time_window=30
# 指标
# redis_slow_log{ident=dev-01 client_addr=127.0.0.1:56364 client_name= cmd="info ALL" log_id=983} 74 (单位微秒)
# # Optional. Specify redis commands to retrieve values
# commands = [
# {command = ["get", "sample-key1"], metric = "custom_metric_name1"},
# {command = ["get", "sample-key2"], metric = "custom_metric_name2"}
# ]
# # interval = global.interval * interval_times
# interval_times = 1
# important! use global unique string to specify instance
labels = { instance="docker-compose-redis" }
## Optional TLS Config
# use_tls = false
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
================================================
FILE: docker/compose-bridge/etc-categraf/input.system/system.toml
================================================
# # collect interval
# interval = 15
# # whether collect metric: system_n_users
# collect_user_number = false
================================================
FILE: docker/compose-bridge/etc-mysql/my.cnf
================================================
[mysqld]
pid-file = /var/run/mysqld/mysqld.pid
socket = /var/run/mysqld/mysqld.sock
datadir = /var/lib/mysql
bind-address = 0.0.0.0
================================================
FILE: docker/compose-bridge/etc-nightingale/config.toml
================================================
[Global]
RunMode = "release"
[Log]
# log write dir
Dir = "logs"
# log level: DEBUG INFO WARNING ERROR
Level = "INFO"
# stdout, stderr, file
Output = "stdout"
# # rotate by time
# KeepHours = 4
# # rotate by size
# RotateNum = 3
# # unit: MB
# RotateSize = 256
[HTTP]
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 17000
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = false
# whether enable pprof
PProf = false
# expose prometheus /metrics?
ExposeMetrics = true
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120
[HTTP.ShowCaptcha]
Enable = false
[HTTP.APIForAgent]
Enable = true
# [HTTP.APIForAgent.BasicAuth]
# user001 = "ccc26da7b9aba533cbb263a36c07dcc5"
[HTTP.APIForService]
Enable = false
[HTTP.APIForService.BasicAuth]
user001 = "ccc26da7b9aba533cbb263a36c07dcc5"
[HTTP.JWTAuth]
# unit: min
AccessExpired = 1500
# unit: min
RefreshExpired = 10080
RedisKeyPrefix = "/jwt/"
[HTTP.TokenAuth]
Enable = false
HeaderUserTokenKey = "X-User-Token"
[HTTP.ProxyAuth]
# if proxy auth enabled, jwt auth is disabled
Enable = false
# username key in http proxy header
HeaderUserNameKey = "X-User-Name"
DefaultRoles = ["Standard"]
[HTTP.RSA]
# open RSA
OpenRSA = false
[DB]
# postgres: DSN="host=127.0.0.1 port=5432 user=root dbname=n9e_v6 password=1234 sslmode=disable"
DSN="root:1234@tcp(mysql:3306)/n9e_v6?charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true"
# enable debug mode or not
Debug = false
# mysql postgres
DBType = "mysql"
# unit: s
MaxLifetime = 7200
# max open connections
MaxOpenConns = 150
# max idle connections
MaxIdleConns = 50
[Redis]
# address, ip:port or ip1:port,ip2:port for cluster and sentinel(SentinelAddrs)
Address = "redis:6379"
# Username = ""
# Password = ""
# DB = 0
# UseTLS = false
# TLSMinVersion = "1.2"
# standalone cluster sentinel
RedisType = "standalone"
# Mastername for sentinel type
# MasterName = "mymaster"
# SentinelUsername = ""
# SentinelPassword = ""
[Alert]
[Alert.Heartbeat]
# auto detect if blank
IP = ""
# unit ms
Interval = 1000
EngineName = "default"
# [Alert.Alerting]
# NotifyConcurrency = 10
[Center]
MetricsYamlFile = "./etc/metrics.yaml"
I18NHeaderKey = "X-Language"
[Center.AnonymousAccess]
PromQuerier = false
AlertDetail = false
[Pushgw]
# use target labels in database instead of in series
LabelRewrite = true
ForceUseServerTS = true
# [Pushgw.DebugSample]
# ident = "xx"
# __name__ = "xx"
# [Pushgw.WriterOpt]
# QueueMaxSize = 1000000
# QueuePopSize = 1000
[[Pushgw.Writers]]
# Url = "http://127.0.0.1:8480/insert/0/prometheus/api/v1/write"
Url = "http://victoriametrics:8428/api/v1/write"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Headers = ["X-From", "n9e"]
Timeout = 10000
DialTimeout = 3000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
## Optional TLS Config
# UseTLS = false
# TLSCA = "/etc/n9e/ca.pem"
# TLSCert = "/etc/n9e/cert.pem"
# TLSKey = "/etc/n9e/key.pem"
# InsecureSkipVerify = false
# [[Writers.WriteRelabels]]
# Action = "replace"
# SourceLabels = ["__address__"]
# Regex = "([^:]+)(?::\\d+)?"
# Replacement = "$1:80"
# TargetLabel = "__address__"
[Ibex]
Enable = true
RPCListen = "0.0.0.0:20090"
================================================
FILE: docker/compose-bridge/etc-nightingale/metrics.yaml
================================================
zh:
ip_conntrack_count: 连接跟踪表条目总数(单位:int, count)
ip_conntrack_max: 连接跟踪表最大容量(单位:int, size)
cpu_usage_idle: CPU空闲率(单位:%)
cpu_usage_active: CPU使用率(单位:%)
cpu_usage_system: CPU内核态时间占比(单位:%)
cpu_usage_user: CPU用户态时间占比(单位:%)
cpu_usage_nice: 低优先级用户态CPU时间占比,也就是进程nice值被调整为1-19之间的CPU时间。这里注意,nice可取值范围是-20到19,数值越大,优先级反而越低(单位:%)
cpu_usage_iowait: CPU等待I/O的时间占比(单位:%)
cpu_usage_irq: CPU处理硬中断的时间占比(单位:%)
cpu_usage_softirq: CPU处理软中断的时间占比(单位:%)
cpu_usage_steal: 在虚拟机环境下有该指标,表示CPU被其他虚拟机争用的时间占比,超过20就表示争抢严重(单位:%)
cpu_usage_guest: 通过虚拟化运行其他操作系统的时间,也就是运行虚拟机的CPU时间占比(单位:%)
cpu_usage_guest_nice: 以低优先级运行虚拟机的时间占比(单位:%)
disk_free: 硬盘分区剩余量(单位:byte)
disk_used: 硬盘分区使用量(单位:byte)
disk_used_percent: 硬盘分区使用率(单位:%)
disk_total: 硬盘分区总量(单位:byte)
disk_inodes_free: 硬盘分区inode剩余量
disk_inodes_used: 硬盘分区inode使用量
disk_inodes_total: 硬盘分区inode总量
diskio_io_time: 从设备视角来看I/O请求总时间,队列中有I/O请求就计数(单位:毫秒),counter类型,需要用函数求rate才有使用价值
diskio_iops_in_progress: 已经分配给设备驱动且尚未完成的IO请求,不包含在队列中但尚未分配给设备驱动的IO请求,gauge类型
diskio_merged_reads: 相邻读请求merge读的次数,counter类型
diskio_merged_writes: 相邻写请求merge写的次数,counter类型
diskio_read_bytes: 读取的byte数量,counter类型,需要用函数求rate才有使用价值
diskio_read_time: 读请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值
diskio_reads: 读请求次数,counter类型,需要用函数求rate才有使用价值
diskio_weighted_io_time: 从I/O请求视角来看I/O等待总时间,如果同时有多个I/O请求,时间会叠加(单位:毫秒)
diskio_write_bytes: 写入的byte数量,counter类型,需要用函数求rate才有使用价值
diskio_write_time: 写请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值
diskio_writes: 写请求次数,counter类型,需要用函数求rate才有使用价值
kernel_boot_time: 内核启动时间
kernel_context_switches: 内核上下文切换次数
kernel_entropy_avail: linux系统内部的熵池
kernel_interrupts: 内核中断次数
kernel_processes_forked: fork的进程数
mem_active: 活跃使用的内存总数(包括cache和buffer内存)
mem_available: 可用内存大小(bytes)
mem_available_percent: 内存剩余百分比(0~100)
mem_buffered: 用来给文件做缓冲大小
mem_cached: 被高速缓冲存储器(cache memory)用的内存的大小(等于 diskcache minus SwapCache )
mem_commit_limit: 根据超额分配比率('vm.overcommit_ratio'),这是当前在系统上分配可用的内存总量,这个限制只是在模式2('vm.overcommit_memory')时启用
mem_committed_as: 目前在系统上分配的内存量。是所有进程申请的内存的总和
mem_dirty: 等待被写回到磁盘的内存大小
mem_free: 空闲内存大小(bytes)
mem_high_free: 未被使用的高位内存大小
mem_high_total: 高位内存总大小(Highmem是指所有内存高于860MB的物理内存,Highmem区域供用户程序使用,或用于页面缓存。该区域不是直接映射到内核空间。内核必须使用不同的手法使用该段内存)
mem_huge_page_size: 每个大页的大小
mem_huge_pages_free: 池中尚未分配的 HugePages 数量
mem_huge_pages_total: 预留HugePages的总个数
mem_inactive: 空闲的内存数(包括free和available的内存)
mem_low_free: 未被使用的低位大小
mem_low_total: 低位内存总大小,低位可以达到高位内存一样的作用,而且它还能够被内核用来记录一些自己的数据结构
mem_mapped: 设备和文件等映射的大小
mem_page_tables: 管理内存分页页面的索引表的大小
mem_shared: 多个进程共享的内存总额
mem_slab: 内核数据结构缓存的大小,可以减少申请和释放内存带来的消耗
mem_sreclaimable: 可收回Slab的大小
mem_sunreclaim: 不可收回Slab的大小(SUnreclaim+SReclaimable=Slab)
mem_swap_cached: 被高速缓冲存储器(cache memory)用的交换空间的大小,已经被交换出来的内存,但仍然被存放在swapfile中。用来在需要的时候很快的被替换而不需要再次打开I/O端口
mem_swap_free: 未被使用交换空间的大小
mem_swap_total: 交换空间的总大小
mem_total: 内存总数
mem_used: 已用内存数
mem_used_percent: 已用内存数百分比(0~100)
mem_vmalloc_chunk: 最大的连续未被使用的vmalloc区域
mem_vmalloc_totalL: 可以vmalloc虚拟内存大小
mem_vmalloc_used: vmalloc已使用的虚拟内存大小
mem_write_back: 正在被写回到磁盘的内存大小
mem_write_back_tmp: FUSE用于临时写回缓冲区的内存
net_bytes_recv: 网卡收包总数(bytes),计算每秒速率时需要用到rate/irate函数
net_bytes_sent: 网卡发包总数(bytes),计算每秒速率时需要用到rate/irate函数
net_drop_in: 网卡收丢包数量
net_drop_out: 网卡发丢包数量
net_err_in: 网卡收包错误数量
net_err_out: 网卡发包错误数量
net_packets_recv: 网卡收包数量
net_packets_sent: 网卡发包数量
net_bits_recv: 网卡收包总数(bits),计算每秒速率时需要用到rate/irate函数
net_bits_sent: 网卡发包总数(bits),计算每秒速率时需要用到rate/irate函数
netstat_tcp_established: ESTABLISHED状态的网络链接数
netstat_tcp_fin_wait1: FIN_WAIT1状态的网络链接数
netstat_tcp_fin_wait2: FIN_WAIT2状态的网络链接数
netstat_tcp_last_ack: LAST_ACK状态的网络链接数
netstat_tcp_listen: LISTEN状态的网络链接数
netstat_tcp_syn_recv: SYN_RECV状态的网络链接数
netstat_tcp_syn_sent: SYN_SENT状态的网络链接数
netstat_tcp_time_wait: TIME_WAIT状态的网络链接数
netstat_udp_socket: UDP状态的网络链接数
netstat_sockets_used: 已使用的所有协议套接字总量
netstat_tcp_inuse: 正在使用(正在侦听)的TCP套接字数量
netstat_tcp_orphan: 无主(不属于任何进程)的TCP连接数(无用、待销毁的TCP socket数)
netstat_tcp_tw: TIME_WAIT状态的TCP连接数
netstat_tcp_alloc: 已分配(已建立、已申请到sk_buff)的TCP套接字数量
netstat_tcp_mem: TCP套接字内存Page使用量
netstat_udp_inuse: 在使用的UDP套接字数量
netstat_udp_mem: UDP套接字内存Page使用量
netstat_udplite_inuse: 正在使用的 udp lite 数量
netstat_raw_inuse: 正在使用的 raw socket 数量
netstat_frag_inuse: ip fragment 数量
netstat_frag_memory: ip fragment 已经分配的内存(byte)
#[ping]
ping_percent_packet_loss: ping数据包丢失百分比(%)
ping_result_code: ping返回码('0','1')
net_response_result_code: 网络探测结果,0表示正常,非0表示异常
net_response_response_time: 网络探测时延,单位:秒
processes_blocked: 不可中断的睡眠状态下的进程数('U','D','L')
processes_dead: 回收中的进程数('X')
processes_idle: 挂起的空闲进程数('I')
processes_paging: 分页进程数('P')
processes_running: 运行中的进程数('R')
processes_sleeping: 可中断进程数('S')
processes_stopped: 暂停状态进程数('T')
processes_total: 总进程数
processes_total_threads: 总线程数
processes_unknown: 未知状态进程数
processes_zombies: 僵尸态进程数('Z')
swap_used_percent: Swap空间换出数据量
system_load1: 1分钟平均load值
system_load5: 5分钟平均load值
system_load15: 15分钟平均load值
system_load_norm_1: 1分钟平均load值/逻辑CPU个数
system_load_norm_5: 5分钟平均load值/逻辑CPU个数
system_load_norm_15: 15分钟平均load值/逻辑CPU个数
system_n_users: 用户数
system_n_cpus: CPU核数
system_uptime: 系统启动时间
nginx_accepts: 自nginx启动起,与客户端建立过得连接总数
nginx_active: 当前nginx正在处理的活动连接数,等于Reading/Writing/Waiting总和
nginx_handled: 自nginx启动起,处理过的客户端连接总数
nginx_reading: 正在读取HTTP请求头部的连接总数
nginx_requests: 自nginx启动起,处理过的客户端请求总数,由于存在HTTP Keep-Alive请求,该值会大于handled值
nginx_upstream_check_fall: upstream_check模块检测到后端失败的次数
nginx_upstream_check_rise: upstream_check模块对后端的检测次数
nginx_upstream_check_status_code: 后端upstream的状态,up为1,down为0
nginx_waiting: 开启 keep-alive 的情况下,这个值等于 active – (reading+writing), 意思就是 Nginx 已经处理完正在等候下一次请求指令的驻留连接
nginx_writing: 正在向客户端发送响应的连接总数
http_response_content_length: HTTP消息实体的传输长度
http_response_http_response_code: http响应状态码
http_response_response_time: http响应用时
http_response_result_code: url探测结果0为正常否则url无法访问
# [aws cloudwatch rds]
cloudwatch_aws_rds_bin_log_disk_usage_average: rds 磁盘使用平均值
cloudwatch_aws_rds_bin_log_disk_usage_maximum: rds 磁盘使用量最大值
cloudwatch_aws_rds_bin_log_disk_usage_minimum: rds binlog 磁盘使用量最低
cloudwatch_aws_rds_bin_log_disk_usage_sample_count: rds binlog 磁盘使用情况样本计数
cloudwatch_aws_rds_bin_log_disk_usage_sum: rds binlog 磁盘使用总和
cloudwatch_aws_rds_burst_balance_average: rds 突发余额平均值
cloudwatch_aws_rds_burst_balance_maximum: rds 突发余额最大值
cloudwatch_aws_rds_burst_balance_minimum: rds 突发余额最低
cloudwatch_aws_rds_burst_balance_sample_count: rds 突发平衡样本计数
cloudwatch_aws_rds_burst_balance_sum: rds 突发余额总和
cloudwatch_aws_rds_cpu_utilization_average: rds cpu 利用率平均值
cloudwatch_aws_rds_cpu_utilization_maximum: rds cpu 利用率最大值
cloudwatch_aws_rds_cpu_utilization_minimum: rds cpu 利用率最低
cloudwatch_aws_rds_cpu_utilization_sample_count: rds cpu 利用率样本计数
cloudwatch_aws_rds_cpu_utilization_sum: rds cpu 利用率总和
cloudwatch_aws_rds_database_connections_average: rds 数据库连接平均值
cloudwatch_aws_rds_database_connections_maximum: rds 数据库连接数最大值
cloudwatch_aws_rds_database_connections_minimum: rds 数据库连接最小
cloudwatch_aws_rds_database_connections_sample_count: rds 数据库连接样本数
cloudwatch_aws_rds_database_connections_sum: rds 数据库连接总和
cloudwatch_aws_rds_db_load_average: rds db 平均负载
cloudwatch_aws_rds_db_load_cpu_average: rds db 负载 cpu 平均值
cloudwatch_aws_rds_db_load_cpu_maximum: rds db 负载 cpu 最大值
cloudwatch_aws_rds_db_load_cpu_minimum: rds db 负载 cpu 最小值
cloudwatch_aws_rds_db_load_cpu_sample_count: rds db 加载 CPU 样本数
cloudwatch_aws_rds_db_load_cpu_sum: rds db 加载cpu总和
cloudwatch_aws_rds_db_load_maximum: rds 数据库负载最大值
cloudwatch_aws_rds_db_load_minimum: rds 数据库负载最小值
cloudwatch_aws_rds_db_load_non_cpu_average: rds 加载非 CPU 平均值
cloudwatch_aws_rds_db_load_non_cpu_maximum: rds 加载非 cpu 最大值
cloudwatch_aws_rds_db_load_non_cpu_minimum: rds 加载非 cpu 最小值
cloudwatch_aws_rds_db_load_non_cpu_sample_count: rds 加载非 cpu 样本计数
cloudwatch_aws_rds_db_load_non_cpu_sum: rds 加载非cpu总和
cloudwatch_aws_rds_db_load_sample_count: rds db 加载样本计数
cloudwatch_aws_rds_db_load_sum: rds db 负载总和
cloudwatch_aws_rds_disk_queue_depth_average: rds 磁盘队列深度平均值
cloudwatch_aws_rds_disk_queue_depth_maximum: rds 磁盘队列深度最大值
cloudwatch_aws_rds_disk_queue_depth_minimum: rds 磁盘队列深度最小值
cloudwatch_aws_rds_disk_queue_depth_sample_count: rds 磁盘队列深度样本计数
cloudwatch_aws_rds_disk_queue_depth_sum: rds 磁盘队列深度总和
cloudwatch_aws_rds_ebs_byte_balance__average: rds ebs 字节余额平均值
cloudwatch_aws_rds_ebs_byte_balance__maximum: rds ebs 字节余额最大值
cloudwatch_aws_rds_ebs_byte_balance__minimum: rds ebs 字节余额最低
cloudwatch_aws_rds_ebs_byte_balance__sample_count: rds ebs 字节余额样本数
cloudwatch_aws_rds_ebs_byte_balance__sum: rds ebs 字节余额总和
cloudwatch_aws_rds_ebsio_balance__average: rds ebsio 余额平均值
cloudwatch_aws_rds_ebsio_balance__maximum: rds ebsio 余额最大值
cloudwatch_aws_rds_ebsio_balance__minimum: rds ebsio 余额最低
cloudwatch_aws_rds_ebsio_balance__sample_count: rds ebsio 平衡样本计数
cloudwatch_aws_rds_ebsio_balance__sum: rds ebsio 余额总和
cloudwatch_aws_rds_free_storage_space_average: rds 免费存储空间平均
cloudwatch_aws_rds_free_storage_space_maximum: rds 最大可用存储空间
cloudwatch_aws_rds_free_storage_space_minimum: rds 最低可用存储空间
cloudwatch_aws_rds_free_storage_space_sample_count: rds 可用存储空间样本数
cloudwatch_aws_rds_free_storage_space_sum: rds 免费存储空间总和
cloudwatch_aws_rds_freeable_memory_average: rds 可用内存平均值
cloudwatch_aws_rds_freeable_memory_maximum: rds 最大可用内存
cloudwatch_aws_rds_freeable_memory_minimum: rds 最小可用内存
cloudwatch_aws_rds_freeable_memory_sample_count: rds 可释放内存样本数
cloudwatch_aws_rds_freeable_memory_sum: rds 可释放内存总和
cloudwatch_aws_rds_lvm_read_iops_average: rds lvm 读取 iops 平均值
cloudwatch_aws_rds_lvm_read_iops_maximum: rds lvm 读取 iops 最大值
cloudwatch_aws_rds_lvm_read_iops_minimum: rds lvm 读取 iops 最低
cloudwatch_aws_rds_lvm_read_iops_sample_count: rds lvm 读取 iops 样本计数
cloudwatch_aws_rds_lvm_read_iops_sum: rds lvm 读取 iops 总和
cloudwatch_aws_rds_lvm_write_iops_average: rds lvm 写入 iops 平均值
cloudwatch_aws_rds_lvm_write_iops_maximum: rds lvm 写入 iops 最大值
cloudwatch_aws_rds_lvm_write_iops_minimum: rds lvm 写入 iops 最低
cloudwatch_aws_rds_lvm_write_iops_sample_count: rds lvm 写入 iops 样本计数
cloudwatch_aws_rds_lvm_write_iops_sum: rds lvm 写入 iops 总和
cloudwatch_aws_rds_network_receive_throughput_average: rds 网络接收吞吐量平均
cloudwatch_aws_rds_network_receive_throughput_maximum: rds 网络接收吞吐量最大值
cloudwatch_aws_rds_network_receive_throughput_minimum: rds 网络接收吞吐量最小值
cloudwatch_aws_rds_network_receive_throughput_sample_count: rds 网络接收吞吐量样本计数
cloudwatch_aws_rds_network_receive_throughput_sum: rds 网络接收吞吐量总和
cloudwatch_aws_rds_network_transmit_throughput_average: rds 网络传输吞吐量平均值
cloudwatch_aws_rds_network_transmit_throughput_maximum: rds 网络传输吞吐量最大
cloudwatch_aws_rds_network_transmit_throughput_minimum: rds 网络传输吞吐量最小值
cloudwatch_aws_rds_network_transmit_throughput_sample_count: rds 网络传输吞吐量样本计数
cloudwatch_aws_rds_network_transmit_throughput_sum: rds 网络传输吞吐量总和
cloudwatch_aws_rds_read_iops_average: rds 读取 iops 平均值
cloudwatch_aws_rds_read_iops_maximum: rds 最大读取 iops
cloudwatch_aws_rds_read_iops_minimum: rds 读取 iops 最低
cloudwatch_aws_rds_read_iops_sample_count: rds 读取 iops 样本计数
cloudwatch_aws_rds_read_iops_sum: rds 读取 iops 总和
cloudwatch_aws_rds_read_latency_average: rds 读取延迟平均值
cloudwatch_aws_rds_read_latency_maximum: rds 读取延迟最大值
cloudwatch_aws_rds_read_latency_minimum: rds 最小读取延迟
cloudwatch_aws_rds_read_latency_sample_count: rds 读取延迟样本计数
cloudwatch_aws_rds_read_latency_sum: rds 读取延迟总和
cloudwatch_aws_rds_read_throughput_average: rds 读取吞吐量平均值
cloudwatch_aws_rds_read_throughput_maximum: rds 最大读取吞吐量
cloudwatch_aws_rds_read_throughput_minimum: rds 最小读取吞吐量
cloudwatch_aws_rds_read_throughput_sample_count: rds 读取吞吐量样本计数
cloudwatch_aws_rds_read_throughput_sum: rds 读取吞吐量总和
cloudwatch_aws_rds_swap_usage_average: rds 交换使用平均值
cloudwatch_aws_rds_swap_usage_maximum: rds 交换使用最大值
cloudwatch_aws_rds_swap_usage_minimum: rds 交换使用量最低
cloudwatch_aws_rds_swap_usage_sample_count: rds 交换使用示例计数
cloudwatch_aws_rds_swap_usage_sum: rds 交换使用总和
cloudwatch_aws_rds_write_iops_average: rds 写入 iops 平均值
cloudwatch_aws_rds_write_iops_maximum: rds 写入 iops 最大值
cloudwatch_aws_rds_write_iops_minimum: rds 写入 iops 最低
cloudwatch_aws_rds_write_iops_sample_count: rds 写入 iops 样本计数
cloudwatch_aws_rds_write_iops_sum: rds 写入 iops 总和
cloudwatch_aws_rds_write_latency_average: rds 写入延迟平均值
cloudwatch_aws_rds_write_latency_maximum: rds 最大写入延迟
cloudwatch_aws_rds_write_latency_minimum: rds 写入延迟最小值
cloudwatch_aws_rds_write_latency_sample_count: rds 写入延迟样本计数
cloudwatch_aws_rds_write_latency_sum: rds 写入延迟总和
cloudwatch_aws_rds_write_throughput_average: rds 写入吞吐量平均值
cloudwatch_aws_rds_write_throughput_maximum: rds 最大写入吞吐量
cloudwatch_aws_rds_write_throughput_minimum: rds 写入吞吐量最小值
cloudwatch_aws_rds_write_throughput_sample_count: rds 写入吞吐量样本计数
cloudwatch_aws_rds_write_throughput_sum: rds 写入吞吐量总和
en:
ip_conntrack_count: the number of entries in the conntrack table(unit:int, count)
ip_conntrack_max: the max capacity of the conntrack table(unit:int, size)
cpu_usage_idle: "CPU idle rate(unit:%)"
cpu_usage_active: "CPU usage rate(unit:%)"
cpu_usage_system: "CPU kernel state time proportion(unit:%)"
cpu_usage_user: "CPU user attitude time proportion(unit:%)"
cpu_usage_nice: "The proportion of low priority CPU time, that is, the process NICE value is adjusted to the CPU time between 1-19. Note here that the value range of NICE is -20 to 19, the larger the value, the lower the priority, the lower the priority(unit:%)"
cpu_usage_iowait: "CPU waiting for I/O time proportion(unit:%)"
cpu_usage_irq: "CPU processing hard interrupt time proportion(unit:%)"
cpu_usage_softirq: "CPU processing soft interrupt time proportion(unit:%)"
cpu_usage_steal: "In the virtual machine environment, there is this indicator, which means that the CPU is used by other virtual machines for the proportion of time.(unit:%)"
cpu_usage_guest: "The time to run other operating systems by virtualization, that is, the proportion of CPU time running the virtual machine(unit:%)"
cpu_usage_guest_nice: "The proportion of time to run the virtual machine at low priority(unit:%)"
disk_free: "The remaining amount of the hard disk partition (unit: byte)"
disk_used: "Hard disk partitional use (unit: byte)"
disk_used_percent: "Hard disk partitional use rate (unit:%)"
disk_total: "Total amount of hard disk partition (unit: byte)"
disk_inodes_free: "Hard disk partition INODE remaining amount"
disk_inodes_used: "Hard disk partition INODE usage amount"
disk_inodes_total: "The total amount of hard disk partition INODE"
diskio_io_time: "From the perspective of the device perspective, the total time of I/O request, the I/O request in the queue is count (unit: millisecond), the counter type, you need to use the function to find the value"
diskio_iops_in_progress: "IO requests that have been assigned to device -driven and have not yet been completed, not included in the queue but not yet assigned to the device -driven IO request, Gauge type"
diskio_merged_reads: "The number of times of adjacent reading request Merge, the counter type"
diskio_merged_writes: "The number of times the request Merge writes, the counter type"
diskio_read_bytes: "The number of byte reads, the counter type, you need to use the function to find the Rate to use the value"
diskio_read_time: "The total time of reading request (unit: millisecond), the counter type, you need to use the function to find the Rate to have the value of use"
diskio_reads: "Read the number of requests, the counter type, you need to use the function to find the Rate to use the value"
diskio_weighted_io_time: "From the perspective of the I/O request perspective, I/O wait for the total time. If there are multiple I/O requests at the same time, the time will be superimposed (unit: millisecond)"
diskio_write_bytes: "The number of bytes written, the counter type, you need to use the function to find the Rate to use the value"
diskio_write_time: "The total time of the request (unit: millisecond), the counter type, you need to use the function to find the rate to have the value of use"
diskio_writes: "Write the number of requests, the counter type, you need to use the function to find the rate to use value"
kernel_boot_time: "Kernel startup time"
kernel_context_switches: "Number of kernel context switching times"
kernel_entropy_avail: "Entropy pool inside the Linux system"
kernel_interrupts: "Number of kernel interruption"
kernel_processes_forked: "ForK's process number"
mem_active: "The total number of memory (including Cache and BUFFER memory)"
mem_available: "Application can use memory numbers"
mem_available_percent: "Memory remaining percentage (0 ~ 100)"
mem_buffered: "Used to make buffer size for the file"
mem_cached: "The size of the memory used by the cache memory (equal to diskcache minus Swap Cache )"
mem_commit_limit: "According to the over allocation ratio ('vm.overCommit _ Ratio'), this is the current total memory that can be allocated on the system."
mem_committed_as: "Currently allocated on the system. It is the sum of the memory of all process applications"
mem_dirty: "Waiting to be written back to the memory size of the disk"
mem_free: "Senior memory number"
mem_high_free: "Unused high memory size"
mem_high_total: "The total memory size of the high memory (Highmem refers to all the physical memory that is higher than 860 MB of memory, the HighMem area is used for user programs, or for page cache. This area is not directly mapped to the kernel space. The kernels must use different methods to use this section of memory. )"
mem_huge_page_size: "The size of each big page"
mem_huge_pages_free: "The number of Huge Pages in the pool that have not been allocated"
mem_huge_pages_total: "Reserve the total number of Huge Pages"
mem_inactive: "Free memory (including the memory of free and available)"
mem_low_free: "Unused low size"
mem_low_total: "The total size of the low memory memory can achieve the same role of high memory, and it can be used by the kernel to record some of its own data structure"
mem_mapped: "The size of the mapping of equipment and files"
mem_page_tables: "The size of the index table of the management of the memory paging page"
mem_shared: "The total memory shared by multiple processes"
mem_slab: "The size of the kernel data structure cache can reduce the consumption of application and release memory"
mem_sreclaimable: "The size of the SLAB can be recovered"
mem_sunreclaim: "The size of the SLAB cannot be recovered(SUnreclaim+SReclaimable=Slab)"
mem_swap_cached: "The size of the swap space used by the cache memory (cache memory), the memory that has been swapped out, but is still stored in the swapfile. Used to be quickly replaced when needed without opening the I/O port again"
mem_swap_free: "The size of the switching space is not used"
mem_swap_total: "The total size of the exchange space"
mem_total: "Total memory"
mem_used: "Memory number"
mem_used_percent: "The memory has been used by several percentage (0 ~ 100)"
mem_vmalloc_chunk: "The largest continuous unused vmalloc area"
mem_vmalloc_totalL: "You can vmalloc virtual memory size"
mem_vmalloc_used: "Vmalloc's virtual memory size"
mem_write_back: "The memory size of the disk is being written back to the disk"
mem_write_back_tmp: "Fuse is used to temporarily write back the memory of the buffer area"
net_bytes_recv: "Total inbound traffic(bytes) of network card"
net_bytes_sent: "Total outbound traffic(bytes) of network card"
net_bits_recv: "Total inbound traffic(bits) of network card"
net_bits_sent: "Total outbound traffic(bits) of network card"
net_drop_in: "The number of packets for network cards"
net_drop_out: "The number of packets issued by the network card"
net_err_in: "The number of incorrect packets of the network card"
net_err_out: "Number of incorrect number of network cards"
net_packets_recv: "Net card collection quantity"
net_packets_sent: "Number of network card issuance"
netstat_tcp_established: "ESTABLISHED status network link number"
netstat_tcp_fin_wait1: "FIN _ WAIT1 status network link number"
netstat_tcp_fin_wait2: "FIN _ WAIT2 status number of network links"
netstat_tcp_last_ack: "LAST_ ACK status number of network links"
netstat_tcp_listen: "Number of network links in Listen status"
netstat_tcp_syn_recv: "SYN _ RECV status number of network links"
netstat_tcp_syn_sent: "SYN _ SENT status number of network links"
netstat_tcp_time_wait: "Time _ WAIT status network link number"
netstat_udp_socket: "Number of network links in UDP status"
processes_blocked: "The number of processes in the unreproducible sleep state('U','D','L')"
processes_dead: "Number of processes in recycling('X')"
processes_idle: "Number of idle processes hanging('I')"
processes_paging: "Number of paging processes('P')"
processes_running: "Number of processes during operation('R')"
processes_sleeping: "Can interrupt the number of processes('S')"
processes_stopped: "Pushing status process number('T')"
processes_total: "Total process number"
processes_total_threads: "Number of threads"
processes_unknown: "Unknown status process number"
processes_zombies: "Number of zombies('Z')"
swap_used_percent: "SWAP space replace the data volume"
system_load1: "1 minute average load value"
system_load5: "5 minutes average load value"
system_load15: "15 minutes average load value"
system_load_norm_1: "1 minute average load value/logical CPU number"
system_load_norm_5: "5 minutes average load value/logical CPU number"
system_load_norm_15: "15 minutes average load value/logical CPU number"
system_n_users: "User number"
system_n_cpus: "CPU nuclear number"
system_uptime: "System startup time"
nginx_accepts: "Since Nginx started, the total number of connections has been established with the client"
nginx_active: "The current number of activity connections that Nginx is being processed is equal to Reading/Writing/Waiting"
nginx_handled: "Starting from Nginx, the total number of client connections that have been processed"
nginx_reading: "Reading the total number of connections on the http request header"
nginx_requests: "Since nginx is started, the total number of client requests processed, due to the existence of HTTP Keep-Alive requests, this value will be greater than the handled value"
nginx_upstream_check_fall: "UPStream_CHECK module detects the number of back -end failures"
nginx_upstream_check_rise: "UPSTREAM _ Check module to detect the number of back -end"
nginx_upstream_check_status_code: "The state of the backstream is 1, and the down is 0"
nginx_waiting: "When keep-alive is enabled, this value is equal to active – (reading+writing), which means that Nginx has processed the resident connection that is waiting for the next request command"
nginx_writing: "The total number of connections to send a response to the client"
http_response_content_length: "HTTP message entity transmission length"
http_response_http_response_code: "http response status code"
http_response_response_time: "When http ring application"
http_response_result_code: "URL detection result 0 is normal, otherwise the URL cannot be accessed"
# [mysqld_exporter]
mysql_global_status_uptime: The number of seconds that the server has been up.(Gauge)
mysql_global_status_uptime_since_flush_status: The number of seconds since the most recent FLUSH STATUS statement.(Gauge)
mysql_global_status_queries: The number of statements executed by the server. This variable includes statements executed within stored programs, unlike the Questions variable. It does not count COM_PING or COM_STATISTICS commands.(Counter)
mysql_global_status_threads_connected: The number of currently open connections.(Counter)
mysql_global_status_connections: The number of connection attempts (successful or not) to the MySQL server.(Gauge)
mysql_global_status_max_used_connections: The maximum number of connections that have been in use simultaneously since the server started.(Gauge)
mysql_global_status_threads_running: The number of threads that are not sleeping.(Gauge)
mysql_global_status_questions: The number of statements executed by the server. This includes only statements sent to the server by clients and not statements executed within stored programs, unlike the Queries variable. This variable does not count COM_PING, COM_STATISTICS, COM_STMT_PREPARE, COM_STMT_CLOSE, or COM_STMT_RESET commands.(Counter)
mysql_global_status_threads_cached: The number of threads in the thread cache.(Counter)
mysql_global_status_threads_created: The number of threads created to handle connections. If Threads_created is big, you may want to increase the thread_cache_size value. The cache miss rate can be calculated as Threads_created/Connections.(Counter)
mysql_global_status_created_tmp_tables: The number of internal temporary tables created by the server while executing statements.(Counter)
mysql_global_status_created_tmp_disk_tables: The number of internal on-disk temporary tables created by the server while executing statements. You can compare the number of internal on-disk temporary tables created to the total number of internal temporary tables created by comparing Created_tmp_disk_tables and Created_tmp_tables values.(Counter)
mysql_global_status_created_tmp_files: How many temporary files mysqld has created.(Counter)
mysql_global_status_select_full_join: The number of joins that perform table scans because they do not use indexes. If this value is not 0, you should carefully check the indexes of your tables.(Counter)
mysql_global_status_select_full_range_join: The number of joins that used a range search on a reference table.(Counter)
mysql_global_status_select_range: The number of joins that used ranges on the first table. This is normally not a critical issue even if the value is quite large.(Counter)
mysql_global_status_select_range_check: The number of joins without keys that check for key usage after each row. If this is not 0, you should carefully check the indexes of your tables.(Counter)
mysql_global_status_select_scan: The number of joins that did a full scan of the first table.(Counter)
mysql_global_status_sort_rows: The number of sorted rows.(Counter)
mysql_global_status_sort_range: The number of sorts that were done using ranges.(Counter)
mysql_global_status_sort_merge_passes: The number of merge passes that the sort algorithm has had to do. If this value is large, you should consider increasing the value of the sort_buffer_size system variable.(Counter)
mysql_global_status_sort_scan: The number of sorts that were done by scanning the table.(Counter)
mysql_global_status_slow_queries: The number of queries that have taken more than long_query_time seconds. This counter increments regardless of whether the slow query log is enabled.(Counter)
mysql_global_status_aborted_connects: The number of failed attempts to connect to the MySQL server.(Counter)
mysql_global_status_aborted_clients: The number of connections that were aborted because the client died without closing the connection properly.(Counter)
mysql_global_status_table_locks_immediate: The number of times that a request for a table lock could be granted immediately. Locks Immediate rising and falling is normal activity.(Counter)
mysql_global_status_table_locks_waited: The number of times that a request for a table lock could not be granted immediately and a wait was needed. If this is high and you have performance problems, you should first optimize your queries, and then either split your table or tables or use replication.(Counter)
mysql_global_status_bytes_received: The number of bytes received from all clients.(Counter)
mysql_global_status_bytes_sent: The number of bytes sent to all clients.(Counter)
mysql_global_status_innodb_page_size: InnoDB page size (default 16KB). Many values are counted in pages; the page size enables them to be easily converted to bytes.(Gauge)
mysql_global_status_buffer_pool_pages: The number of pages in the InnoDB buffer pool.(Gauge)
mysql_global_status_commands_total: The number of times each xxx statement has been executed.(Counter)
mysql_global_status_handlers_total: Handler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes. This is in fact the layer between the Storage Engine and MySQL.(Counter)
mysql_global_status_opened_files: The number of files that have been opened with my_open() (a mysys library function). Parts of the server that open files without using this function do not increment the count.(Counter)
mysql_global_status_open_tables: The number of tables that are open.(Gauge)
mysql_global_status_opened_tables: The number of tables that have been opened. If Opened_tables is big, your table_open_cache value is probably too small.(Counter)
mysql_global_status_table_open_cache_hits: The number of hits for open tables cache lookups.(Counter)
mysql_global_status_table_open_cache_misses: The number of misses for open tables cache lookups.(Counter)
mysql_global_status_table_open_cache_overflows: The number of overflows for the open tables cache.(Counter)
mysql_global_status_innodb_num_open_files: The number of files InnoDB currently holds open.(Gauge)
mysql_global_status_connection_errors_total: These variables provide information about errors that occur during the client connection process.(Counter)
mysql_global_status_innodb_buffer_pool_read_requests: The number of logical read requests.(Counter)
mysql_global_status_innodb_buffer_pool_reads: The number of logical reads that InnoDB could not satisfy from the buffer pool, and had to read directly from disk.(Counter)
mysql_global_variables_thread_cache_size: How many threads the server should cache for reuse.(Gauge)
mysql_global_variables_max_connections: The maximum permitted number of simultaneous client connections.(Gauge)
mysql_global_variables_innodb_buffer_pool_size: The size in bytes of the buffer pool, the memory area where InnoDB caches table and index data. The default value is 134217728 bytes (128MB).(Gauge)
mysql_global_variables_innodb_log_buffer_size: The size in bytes of the buffer that InnoDB uses to write to the log files on disk.(Gauge)
mysql_global_variables_key_buffer_size: Index blocks for MyISAM tables are buffered and are shared by all threads.(Gauge)
mysql_global_variables_query_cache_size: The amount of memory allocated for caching query results.(Gauge)
mysql_global_variables_table_open_cache: The number of open tables for all threads.(Gauge)
mysql_global_variables_open_files_limit: The number of file descriptors available to mysqld from the operating system.(Gauge)
# [redis_exporter]
redis_active_defrag_running: When activedefrag is enabled, this indicates whether defragmentation is currently active, and the CPU percentage it intends to utilize.
redis_allocator_active_bytes: Total bytes in the allocator active pages, this includes external-fragmentation.
redis_allocator_allocated_bytes: Total bytes allocated form the allocator, including internal-fragmentation. Normally the same as used_memory.
redis_allocator_frag_bytes: Delta between allocator_active and allocator_allocated. See note about mem_fragmentation_bytes.
redis_allocator_frag_ratio: Ratio between allocator_active and allocator_allocated. This is the true (external) fragmentation metric (not mem_fragmentation_ratio).
redis_allocator_resident_bytes: Total bytes resident (RSS) in the allocator, this includes pages that can be released to the OS (by MEMORY PURGE, or just waiting).
redis_allocator_rss_bytes: Delta between allocator_resident and allocator_active.
redis_allocator_rss_ratio: Ratio between allocator_resident and allocator_active. This usually indicates pages that the allocator can and probably will soon release back to the OS.
redis_aof_current_rewrite_duration_sec: Duration of the on-going AOF rewrite operation if any.
redis_aof_enabled: Flag indicating AOF logging is activated.
redis_aof_last_bgrewrite_status: Status of the last AOF rewrite operation.
redis_aof_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last AOF rewrite operation.
redis_aof_last_rewrite_duration_sec: Duration of the last AOF rewrite operation in seconds.
redis_aof_last_write_status: Status of the last write operation to the AOF.
redis_aof_rewrite_in_progress: Flag indicating a AOF rewrite operation is on-going.
redis_aof_rewrite_scheduled: Flag indicating an AOF rewrite operation will be scheduled once the on-going RDB save is complete.
redis_blocked_clients: Number of clients pending on a blocking call (BLPOP, BRPOP, BRPOPLPUSH, BLMOVE, BZPOPMIN, BZPOPMAX).
redis_client_recent_max_input_buffer_bytes: Biggest input buffer among current client connections.
redis_client_recent_max_output_buffer_bytes: Biggest output buffer among current client connections.
redis_cluster_enabled: Indicate Redis cluster is enabled.
redis_commands_duration_seconds_total: The total CPU time consumed by these commands.(Counter)
redis_commands_processed_total: Total number of commands processed by the server.(Counter)
redis_commands_total: The number of calls that reached command execution (not rejected).(Counter)
redis_config_maxclients: The value of the maxclients configuration directive. This is the upper limit for the sum of connected_clients, connected_slaves and cluster_connections.
redis_config_maxmemory: The value of the maxmemory configuration directive.
redis_connected_clients: Number of client connections (excluding connections from replicas).
redis_connected_slaves: Number of connected replicas.
redis_connections_received_total: Total number of connections accepted by the server.(Counter)
redis_cpu_sys_children_seconds_total: System CPU consumed by the background processes.(Counter)
redis_cpu_sys_seconds_total: System CPU consumed by the Redis server, which is the sum of system CPU consumed by all threads of the server process (main thread and background threads).(Counter)
redis_cpu_user_children_seconds_total: User CPU consumed by the background processes.(Counter)
redis_cpu_user_seconds_total: User CPU consumed by the Redis server, which is the sum of user CPU consumed by all threads of the server process (main thread and background threads).(Counter)
redis_db_keys: Total number of keys by DB.
redis_db_keys_expiring: Total number of expiring keys by DB
redis_defrag_hits: Number of value reallocations performed by active the defragmentation process.
redis_defrag_misses: Number of aborted value reallocations started by the active defragmentation process.
redis_defrag_key_hits: Number of keys that were actively defragmented.
redis_defrag_key_misses: Number of keys that were skipped by the active defragmentation process.
redis_evicted_keys_total: Number of evicted keys due to maxmemory limit.(Counter)
redis_expired_keys_total: Total number of key expiration events.(Counter)
redis_expired_stale_percentage: The percentage of keys probably expired.
redis_expired_time_cap_reached_total: The count of times that active expiry cycles have stopped early.
redis_exporter_last_scrape_connect_time_seconds: The duration(in seconds) to connect when scrape.
redis_exporter_last_scrape_duration_seconds: The last scrape duration.
redis_exporter_last_scrape_error: The last scrape error status.
redis_exporter_scrape_duration_seconds_count: Durations of scrapes by the exporter
redis_exporter_scrape_duration_seconds_sum: Durations of scrapes by the exporter
redis_exporter_scrapes_total: Current total redis scrapes.(Counter)
redis_instance_info: Information about the Redis instance.
redis_keyspace_hits_total: Hits total.(Counter)
redis_keyspace_misses_total: Misses total.(Counter)
redis_last_key_groups_scrape_duration_milliseconds: Duration of the last key group metrics scrape in milliseconds.
redis_last_slow_execution_duration_seconds: The amount of time needed for last slow execution, in seconds.
redis_latest_fork_seconds: The amount of time needed for last fork, in seconds.
redis_lazyfree_pending_objects: The number of objects waiting to be freed (as a result of calling UNLINK, or FLUSHDB and FLUSHALL with the ASYNC option).
redis_master_repl_offset: The server's current replication offset.
redis_mem_clients_normal: Memory used by normal clients.(Gauge)
redis_mem_clients_slaves: Memory used by replica clients - Starting Redis 7.0, replica buffers share memory with the replication backlog, so this field can show 0 when replicas don't trigger an increase of memory usage.
redis_mem_fragmentation_bytes: Delta between used_memory_rss and used_memory. Note that when the total fragmentation bytes is low (few megabytes), a high ratio (e.g. 1.5 and above) is not an indication of an issue.
redis_mem_fragmentation_ratio: Ratio between used_memory_rss and used_memory. Note that this doesn't only includes fragmentation, but also other process overheads (see the allocator_* metrics), and also overheads like code, shared libraries, stack, etc.
redis_mem_not_counted_for_eviction_bytes: (Gauge)
redis_memory_max_bytes: Max memory limit in bytes.
redis_memory_used_bytes: Total number of bytes allocated by Redis using its allocator (either standard libc, jemalloc, or an alternative allocator such as tcmalloc)
redis_memory_used_dataset_bytes: The size in bytes of the dataset (used_memory_overhead subtracted from used_memory)
redis_memory_used_lua_bytes: Number of bytes used by the Lua engine.
redis_memory_used_overhead_bytes: The sum in bytes of all overheads that the server allocated for managing its internal data structures.
redis_memory_used_peak_bytes: Peak memory consumed by Redis (in bytes)
redis_memory_used_rss_bytes: Number of bytes that Redis allocated as seen by the operating system (a.k.a resident set size). This is the number reported by tools such as top(1) and ps(1)
redis_memory_used_scripts_bytes: Number of bytes used by cached Lua scripts
redis_memory_used_startup_bytes: Initial amount of memory consumed by Redis at startup in bytes
redis_migrate_cached_sockets_total: The number of sockets open for MIGRATE purposes
redis_net_input_bytes_total: Total input bytes(Counter)
redis_net_output_bytes_total: Total output bytes(Counter)
redis_process_id: Process ID
redis_pubsub_channels: Global number of pub/sub channels with client subscriptions
redis_pubsub_patterns: Global number of pub/sub pattern with client subscriptions
redis_rdb_bgsave_in_progress: Flag indicating a RDB save is on-going
redis_rdb_changes_since_last_save: Number of changes since the last dump
redis_rdb_current_bgsave_duration_sec: Duration of the on-going RDB save operation if any
redis_rdb_last_bgsave_duration_sec: Duration of the last RDB save operation in seconds
redis_rdb_last_bgsave_status: Status of the last RDB save operation
redis_rdb_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last RDB save operation
redis_rdb_last_save_timestamp_seconds: Epoch-based timestamp of last successful RDB save
redis_rejected_connections_total: Number of connections rejected because of maxclients limit(Counter)
redis_repl_backlog_first_byte_offset: The master offset of the replication backlog buffer
redis_repl_backlog_history_bytes: Size in bytes of the data in the replication backlog buffer
redis_repl_backlog_is_active: Flag indicating replication backlog is active
redis_replica_partial_resync_accepted: The number of accepted partial resync requests(Gauge)
redis_replica_partial_resync_denied: The number of denied partial resync requests(Gauge)
redis_replica_resyncs_full: The number of full resyncs with replicas
redis_replication_backlog_bytes: Memory used by replication backlog
redis_second_repl_offset: The offset up to which replication IDs are accepted.
redis_slave_expires_tracked_keys: The number of keys tracked for expiry purposes (applicable only to writable replicas)(Gauge)
redis_slowlog_last_id: Last id of slowlog
redis_slowlog_length: Total slowlog
redis_start_time_seconds: Start time of the Redis instance since unix epoch in seconds.
redis_target_scrape_request_errors_total: Errors in requests to the exporter
redis_up: Flag indicating redis instance is up
redis_uptime_in_seconds: Number of seconds since Redis server start
# [windows_exporter]
windows_cpu_clock_interrupts_total: Total number of received and serviced clock tick interrupts(counter)
windows_cpu_core_frequency_mhz: Core frequency in megahertz(gauge)
windows_cpu_cstate_seconds_total: Time spent in low-power idle state(counter)
windows_cpu_dpcs_total: Total number of received and serviced deferred procedure calls (DPCs)(counter)
windows_cpu_idle_break_events_total: Total number of time processor was woken from idle(counter)
windows_cpu_interrupts_total: Total number of received and serviced hardware interrupts(counter)
windows_cpu_parking_status: Parking Status represents whether a processor is parked or not(gauge)
windows_cpu_processor_performance: Processor Performance is the average performance of the processor while it is executing instructions, as a percentage of the nominal performance of the processor. On some processors, Processor Performance may exceed 100%(gauge)
windows_cpu_time_total: Time that processor spent in different modes (idle, user, system, ...)(counter)
windows_cs_hostname: Labeled system hostname information as provided by ComputerSystem.DNSHostName and ComputerSystem.Domain(gauge)
windows_cs_logical_processors: ComputerSystem.NumberOfLogicalProcessors(gauge)
windows_cs_physical_memory_bytes: ComputerSystem.TotalPhysicalMemory(gauge)
windows_exporter_build_info: A metric with a constant '1' value labeled by version, revision, branch, and goversion from which windows_exporter was built.(gauge)
windows_exporter_collector_duration_seconds: Duration of a collection.(gauge)
windows_exporter_collector_success: Whether the collector was successful.(gauge)
windows_exporter_collector_timeout: Whether the collector timed out.(gauge)
windows_exporter_perflib_snapshot_duration_seconds: Duration of perflib snapshot capture(gauge)
windows_logical_disk_free_bytes: Free space in bytes (LogicalDisk.PercentFreeSpace)(gauge)
windows_logical_disk_idle_seconds_total: Seconds that the disk was idle (LogicalDisk.PercentIdleTime)(counter)
windows_logical_disk_read_bytes_total: The number of bytes transferred from the disk during read operations (LogicalDisk.DiskReadBytesPerSec)(counter)
windows_logical_disk_read_latency_seconds_total: Shows the average time, in seconds, of a read operation from the disk (LogicalDisk.AvgDiskSecPerRead)(counter)
windows_logical_disk_read_seconds_total: Seconds that the disk was busy servicing read requests (LogicalDisk.PercentDiskReadTime)(counter)
windows_logical_disk_read_write_latency_seconds_total: Shows the time, in seconds, of the average disk transfer (LogicalDisk.AvgDiskSecPerTransfer)(counter)
windows_logical_disk_reads_total: The number of read operations on the disk (LogicalDisk.DiskReadsPerSec)(counter)
windows_logical_disk_requests_queued: The number of requests queued to the disk (LogicalDisk.CurrentDiskQueueLength)(gauge)
windows_logical_disk_size_bytes: Total space in bytes (LogicalDisk.PercentFreeSpace_Base)(gauge)
windows_logical_disk_split_ios_total: The number of I/Os to the disk were split into multiple I/Os (LogicalDisk.SplitIOPerSec)(counter)
windows_logical_disk_write_bytes_total: The number of bytes transferred to the disk during write operations (LogicalDisk.DiskWriteBytesPerSec)(counter)
windows_logical_disk_write_latency_seconds_total: Shows the average time, in seconds, of a write operation to the disk (LogicalDisk.AvgDiskSecPerWrite)(counter)
windows_logical_disk_write_seconds_total: Seconds that the disk was busy servicing write requests (LogicalDisk.PercentDiskWriteTime)(counter)
windows_logical_disk_writes_total: The number of write operations on the disk (LogicalDisk.DiskWritesPerSec)(counter)
windows_net_bytes_received_total: (Network.BytesReceivedPerSec)(counter)
windows_net_bytes_sent_total: (Network.BytesSentPerSec)(counter)
windows_net_bytes_total: (Network.BytesTotalPerSec)(counter)
windows_net_current_bandwidth: (Network.CurrentBandwidth)(gauge)
windows_net_packets_outbound_discarded_total: (Network.PacketsOutboundDiscarded)(counter)
windows_net_packets_outbound_errors_total: (Network.PacketsOutboundErrors)(counter)
windows_net_packets_received_discarded_total: (Network.PacketsReceivedDiscarded)(counter)
windows_net_packets_received_errors_total: (Network.PacketsReceivedErrors)(counter)
windows_net_packets_received_total: (Network.PacketsReceivedPerSec)(counter)
windows_net_packets_received_unknown_total: (Network.PacketsReceivedUnknown)(counter)
windows_net_packets_sent_total: (Network.PacketsSentPerSec)(counter)
windows_net_packets_total: (Network.PacketsPerSec)(counter)
windows_os_info: OperatingSystem.Caption, OperatingSystem.Version(gauge)
windows_os_paging_free_bytes: OperatingSystem.FreeSpaceInPagingFiles(gauge)
windows_os_paging_limit_bytes: OperatingSystem.SizeStoredInPagingFiles(gauge)
windows_os_physical_memory_free_bytes: OperatingSystem.FreePhysicalMemory(gauge)
windows_os_process_memory_limix_bytes: OperatingSystem.MaxProcessMemorySize(gauge)
windows_os_processes: OperatingSystem.NumberOfProcesses(gauge)
windows_os_processes_limit: OperatingSystem.MaxNumberOfProcesses(gauge)
windows_os_time: OperatingSystem.LocalDateTime(gauge)
windows_os_timezone: OperatingSystem.LocalDateTime(gauge)
windows_os_users: OperatingSystem.NumberOfUsers(gauge)
windows_os_virtual_memory_bytes: OperatingSystem.TotalVirtualMemorySize(gauge)
windows_os_virtual_memory_free_bytes: OperatingSystem.FreeVirtualMemory(gauge)
windows_os_visible_memory_bytes: OperatingSystem.TotalVisibleMemorySize(gauge)
windows_service_info: A metric with a constant '1' value labeled with service information(gauge)
windows_service_start_mode: The start mode of the service (StartMode)(gauge)
windows_service_state: The state of the service (State)(gauge)
windows_service_status: The status of the service (Status)(gauge)
windows_system_context_switches_total: Total number of context switches (WMI source is PerfOS_System.ContextSwitchesPersec)(counter)
windows_system_exception_dispatches_total: Total number of exceptions dispatched (WMI source is PerfOS_System.ExceptionDispatchesPersec)(counter)
windows_system_processor_queue_length: Length of processor queue (WMI source is PerfOS_System.ProcessorQueueLength)(gauge)
windows_system_system_calls_total: Total number of system calls (WMI source is PerfOS_System.SystemCallsPersec)(counter)
windows_system_system_up_time: System boot time (WMI source is PerfOS_System.SystemUpTime)(gauge)
windows_system_threads: Current number of threads (WMI source is PerfOS_System.Threads)(gauge)
# [node_exporter]
# SYSTEM
# CPU context switch 次数
node_context_switches_total: context_switches
# Interrupts 次数
node_intr_total: Interrupts
# 运行的进程数
node_procs_running: Processes in runnable state
# 熵池大小
node_entropy_available_bits: Entropy available to random number generators
node_time_seconds: System time in seconds since epoch (1970)
node_boot_time_seconds: Node boot time, in unixtime
# CPU
node_cpu_seconds_total: Seconds the CPUs spent in each mode
node_load1: cpu load 1m
node_load5: cpu load 5m
node_load15: cpu load 15m
# MEM
# 内核态
# 内核用于缓存数据结构供自己使用的内存
node_memory_Slab_bytes: Memory used by the kernel to cache data structures for its own use
# slab中可回收的部分
node_memory_SReclaimable_bytes: SReclaimable - Part of Slab, that might be reclaimed, such as caches
# slab中不可回收的部分
node_memory_SUnreclaim_bytes: Part of Slab, that cannot be reclaimed on memory pressure
# Vmalloc内存区的大小
node_memory_VmallocTotal_bytes: Total size of vmalloc memory area
# vmalloc已分配的内存,虚拟地址空间上的连续的内存
node_memory_VmallocUsed_bytes: Amount of vmalloc area which is used
# vmalloc区可用的连续最大快的大小,通过此指标可以知道vmalloc可分配连续内存的最大值
node_memory_VmallocChunk_bytes: Largest contiguous block of vmalloc area which is free
# 内存的硬件故障删除掉的内存页的总大小
node_memory_HardwareCorrupted_bytes: Amount of RAM that the kernel identified as corrupted / not working
# 用于在虚拟和物理内存地址之间映射的内存
node_memory_PageTables_bytes: Memory used to map between virtual and physical memory addresses (gauge)
# 内核栈内存,常驻内存,不可回收
node_memory_KernelStack_bytes: Kernel memory stack. This is not reclaimable
# 用来访问高端内存,复制高端内存的临时buffer,称为“bounce buffering”,会降低I/O 性能
node_memory_Bounce_bytes: Memory used for block device bounce buffers
#用户态
# 单个巨页大小
node_memory_Hugepagesize_bytes: Huge Page size
# 系统分配的常驻巨页数
node_memory_HugePages_Total: Total size of the pool of huge pages
# 系统空闲的巨页数
node_memory_HugePages_Free: Huge pages in the pool that are not yet allocated
# 进程已申请但未使用的巨页数
node_memory_HugePages_Rsvd: Huge pages for which a commitment to allocate from the pool has been made, but no allocation
# 超过系统设定的常驻HugePages数量的个数
node_memory_HugePages_Surp: Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages
# 透明巨页 Transparent HugePages (THP)
node_memory_AnonHugePages_bytes: Memory in anonymous huge pages
# inactivelist中的File-backed内存
node_memory_Inactive_file_bytes: File-backed memory on inactive LRU list
# inactivelist中的Anonymous内存
node_memory_Inactive_anon_bytes: Anonymous and swap cache on inactive LRU list, including tmpfs (shmem)
# activelist中的File-backed内存
node_memory_Active_file_bytes: File-backed memory on active LRU list
# activelist中的Anonymous内存
node_memory_Active_anon_bytes: Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs
# 禁止换出的页,对应 Unevictable 链表
node_memory_Unevictable_bytes: Amount of unevictable memory that can't be swapped out for a variety of reasons
# 共享内存
node_memory_Shmem_bytes: Used shared memory (shared between several processes, thus including RAM disks)
# 匿名页内存大小
node_memory_AnonPages_bytes: Memory in user pages not backed by files
# 被关联的内存页大小
node_memory_Mapped_bytes: Used memory in mapped pages files which have been mapped, such as libraries
# file-backed内存页缓存大小
node_memory_Cached_bytes: Parked file data (file content) cache
# 系统中有多少匿名页曾经被swap-out、现在又被swap-in并且swap-in之后页面中的内容一直没发生变化
node_memory_SwapCached_bytes: Memory that keeps track of pages that have been fetched from swap but not yet been modified
# 被mlock()系统调用锁定的内存大小
node_memory_Mlocked_bytes: Size of pages locked to memory using the mlock() system call
# 块设备(block device)所占用的缓存页
node_memory_Buffers_bytes: Block device (e.g. harddisk) cache
node_memory_SwapTotal_bytes: Memory information field SwapTotal_bytes
node_memory_SwapFree_bytes: Memory information field SwapFree_bytes
# DISK
node_filesystem_avail_bytes: Filesystem space available to non-root users in byte
node_filesystem_free_bytes: Filesystem free space in bytes
node_filesystem_size_bytes: Filesystem size in bytes
node_filesystem_files_free: Filesystem total free file nodes
node_filesystem_files: Filesystem total free file nodes
node_filefd_maximum: Max open files
node_filefd_allocated: Open files
node_filesystem_readonly: Filesystem read-only status
node_filesystem_device_error: Whether an error occurred while getting statistics for the given device
node_disk_reads_completed_total: The total number of reads completed successfully
node_disk_writes_completed_total: The total number of writes completed successfully
node_disk_reads_merged_total: The number of reads merged
node_disk_writes_merged_total: The number of writes merged
node_disk_read_bytes_total: The total number of bytes read successfully
node_disk_written_bytes_total: The total number of bytes written successfully
node_disk_io_time_seconds_total: Total seconds spent doing I/Os
node_disk_read_time_seconds_total: The total number of seconds spent by all reads
node_disk_write_time_seconds_total: The total number of seconds spent by all writes
node_disk_io_time_weighted_seconds_total: The weighted of seconds spent doing I/Os
# NET
node_network_receive_bytes_total: Network device statistic receive_bytes (counter)
node_network_transmit_bytes_total: Network device statistic transmit_bytes (counter)
node_network_receive_packets_total: Network device statistic receive_bytes
node_network_transmit_packets_total: Network device statistic transmit_bytes
node_network_receive_errs_total: Network device statistic receive_errs
node_network_transmit_errs_total: Network device statistic transmit_errs
node_network_receive_drop_total: Network device statistic receive_drop
node_network_transmit_drop_total: Network device statistic transmit_drop
node_nf_conntrack_entries: Number of currently allocated flow entries for connection tracking
node_sockstat_TCP_alloc: Number of TCP sockets in state alloc
node_sockstat_TCP_inuse: Number of TCP sockets in state inuse
node_sockstat_TCP_orphan: Number of TCP sockets in state orphan
node_sockstat_TCP_tw: Number of TCP sockets in state tw
node_netstat_Tcp_CurrEstab: Statistic TcpCurrEstab
node_sockstat_sockets_used: Number of IPv4 sockets in use
# [kafka_exporter]
kafka_brokers: count of kafka_brokers (gauge)
kafka_topic_partitions: Number of partitions for this Topic (gauge)
kafka_topic_partition_current_offset: Current Offset of a Broker at Topic/Partition (gauge)
kafka_consumergroup_current_offset: Current Offset of a ConsumerGroup at Topic/Partition (gauge)
kafka_consumer_lag_millis: Current approximation of consumer lag for a ConsumerGroup at Topic/Partition (gauge)
kafka_topic_partition_under_replicated_partition: 1 if Topic/Partition is under Replicated
# [zookeeper_exporter]
zk_znode_count: The total count of znodes stored
zk_ephemerals_count: The number of Ephemerals nodes
zk_watch_count: The number of watchers setup over Zookeeper nodes.
zk_approximate_data_size: Size of data in bytes that a zookeeper server has in its data tree
zk_outstanding_requests: Number of currently executing requests
zk_packets_sent: Count of the number of zookeeper packets sent from a server
zk_packets_received: Count of the number of zookeeper packets received by a server
zk_num_alive_connections: Number of active clients connected to a zookeeper server
zk_open_file_descriptor_count: Number of file descriptors that a zookeeper server has open
zk_max_file_descriptor_count: Maximum number of file descriptors that a zookeeper server can open
zk_avg_latency: Average time in milliseconds for requests to be processed
zk_min_latency: Maximum time in milliseconds for a request to be processed
zk_max_latency: Minimum time in milliseconds for a request to be processed
================================================
FILE: docker/compose-bridge/etc-nightingale/script/notify.bak.py
================================================
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys
import json
import urllib2
import smtplib
from email.mime.text import MIMEText
reload(sys)
sys.setdefaultencoding('utf8')
notify_channel_funcs = {
"email":"email",
"sms":"sms",
"voice":"voice",
"dingtalk":"dingtalk",
"wecom":"wecom",
"feishu":"feishu"
}
mail_host = "smtp.163.com"
mail_port = 994
mail_user = "ulricqin"
mail_pass = "password"
mail_from = "ulricqin@163.com"
class Sender(object):
@classmethod
def send_email(cls, payload):
if mail_user == "ulricqin" and mail_pass == "password":
print("invalid smtp configuration")
return
users = payload.get('event').get("notify_users_obj")
emails = {}
for u in users:
if u.get("email"):
emails[u.get("email")] = 1
if not emails:
return
recipients = emails.keys()
mail_body = payload.get('tpls').get("email.tpl", "email.tpl not found")
message = MIMEText(mail_body, 'html', 'utf-8')
message['From'] = mail_from
message['To'] = ", ".join(recipients)
message["Subject"] = payload.get('tpls').get("subject.tpl", "subject.tpl not found")
try:
smtp = smtplib.SMTP_SSL(mail_host, mail_port)
smtp.login(mail_user, mail_pass)
smtp.sendmail(mail_from, recipients, message.as_string())
smtp.close()
except smtplib.SMTPException, error:
print(error)
@classmethod
def send_wecom(cls, payload):
users = payload.get('event').get("notify_users_obj")
tokens = {}
for u in users:
contacts = u.get("contacts")
if contacts.get("wecom_robot_token", ""):
tokens[contacts.get("wecom_robot_token", "")] = 1
opener = urllib2.build_opener(urllib2.HTTPHandler())
method = "POST"
for t in tokens:
url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}".format(t)
body = {
"msgtype": "markdown",
"markdown": {
"content": payload.get('tpls').get("wecom.tpl", "wecom.tpl not found")
}
}
request = urllib2.Request(url, data=json.dumps(body))
request.add_header("Content-Type",'application/json;charset=utf-8')
request.get_method = lambda: method
try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
@classmethod
def send_dingtalk(cls, payload):
event = payload.get('event')
users = event.get("notify_users_obj")
rule_name = event.get("rule_name")
event_state = "Triggered"
if event.get("is_recovered"):
event_state = "Recovered"
tokens = {}
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
contacts = u.get("contacts")
if contacts.get("dingtalk_robot_token", ""):
tokens[contacts.get("dingtalk_robot_token", "")] = 1
opener = urllib2.build_opener(urllib2.HTTPHandler())
method = "POST"
for t in tokens:
url = "https://oapi.dingtalk.com/robot/send?access_token={}".format(t)
body = {
"msgtype": "markdown",
"markdown": {
"title": "{} - {}".format(event_state, rule_name),
"text": payload.get('tpls').get("dingtalk.tpl", "dingtalk.tpl not found") + ' '.join(["@"+i for i in phones.keys()])
},
"at": {
"atMobiles": phones.keys(),
"isAtAll": False
}
}
request = urllib2.Request(url, data=json.dumps(body))
request.add_header("Content-Type",'application/json;charset=utf-8')
request.get_method = lambda: method
try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
@classmethod
def send_feishu(cls, payload):
users = payload.get('event').get("notify_users_obj")
tokens = {}
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
contacts = u.get("contacts")
if contacts.get("feishu_robot_token", ""):
tokens[contacts.get("feishu_robot_token", "")] = 1
opener = urllib2.build_opener(urllib2.HTTPHandler())
method = "POST"
for t in tokens:
url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t)
body = {
"msg_type": "text",
"content": {
"text": payload.get('tpls').get("feishu.tpl", "feishu.tpl not found")
},
"at": {
"atMobiles": phones.keys(),
"isAtAll": False
}
}
request = urllib2.Request(url, data=json.dumps(body))
request.add_header("Content-Type",'application/json;charset=utf-8')
request.get_method = lambda: method
try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
@classmethod
def send_sms(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_sms not implemented, phones: {}".format(phones.keys()))
@classmethod
def send_voice(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_voice not implemented, phones: {}".format(phones.keys()))
def main():
payload = json.load(sys.stdin)
with open(".payload", 'w') as f:
f.write(json.dumps(payload, indent=4))
for ch in payload.get('event').get('notify_channels'):
send_func_name = "send_{}".format(notify_channel_funcs.get(ch.strip()))
if not hasattr(Sender, send_func_name):
print("function: {} not found", send_func_name)
continue
send_func = getattr(Sender, send_func_name)
send_func(payload)
def hello():
print("hello nightingale")
if __name__ == "__main__":
if len(sys.argv) == 1:
main()
elif sys.argv[1] == "hello":
hello()
else:
print("I am confused")
================================================
FILE: docker/compose-bridge/etc-nightingale/script/notify.py
================================================
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import sys
import json
class Sender(object):
@classmethod
def send_email(cls, payload):
# already done in go code
pass
@classmethod
def send_wecom(cls, payload):
# already done in go code
pass
@classmethod
def send_dingtalk(cls, payload):
# already done in go code
pass
@classmethod
def send_feishu(cls, payload):
# already done in go code
pass
@classmethod
def send_mm(cls, payload):
# already done in go code
pass
@classmethod
def send_sms(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_sms not implemented, phones: {}".format(phones.keys()))
@classmethod
def send_voice(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_voice not implemented, phones: {}".format(phones.keys()))
def main():
payload = json.load(sys.stdin)
with open(".payload", 'w') as f:
f.write(json.dumps(payload, indent=4))
for ch in payload.get('event').get('notify_channels'):
send_func_name = "send_{}".format(ch.strip())
if not hasattr(Sender, send_func_name):
print("function: {} not found", send_func_name)
continue
send_func = getattr(Sender, send_func_name)
send_func(payload)
def hello():
print("hello nightingale")
if __name__ == "__main__":
if len(sys.argv) == 1:
main()
elif sys.argv[1] == "hello":
hello()
else:
print("I am confused")
================================================
FILE: docker/compose-bridge/etc-nightingale/script/notify_feishu.py
================================================
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys
import json
import requests
class Sender(object):
@classmethod
def send_email(cls, payload):
# already done in go code
pass
@classmethod
def send_wecom(cls, payload):
# already done in go code
pass
@classmethod
def send_dingtalk(cls, payload):
# already done in go code
pass
@classmethod
def send_ifeishu(cls, payload):
users = payload.get('event').get("notify_users_obj")
tokens = {}
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
contacts = u.get("contacts")
if contacts.get("feishu_robot_token", ""):
tokens[contacts.get("feishu_robot_token", "")] = 1
headers = {
"Content-Type": "application/json;charset=utf-8",
"Host": "open.feishu.cn"
}
for t in tokens:
url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t)
body = {
"msg_type": "text",
"content": {
"text": payload.get('tpls').get("feishu", "feishu not found")
},
"at": {
"atMobiles": list(phones.keys()),
"isAtAll": False
}
}
response = requests.post(url, headers=headers, data=json.dumps(body))
print(f"notify_ifeishu: token={t} status_code={response.status_code} response_text={response.text}")
@classmethod
def send_mm(cls, payload):
# already done in go code
pass
@classmethod
def send_sms(cls, payload):
pass
@classmethod
def send_voice(cls, payload):
pass
def main():
payload = json.load(sys.stdin)
with open(".payload", 'w') as f:
f.write(json.dumps(payload, indent=4))
for ch in payload.get('event').get('notify_channels'):
send_func_name = "send_{}".format(ch.strip())
if not hasattr(Sender, send_func_name):
print("function: {} not found", send_func_name)
continue
send_func = getattr(Sender, send_func_name)
send_func(payload)
def hello():
print("hello nightingale")
if __name__ == "__main__":
if len(sys.argv) == 1:
main()
elif sys.argv[1] == "hello":
hello()
else:
print("I am confused")
================================================
FILE: docker/compose-bridge/etc-nightingale/script/rule_converter.py
================================================
import json
import yaml
'''
将promtheus/vmalert的rule转换为n9e中的rule
支持k8s的rule configmap
'''
rule_file = 'rules.yaml'
def convert_interval(interval):
if interval.endswith('s') or interval.endswith('S'):
return int(interval[:-1])
if interval.endswith('m') or interval.endswith('M'):
return int(interval[:-1]) * 60
if interval.endswith('h') or interval.endswith('H'):
return int(interval[:-1]) * 60 * 60
if interval.endswith('d') or interval.endswith('D'):
return int(interval[:-1]) * 60 * 60 * 24
return int(interval)
def convert_alert(rule, interval):
name = rule['alert']
prom_ql = rule['expr']
if 'for' in rule:
prom_for_duration = convert_interval(rule['for'])
else:
prom_for_duration = 0
prom_eval_interval = convert_interval(interval)
note = ''
if 'annotations' in rule:
for v in rule['annotations'].values():
note = v
break
annotations = {}
if 'annotations' in rule:
for k, v in rule['annotations'].items():
annotations[k] = v
append_tags = []
severity = 2
if 'labels' in rule:
for k, v in rule['labels'].items():
if k != 'severity':
append_tags.append('{}={}'.format(k, v))
continue
if v == 'critical':
severity = 1
elif v == 'info':
severity = 3
# elif v == 'warning':
# severity = 2
n9e_alert_rule = {
"name": name,
"note": note,
"severity": severity,
"disabled": 0,
"prom_for_duration": prom_for_duration,
"prom_ql": prom_ql,
"prom_eval_interval": prom_eval_interval,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": append_tags,
"annotations":annotations
}
return n9e_alert_rule
def convert_record(rule, interval):
name = rule['record']
prom_ql = rule['expr']
prom_eval_interval = convert_interval(interval)
note = ''
append_tags = []
if 'labels' in rule:
for k, v in rule['labels'].items():
append_tags.append('{}={}'.format(k, v))
n9e_record_rule = {
"name": name,
"note": note,
"disabled": 0,
"prom_ql": prom_ql,
"prom_eval_interval": prom_eval_interval,
"append_tags": append_tags
}
return n9e_record_rule
'''
example of rule group file
---
groups:
- name: example
rules:
- alert: HighRequestLatency
expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5
for: 10m
labels:
severity: page
annotations:
summary: High request latency
'''
def deal_group(group):
"""
parse single prometheus/vmalert rule group
"""
alert_rules = []
record_rules = []
for rule_segment in group['groups']:
if 'interval' in rule_segment:
interval = rule_segment['interval']
else:
interval = '15s'
for rule in rule_segment['rules']:
if 'alert' in rule:
alert_rules.append(convert_alert(rule, interval))
else:
record_rules.append(convert_record(rule, interval))
return alert_rules, record_rules
'''
example of k8s rule configmap
---
apiVersion: v1
kind: ConfigMap
metadata:
name: rulefiles-0
data:
etcdrules.yaml: |
groups:
- name: etcd
rules:
- alert: etcdInsufficientMembers
annotations:
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value}}).'
expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"})
by (job) + 1) / 2)
for: 3m
labels:
severity: critical
'''
def deal_configmap(rule_configmap):
"""
parse rule configmap from k8s
"""
all_record_rules = []
all_alert_rules = []
for _, rule_group_str in rule_configmap['data'].items():
rule_group = yaml.load(rule_group_str, Loader=yaml.FullLoader)
alert_rules, record_rules = deal_group(rule_group)
all_alert_rules.extend(alert_rules)
all_record_rules.extend(record_rules)
return all_alert_rules, all_record_rules
def main():
with open(rule_file, 'r') as f:
rule_config = yaml.load(f, Loader=yaml.FullLoader)
# 如果文件是k8s中的configmap,使用下面的方法
# alert_rules, record_rules = deal_configmap(rule_config)
alert_rules, record_rules = deal_group(rule_config)
with open("alert-rules.json", 'w') as fw:
json.dump(alert_rules, fw, indent=2, ensure_ascii=False)
with open("record-rules.json", 'w') as fw:
json.dump(record_rules, fw, indent=2, ensure_ascii=False)
if __name__ == '__main__':
main()
================================================
FILE: docker/compose-host-network/docker-compose.yaml
================================================
version: "3.7"
services:
mysql:
image: "mysql:8"
container_name: mysql
hostname: mysql
restart: always
environment:
TZ: Asia/Shanghai
MYSQL_ROOT_PASSWORD: 1234
volumes:
- ./mysqldata:/var/lib/mysql/
- ../initsql:/docker-entrypoint-initdb.d/
- ./etc-mysql/my.cnf:/etc/my.cnf
network_mode: host
redis:
image: "redis:6.2"
container_name: redis
hostname: redis
restart: always
environment:
TZ: Asia/Shanghai
network_mode: host
prometheus:
image: prom/prometheus:v2.55.1
container_name: prometheus
hostname: prometheus
restart: always
environment:
TZ: Asia/Shanghai
volumes:
- ./etc-prometheus:/etc/prometheus
network_mode: host
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
- "--enable-feature=remote-write-receiver"
- "--query.lookback-delta=2m"
n9e:
image: flashcatcloud/nightingale:latest
container_name: n9e
hostname: n9e
restart: always
environment:
GIN_MODE: release
TZ: Asia/Shanghai
WAIT_HOSTS: 127.0.0.1:3306, 127.0.0.1:6379
volumes:
- ./etc-nightingale:/app/etc
network_mode: host
depends_on:
- mysql
- redis
- prometheus
command:
- /app/n9e
categraf:
image: "flashcatcloud/categraf:latest"
container_name: "categraf"
hostname: "categraf01"
restart: always
environment:
TZ: Asia/Shanghai
HOST_PROC: /hostfs/proc
HOST_SYS: /hostfs/sys
HOST_MOUNT_PREFIX: /hostfs
WAIT_HOSTS: 127.0.0.1:17000, 127.0.0.1:20090
volumes:
- ./etc-categraf:/etc/categraf/conf
- /:/hostfs
network_mode: host
depends_on:
- n9e
================================================
FILE: docker/compose-host-network/etc-categraf/config.toml
================================================
[global]
# whether print configs
print_configs = false
# add label(agent_hostname) to series
# "" -> auto detect hostname
# "xx" -> use specified string xx
# "$hostname" -> auto detect hostname
# "$ip" -> auto detect ip
# "$hostname-$ip" -> auto detect hostname and ip to replace the vars
hostname = "$HOSTNAME"
# will not add label(agent_hostname) if true
omit_hostname = false
# s | ms
precision = "ms"
# global collect interval
interval = 15
[global.labels]
source="categraf"
# region = "shanghai"
# env = "localhost"
[writer_opt]
# default: 2000
batch = 2000
# channel(as queue) size
chan_size = 10000
[[writers]]
url = "http://127.0.0.1:17000/prometheus/v1/write"
# Basic auth username
basic_auth_user = ""
# Basic auth password
basic_auth_pass = ""
# timeout settings, unit: ms
timeout = 5000
dial_timeout = 2500
max_idle_conns_per_host = 100
[http]
enable = false
address = ":9100"
print_access = false
run_mode = "release"
[heartbeat]
enable = true
# report os version cpu.util mem.util metadata
url = "http://127.0.0.1:17000/v1/n9e/heartbeat"
# interval, unit: s
interval = 10
# Basic auth username
basic_auth_user = ""
# Basic auth password
basic_auth_pass = ""
## Optional headers
# headers = ["X-From", "categraf", "X-Xyz", "abc"]
# timeout settings, unit: ms
timeout = 5000
dial_timeout = 2500
max_idle_conns_per_host = 100
[ibex]
enable = true
## ibex flush interval
interval = "1000ms"
## n9e ibex server rpc address
servers = ["127.0.0.1:20090"]
## temp script dir
meta_dir = "./meta"
================================================
FILE: docker/compose-host-network/etc-categraf/input.cpu/cpu.toml
================================================
# # collect interval
# interval = 15
# # whether collect per cpu
# collect_per_cpu = false
================================================
FILE: docker/compose-host-network/etc-categraf/input.disk/disk.toml
================================================
# # collect interval
# interval = 15
# # By default stats will be gathered for all mount points.
# # Set mount_points will restrict the stats to only the specified mount points.
# mount_points = ["/"]
# Ignore mount points by filesystem type.
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
ignore_mount_points = ["/boot"]
================================================
FILE: docker/compose-host-network/etc-categraf/input.diskio/diskio.toml
================================================
# # collect interval
# interval = 15
# # By default, categraf will gather stats for all devices including disk partitions.
# # Setting devices will restrict the stats to the specified devices.
# devices = ["sda", "sdb", "vd*"]
================================================
FILE: docker/compose-host-network/etc-categraf/input.kernel/kernel.toml
================================================
# # collect interval
# interval = 15
================================================
FILE: docker/compose-host-network/etc-categraf/input.mem/mem.toml
================================================
# # collect interval
# interval = 15
# # whether collect platform specified metrics
collect_platform_fields = true
================================================
FILE: docker/compose-host-network/etc-categraf/input.net/net.toml
================================================
# # collect interval
# interval = 15
# # whether collect protocol stats on Linux
# collect_protocol_stats = false
# # setting interfaces will tell categraf to gather these explicit interfaces
# interfaces = ["eth0"]
================================================
FILE: docker/compose-host-network/etc-categraf/input.netstat/netstat.toml
================================================
# # collect interval
# interval = 15
================================================
FILE: docker/compose-host-network/etc-categraf/input.processes/processes.toml
================================================
# # collect interval
# interval = 15
# # force use ps command to gather
# force_ps = false
# # force use /proc to gather
# force_proc = false
================================================
FILE: docker/compose-host-network/etc-categraf/input.system/system.toml
================================================
# # collect interval
# interval = 15
# # whether collect metric: system_n_users
# collect_user_number = false
================================================
FILE: docker/compose-host-network/etc-mysql/my.cnf
================================================
[mysqld]
pid-file = /var/run/mysqld/mysqld.pid
socket = /var/run/mysqld/mysqld.sock
datadir = /var/lib/mysql
bind-address = 127.0.0.1
================================================
FILE: docker/compose-host-network/etc-nightingale/config.toml
================================================
[Global]
RunMode = "release"
[Log]
# log write dir
Dir = "logs"
# log level: DEBUG INFO WARNING ERROR
Level = "INFO"
# stdout, stderr, file
Output = "stdout"
# # rotate by time
# KeepHours = 4
# # rotate by size
# RotateNum = 3
# # unit: MB
# RotateSize = 256
[HTTP]
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 17000
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = false
# whether enable pprof
PProf = false
# expose prometheus /metrics?
ExposeMetrics = true
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120
[HTTP.ShowCaptcha]
Enable = false
[HTTP.APIForAgent]
Enable = true
# [HTTP.APIForAgent.BasicAuth]
# user001 = "ccc26da7b9aba533cbb263a36c07dcc5"
[HTTP.APIForService]
Enable = false
[HTTP.APIForService.BasicAuth]
user001 = "ccc26da7b9aba533cbb263a36c07dcc5"
[HTTP.JWTAuth]
# unit: min
AccessExpired = 1500
# unit: min
RefreshExpired = 10080
RedisKeyPrefix = "/jwt/"
[HTTP.ProxyAuth]
# if proxy auth enabled, jwt auth is disabled
Enable = false
# username key in http proxy header
HeaderUserNameKey = "X-User-Name"
DefaultRoles = ["Standard"]
[HTTP.RSA]
# open RSA
OpenRSA = false
[DB]
# postgres: host=%s port=%s user=%s dbname=%s password=%s sslmode=%s
# postgres: DSN="host=127.0.0.1 port=5432 user=root dbname=n9e_v6 password=1234 sslmode=disable"
DSN="root:1234@tcp(127.0.0.1:3306)/n9e_v6?charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true"
# enable debug mode or not
Debug = false
# mysql postgres
DBType = "mysql"
# unit: s
MaxLifetime = 7200
# max open connections
MaxOpenConns = 150
# max idle connections
MaxIdleConns = 50
[Redis]
# address, ip:port or ip1:port,ip2:port for cluster and sentinel(SentinelAddrs)
Address = "127.0.0.1:6379"
# Username = ""
# Password = ""
# DB = 0
# UseTLS = false
# TLSMinVersion = "1.2"
# standalone cluster sentinel
RedisType = "standalone"
# Mastername for sentinel type
# MasterName = "mymaster"
# SentinelUsername = ""
# SentinelPassword = ""
[Alert]
[Alert.Heartbeat]
# auto detect if blank
IP = ""
# unit ms
Interval = 1000
EngineName = "default"
# [Alert.Alerting]
# NotifyConcurrency = 10
[Center]
MetricsYamlFile = "./etc/metrics.yaml"
I18NHeaderKey = "X-Language"
[Center.AnonymousAccess]
PromQuerier = true
AlertDetail = true
[Pushgw]
# use target labels in database instead of in series
LabelRewrite = true
ForceUseServerTS = true
# [Pushgw.DebugSample]
# ident = "xx"
# __name__ = "xx"
# [Pushgw.WriterOpt]
# QueueMaxSize = 1000000
# QueuePopSize = 1000
[[Pushgw.Writers]]
# Url = "http://127.0.0.1:8480/insert/0/prometheus/api/v1/write"
Url = "http://127.0.0.1:9090/api/v1/write"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Headers = ["X-From", "n9e"]
Timeout = 10000
DialTimeout = 3000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
## Optional TLS Config
# UseTLS = false
# TLSCA = "/etc/n9e/ca.pem"
# TLSCert = "/etc/n9e/cert.pem"
# TLSKey = "/etc/n9e/key.pem"
# InsecureSkipVerify = false
# [[Writers.WriteRelabels]]
# Action = "replace"
# SourceLabels = ["__address__"]
# Regex = "([^:]+)(?::\\d+)?"
# Replacement = "$1:80"
# TargetLabel = "__address__"
[Ibex]
Enable = true
RPCListen = "0.0.0.0:20090"
================================================
FILE: docker/compose-host-network/etc-nightingale/metrics.yaml
================================================
zh:
ip_conntrack_count: 连接跟踪表条目总数(单位:int, count)
ip_conntrack_max: 连接跟踪表最大容量(单位:int, size)
cpu_usage_idle: CPU空闲率(单位:%)
cpu_usage_active: CPU使用率(单位:%)
cpu_usage_system: CPU内核态时间占比(单位:%)
cpu_usage_user: CPU用户态时间占比(单位:%)
cpu_usage_nice: 低优先级用户态CPU时间占比,也就是进程nice值被调整为1-19之间的CPU时间。这里注意,nice可取值范围是-20到19,数值越大,优先级反而越低(单位:%)
cpu_usage_iowait: CPU等待I/O的时间占比(单位:%)
cpu_usage_irq: CPU处理硬中断的时间占比(单位:%)
cpu_usage_softirq: CPU处理软中断的时间占比(单位:%)
cpu_usage_steal: 在虚拟机环境下有该指标,表示CPU被其他虚拟机争用的时间占比,超过20就表示争抢严重(单位:%)
cpu_usage_guest: 通过虚拟化运行其他操作系统的时间,也就是运行虚拟机的CPU时间占比(单位:%)
cpu_usage_guest_nice: 以低优先级运行虚拟机的时间占比(单位:%)
disk_free: 硬盘分区剩余量(单位:byte)
disk_used: 硬盘分区使用量(单位:byte)
disk_used_percent: 硬盘分区使用率(单位:%)
disk_total: 硬盘分区总量(单位:byte)
disk_inodes_free: 硬盘分区inode剩余量
disk_inodes_used: 硬盘分区inode使用量
disk_inodes_total: 硬盘分区inode总量
diskio_io_time: 从设备视角来看I/O请求总时间,队列中有I/O请求就计数(单位:毫秒),counter类型,需要用函数求rate才有使用价值
diskio_iops_in_progress: 已经分配给设备驱动且尚未完成的IO请求,不包含在队列中但尚未分配给设备驱动的IO请求,gauge类型
diskio_merged_reads: 相邻读请求merge读的次数,counter类型
diskio_merged_writes: 相邻写请求merge写的次数,counter类型
diskio_read_bytes: 读取的byte数量,counter类型,需要用函数求rate才有使用价值
diskio_read_time: 读请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值
diskio_reads: 读请求次数,counter类型,需要用函数求rate才有使用价值
diskio_weighted_io_time: 从I/O请求视角来看I/O等待总时间,如果同时有多个I/O请求,时间会叠加(单位:毫秒)
diskio_write_bytes: 写入的byte数量,counter类型,需要用函数求rate才有使用价值
diskio_write_time: 写请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值
diskio_writes: 写请求次数,counter类型,需要用函数求rate才有使用价值
kernel_boot_time: 内核启动时间
kernel_context_switches: 内核上下文切换次数
kernel_entropy_avail: linux系统内部的熵池
kernel_interrupts: 内核中断次数
kernel_processes_forked: fork的进程数
mem_active: 活跃使用的内存总数(包括cache和buffer内存)
mem_available: 可用内存大小(bytes)
mem_available_percent: 内存剩余百分比(0~100)
mem_buffered: 用来给文件做缓冲大小
mem_cached: 被高速缓冲存储器(cache memory)用的内存的大小(等于 diskcache minus SwapCache )
mem_commit_limit: 根据超额分配比率('vm.overcommit_ratio'),这是当前在系统上分配可用的内存总量,这个限制只是在模式2('vm.overcommit_memory')时启用
mem_committed_as: 目前在系统上分配的内存量。是所有进程申请的内存的总和
mem_dirty: 等待被写回到磁盘的内存大小
mem_free: 空闲内存大小(bytes)
mem_high_free: 未被使用的高位内存大小
mem_high_total: 高位内存总大小(Highmem是指所有内存高于860MB的物理内存,Highmem区域供用户程序使用,或用于页面缓存。该区域不是直接映射到内核空间。内核必须使用不同的手法使用该段内存)
mem_huge_page_size: 每个大页的大小
mem_huge_pages_free: 池中尚未分配的 HugePages 数量
mem_huge_pages_total: 预留HugePages的总个数
mem_inactive: 空闲的内存数(包括free和available的内存)
mem_low_free: 未被使用的低位大小
mem_low_total: 低位内存总大小,低位可以达到高位内存一样的作用,而且它还能够被内核用来记录一些自己的数据结构
mem_mapped: 设备和文件等映射的大小
mem_page_tables: 管理内存分页页面的索引表的大小
mem_shared: 多个进程共享的内存总额
mem_slab: 内核数据结构缓存的大小,可以减少申请和释放内存带来的消耗
mem_sreclaimable: 可收回Slab的大小
mem_sunreclaim: 不可收回Slab的大小(SUnreclaim+SReclaimable=Slab)
mem_swap_cached: 被高速缓冲存储器(cache memory)用的交换空间的大小,已经被交换出来的内存,但仍然被存放在swapfile中。用来在需要的时候很快的被替换而不需要再次打开I/O端口
mem_swap_free: 未被使用交换空间的大小
mem_swap_total: 交换空间的总大小
mem_total: 内存总数
mem_used: 已用内存数
mem_used_percent: 已用内存数百分比(0~100)
mem_vmalloc_chunk: 最大的连续未被使用的vmalloc区域
mem_vmalloc_totalL: 可以vmalloc虚拟内存大小
mem_vmalloc_used: vmalloc已使用的虚拟内存大小
mem_write_back: 正在被写回到磁盘的内存大小
mem_write_back_tmp: FUSE用于临时写回缓冲区的内存
net_bytes_recv: 网卡收包总数(bytes),计算每秒速率时需要用到rate/irate函数
net_bytes_sent: 网卡发包总数(bytes),计算每秒速率时需要用到rate/irate函数
net_drop_in: 网卡收丢包数量
net_drop_out: 网卡发丢包数量
net_err_in: 网卡收包错误数量
net_err_out: 网卡发包错误数量
net_packets_recv: 网卡收包数量
net_packets_sent: 网卡发包数量
net_bits_recv: 网卡收包总数(bits),计算每秒速率时需要用到rate/irate函数
net_bits_sent: 网卡发包总数(bits),计算每秒速率时需要用到rate/irate函数
netstat_tcp_established: ESTABLISHED状态的网络链接数
netstat_tcp_fin_wait1: FIN_WAIT1状态的网络链接数
netstat_tcp_fin_wait2: FIN_WAIT2状态的网络链接数
netstat_tcp_last_ack: LAST_ACK状态的网络链接数
netstat_tcp_listen: LISTEN状态的网络链接数
netstat_tcp_syn_recv: SYN_RECV状态的网络链接数
netstat_tcp_syn_sent: SYN_SENT状态的网络链接数
netstat_tcp_time_wait: TIME_WAIT状态的网络链接数
netstat_udp_socket: UDP状态的网络链接数
netstat_sockets_used: 已使用的所有协议套接字总量
netstat_tcp_inuse: 正在使用(正在侦听)的TCP套接字数量
netstat_tcp_orphan: 无主(不属于任何进程)的TCP连接数(无用、待销毁的TCP socket数)
netstat_tcp_tw: TIME_WAIT状态的TCP连接数
netstat_tcp_alloc: 已分配(已建立、已申请到sk_buff)的TCP套接字数量
netstat_tcp_mem: TCP套接字内存Page使用量
netstat_udp_inuse: 在使用的UDP套接字数量
netstat_udp_mem: UDP套接字内存Page使用量
netstat_udplite_inuse: 正在使用的 udp lite 数量
netstat_raw_inuse: 正在使用的 raw socket 数量
netstat_frag_inuse: ip fragment 数量
netstat_frag_memory: ip fragment 已经分配的内存(byte)
#[ping]
ping_percent_packet_loss: ping数据包丢失百分比(%)
ping_result_code: ping返回码('0','1')
net_response_result_code: 网络探测结果,0表示正常,非0表示异常
net_response_response_time: 网络探测时延,单位:秒
processes_blocked: 不可中断的睡眠状态下的进程数('U','D','L')
processes_dead: 回收中的进程数('X')
processes_idle: 挂起的空闲进程数('I')
processes_paging: 分页进程数('P')
processes_running: 运行中的进程数('R')
processes_sleeping: 可中断进程数('S')
processes_stopped: 暂停状态进程数('T')
processes_total: 总进程数
processes_total_threads: 总线程数
processes_unknown: 未知状态进程数
processes_zombies: 僵尸态进程数('Z')
swap_used_percent: Swap空间换出数据量
system_load1: 1分钟平均load值
system_load5: 5分钟平均load值
system_load15: 15分钟平均load值
system_load_norm_1: 1分钟平均load值/逻辑CPU个数
system_load_norm_5: 5分钟平均load值/逻辑CPU个数
system_load_norm_15: 15分钟平均load值/逻辑CPU个数
system_n_users: 用户数
system_n_cpus: CPU核数
system_uptime: 系统启动时间
nginx_accepts: 自nginx启动起,与客户端建立过得连接总数
nginx_active: 当前nginx正在处理的活动连接数,等于Reading/Writing/Waiting总和
nginx_handled: 自nginx启动起,处理过的客户端连接总数
nginx_reading: 正在读取HTTP请求头部的连接总数
nginx_requests: 自nginx启动起,处理过的客户端请求总数,由于存在HTTP Keep-Alive请求,该值会大于handled值
nginx_upstream_check_fall: upstream_check模块检测到后端失败的次数
nginx_upstream_check_rise: upstream_check模块对后端的检测次数
nginx_upstream_check_status_code: 后端upstream的状态,up为1,down为0
nginx_waiting: 开启 keep-alive 的情况下,这个值等于 active – (reading+writing), 意思就是 Nginx 已经处理完正在等候下一次请求指令的驻留连接
nginx_writing: 正在向客户端发送响应的连接总数
http_response_content_length: HTTP消息实体的传输长度
http_response_http_response_code: http响应状态码
http_response_response_time: http响应用时
http_response_result_code: url探测结果0为正常否则url无法访问
# [aws cloudwatch rds]
cloudwatch_aws_rds_bin_log_disk_usage_average: rds 磁盘使用平均值
cloudwatch_aws_rds_bin_log_disk_usage_maximum: rds 磁盘使用量最大值
cloudwatch_aws_rds_bin_log_disk_usage_minimum: rds binlog 磁盘使用量最低
cloudwatch_aws_rds_bin_log_disk_usage_sample_count: rds binlog 磁盘使用情况样本计数
cloudwatch_aws_rds_bin_log_disk_usage_sum: rds binlog 磁盘使用总和
cloudwatch_aws_rds_burst_balance_average: rds 突发余额平均值
cloudwatch_aws_rds_burst_balance_maximum: rds 突发余额最大值
cloudwatch_aws_rds_burst_balance_minimum: rds 突发余额最低
cloudwatch_aws_rds_burst_balance_sample_count: rds 突发平衡样本计数
cloudwatch_aws_rds_burst_balance_sum: rds 突发余额总和
cloudwatch_aws_rds_cpu_utilization_average: rds cpu 利用率平均值
cloudwatch_aws_rds_cpu_utilization_maximum: rds cpu 利用率最大值
cloudwatch_aws_rds_cpu_utilization_minimum: rds cpu 利用率最低
cloudwatch_aws_rds_cpu_utilization_sample_count: rds cpu 利用率样本计数
cloudwatch_aws_rds_cpu_utilization_sum: rds cpu 利用率总和
cloudwatch_aws_rds_database_connections_average: rds 数据库连接平均值
cloudwatch_aws_rds_database_connections_maximum: rds 数据库连接数最大值
cloudwatch_aws_rds_database_connections_minimum: rds 数据库连接最小
cloudwatch_aws_rds_database_connections_sample_count: rds 数据库连接样本数
cloudwatch_aws_rds_database_connections_sum: rds 数据库连接总和
cloudwatch_aws_rds_db_load_average: rds db 平均负载
cloudwatch_aws_rds_db_load_cpu_average: rds db 负载 cpu 平均值
cloudwatch_aws_rds_db_load_cpu_maximum: rds db 负载 cpu 最大值
cloudwatch_aws_rds_db_load_cpu_minimum: rds db 负载 cpu 最小值
cloudwatch_aws_rds_db_load_cpu_sample_count: rds db 加载 CPU 样本数
cloudwatch_aws_rds_db_load_cpu_sum: rds db 加载cpu总和
cloudwatch_aws_rds_db_load_maximum: rds 数据库负载最大值
cloudwatch_aws_rds_db_load_minimum: rds 数据库负载最小值
cloudwatch_aws_rds_db_load_non_cpu_average: rds 加载非 CPU 平均值
cloudwatch_aws_rds_db_load_non_cpu_maximum: rds 加载非 cpu 最大值
cloudwatch_aws_rds_db_load_non_cpu_minimum: rds 加载非 cpu 最小值
cloudwatch_aws_rds_db_load_non_cpu_sample_count: rds 加载非 cpu 样本计数
cloudwatch_aws_rds_db_load_non_cpu_sum: rds 加载非cpu总和
cloudwatch_aws_rds_db_load_sample_count: rds db 加载样本计数
cloudwatch_aws_rds_db_load_sum: rds db 负载总和
cloudwatch_aws_rds_disk_queue_depth_average: rds 磁盘队列深度平均值
cloudwatch_aws_rds_disk_queue_depth_maximum: rds 磁盘队列深度最大值
cloudwatch_aws_rds_disk_queue_depth_minimum: rds 磁盘队列深度最小值
cloudwatch_aws_rds_disk_queue_depth_sample_count: rds 磁盘队列深度样本计数
cloudwatch_aws_rds_disk_queue_depth_sum: rds 磁盘队列深度总和
cloudwatch_aws_rds_ebs_byte_balance__average: rds ebs 字节余额平均值
cloudwatch_aws_rds_ebs_byte_balance__maximum: rds ebs 字节余额最大值
cloudwatch_aws_rds_ebs_byte_balance__minimum: rds ebs 字节余额最低
cloudwatch_aws_rds_ebs_byte_balance__sample_count: rds ebs 字节余额样本数
cloudwatch_aws_rds_ebs_byte_balance__sum: rds ebs 字节余额总和
cloudwatch_aws_rds_ebsio_balance__average: rds ebsio 余额平均值
cloudwatch_aws_rds_ebsio_balance__maximum: rds ebsio 余额最大值
cloudwatch_aws_rds_ebsio_balance__minimum: rds ebsio 余额最低
cloudwatch_aws_rds_ebsio_balance__sample_count: rds ebsio 平衡样本计数
cloudwatch_aws_rds_ebsio_balance__sum: rds ebsio 余额总和
cloudwatch_aws_rds_free_storage_space_average: rds 免费存储空间平均
cloudwatch_aws_rds_free_storage_space_maximum: rds 最大可用存储空间
cloudwatch_aws_rds_free_storage_space_minimum: rds 最低可用存储空间
cloudwatch_aws_rds_free_storage_space_sample_count: rds 可用存储空间样本数
cloudwatch_aws_rds_free_storage_space_sum: rds 免费存储空间总和
cloudwatch_aws_rds_freeable_memory_average: rds 可用内存平均值
cloudwatch_aws_rds_freeable_memory_maximum: rds 最大可用内存
cloudwatch_aws_rds_freeable_memory_minimum: rds 最小可用内存
cloudwatch_aws_rds_freeable_memory_sample_count: rds 可释放内存样本数
cloudwatch_aws_rds_freeable_memory_sum: rds 可释放内存总和
cloudwatch_aws_rds_lvm_read_iops_average: rds lvm 读取 iops 平均值
cloudwatch_aws_rds_lvm_read_iops_maximum: rds lvm 读取 iops 最大值
cloudwatch_aws_rds_lvm_read_iops_minimum: rds lvm 读取 iops 最低
cloudwatch_aws_rds_lvm_read_iops_sample_count: rds lvm 读取 iops 样本计数
cloudwatch_aws_rds_lvm_read_iops_sum: rds lvm 读取 iops 总和
cloudwatch_aws_rds_lvm_write_iops_average: rds lvm 写入 iops 平均值
cloudwatch_aws_rds_lvm_write_iops_maximum: rds lvm 写入 iops 最大值
cloudwatch_aws_rds_lvm_write_iops_minimum: rds lvm 写入 iops 最低
cloudwatch_aws_rds_lvm_write_iops_sample_count: rds lvm 写入 iops 样本计数
cloudwatch_aws_rds_lvm_write_iops_sum: rds lvm 写入 iops 总和
cloudwatch_aws_rds_network_receive_throughput_average: rds 网络接收吞吐量平均
cloudwatch_aws_rds_network_receive_throughput_maximum: rds 网络接收吞吐量最大值
cloudwatch_aws_rds_network_receive_throughput_minimum: rds 网络接收吞吐量最小值
cloudwatch_aws_rds_network_receive_throughput_sample_count: rds 网络接收吞吐量样本计数
cloudwatch_aws_rds_network_receive_throughput_sum: rds 网络接收吞吐量总和
cloudwatch_aws_rds_network_transmit_throughput_average: rds 网络传输吞吐量平均值
cloudwatch_aws_rds_network_transmit_throughput_maximum: rds 网络传输吞吐量最大
cloudwatch_aws_rds_network_transmit_throughput_minimum: rds 网络传输吞吐量最小值
cloudwatch_aws_rds_network_transmit_throughput_sample_count: rds 网络传输吞吐量样本计数
cloudwatch_aws_rds_network_transmit_throughput_sum: rds 网络传输吞吐量总和
cloudwatch_aws_rds_read_iops_average: rds 读取 iops 平均值
cloudwatch_aws_rds_read_iops_maximum: rds 最大读取 iops
cloudwatch_aws_rds_read_iops_minimum: rds 读取 iops 最低
cloudwatch_aws_rds_read_iops_sample_count: rds 读取 iops 样本计数
cloudwatch_aws_rds_read_iops_sum: rds 读取 iops 总和
cloudwatch_aws_rds_read_latency_average: rds 读取延迟平均值
cloudwatch_aws_rds_read_latency_maximum: rds 读取延迟最大值
cloudwatch_aws_rds_read_latency_minimum: rds 最小读取延迟
cloudwatch_aws_rds_read_latency_sample_count: rds 读取延迟样本计数
cloudwatch_aws_rds_read_latency_sum: rds 读取延迟总和
cloudwatch_aws_rds_read_throughput_average: rds 读取吞吐量平均值
cloudwatch_aws_rds_read_throughput_maximum: rds 最大读取吞吐量
cloudwatch_aws_rds_read_throughput_minimum: rds 最小读取吞吐量
cloudwatch_aws_rds_read_throughput_sample_count: rds 读取吞吐量样本计数
cloudwatch_aws_rds_read_throughput_sum: rds 读取吞吐量总和
cloudwatch_aws_rds_swap_usage_average: rds 交换使用平均值
cloudwatch_aws_rds_swap_usage_maximum: rds 交换使用最大值
cloudwatch_aws_rds_swap_usage_minimum: rds 交换使用量最低
cloudwatch_aws_rds_swap_usage_sample_count: rds 交换使用示例计数
cloudwatch_aws_rds_swap_usage_sum: rds 交换使用总和
cloudwatch_aws_rds_write_iops_average: rds 写入 iops 平均值
cloudwatch_aws_rds_write_iops_maximum: rds 写入 iops 最大值
cloudwatch_aws_rds_write_iops_minimum: rds 写入 iops 最低
cloudwatch_aws_rds_write_iops_sample_count: rds 写入 iops 样本计数
cloudwatch_aws_rds_write_iops_sum: rds 写入 iops 总和
cloudwatch_aws_rds_write_latency_average: rds 写入延迟平均值
cloudwatch_aws_rds_write_latency_maximum: rds 最大写入延迟
cloudwatch_aws_rds_write_latency_minimum: rds 写入延迟最小值
cloudwatch_aws_rds_write_latency_sample_count: rds 写入延迟样本计数
cloudwatch_aws_rds_write_latency_sum: rds 写入延迟总和
cloudwatch_aws_rds_write_throughput_average: rds 写入吞吐量平均值
cloudwatch_aws_rds_write_throughput_maximum: rds 最大写入吞吐量
cloudwatch_aws_rds_write_throughput_minimum: rds 写入吞吐量最小值
cloudwatch_aws_rds_write_throughput_sample_count: rds 写入吞吐量样本计数
cloudwatch_aws_rds_write_throughput_sum: rds 写入吞吐量总和
en:
ip_conntrack_count: the number of entries in the conntrack table(unit:int, count)
ip_conntrack_max: the max capacity of the conntrack table(unit:int, size)
cpu_usage_idle: "CPU idle rate(unit:%)"
cpu_usage_active: "CPU usage rate(unit:%)"
cpu_usage_system: "CPU kernel state time proportion(unit:%)"
cpu_usage_user: "CPU user attitude time proportion(unit:%)"
cpu_usage_nice: "The proportion of low priority CPU time, that is, the process NICE value is adjusted to the CPU time between 1-19. Note here that the value range of NICE is -20 to 19, the larger the value, the lower the priority, the lower the priority(unit:%)"
cpu_usage_iowait: "CPU waiting for I/O time proportion(unit:%)"
cpu_usage_irq: "CPU processing hard interrupt time proportion(unit:%)"
cpu_usage_softirq: "CPU processing soft interrupt time proportion(unit:%)"
cpu_usage_steal: "In the virtual machine environment, there is this indicator, which means that the CPU is used by other virtual machines for the proportion of time.(unit:%)"
cpu_usage_guest: "The time to run other operating systems by virtualization, that is, the proportion of CPU time running the virtual machine(unit:%)"
cpu_usage_guest_nice: "The proportion of time to run the virtual machine at low priority(unit:%)"
disk_free: "The remaining amount of the hard disk partition (unit: byte)"
disk_used: "Hard disk partitional use (unit: byte)"
disk_used_percent: "Hard disk partitional use rate (unit:%)"
disk_total: "Total amount of hard disk partition (unit: byte)"
disk_inodes_free: "Hard disk partition INODE remaining amount"
disk_inodes_used: "Hard disk partition INODE usage amount"
disk_inodes_total: "The total amount of hard disk partition INODE"
diskio_io_time: "From the perspective of the device perspective, the total time of I/O request, the I/O request in the queue is count (unit: millisecond), the counter type, you need to use the function to find the value"
diskio_iops_in_progress: "IO requests that have been assigned to device -driven and have not yet been completed, not included in the queue but not yet assigned to the device -driven IO request, Gauge type"
diskio_merged_reads: "The number of times of adjacent reading request Merge, the counter type"
diskio_merged_writes: "The number of times the request Merge writes, the counter type"
diskio_read_bytes: "The number of byte reads, the counter type, you need to use the function to find the Rate to use the value"
diskio_read_time: "The total time of reading request (unit: millisecond), the counter type, you need to use the function to find the Rate to have the value of use"
diskio_reads: "Read the number of requests, the counter type, you need to use the function to find the Rate to use the value"
diskio_weighted_io_time: "From the perspective of the I/O request perspective, I/O wait for the total time. If there are multiple I/O requests at the same time, the time will be superimposed (unit: millisecond)"
diskio_write_bytes: "The number of bytes written, the counter type, you need to use the function to find the Rate to use the value"
diskio_write_time: "The total time of the request (unit: millisecond), the counter type, you need to use the function to find the rate to have the value of use"
diskio_writes: "Write the number of requests, the counter type, you need to use the function to find the rate to use value"
kernel_boot_time: "Kernel startup time"
kernel_context_switches: "Number of kernel context switching times"
kernel_entropy_avail: "Entropy pool inside the Linux system"
kernel_interrupts: "Number of kernel interruption"
kernel_processes_forked: "ForK's process number"
mem_active: "The total number of memory (including Cache and BUFFER memory)"
mem_available: "Application can use memory numbers"
mem_available_percent: "Memory remaining percentage (0 ~ 100)"
mem_buffered: "Used to make buffer size for the file"
mem_cached: "The size of the memory used by the cache memory (equal to diskcache minus Swap Cache )"
mem_commit_limit: "According to the over allocation ratio ('vm.overCommit _ Ratio'), this is the current total memory that can be allocated on the system."
mem_committed_as: "Currently allocated on the system. It is the sum of the memory of all process applications"
mem_dirty: "Waiting to be written back to the memory size of the disk"
mem_free: "Senior memory number"
mem_high_free: "Unused high memory size"
mem_high_total: "The total memory size of the high memory (Highmem refers to all the physical memory that is higher than 860 MB of memory, the HighMem area is used for user programs, or for page cache. This area is not directly mapped to the kernel space. The kernels must use different methods to use this section of memory. )"
mem_huge_page_size: "The size of each big page"
mem_huge_pages_free: "The number of Huge Pages in the pool that have not been allocated"
mem_huge_pages_total: "Reserve the total number of Huge Pages"
mem_inactive: "Free memory (including the memory of free and available)"
mem_low_free: "Unused low size"
mem_low_total: "The total size of the low memory memory can achieve the same role of high memory, and it can be used by the kernel to record some of its own data structure"
mem_mapped: "The size of the mapping of equipment and files"
mem_page_tables: "The size of the index table of the management of the memory paging page"
mem_shared: "The total memory shared by multiple processes"
mem_slab: "The size of the kernel data structure cache can reduce the consumption of application and release memory"
mem_sreclaimable: "The size of the SLAB can be recovered"
mem_sunreclaim: "The size of the SLAB cannot be recovered(SUnreclaim+SReclaimable=Slab)"
mem_swap_cached: "The size of the swap space used by the cache memory (cache memory), the memory that has been swapped out, but is still stored in the swapfile. Used to be quickly replaced when needed without opening the I/O port again"
mem_swap_free: "The size of the switching space is not used"
mem_swap_total: "The total size of the exchange space"
mem_total: "Total memory"
mem_used: "Memory number"
mem_used_percent: "The memory has been used by several percentage (0 ~ 100)"
mem_vmalloc_chunk: "The largest continuous unused vmalloc area"
mem_vmalloc_totalL: "You can vmalloc virtual memory size"
mem_vmalloc_used: "Vmalloc's virtual memory size"
mem_write_back: "The memory size of the disk is being written back to the disk"
mem_write_back_tmp: "Fuse is used to temporarily write back the memory of the buffer area"
net_bytes_recv: "Total inbound traffic(bytes) of network card"
net_bytes_sent: "Total outbound traffic(bytes) of network card"
net_bits_recv: "Total inbound traffic(bits) of network card"
net_bits_sent: "Total outbound traffic(bits) of network card"
net_drop_in: "The number of packets for network cards"
net_drop_out: "The number of packets issued by the network card"
net_err_in: "The number of incorrect packets of the network card"
net_err_out: "Number of incorrect number of network cards"
net_packets_recv: "Net card collection quantity"
net_packets_sent: "Number of network card issuance"
netstat_tcp_established: "ESTABLISHED status network link number"
netstat_tcp_fin_wait1: "FIN _ WAIT1 status network link number"
netstat_tcp_fin_wait2: "FIN _ WAIT2 status number of network links"
netstat_tcp_last_ack: "LAST_ ACK status number of network links"
netstat_tcp_listen: "Number of network links in Listen status"
netstat_tcp_syn_recv: "SYN _ RECV status number of network links"
netstat_tcp_syn_sent: "SYN _ SENT status number of network links"
netstat_tcp_time_wait: "Time _ WAIT status network link number"
netstat_udp_socket: "Number of network links in UDP status"
processes_blocked: "The number of processes in the unreproducible sleep state('U','D','L')"
processes_dead: "Number of processes in recycling('X')"
processes_idle: "Number of idle processes hanging('I')"
processes_paging: "Number of paging processes('P')"
processes_running: "Number of processes during operation('R')"
processes_sleeping: "Can interrupt the number of processes('S')"
processes_stopped: "Pushing status process number('T')"
processes_total: "Total process number"
processes_total_threads: "Number of threads"
processes_unknown: "Unknown status process number"
processes_zombies: "Number of zombies('Z')"
swap_used_percent: "SWAP space replace the data volume"
system_load1: "1 minute average load value"
system_load5: "5 minutes average load value"
system_load15: "15 minutes average load value"
system_load_norm_1: "1 minute average load value/logical CPU number"
system_load_norm_5: "5 minutes average load value/logical CPU number"
system_load_norm_15: "15 minutes average load value/logical CPU number"
system_n_users: "User number"
system_n_cpus: "CPU nuclear number"
system_uptime: "System startup time"
nginx_accepts: "Since Nginx started, the total number of connections has been established with the client"
nginx_active: "The current number of activity connections that Nginx is being processed is equal to Reading/Writing/Waiting"
nginx_handled: "Starting from Nginx, the total number of client connections that have been processed"
nginx_reading: "Reading the total number of connections on the http request header"
nginx_requests: "Since nginx is started, the total number of client requests processed, due to the existence of HTTP Keep-Alive requests, this value will be greater than the handled value"
nginx_upstream_check_fall: "UPStream_CHECK module detects the number of back -end failures"
nginx_upstream_check_rise: "UPSTREAM _ Check module to detect the number of back -end"
nginx_upstream_check_status_code: "The state of the backstream is 1, and the down is 0"
nginx_waiting: "When keep-alive is enabled, this value is equal to active – (reading+writing), which means that Nginx has processed the resident connection that is waiting for the next request command"
nginx_writing: "The total number of connections to send a response to the client"
http_response_content_length: "HTTP message entity transmission length"
http_response_http_response_code: "http response status code"
http_response_response_time: "When http ring application"
http_response_result_code: "URL detection result 0 is normal, otherwise the URL cannot be accessed"
# [mysqld_exporter]
mysql_global_status_uptime: The number of seconds that the server has been up.(Gauge)
mysql_global_status_uptime_since_flush_status: The number of seconds since the most recent FLUSH STATUS statement.(Gauge)
mysql_global_status_queries: The number of statements executed by the server. This variable includes statements executed within stored programs, unlike the Questions variable. It does not count COM_PING or COM_STATISTICS commands.(Counter)
mysql_global_status_threads_connected: The number of currently open connections.(Counter)
mysql_global_status_connections: The number of connection attempts (successful or not) to the MySQL server.(Gauge)
mysql_global_status_max_used_connections: The maximum number of connections that have been in use simultaneously since the server started.(Gauge)
mysql_global_status_threads_running: The number of threads that are not sleeping.(Gauge)
mysql_global_status_questions: The number of statements executed by the server. This includes only statements sent to the server by clients and not statements executed within stored programs, unlike the Queries variable. This variable does not count COM_PING, COM_STATISTICS, COM_STMT_PREPARE, COM_STMT_CLOSE, or COM_STMT_RESET commands.(Counter)
mysql_global_status_threads_cached: The number of threads in the thread cache.(Counter)
mysql_global_status_threads_created: The number of threads created to handle connections. If Threads_created is big, you may want to increase the thread_cache_size value. The cache miss rate can be calculated as Threads_created/Connections.(Counter)
mysql_global_status_created_tmp_tables: The number of internal temporary tables created by the server while executing statements.(Counter)
mysql_global_status_created_tmp_disk_tables: The number of internal on-disk temporary tables created by the server while executing statements. You can compare the number of internal on-disk temporary tables created to the total number of internal temporary tables created by comparing Created_tmp_disk_tables and Created_tmp_tables values.(Counter)
mysql_global_status_created_tmp_files: How many temporary files mysqld has created.(Counter)
mysql_global_status_select_full_join: The number of joins that perform table scans because they do not use indexes. If this value is not 0, you should carefully check the indexes of your tables.(Counter)
mysql_global_status_select_full_range_join: The number of joins that used a range search on a reference table.(Counter)
mysql_global_status_select_range: The number of joins that used ranges on the first table. This is normally not a critical issue even if the value is quite large.(Counter)
mysql_global_status_select_range_check: The number of joins without keys that check for key usage after each row. If this is not 0, you should carefully check the indexes of your tables.(Counter)
mysql_global_status_select_scan: The number of joins that did a full scan of the first table.(Counter)
mysql_global_status_sort_rows: The number of sorted rows.(Counter)
mysql_global_status_sort_range: The number of sorts that were done using ranges.(Counter)
mysql_global_status_sort_merge_passes: The number of merge passes that the sort algorithm has had to do. If this value is large, you should consider increasing the value of the sort_buffer_size system variable.(Counter)
mysql_global_status_sort_scan: The number of sorts that were done by scanning the table.(Counter)
mysql_global_status_slow_queries: The number of queries that have taken more than long_query_time seconds. This counter increments regardless of whether the slow query log is enabled.(Counter)
mysql_global_status_aborted_connects: The number of failed attempts to connect to the MySQL server.(Counter)
mysql_global_status_aborted_clients: The number of connections that were aborted because the client died without closing the connection properly.(Counter)
mysql_global_status_table_locks_immediate: The number of times that a request for a table lock could be granted immediately. Locks Immediate rising and falling is normal activity.(Counter)
mysql_global_status_table_locks_waited: The number of times that a request for a table lock could not be granted immediately and a wait was needed. If this is high and you have performance problems, you should first optimize your queries, and then either split your table or tables or use replication.(Counter)
mysql_global_status_bytes_received: The number of bytes received from all clients.(Counter)
mysql_global_status_bytes_sent: The number of bytes sent to all clients.(Counter)
mysql_global_status_innodb_page_size: InnoDB page size (default 16KB). Many values are counted in pages; the page size enables them to be easily converted to bytes.(Gauge)
mysql_global_status_buffer_pool_pages: The number of pages in the InnoDB buffer pool.(Gauge)
mysql_global_status_commands_total: The number of times each xxx statement has been executed.(Counter)
mysql_global_status_handlers_total: Handler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes. This is in fact the layer between the Storage Engine and MySQL.(Counter)
mysql_global_status_opened_files: The number of files that have been opened with my_open() (a mysys library function). Parts of the server that open files without using this function do not increment the count.(Counter)
mysql_global_status_open_tables: The number of tables that are open.(Gauge)
mysql_global_status_opened_tables: The number of tables that have been opened. If Opened_tables is big, your table_open_cache value is probably too small.(Counter)
mysql_global_status_table_open_cache_hits: The number of hits for open tables cache lookups.(Counter)
mysql_global_status_table_open_cache_misses: The number of misses for open tables cache lookups.(Counter)
mysql_global_status_table_open_cache_overflows: The number of overflows for the open tables cache.(Counter)
mysql_global_status_innodb_num_open_files: The number of files InnoDB currently holds open.(Gauge)
mysql_global_status_connection_errors_total: These variables provide information about errors that occur during the client connection process.(Counter)
mysql_global_status_innodb_buffer_pool_read_requests: The number of logical read requests.(Counter)
mysql_global_status_innodb_buffer_pool_reads: The number of logical reads that InnoDB could not satisfy from the buffer pool, and had to read directly from disk.(Counter)
mysql_global_variables_thread_cache_size: How many threads the server should cache for reuse.(Gauge)
mysql_global_variables_max_connections: The maximum permitted number of simultaneous client connections.(Gauge)
mysql_global_variables_innodb_buffer_pool_size: The size in bytes of the buffer pool, the memory area where InnoDB caches table and index data. The default value is 134217728 bytes (128MB).(Gauge)
mysql_global_variables_innodb_log_buffer_size: The size in bytes of the buffer that InnoDB uses to write to the log files on disk.(Gauge)
mysql_global_variables_key_buffer_size: Index blocks for MyISAM tables are buffered and are shared by all threads.(Gauge)
mysql_global_variables_query_cache_size: The amount of memory allocated for caching query results.(Gauge)
mysql_global_variables_table_open_cache: The number of open tables for all threads.(Gauge)
mysql_global_variables_open_files_limit: The number of file descriptors available to mysqld from the operating system.(Gauge)
# [redis_exporter]
redis_active_defrag_running: When activedefrag is enabled, this indicates whether defragmentation is currently active, and the CPU percentage it intends to utilize.
redis_allocator_active_bytes: Total bytes in the allocator active pages, this includes external-fragmentation.
redis_allocator_allocated_bytes: Total bytes allocated form the allocator, including internal-fragmentation. Normally the same as used_memory.
redis_allocator_frag_bytes: Delta between allocator_active and allocator_allocated. See note about mem_fragmentation_bytes.
redis_allocator_frag_ratio: Ratio between allocator_active and allocator_allocated. This is the true (external) fragmentation metric (not mem_fragmentation_ratio).
redis_allocator_resident_bytes: Total bytes resident (RSS) in the allocator, this includes pages that can be released to the OS (by MEMORY PURGE, or just waiting).
redis_allocator_rss_bytes: Delta between allocator_resident and allocator_active.
redis_allocator_rss_ratio: Ratio between allocator_resident and allocator_active. This usually indicates pages that the allocator can and probably will soon release back to the OS.
redis_aof_current_rewrite_duration_sec: Duration of the on-going AOF rewrite operation if any.
redis_aof_enabled: Flag indicating AOF logging is activated.
redis_aof_last_bgrewrite_status: Status of the last AOF rewrite operation.
redis_aof_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last AOF rewrite operation.
redis_aof_last_rewrite_duration_sec: Duration of the last AOF rewrite operation in seconds.
redis_aof_last_write_status: Status of the last write operation to the AOF.
redis_aof_rewrite_in_progress: Flag indicating a AOF rewrite operation is on-going.
redis_aof_rewrite_scheduled: Flag indicating an AOF rewrite operation will be scheduled once the on-going RDB save is complete.
redis_blocked_clients: Number of clients pending on a blocking call (BLPOP, BRPOP, BRPOPLPUSH, BLMOVE, BZPOPMIN, BZPOPMAX).
redis_client_recent_max_input_buffer_bytes: Biggest input buffer among current client connections.
redis_client_recent_max_output_buffer_bytes: Biggest output buffer among current client connections.
redis_cluster_enabled: Indicate Redis cluster is enabled.
redis_commands_duration_seconds_total: The total CPU time consumed by these commands.(Counter)
redis_commands_processed_total: Total number of commands processed by the server.(Counter)
redis_commands_total: The number of calls that reached command execution (not rejected).(Counter)
redis_config_maxclients: The value of the maxclients configuration directive. This is the upper limit for the sum of connected_clients, connected_slaves and cluster_connections.
redis_config_maxmemory: The value of the maxmemory configuration directive.
redis_connected_clients: Number of client connections (excluding connections from replicas).
redis_connected_slaves: Number of connected replicas.
redis_connections_received_total: Total number of connections accepted by the server.(Counter)
redis_cpu_sys_children_seconds_total: System CPU consumed by the background processes.(Counter)
redis_cpu_sys_seconds_total: System CPU consumed by the Redis server, which is the sum of system CPU consumed by all threads of the server process (main thread and background threads).(Counter)
redis_cpu_user_children_seconds_total: User CPU consumed by the background processes.(Counter)
redis_cpu_user_seconds_total: User CPU consumed by the Redis server, which is the sum of user CPU consumed by all threads of the server process (main thread and background threads).(Counter)
redis_db_keys: Total number of keys by DB.
redis_db_keys_expiring: Total number of expiring keys by DB
redis_defrag_hits: Number of value reallocations performed by active the defragmentation process.
redis_defrag_misses: Number of aborted value reallocations started by the active defragmentation process.
redis_defrag_key_hits: Number of keys that were actively defragmented.
redis_defrag_key_misses: Number of keys that were skipped by the active defragmentation process.
redis_evicted_keys_total: Number of evicted keys due to maxmemory limit.(Counter)
redis_expired_keys_total: Total number of key expiration events.(Counter)
redis_expired_stale_percentage: The percentage of keys probably expired.
redis_expired_time_cap_reached_total: The count of times that active expiry cycles have stopped early.
redis_exporter_last_scrape_connect_time_seconds: The duration(in seconds) to connect when scrape.
redis_exporter_last_scrape_duration_seconds: The last scrape duration.
redis_exporter_last_scrape_error: The last scrape error status.
redis_exporter_scrape_duration_seconds_count: Durations of scrapes by the exporter
redis_exporter_scrape_duration_seconds_sum: Durations of scrapes by the exporter
redis_exporter_scrapes_total: Current total redis scrapes.(Counter)
redis_instance_info: Information about the Redis instance.
redis_keyspace_hits_total: Hits total.(Counter)
redis_keyspace_misses_total: Misses total.(Counter)
redis_last_key_groups_scrape_duration_milliseconds: Duration of the last key group metrics scrape in milliseconds.
redis_last_slow_execution_duration_seconds: The amount of time needed for last slow execution, in seconds.
redis_latest_fork_seconds: The amount of time needed for last fork, in seconds.
redis_lazyfree_pending_objects: The number of objects waiting to be freed (as a result of calling UNLINK, or FLUSHDB and FLUSHALL with the ASYNC option).
redis_master_repl_offset: The server's current replication offset.
redis_mem_clients_normal: Memory used by normal clients.(Gauge)
redis_mem_clients_slaves: Memory used by replica clients - Starting Redis 7.0, replica buffers share memory with the replication backlog, so this field can show 0 when replicas don't trigger an increase of memory usage.
redis_mem_fragmentation_bytes: Delta between used_memory_rss and used_memory. Note that when the total fragmentation bytes is low (few megabytes), a high ratio (e.g. 1.5 and above) is not an indication of an issue.
redis_mem_fragmentation_ratio: Ratio between used_memory_rss and used_memory. Note that this doesn't only includes fragmentation, but also other process overheads (see the allocator_* metrics), and also overheads like code, shared libraries, stack, etc.
redis_mem_not_counted_for_eviction_bytes: (Gauge)
redis_memory_max_bytes: Max memory limit in bytes.
redis_memory_used_bytes: Total number of bytes allocated by Redis using its allocator (either standard libc, jemalloc, or an alternative allocator such as tcmalloc)
redis_memory_used_dataset_bytes: The size in bytes of the dataset (used_memory_overhead subtracted from used_memory)
redis_memory_used_lua_bytes: Number of bytes used by the Lua engine.
redis_memory_used_overhead_bytes: The sum in bytes of all overheads that the server allocated for managing its internal data structures.
redis_memory_used_peak_bytes: Peak memory consumed by Redis (in bytes)
redis_memory_used_rss_bytes: Number of bytes that Redis allocated as seen by the operating system (a.k.a resident set size). This is the number reported by tools such as top(1) and ps(1)
redis_memory_used_scripts_bytes: Number of bytes used by cached Lua scripts
redis_memory_used_startup_bytes: Initial amount of memory consumed by Redis at startup in bytes
redis_migrate_cached_sockets_total: The number of sockets open for MIGRATE purposes
redis_net_input_bytes_total: Total input bytes(Counter)
redis_net_output_bytes_total: Total output bytes(Counter)
redis_process_id: Process ID
redis_pubsub_channels: Global number of pub/sub channels with client subscriptions
redis_pubsub_patterns: Global number of pub/sub pattern with client subscriptions
redis_rdb_bgsave_in_progress: Flag indicating a RDB save is on-going
redis_rdb_changes_since_last_save: Number of changes since the last dump
redis_rdb_current_bgsave_duration_sec: Duration of the on-going RDB save operation if any
redis_rdb_last_bgsave_duration_sec: Duration of the last RDB save operation in seconds
redis_rdb_last_bgsave_status: Status of the last RDB save operation
redis_rdb_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last RDB save operation
redis_rdb_last_save_timestamp_seconds: Epoch-based timestamp of last successful RDB save
redis_rejected_connections_total: Number of connections rejected because of maxclients limit(Counter)
redis_repl_backlog_first_byte_offset: The master offset of the replication backlog buffer
redis_repl_backlog_history_bytes: Size in bytes of the data in the replication backlog buffer
redis_repl_backlog_is_active: Flag indicating replication backlog is active
redis_replica_partial_resync_accepted: The number of accepted partial resync requests(Gauge)
redis_replica_partial_resync_denied: The number of denied partial resync requests(Gauge)
redis_replica_resyncs_full: The number of full resyncs with replicas
redis_replication_backlog_bytes: Memory used by replication backlog
redis_second_repl_offset: The offset up to which replication IDs are accepted.
redis_slave_expires_tracked_keys: The number of keys tracked for expiry purposes (applicable only to writable replicas)(Gauge)
redis_slowlog_last_id: Last id of slowlog
redis_slowlog_length: Total slowlog
redis_start_time_seconds: Start time of the Redis instance since unix epoch in seconds.
redis_target_scrape_request_errors_total: Errors in requests to the exporter
redis_up: Flag indicating redis instance is up
redis_uptime_in_seconds: Number of seconds since Redis server start
# [windows_exporter]
windows_cpu_clock_interrupts_total: Total number of received and serviced clock tick interrupts(counter)
windows_cpu_core_frequency_mhz: Core frequency in megahertz(gauge)
windows_cpu_cstate_seconds_total: Time spent in low-power idle state(counter)
windows_cpu_dpcs_total: Total number of received and serviced deferred procedure calls (DPCs)(counter)
windows_cpu_idle_break_events_total: Total number of time processor was woken from idle(counter)
windows_cpu_interrupts_total: Total number of received and serviced hardware interrupts(counter)
windows_cpu_parking_status: Parking Status represents whether a processor is parked or not(gauge)
windows_cpu_processor_performance: Processor Performance is the average performance of the processor while it is executing instructions, as a percentage of the nominal performance of the processor. On some processors, Processor Performance may exceed 100%(gauge)
windows_cpu_time_total: Time that processor spent in different modes (idle, user, system, ...)(counter)
windows_cs_hostname: Labeled system hostname information as provided by ComputerSystem.DNSHostName and ComputerSystem.Domain(gauge)
windows_cs_logical_processors: ComputerSystem.NumberOfLogicalProcessors(gauge)
windows_cs_physical_memory_bytes: ComputerSystem.TotalPhysicalMemory(gauge)
windows_exporter_build_info: A metric with a constant '1' value labeled by version, revision, branch, and goversion from which windows_exporter was built.(gauge)
windows_exporter_collector_duration_seconds: Duration of a collection.(gauge)
windows_exporter_collector_success: Whether the collector was successful.(gauge)
windows_exporter_collector_timeout: Whether the collector timed out.(gauge)
windows_exporter_perflib_snapshot_duration_seconds: Duration of perflib snapshot capture(gauge)
windows_logical_disk_free_bytes: Free space in bytes (LogicalDisk.PercentFreeSpace)(gauge)
windows_logical_disk_idle_seconds_total: Seconds that the disk was idle (LogicalDisk.PercentIdleTime)(counter)
windows_logical_disk_read_bytes_total: The number of bytes transferred from the disk during read operations (LogicalDisk.DiskReadBytesPerSec)(counter)
windows_logical_disk_read_latency_seconds_total: Shows the average time, in seconds, of a read operation from the disk (LogicalDisk.AvgDiskSecPerRead)(counter)
windows_logical_disk_read_seconds_total: Seconds that the disk was busy servicing read requests (LogicalDisk.PercentDiskReadTime)(counter)
windows_logical_disk_read_write_latency_seconds_total: Shows the time, in seconds, of the average disk transfer (LogicalDisk.AvgDiskSecPerTransfer)(counter)
windows_logical_disk_reads_total: The number of read operations on the disk (LogicalDisk.DiskReadsPerSec)(counter)
windows_logical_disk_requests_queued: The number of requests queued to the disk (LogicalDisk.CurrentDiskQueueLength)(gauge)
windows_logical_disk_size_bytes: Total space in bytes (LogicalDisk.PercentFreeSpace_Base)(gauge)
windows_logical_disk_split_ios_total: The number of I/Os to the disk were split into multiple I/Os (LogicalDisk.SplitIOPerSec)(counter)
windows_logical_disk_write_bytes_total: The number of bytes transferred to the disk during write operations (LogicalDisk.DiskWriteBytesPerSec)(counter)
windows_logical_disk_write_latency_seconds_total: Shows the average time, in seconds, of a write operation to the disk (LogicalDisk.AvgDiskSecPerWrite)(counter)
windows_logical_disk_write_seconds_total: Seconds that the disk was busy servicing write requests (LogicalDisk.PercentDiskWriteTime)(counter)
windows_logical_disk_writes_total: The number of write operations on the disk (LogicalDisk.DiskWritesPerSec)(counter)
windows_net_bytes_received_total: (Network.BytesReceivedPerSec)(counter)
windows_net_bytes_sent_total: (Network.BytesSentPerSec)(counter)
windows_net_bytes_total: (Network.BytesTotalPerSec)(counter)
windows_net_current_bandwidth: (Network.CurrentBandwidth)(gauge)
windows_net_packets_outbound_discarded_total: (Network.PacketsOutboundDiscarded)(counter)
windows_net_packets_outbound_errors_total: (Network.PacketsOutboundErrors)(counter)
windows_net_packets_received_discarded_total: (Network.PacketsReceivedDiscarded)(counter)
windows_net_packets_received_errors_total: (Network.PacketsReceivedErrors)(counter)
windows_net_packets_received_total: (Network.PacketsReceivedPerSec)(counter)
windows_net_packets_received_unknown_total: (Network.PacketsReceivedUnknown)(counter)
windows_net_packets_sent_total: (Network.PacketsSentPerSec)(counter)
windows_net_packets_total: (Network.PacketsPerSec)(counter)
windows_os_info: OperatingSystem.Caption, OperatingSystem.Version(gauge)
windows_os_paging_free_bytes: OperatingSystem.FreeSpaceInPagingFiles(gauge)
windows_os_paging_limit_bytes: OperatingSystem.SizeStoredInPagingFiles(gauge)
windows_os_physical_memory_free_bytes: OperatingSystem.FreePhysicalMemory(gauge)
windows_os_process_memory_limix_bytes: OperatingSystem.MaxProcessMemorySize(gauge)
windows_os_processes: OperatingSystem.NumberOfProcesses(gauge)
windows_os_processes_limit: OperatingSystem.MaxNumberOfProcesses(gauge)
windows_os_time: OperatingSystem.LocalDateTime(gauge)
windows_os_timezone: OperatingSystem.LocalDateTime(gauge)
windows_os_users: OperatingSystem.NumberOfUsers(gauge)
windows_os_virtual_memory_bytes: OperatingSystem.TotalVirtualMemorySize(gauge)
windows_os_virtual_memory_free_bytes: OperatingSystem.FreeVirtualMemory(gauge)
windows_os_visible_memory_bytes: OperatingSystem.TotalVisibleMemorySize(gauge)
windows_service_info: A metric with a constant '1' value labeled with service information(gauge)
windows_service_start_mode: The start mode of the service (StartMode)(gauge)
windows_service_state: The state of the service (State)(gauge)
windows_service_status: The status of the service (Status)(gauge)
windows_system_context_switches_total: Total number of context switches (WMI source is PerfOS_System.ContextSwitchesPersec)(counter)
windows_system_exception_dispatches_total: Total number of exceptions dispatched (WMI source is PerfOS_System.ExceptionDispatchesPersec)(counter)
windows_system_processor_queue_length: Length of processor queue (WMI source is PerfOS_System.ProcessorQueueLength)(gauge)
windows_system_system_calls_total: Total number of system calls (WMI source is PerfOS_System.SystemCallsPersec)(counter)
windows_system_system_up_time: System boot time (WMI source is PerfOS_System.SystemUpTime)(gauge)
windows_system_threads: Current number of threads (WMI source is PerfOS_System.Threads)(gauge)
# [node_exporter]
# SYSTEM
# CPU context switch 次数
node_context_switches_total: context_switches
# Interrupts 次数
node_intr_total: Interrupts
# 运行的进程数
node_procs_running: Processes in runnable state
# 熵池大小
node_entropy_available_bits: Entropy available to random number generators
node_time_seconds: System time in seconds since epoch (1970)
node_boot_time_seconds: Node boot time, in unixtime
# CPU
node_cpu_seconds_total: Seconds the CPUs spent in each mode
node_load1: cpu load 1m
node_load5: cpu load 5m
node_load15: cpu load 15m
# MEM
# 内核态
# 内核用于缓存数据结构供自己使用的内存
node_memory_Slab_bytes: Memory used by the kernel to cache data structures for its own use
# slab中可回收的部分
node_memory_SReclaimable_bytes: SReclaimable - Part of Slab, that might be reclaimed, such as caches
# slab中不可回收的部分
node_memory_SUnreclaim_bytes: Part of Slab, that cannot be reclaimed on memory pressure
# Vmalloc内存区的大小
node_memory_VmallocTotal_bytes: Total size of vmalloc memory area
# vmalloc已分配的内存,虚拟地址空间上的连续的内存
node_memory_VmallocUsed_bytes: Amount of vmalloc area which is used
# vmalloc区可用的连续最大快的大小,通过此指标可以知道vmalloc可分配连续内存的最大值
node_memory_VmallocChunk_bytes: Largest contiguous block of vmalloc area which is free
# 内存的硬件故障删除掉的内存页的总大小
node_memory_HardwareCorrupted_bytes: Amount of RAM that the kernel identified as corrupted / not working
# 用于在虚拟和物理内存地址之间映射的内存
node_memory_PageTables_bytes: Memory used to map between virtual and physical memory addresses (gauge)
# 内核栈内存,常驻内存,不可回收
node_memory_KernelStack_bytes: Kernel memory stack. This is not reclaimable
# 用来访问高端内存,复制高端内存的临时buffer,称为“bounce buffering”,会降低I/O 性能
node_memory_Bounce_bytes: Memory used for block device bounce buffers
#用户态
# 单个巨页大小
node_memory_Hugepagesize_bytes: Huge Page size
# 系统分配的常驻巨页数
node_memory_HugePages_Total: Total size of the pool of huge pages
# 系统空闲的巨页数
node_memory_HugePages_Free: Huge pages in the pool that are not yet allocated
# 进程已申请但未使用的巨页数
node_memory_HugePages_Rsvd: Huge pages for which a commitment to allocate from the pool has been made, but no allocation
# 超过系统设定的常驻HugePages数量的个数
node_memory_HugePages_Surp: Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages
# 透明巨页 Transparent HugePages (THP)
node_memory_AnonHugePages_bytes: Memory in anonymous huge pages
# inactivelist中的File-backed内存
node_memory_Inactive_file_bytes: File-backed memory on inactive LRU list
# inactivelist中的Anonymous内存
node_memory_Inactive_anon_bytes: Anonymous and swap cache on inactive LRU list, including tmpfs (shmem)
# activelist中的File-backed内存
node_memory_Active_file_bytes: File-backed memory on active LRU list
# activelist中的Anonymous内存
node_memory_Active_anon_bytes: Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs
# 禁止换出的页,对应 Unevictable 链表
node_memory_Unevictable_bytes: Amount of unevictable memory that can't be swapped out for a variety of reasons
# 共享内存
node_memory_Shmem_bytes: Used shared memory (shared between several processes, thus including RAM disks)
# 匿名页内存大小
node_memory_AnonPages_bytes: Memory in user pages not backed by files
# 被关联的内存页大小
node_memory_Mapped_bytes: Used memory in mapped pages files which have been mapped, such as libraries
# file-backed内存页缓存大小
node_memory_Cached_bytes: Parked file data (file content) cache
# 系统中有多少匿名页曾经被swap-out、现在又被swap-in并且swap-in之后页面中的内容一直没发生变化
node_memory_SwapCached_bytes: Memory that keeps track of pages that have been fetched from swap but not yet been modified
# 被mlock()系统调用锁定的内存大小
node_memory_Mlocked_bytes: Size of pages locked to memory using the mlock() system call
# 块设备(block device)所占用的缓存页
node_memory_Buffers_bytes: Block device (e.g. harddisk) cache
node_memory_SwapTotal_bytes: Memory information field SwapTotal_bytes
node_memory_SwapFree_bytes: Memory information field SwapFree_bytes
# DISK
node_filesystem_avail_bytes: Filesystem space available to non-root users in byte
node_filesystem_free_bytes: Filesystem free space in bytes
node_filesystem_size_bytes: Filesystem size in bytes
node_filesystem_files_free: Filesystem total free file nodes
node_filesystem_files: Filesystem total free file nodes
node_filefd_maximum: Max open files
node_filefd_allocated: Open files
node_filesystem_readonly: Filesystem read-only status
node_filesystem_device_error: Whether an error occurred while getting statistics for the given device
node_disk_reads_completed_total: The total number of reads completed successfully
node_disk_writes_completed_total: The total number of writes completed successfully
node_disk_reads_merged_total: The number of reads merged
node_disk_writes_merged_total: The number of writes merged
node_disk_read_bytes_total: The total number of bytes read successfully
node_disk_written_bytes_total: The total number of bytes written successfully
node_disk_io_time_seconds_total: Total seconds spent doing I/Os
node_disk_read_time_seconds_total: The total number of seconds spent by all reads
node_disk_write_time_seconds_total: The total number of seconds spent by all writes
node_disk_io_time_weighted_seconds_total: The weighted of seconds spent doing I/Os
# NET
node_network_receive_bytes_total: Network device statistic receive_bytes (counter)
node_network_transmit_bytes_total: Network device statistic transmit_bytes (counter)
node_network_receive_packets_total: Network device statistic receive_bytes
node_network_transmit_packets_total: Network device statistic transmit_bytes
node_network_receive_errs_total: Network device statistic receive_errs
node_network_transmit_errs_total: Network device statistic transmit_errs
node_network_receive_drop_total: Network device statistic receive_drop
node_network_transmit_drop_total: Network device statistic transmit_drop
node_nf_conntrack_entries: Number of currently allocated flow entries for connection tracking
node_sockstat_TCP_alloc: Number of TCP sockets in state alloc
node_sockstat_TCP_inuse: Number of TCP sockets in state inuse
node_sockstat_TCP_orphan: Number of TCP sockets in state orphan
node_sockstat_TCP_tw: Number of TCP sockets in state tw
node_netstat_Tcp_CurrEstab: Statistic TcpCurrEstab
node_sockstat_sockets_used: Number of IPv4 sockets in use
# [kafka_exporter]
kafka_brokers: count of kafka_brokers (gauge)
kafka_topic_partitions: Number of partitions for this Topic (gauge)
kafka_topic_partition_current_offset: Current Offset of a Broker at Topic/Partition (gauge)
kafka_consumergroup_current_offset: Current Offset of a ConsumerGroup at Topic/Partition (gauge)
kafka_consumer_lag_millis: Current approximation of consumer lag for a ConsumerGroup at Topic/Partition (gauge)
kafka_topic_partition_under_replicated_partition: 1 if Topic/Partition is under Replicated
# [zookeeper_exporter]
zk_znode_count: The total count of znodes stored
zk_ephemerals_count: The number of Ephemerals nodes
zk_watch_count: The number of watchers setup over Zookeeper nodes.
zk_approximate_data_size: Size of data in bytes that a zookeeper server has in its data tree
zk_outstanding_requests: Number of currently executing requests
zk_packets_sent: Count of the number of zookeeper packets sent from a server
zk_packets_received: Count of the number of zookeeper packets received by a server
zk_num_alive_connections: Number of active clients connected to a zookeeper server
zk_open_file_descriptor_count: Number of file descriptors that a zookeeper server has open
zk_max_file_descriptor_count: Maximum number of file descriptors that a zookeeper server can open
zk_avg_latency: Average time in milliseconds for requests to be processed
zk_min_latency: Maximum time in milliseconds for a request to be processed
zk_max_latency: Minimum time in milliseconds for a request to be processed
================================================
FILE: docker/compose-host-network/etc-nightingale/script/notify.bak.py
================================================
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys
import json
import urllib2
import smtplib
from email.mime.text import MIMEText
reload(sys)
sys.setdefaultencoding('utf8')
notify_channel_funcs = {
"email":"email",
"sms":"sms",
"voice":"voice",
"dingtalk":"dingtalk",
"wecom":"wecom",
"feishu":"feishu"
}
mail_host = "smtp.163.com"
mail_port = 994
mail_user = "ulricqin"
mail_pass = "password"
mail_from = "ulricqin@163.com"
class Sender(object):
@classmethod
def send_email(cls, payload):
if mail_user == "ulricqin" and mail_pass == "password":
print("invalid smtp configuration")
return
users = payload.get('event').get("notify_users_obj")
emails = {}
for u in users:
if u.get("email"):
emails[u.get("email")] = 1
if not emails:
return
recipients = emails.keys()
mail_body = payload.get('tpls').get("email.tpl", "email.tpl not found")
message = MIMEText(mail_body, 'html', 'utf-8')
message['From'] = mail_from
message['To'] = ", ".join(recipients)
message["Subject"] = payload.get('tpls').get("subject.tpl", "subject.tpl not found")
try:
smtp = smtplib.SMTP_SSL(mail_host, mail_port)
smtp.login(mail_user, mail_pass)
smtp.sendmail(mail_from, recipients, message.as_string())
smtp.close()
except smtplib.SMTPException, error:
print(error)
@classmethod
def send_wecom(cls, payload):
users = payload.get('event').get("notify_users_obj")
tokens = {}
for u in users:
contacts = u.get("contacts")
if contacts.get("wecom_robot_token", ""):
tokens[contacts.get("wecom_robot_token", "")] = 1
opener = urllib2.build_opener(urllib2.HTTPHandler())
method = "POST"
for t in tokens:
url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}".format(t)
body = {
"msgtype": "markdown",
"markdown": {
"content": payload.get('tpls').get("wecom.tpl", "wecom.tpl not found")
}
}
request = urllib2.Request(url, data=json.dumps(body))
request.add_header("Content-Type",'application/json;charset=utf-8')
request.get_method = lambda: method
try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
@classmethod
def send_dingtalk(cls, payload):
event = payload.get('event')
users = event.get("notify_users_obj")
rule_name = event.get("rule_name")
event_state = "Triggered"
if event.get("is_recovered"):
event_state = "Recovered"
tokens = {}
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
contacts = u.get("contacts")
if contacts.get("dingtalk_robot_token", ""):
tokens[contacts.get("dingtalk_robot_token", "")] = 1
opener = urllib2.build_opener(urllib2.HTTPHandler())
method = "POST"
for t in tokens:
url = "https://oapi.dingtalk.com/robot/send?access_token={}".format(t)
body = {
"msgtype": "markdown",
"markdown": {
"title": "{} - {}".format(event_state, rule_name),
"text": payload.get('tpls').get("dingtalk.tpl", "dingtalk.tpl not found") + ' '.join(["@"+i for i in phones.keys()])
},
"at": {
"atMobiles": phones.keys(),
"isAtAll": False
}
}
request = urllib2.Request(url, data=json.dumps(body))
request.add_header("Content-Type",'application/json;charset=utf-8')
request.get_method = lambda: method
try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
@classmethod
def send_feishu(cls, payload):
users = payload.get('event').get("notify_users_obj")
tokens = {}
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
contacts = u.get("contacts")
if contacts.get("feishu_robot_token", ""):
tokens[contacts.get("feishu_robot_token", "")] = 1
opener = urllib2.build_opener(urllib2.HTTPHandler())
method = "POST"
for t in tokens:
url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t)
body = {
"msg_type": "text",
"content": {
"text": payload.get('tpls').get("feishu.tpl", "feishu.tpl not found")
},
"at": {
"atMobiles": phones.keys(),
"isAtAll": False
}
}
request = urllib2.Request(url, data=json.dumps(body))
request.add_header("Content-Type",'application/json;charset=utf-8')
request.get_method = lambda: method
try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
@classmethod
def send_sms(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_sms not implemented, phones: {}".format(phones.keys()))
@classmethod
def send_voice(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_voice not implemented, phones: {}".format(phones.keys()))
def main():
payload = json.load(sys.stdin)
with open(".payload", 'w') as f:
f.write(json.dumps(payload, indent=4))
for ch in payload.get('event').get('notify_channels'):
send_func_name = "send_{}".format(notify_channel_funcs.get(ch.strip()))
if not hasattr(Sender, send_func_name):
print("function: {} not found", send_func_name)
continue
send_func = getattr(Sender, send_func_name)
send_func(payload)
def hello():
print("hello nightingale")
if __name__ == "__main__":
if len(sys.argv) == 1:
main()
elif sys.argv[1] == "hello":
hello()
else:
print("I am confused")
================================================
FILE: docker/compose-host-network/etc-nightingale/script/notify.py
================================================
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import sys
import json
class Sender(object):
@classmethod
def send_email(cls, payload):
# already done in go code
pass
@classmethod
def send_wecom(cls, payload):
# already done in go code
pass
@classmethod
def send_dingtalk(cls, payload):
# already done in go code
pass
@classmethod
def send_feishu(cls, payload):
# already done in go code
pass
@classmethod
def send_mm(cls, payload):
# already done in go code
pass
@classmethod
def send_sms(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_sms not implemented, phones: {}".format(phones.keys()))
@classmethod
def send_voice(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_voice not implemented, phones: {}".format(phones.keys()))
def main():
payload = json.load(sys.stdin)
with open(".payload", 'w') as f:
f.write(json.dumps(payload, indent=4))
for ch in payload.get('event').get('notify_channels'):
send_func_name = "send_{}".format(ch.strip())
if not hasattr(Sender, send_func_name):
print("function: {} not found", send_func_name)
continue
send_func = getattr(Sender, send_func_name)
send_func(payload)
def hello():
print("hello nightingale")
if __name__ == "__main__":
if len(sys.argv) == 1:
main()
elif sys.argv[1] == "hello":
hello()
else:
print("I am confused")
================================================
FILE: docker/compose-host-network/etc-nightingale/script/notify_feishu.py
================================================
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys
import json
import requests
class Sender(object):
@classmethod
def send_email(cls, payload):
# already done in go code
pass
@classmethod
def send_wecom(cls, payload):
# already done in go code
pass
@classmethod
def send_dingtalk(cls, payload):
# already done in go code
pass
@classmethod
def send_ifeishu(cls, payload):
users = payload.get('event').get("notify_users_obj")
tokens = {}
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
contacts = u.get("contacts")
if contacts.get("feishu_robot_token", ""):
tokens[contacts.get("feishu_robot_token", "")] = 1
headers = {
"Content-Type": "application/json;charset=utf-8",
"Host": "open.feishu.cn"
}
for t in tokens:
url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t)
body = {
"msg_type": "text",
"content": {
"text": payload.get('tpls').get("feishu", "feishu not found")
},
"at": {
"atMobiles": list(phones.keys()),
"isAtAll": False
}
}
response = requests.post(url, headers=headers, data=json.dumps(body))
print(f"notify_ifeishu: token={t} status_code={response.status_code} response_text={response.text}")
@classmethod
def send_mm(cls, payload):
# already done in go code
pass
@classmethod
def send_sms(cls, payload):
pass
@classmethod
def send_voice(cls, payload):
pass
def main():
payload = json.load(sys.stdin)
with open(".payload", 'w') as f:
f.write(json.dumps(payload, indent=4))
for ch in payload.get('event').get('notify_channels'):
send_func_name = "send_{}".format(ch.strip())
if not hasattr(Sender, send_func_name):
print("function: {} not found", send_func_name)
continue
send_func = getattr(Sender, send_func_name)
send_func(payload)
def hello():
print("hello nightingale")
if __name__ == "__main__":
if len(sys.argv) == 1:
main()
elif sys.argv[1] == "hello":
hello()
else:
print("I am confused")
================================================
FILE: docker/compose-host-network/etc-nightingale/script/rule_converter.py
================================================
import json
import yaml
'''
将promtheus/vmalert的rule转换为n9e中的rule
支持k8s的rule configmap
'''
rule_file = 'rules.yaml'
def convert_interval(interval):
if interval.endswith('s') or interval.endswith('S'):
return int(interval[:-1])
if interval.endswith('m') or interval.endswith('M'):
return int(interval[:-1]) * 60
if interval.endswith('h') or interval.endswith('H'):
return int(interval[:-1]) * 60 * 60
if interval.endswith('d') or interval.endswith('D'):
return int(interval[:-1]) * 60 * 60 * 24
return int(interval)
def convert_alert(rule, interval):
name = rule['alert']
prom_ql = rule['expr']
if 'for' in rule:
prom_for_duration = convert_interval(rule['for'])
else:
prom_for_duration = 0
prom_eval_interval = convert_interval(interval)
note = ''
if 'annotations' in rule:
for v in rule['annotations'].values():
note = v
break
annotations = {}
if 'annotations' in rule:
for k, v in rule['annotations'].items():
annotations[k] = v
append_tags = []
severity = 2
if 'labels' in rule:
for k, v in rule['labels'].items():
if k != 'severity':
append_tags.append('{}={}'.format(k, v))
continue
if v == 'critical':
severity = 1
elif v == 'info':
severity = 3
# elif v == 'warning':
# severity = 2
n9e_alert_rule = {
"name": name,
"note": note,
"severity": severity,
"disabled": 0,
"prom_for_duration": prom_for_duration,
"prom_ql": prom_ql,
"prom_eval_interval": prom_eval_interval,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": append_tags,
"annotations":annotations
}
return n9e_alert_rule
def convert_record(rule, interval):
name = rule['record']
prom_ql = rule['expr']
prom_eval_interval = convert_interval(interval)
note = ''
append_tags = []
if 'labels' in rule:
for k, v in rule['labels'].items():
append_tags.append('{}={}'.format(k, v))
n9e_record_rule = {
"name": name,
"note": note,
"disabled": 0,
"prom_ql": prom_ql,
"prom_eval_interval": prom_eval_interval,
"append_tags": append_tags
}
return n9e_record_rule
'''
example of rule group file
---
groups:
- name: example
rules:
- alert: HighRequestLatency
expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5
for: 10m
labels:
severity: page
annotations:
summary: High request latency
'''
def deal_group(group):
"""
parse single prometheus/vmalert rule group
"""
alert_rules = []
record_rules = []
for rule_segment in group['groups']:
if 'interval' in rule_segment:
interval = rule_segment['interval']
else:
interval = '15s'
for rule in rule_segment['rules']:
if 'alert' in rule:
alert_rules.append(convert_alert(rule, interval))
else:
record_rules.append(convert_record(rule, interval))
return alert_rules, record_rules
'''
example of k8s rule configmap
---
apiVersion: v1
kind: ConfigMap
metadata:
name: rulefiles-0
data:
etcdrules.yaml: |
groups:
- name: etcd
rules:
- alert: etcdInsufficientMembers
annotations:
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value}}).'
expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"})
by (job) + 1) / 2)
for: 3m
labels:
severity: critical
'''
def deal_configmap(rule_configmap):
"""
parse rule configmap from k8s
"""
all_record_rules = []
all_alert_rules = []
for _, rule_group_str in rule_configmap['data'].items():
rule_group = yaml.load(rule_group_str, Loader=yaml.FullLoader)
alert_rules, record_rules = deal_group(rule_group)
all_alert_rules.extend(alert_rules)
all_record_rules.extend(record_rules)
return all_alert_rules, all_record_rules
def main():
with open(rule_file, 'r') as f:
rule_config = yaml.load(f, Loader=yaml.FullLoader)
# 如果文件是k8s中的configmap,使用下面的方法
# alert_rules, record_rules = deal_configmap(rule_config)
alert_rules, record_rules = deal_group(rule_config)
with open("alert-rules.json", 'w') as fw:
json.dump(alert_rules, fw, indent=2, ensure_ascii=False)
with open("record-rules.json", 'w') as fw:
json.dump(record_rules, fw, indent=2, ensure_ascii=False)
if __name__ == '__main__':
main()
================================================
FILE: docker/compose-host-network/etc-prometheus/prometheus.yml
================================================
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
scrape_configs:
# The job name is added as a label `job=` to any timeseries scraped from this config.
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'nightingale'
static_configs:
- targets: ['localhost:17000']
================================================
FILE: docker/compose-host-network-metric-log/docker-compose.yaml
================================================
version: "3.7"
services:
mysql:
image: "mysql:8"
container_name: mysql
hostname: mysql
restart: always
environment:
TZ: Asia/Shanghai
MYSQL_ROOT_PASSWORD: 1234
volumes:
- ./mysqldata:/var/lib/mysql/
- ../initsql:/docker-entrypoint-initdb.d/
- ./etc-mysql/my.cnf:/etc/my.cnf
network_mode: host
redis:
image: "redis:6.2"
container_name: redis
hostname: redis
restart: always
environment:
TZ: Asia/Shanghai
network_mode: host
prometheus:
image: prom/prometheus
container_name: prometheus
hostname: prometheus
restart: always
environment:
TZ: Asia/Shanghai
volumes:
- ./etc-prometheus:/etc/prometheus
network_mode: host
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
- "--enable-feature=remote-write-receiver"
- "--query.lookback-delta=2m"
n9e:
image: flashcatcloud/nightingale:latest
container_name: n9e
hostname: n9e
restart: always
environment:
GIN_MODE: release
TZ: Asia/Shanghai
WAIT_HOSTS: 127.0.0.1:3306, 127.0.0.1:6379
volumes:
- ./etc-nightingale:/app/etc
- ./n9e-logs:/app/logs
network_mode: host
depends_on:
- mysql
- redis
- prometheus
command:
- /app/n9e
categraf:
image: "flashcatcloud/categraf:latest"
container_name: "categraf"
hostname: "categraf01"
restart: always
environment:
TZ: Asia/Shanghai
HOST_PROC: /hostfs/proc
HOST_SYS: /hostfs/sys
HOST_MOUNT_PREFIX: /hostfs
WAIT_HOSTS: 127.0.0.1:17000, 127.0.0.1:20090, 127.0.0.1:9092
volumes:
- ./etc-categraf:/etc/categraf/conf
- ./n9e-logs:/logs
- /:/hostfs
network_mode: host
depends_on:
- n9e
- kafka
zookeeper:
image: bitnami/zookeeper:3.9
container_name: "zookeeper"
restart: always
environment:
- TZ=Asia/Shanghai
- ALLOW_ANONYMOUS_LOGIN=yes
network_mode: host
depends_on:
- n9e
kafka:
image: bitnami/kafka:3.4
container_name: "kafka"
restart: always
environment:
TZ: Asia/Shanghai
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://127.0.0.1:9092
KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092
KAFKA_ZOOKEEPER_CONNECT: 127.0.0.1:2181
KAFKA_CFG_MESSAGE_MAX_BYTES: 2000000
network_mode: host
depends_on:
- zookeeper
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:7.10.1
container_name: "elasticsearch"
restart: always
environment:
- TZ=Asia/Shanghai
- discovery.type=single-node
network_mode: host
depends_on:
- kafka
logstash:
image: docker.elastic.co/logstash/logstash:8.11.3
container_name: "logstash"
restart: always
environment:
- TZ=Asia/Shanghai
- LS_JAVA_OPTS=-Xmx256m -Xms256m
volumes:
- ./etc-logstash/logstash.yaml:/etc/logstash/conf.d/logstash.yaml
entrypoint:
- logstash
- -f
- /etc/logstash/conf.d/logstash.yaml
network_mode: host
depends_on:
- elasticsearch
- kafka
logging:
driver: "json-file"
options:
max-size: "200m"
max-file: "3"
================================================
FILE: docker/compose-host-network-metric-log/etc-categraf/config.toml
================================================
[global]
# whether print configs
print_configs = false
# add label(agent_hostname) to series
# "" -> auto detect hostname
# "xx" -> use specified string xx
# "$hostname" -> auto detect hostname
# "$ip" -> auto detect ip
# "$hostname-$ip" -> auto detect hostname and ip to replace the vars
hostname = "$HOSTNAME"
# will not add label(agent_hostname) if true
omit_hostname = false
# s | ms
precision = "ms"
# global collect interval
interval = 15
[global.labels]
source="categraf"
# region = "shanghai"
# env = "localhost"
[writer_opt]
# default: 2000
batch = 2000
# channel(as queue) size
chan_size = 10000
[[writers]]
url = "http://127.0.0.1:17000/prometheus/v1/write"
# Basic auth username
basic_auth_user = ""
# Basic auth password
basic_auth_pass = ""
# timeout settings, unit: ms
timeout = 5000
dial_timeout = 2500
max_idle_conns_per_host = 100
[http]
enable = false
address = ":9100"
print_access = false
run_mode = "release"
[heartbeat]
enable = true
# report os version cpu.util mem.util metadata
url = "http://127.0.0.1:17000/v1/n9e/heartbeat"
# interval, unit: s
interval = 10
# Basic auth username
basic_auth_user = ""
# Basic auth password
basic_auth_pass = ""
## Optional headers
# headers = ["X-From", "categraf", "X-Xyz", "abc"]
# timeout settings, unit: ms
timeout = 5000
dial_timeout = 2500
max_idle_conns_per_host = 100
[ibex]
enable = true
## ibex flush interval
interval = "1000ms"
## n9e ibex server rpc address
servers = ["127.0.0.1:20090"]
## temp script dir
meta_dir = "./meta"
================================================
FILE: docker/compose-host-network-metric-log/etc-categraf/input.cpu/cpu.toml
================================================
# # collect interval
# interval = 15
# # whether collect per cpu
# collect_per_cpu = false
================================================
FILE: docker/compose-host-network-metric-log/etc-categraf/input.disk/disk.toml
================================================
# # collect interval
# interval = 15
# # By default stats will be gathered for all mount points.
# # Set mount_points will restrict the stats to only the specified mount points.
# mount_points = ["/"]
# Ignore mount points by filesystem type.
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
ignore_mount_points = ["/boot"]
================================================
FILE: docker/compose-host-network-metric-log/etc-categraf/input.diskio/diskio.toml
================================================
# # collect interval
# interval = 15
# # By default, categraf will gather stats for all devices including disk partitions.
# # Setting devices will restrict the stats to the specified devices.
# devices = ["sda", "sdb", "vd*"]
================================================
FILE: docker/compose-host-network-metric-log/etc-categraf/input.kernel/kernel.toml
================================================
# # collect interval
# interval = 15
================================================
FILE: docker/compose-host-network-metric-log/etc-categraf/input.mem/mem.toml
================================================
# # collect interval
# interval = 15
# # whether collect platform specified metrics
collect_platform_fields = true
================================================
FILE: docker/compose-host-network-metric-log/etc-categraf/input.net/net.toml
================================================
# # collect interval
# interval = 15
# # whether collect protocol stats on Linux
# collect_protocol_stats = false
# # setting interfaces will tell categraf to gather these explicit interfaces
# interfaces = ["eth0"]
================================================
FILE: docker/compose-host-network-metric-log/etc-categraf/input.netstat/netstat.toml
================================================
# # collect interval
# interval = 15
================================================
FILE: docker/compose-host-network-metric-log/etc-categraf/input.processes/processes.toml
================================================
# # collect interval
# interval = 15
# # force use ps command to gather
# force_ps = false
# # force use /proc to gather
# force_proc = false
================================================
FILE: docker/compose-host-network-metric-log/etc-categraf/input.system/system.toml
================================================
# # collect interval
# interval = 15
# # whether collect metric: system_n_users
# collect_user_number = false
================================================
FILE: docker/compose-host-network-metric-log/etc-categraf/logs.toml
================================================
[logs]
## just a placeholder
api_key = "ef4ahfbwzwwtlwfpbertgq1i6mq0ab1q"
## enable log collect or not
enable = true
## the server receive logs, http/tcp/kafka, only kafka brokers can be multiple ip:ports with concatenation character ","
send_to = "127.0.0.1:9092"
## send logs with protocol: http/tcp/kafka
send_type = "kafka"
topic = "flashcatcloud"
## send logs with compression or not
use_compress = false
## use ssl or not
send_with_tls = false
## send logs in batchs
batch_wait = 5
## save offset in this path
run_path = "/opt/categraf/run"
## max files can be open
open_files_limit = 100
## scan config file in 10 seconds
scan_period = 10
## read buffer of udp
frame_size = 9000
## channal size, default 100
## 读取日志缓冲区,行数
chan_size = 1000
## pipeline num , default 4
## 有多少线程处理日志
pipeline=4
## configuration for kafka
## 指定kafka版本
kafka_version="2.8.1"
# 默认0 表示串行,如果对日志顺序有要求,保持默认配置
batch_max_concurrence = 0
# 最大并发批次, 默认100
batch_max_size=100
# 每次最大发送的内容上限 默认1000000
batch_max_contentsize=1000000
# client timeout in seconds
producer_timeout= 10
# 是否开启sasl模式
sasl_enable = false
sasl_user = "admin"
sasl_password = "admin"
# PLAIN
sasl_mechanism= "PLAIN"
# v1
sasl_version=1
# set true
sasl_handshake = true
# optional
# sasl_auth_identity=""
#
##
# v0.3.39以上版本新增,是否开启pod日志采集
enable_collect_container=false
# 是否采集所有pod的stdout stderr
collect_container_all = false
## glog processing rules
# [[logs.Processing_rules]]
## single log configure
[[logs.items]]
## file/journald/tcp/udp
type = "file"
## type=file, path is required; type=journald/tcp/udp, port is required
path = "/logs/*"
source = "n9e"
service = "n9e_service"
================================================
FILE: docker/compose-host-network-metric-log/etc-logstash/logstash.yaml
================================================
input {
kafka {
bootstrap_servers => "127.0.0.1:9092"
topics => ["flashcatcloud"]
codec => json
type => n9e
}
}
filter {
grok {
match => {"message" => "%{LOGLEVEL:status}"}
overwrite => ["status"]
}
}
output {
elasticsearch {
hosts => ["127.0.0.1:9200"]
index => "n9e-%{+YYYY.MM.DD}"
}
}
================================================
FILE: docker/compose-host-network-metric-log/etc-mysql/my.cnf
================================================
[mysqld]
pid-file = /var/run/mysqld/mysqld.pid
socket = /var/run/mysqld/mysqld.sock
datadir = /var/lib/mysql
bind-address = 127.0.0.1
================================================
FILE: docker/compose-host-network-metric-log/etc-nightingale/config.toml
================================================
[Global]
RunMode = "release"
[Log]
# log write dir
Dir = "logs"
# log level: DEBUG INFO WARNING ERROR
Level = "INFO"
# stdout, stderr, file
Output = "file"
# # rotate by time
KeepHours = 4
# # rotate by size
# RotateNum = 3
# # unit: MB
# RotateSize = 256
[HTTP]
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 17000
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = false
# whether enable pprof
PProf = false
# expose prometheus /metrics?
ExposeMetrics = true
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120
[HTTP.ShowCaptcha]
Enable = false
[HTTP.APIForAgent]
Enable = true
# [HTTP.APIForAgent.BasicAuth]
# user001 = "ccc26da7b9aba533cbb263a36c07dcc5"
[HTTP.APIForService]
Enable = false
[HTTP.APIForService.BasicAuth]
user001 = "ccc26da7b9aba533cbb263a36c07dcc5"
[HTTP.JWTAuth]
# unit: min
AccessExpired = 1500
# unit: min
RefreshExpired = 10080
RedisKeyPrefix = "/jwt/"
[HTTP.ProxyAuth]
# if proxy auth enabled, jwt auth is disabled
Enable = false
# username key in http proxy header
HeaderUserNameKey = "X-User-Name"
DefaultRoles = ["Standard"]
[HTTP.RSA]
# open RSA
OpenRSA = false
[DB]
# postgres: host=%s port=%s user=%s dbname=%s password=%s sslmode=%s
# postgres: DSN="host=127.0.0.1 port=5432 user=root dbname=n9e_v6 password=1234 sslmode=disable"
DSN="root:1234@tcp(127.0.0.1:3306)/n9e_v6?charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true"
# enable debug mode or not
Debug = false
# mysql postgres
DBType = "mysql"
# unit: s
MaxLifetime = 7200
# max open connections
MaxOpenConns = 150
# max idle connections
MaxIdleConns = 50
[Redis]
# address, ip:port or ip1:port,ip2:port for cluster and sentinel(SentinelAddrs)
Address = "127.0.0.1:6379"
# Username = ""
# Password = ""
# DB = 0
# UseTLS = false
# TLSMinVersion = "1.2"
# standalone cluster sentinel
RedisType = "standalone"
# Mastername for sentinel type
# MasterName = "mymaster"
# SentinelUsername = ""
# SentinelPassword = ""
[Alert]
[Alert.Heartbeat]
# auto detect if blank
IP = ""
# unit ms
Interval = 1000
EngineName = "default"
# [Alert.Alerting]
# NotifyConcurrency = 10
[Center]
MetricsYamlFile = "./etc/metrics.yaml"
I18NHeaderKey = "X-Language"
[Center.AnonymousAccess]
PromQuerier = true
AlertDetail = true
[Pushgw]
# use target labels in database instead of in series
LabelRewrite = true
ForceUseServerTS = true
# [Pushgw.DebugSample]
# ident = "xx"
# __name__ = "xx"
# [Pushgw.WriterOpt]
# QueueMaxSize = 1000000
# QueuePopSize = 1000
[[Pushgw.Writers]]
# Url = "http://127.0.0.1:8480/insert/0/prometheus/api/v1/write"
Url = "http://127.0.0.1:9090/api/v1/write"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Headers = ["X-From", "n9e"]
Timeout = 10000
DialTimeout = 3000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
## Optional TLS Config
# UseTLS = false
# TLSCA = "/etc/n9e/ca.pem"
# TLSCert = "/etc/n9e/cert.pem"
# TLSKey = "/etc/n9e/key.pem"
# InsecureSkipVerify = false
# [[Writers.WriteRelabels]]
# Action = "replace"
# SourceLabels = ["__address__"]
# Regex = "([^:]+)(?::\\d+)?"
# Replacement = "$1:80"
# TargetLabel = "__address__"
[Ibex]
Enable = true
RPCListen = "0.0.0.0:20090"
================================================
FILE: docker/compose-host-network-metric-log/etc-nightingale/metrics.yaml
================================================
zh:
ip_conntrack_count: 连接跟踪表条目总数(单位:int, count)
ip_conntrack_max: 连接跟踪表最大容量(单位:int, size)
cpu_usage_idle: CPU空闲率(单位:%)
cpu_usage_active: CPU使用率(单位:%)
cpu_usage_system: CPU内核态时间占比(单位:%)
cpu_usage_user: CPU用户态时间占比(单位:%)
cpu_usage_nice: 低优先级用户态CPU时间占比,也就是进程nice值被调整为1-19之间的CPU时间。这里注意,nice可取值范围是-20到19,数值越大,优先级反而越低(单位:%)
cpu_usage_iowait: CPU等待I/O的时间占比(单位:%)
cpu_usage_irq: CPU处理硬中断的时间占比(单位:%)
cpu_usage_softirq: CPU处理软中断的时间占比(单位:%)
cpu_usage_steal: 在虚拟机环境下有该指标,表示CPU被其他虚拟机争用的时间占比,超过20就表示争抢严重(单位:%)
cpu_usage_guest: 通过虚拟化运行其他操作系统的时间,也就是运行虚拟机的CPU时间占比(单位:%)
cpu_usage_guest_nice: 以低优先级运行虚拟机的时间占比(单位:%)
disk_free: 硬盘分区剩余量(单位:byte)
disk_used: 硬盘分区使用量(单位:byte)
disk_used_percent: 硬盘分区使用率(单位:%)
disk_total: 硬盘分区总量(单位:byte)
disk_inodes_free: 硬盘分区inode剩余量
disk_inodes_used: 硬盘分区inode使用量
disk_inodes_total: 硬盘分区inode总量
diskio_io_time: 从设备视角来看I/O请求总时间,队列中有I/O请求就计数(单位:毫秒),counter类型,需要用函数求rate才有使用价值
diskio_iops_in_progress: 已经分配给设备驱动且尚未完成的IO请求,不包含在队列中但尚未分配给设备驱动的IO请求,gauge类型
diskio_merged_reads: 相邻读请求merge读的次数,counter类型
diskio_merged_writes: 相邻写请求merge写的次数,counter类型
diskio_read_bytes: 读取的byte数量,counter类型,需要用函数求rate才有使用价值
diskio_read_time: 读请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值
diskio_reads: 读请求次数,counter类型,需要用函数求rate才有使用价值
diskio_weighted_io_time: 从I/O请求视角来看I/O等待总时间,如果同时有多个I/O请求,时间会叠加(单位:毫秒)
diskio_write_bytes: 写入的byte数量,counter类型,需要用函数求rate才有使用价值
diskio_write_time: 写请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值
diskio_writes: 写请求次数,counter类型,需要用函数求rate才有使用价值
kernel_boot_time: 内核启动时间
kernel_context_switches: 内核上下文切换次数
kernel_entropy_avail: linux系统内部的熵池
kernel_interrupts: 内核中断次数
kernel_processes_forked: fork的进程数
mem_active: 活跃使用的内存总数(包括cache和buffer内存)
mem_available: 可用内存大小(bytes)
mem_available_percent: 内存剩余百分比(0~100)
mem_buffered: 用来给文件做缓冲大小
mem_cached: 被高速缓冲存储器(cache memory)用的内存的大小(等于 diskcache minus SwapCache )
mem_commit_limit: 根据超额分配比率('vm.overcommit_ratio'),这是当前在系统上分配可用的内存总量,这个限制只是在模式2('vm.overcommit_memory')时启用
mem_committed_as: 目前在系统上分配的内存量。是所有进程申请的内存的总和
mem_dirty: 等待被写回到磁盘的内存大小
mem_free: 空闲内存大小(bytes)
mem_high_free: 未被使用的高位内存大小
mem_high_total: 高位内存总大小(Highmem是指所有内存高于860MB的物理内存,Highmem区域供用户程序使用,或用于页面缓存。该区域不是直接映射到内核空间。内核必须使用不同的手法使用该段内存)
mem_huge_page_size: 每个大页的大小
mem_huge_pages_free: 池中尚未分配的 HugePages 数量
mem_huge_pages_total: 预留HugePages的总个数
mem_inactive: 空闲的内存数(包括free和available的内存)
mem_low_free: 未被使用的低位大小
mem_low_total: 低位内存总大小,低位可以达到高位内存一样的作用,而且它还能够被内核用来记录一些自己的数据结构
mem_mapped: 设备和文件等映射的大小
mem_page_tables: 管理内存分页页面的索引表的大小
mem_shared: 多个进程共享的内存总额
mem_slab: 内核数据结构缓存的大小,可以减少申请和释放内存带来的消耗
mem_sreclaimable: 可收回Slab的大小
mem_sunreclaim: 不可收回Slab的大小(SUnreclaim+SReclaimable=Slab)
mem_swap_cached: 被高速缓冲存储器(cache memory)用的交换空间的大小,已经被交换出来的内存,但仍然被存放在swapfile中。用来在需要的时候很快的被替换而不需要再次打开I/O端口
mem_swap_free: 未被使用交换空间的大小
mem_swap_total: 交换空间的总大小
mem_total: 内存总数
mem_used: 已用内存数
mem_used_percent: 已用内存数百分比(0~100)
mem_vmalloc_chunk: 最大的连续未被使用的vmalloc区域
mem_vmalloc_totalL: 可以vmalloc虚拟内存大小
mem_vmalloc_used: vmalloc已使用的虚拟内存大小
mem_write_back: 正在被写回到磁盘的内存大小
mem_write_back_tmp: FUSE用于临时写回缓冲区的内存
net_bytes_recv: 网卡收包总数(bytes),计算每秒速率时需要用到rate/irate函数
net_bytes_sent: 网卡发包总数(bytes),计算每秒速率时需要用到rate/irate函数
net_drop_in: 网卡收丢包数量
net_drop_out: 网卡发丢包数量
net_err_in: 网卡收包错误数量
net_err_out: 网卡发包错误数量
net_packets_recv: 网卡收包数量
net_packets_sent: 网卡发包数量
net_bits_recv: 网卡收包总数(bits),计算每秒速率时需要用到rate/irate函数
net_bits_sent: 网卡发包总数(bits),计算每秒速率时需要用到rate/irate函数
netstat_tcp_established: ESTABLISHED状态的网络链接数
netstat_tcp_fin_wait1: FIN_WAIT1状态的网络链接数
netstat_tcp_fin_wait2: FIN_WAIT2状态的网络链接数
netstat_tcp_last_ack: LAST_ACK状态的网络链接数
netstat_tcp_listen: LISTEN状态的网络链接数
netstat_tcp_syn_recv: SYN_RECV状态的网络链接数
netstat_tcp_syn_sent: SYN_SENT状态的网络链接数
netstat_tcp_time_wait: TIME_WAIT状态的网络链接数
netstat_udp_socket: UDP状态的网络链接数
netstat_sockets_used: 已使用的所有协议套接字总量
netstat_tcp_inuse: 正在使用(正在侦听)的TCP套接字数量
netstat_tcp_orphan: 无主(不属于任何进程)的TCP连接数(无用、待销毁的TCP socket数)
netstat_tcp_tw: TIME_WAIT状态的TCP连接数
netstat_tcp_alloc: 已分配(已建立、已申请到sk_buff)的TCP套接字数量
netstat_tcp_mem: TCP套接字内存Page使用量
netstat_udp_inuse: 在使用的UDP套接字数量
netstat_udp_mem: UDP套接字内存Page使用量
netstat_udplite_inuse: 正在使用的 udp lite 数量
netstat_raw_inuse: 正在使用的 raw socket 数量
netstat_frag_inuse: ip fragment 数量
netstat_frag_memory: ip fragment 已经分配的内存(byte)
#[ping]
ping_percent_packet_loss: ping数据包丢失百分比(%)
ping_result_code: ping返回码('0','1')
net_response_result_code: 网络探测结果,0表示正常,非0表示异常
net_response_response_time: 网络探测时延,单位:秒
processes_blocked: 不可中断的睡眠状态下的进程数('U','D','L')
processes_dead: 回收中的进程数('X')
processes_idle: 挂起的空闲进程数('I')
processes_paging: 分页进程数('P')
processes_running: 运行中的进程数('R')
processes_sleeping: 可中断进程数('S')
processes_stopped: 暂停状态进程数('T')
processes_total: 总进程数
processes_total_threads: 总线程数
processes_unknown: 未知状态进程数
processes_zombies: 僵尸态进程数('Z')
swap_used_percent: Swap空间换出数据量
system_load1: 1分钟平均load值
system_load5: 5分钟平均load值
system_load15: 15分钟平均load值
system_load_norm_1: 1分钟平均load值/逻辑CPU个数
system_load_norm_5: 5分钟平均load值/逻辑CPU个数
system_load_norm_15: 15分钟平均load值/逻辑CPU个数
system_n_users: 用户数
system_n_cpus: CPU核数
system_uptime: 系统启动时间
nginx_accepts: 自nginx启动起,与客户端建立过得连接总数
nginx_active: 当前nginx正在处理的活动连接数,等于Reading/Writing/Waiting总和
nginx_handled: 自nginx启动起,处理过的客户端连接总数
nginx_reading: 正在读取HTTP请求头部的连接总数
nginx_requests: 自nginx启动起,处理过的客户端请求总数,由于存在HTTP Keep-Alive请求,该值会大于handled值
nginx_upstream_check_fall: upstream_check模块检测到后端失败的次数
nginx_upstream_check_rise: upstream_check模块对后端的检测次数
nginx_upstream_check_status_code: 后端upstream的状态,up为1,down为0
nginx_waiting: 开启 keep-alive 的情况下,这个值等于 active – (reading+writing), 意思就是 Nginx 已经处理完正在等候下一次请求指令的驻留连接
nginx_writing: 正在向客户端发送响应的连接总数
http_response_content_length: HTTP消息实体的传输长度
http_response_http_response_code: http响应状态码
http_response_response_time: http响应用时
http_response_result_code: url探测结果0为正常否则url无法访问
# [aws cloudwatch rds]
cloudwatch_aws_rds_bin_log_disk_usage_average: rds 磁盘使用平均值
cloudwatch_aws_rds_bin_log_disk_usage_maximum: rds 磁盘使用量最大值
cloudwatch_aws_rds_bin_log_disk_usage_minimum: rds binlog 磁盘使用量最低
cloudwatch_aws_rds_bin_log_disk_usage_sample_count: rds binlog 磁盘使用情况样本计数
cloudwatch_aws_rds_bin_log_disk_usage_sum: rds binlog 磁盘使用总和
cloudwatch_aws_rds_burst_balance_average: rds 突发余额平均值
cloudwatch_aws_rds_burst_balance_maximum: rds 突发余额最大值
cloudwatch_aws_rds_burst_balance_minimum: rds 突发余额最低
cloudwatch_aws_rds_burst_balance_sample_count: rds 突发平衡样本计数
cloudwatch_aws_rds_burst_balance_sum: rds 突发余额总和
cloudwatch_aws_rds_cpu_utilization_average: rds cpu 利用率平均值
cloudwatch_aws_rds_cpu_utilization_maximum: rds cpu 利用率最大值
cloudwatch_aws_rds_cpu_utilization_minimum: rds cpu 利用率最低
cloudwatch_aws_rds_cpu_utilization_sample_count: rds cpu 利用率样本计数
cloudwatch_aws_rds_cpu_utilization_sum: rds cpu 利用率总和
cloudwatch_aws_rds_database_connections_average: rds 数据库连接平均值
cloudwatch_aws_rds_database_connections_maximum: rds 数据库连接数最大值
cloudwatch_aws_rds_database_connections_minimum: rds 数据库连接最小
cloudwatch_aws_rds_database_connections_sample_count: rds 数据库连接样本数
cloudwatch_aws_rds_database_connections_sum: rds 数据库连接总和
cloudwatch_aws_rds_db_load_average: rds db 平均负载
cloudwatch_aws_rds_db_load_cpu_average: rds db 负载 cpu 平均值
cloudwatch_aws_rds_db_load_cpu_maximum: rds db 负载 cpu 最大值
cloudwatch_aws_rds_db_load_cpu_minimum: rds db 负载 cpu 最小值
cloudwatch_aws_rds_db_load_cpu_sample_count: rds db 加载 CPU 样本数
cloudwatch_aws_rds_db_load_cpu_sum: rds db 加载cpu总和
cloudwatch_aws_rds_db_load_maximum: rds 数据库负载最大值
cloudwatch_aws_rds_db_load_minimum: rds 数据库负载最小值
cloudwatch_aws_rds_db_load_non_cpu_average: rds 加载非 CPU 平均值
cloudwatch_aws_rds_db_load_non_cpu_maximum: rds 加载非 cpu 最大值
cloudwatch_aws_rds_db_load_non_cpu_minimum: rds 加载非 cpu 最小值
cloudwatch_aws_rds_db_load_non_cpu_sample_count: rds 加载非 cpu 样本计数
cloudwatch_aws_rds_db_load_non_cpu_sum: rds 加载非cpu总和
cloudwatch_aws_rds_db_load_sample_count: rds db 加载样本计数
cloudwatch_aws_rds_db_load_sum: rds db 负载总和
cloudwatch_aws_rds_disk_queue_depth_average: rds 磁盘队列深度平均值
cloudwatch_aws_rds_disk_queue_depth_maximum: rds 磁盘队列深度最大值
cloudwatch_aws_rds_disk_queue_depth_minimum: rds 磁盘队列深度最小值
cloudwatch_aws_rds_disk_queue_depth_sample_count: rds 磁盘队列深度样本计数
cloudwatch_aws_rds_disk_queue_depth_sum: rds 磁盘队列深度总和
cloudwatch_aws_rds_ebs_byte_balance__average: rds ebs 字节余额平均值
cloudwatch_aws_rds_ebs_byte_balance__maximum: rds ebs 字节余额最大值
cloudwatch_aws_rds_ebs_byte_balance__minimum: rds ebs 字节余额最低
cloudwatch_aws_rds_ebs_byte_balance__sample_count: rds ebs 字节余额样本数
cloudwatch_aws_rds_ebs_byte_balance__sum: rds ebs 字节余额总和
cloudwatch_aws_rds_ebsio_balance__average: rds ebsio 余额平均值
cloudwatch_aws_rds_ebsio_balance__maximum: rds ebsio 余额最大值
cloudwatch_aws_rds_ebsio_balance__minimum: rds ebsio 余额最低
cloudwatch_aws_rds_ebsio_balance__sample_count: rds ebsio 平衡样本计数
cloudwatch_aws_rds_ebsio_balance__sum: rds ebsio 余额总和
cloudwatch_aws_rds_free_storage_space_average: rds 免费存储空间平均
cloudwatch_aws_rds_free_storage_space_maximum: rds 最大可用存储空间
cloudwatch_aws_rds_free_storage_space_minimum: rds 最低可用存储空间
cloudwatch_aws_rds_free_storage_space_sample_count: rds 可用存储空间样本数
cloudwatch_aws_rds_free_storage_space_sum: rds 免费存储空间总和
cloudwatch_aws_rds_freeable_memory_average: rds 可用内存平均值
cloudwatch_aws_rds_freeable_memory_maximum: rds 最大可用内存
cloudwatch_aws_rds_freeable_memory_minimum: rds 最小可用内存
cloudwatch_aws_rds_freeable_memory_sample_count: rds 可释放内存样本数
cloudwatch_aws_rds_freeable_memory_sum: rds 可释放内存总和
cloudwatch_aws_rds_lvm_read_iops_average: rds lvm 读取 iops 平均值
cloudwatch_aws_rds_lvm_read_iops_maximum: rds lvm 读取 iops 最大值
cloudwatch_aws_rds_lvm_read_iops_minimum: rds lvm 读取 iops 最低
cloudwatch_aws_rds_lvm_read_iops_sample_count: rds lvm 读取 iops 样本计数
cloudwatch_aws_rds_lvm_read_iops_sum: rds lvm 读取 iops 总和
cloudwatch_aws_rds_lvm_write_iops_average: rds lvm 写入 iops 平均值
cloudwatch_aws_rds_lvm_write_iops_maximum: rds lvm 写入 iops 最大值
cloudwatch_aws_rds_lvm_write_iops_minimum: rds lvm 写入 iops 最低
cloudwatch_aws_rds_lvm_write_iops_sample_count: rds lvm 写入 iops 样本计数
cloudwatch_aws_rds_lvm_write_iops_sum: rds lvm 写入 iops 总和
cloudwatch_aws_rds_network_receive_throughput_average: rds 网络接收吞吐量平均
cloudwatch_aws_rds_network_receive_throughput_maximum: rds 网络接收吞吐量最大值
cloudwatch_aws_rds_network_receive_throughput_minimum: rds 网络接收吞吐量最小值
cloudwatch_aws_rds_network_receive_throughput_sample_count: rds 网络接收吞吐量样本计数
cloudwatch_aws_rds_network_receive_throughput_sum: rds 网络接收吞吐量总和
cloudwatch_aws_rds_network_transmit_throughput_average: rds 网络传输吞吐量平均值
cloudwatch_aws_rds_network_transmit_throughput_maximum: rds 网络传输吞吐量最大
cloudwatch_aws_rds_network_transmit_throughput_minimum: rds 网络传输吞吐量最小值
cloudwatch_aws_rds_network_transmit_throughput_sample_count: rds 网络传输吞吐量样本计数
cloudwatch_aws_rds_network_transmit_throughput_sum: rds 网络传输吞吐量总和
cloudwatch_aws_rds_read_iops_average: rds 读取 iops 平均值
cloudwatch_aws_rds_read_iops_maximum: rds 最大读取 iops
cloudwatch_aws_rds_read_iops_minimum: rds 读取 iops 最低
cloudwatch_aws_rds_read_iops_sample_count: rds 读取 iops 样本计数
cloudwatch_aws_rds_read_iops_sum: rds 读取 iops 总和
cloudwatch_aws_rds_read_latency_average: rds 读取延迟平均值
cloudwatch_aws_rds_read_latency_maximum: rds 读取延迟最大值
cloudwatch_aws_rds_read_latency_minimum: rds 最小读取延迟
cloudwatch_aws_rds_read_latency_sample_count: rds 读取延迟样本计数
cloudwatch_aws_rds_read_latency_sum: rds 读取延迟总和
cloudwatch_aws_rds_read_throughput_average: rds 读取吞吐量平均值
cloudwatch_aws_rds_read_throughput_maximum: rds 最大读取吞吐量
cloudwatch_aws_rds_read_throughput_minimum: rds 最小读取吞吐量
cloudwatch_aws_rds_read_throughput_sample_count: rds 读取吞吐量样本计数
cloudwatch_aws_rds_read_throughput_sum: rds 读取吞吐量总和
cloudwatch_aws_rds_swap_usage_average: rds 交换使用平均值
cloudwatch_aws_rds_swap_usage_maximum: rds 交换使用最大值
cloudwatch_aws_rds_swap_usage_minimum: rds 交换使用量最低
cloudwatch_aws_rds_swap_usage_sample_count: rds 交换使用示例计数
cloudwatch_aws_rds_swap_usage_sum: rds 交换使用总和
cloudwatch_aws_rds_write_iops_average: rds 写入 iops 平均值
cloudwatch_aws_rds_write_iops_maximum: rds 写入 iops 最大值
cloudwatch_aws_rds_write_iops_minimum: rds 写入 iops 最低
cloudwatch_aws_rds_write_iops_sample_count: rds 写入 iops 样本计数
cloudwatch_aws_rds_write_iops_sum: rds 写入 iops 总和
cloudwatch_aws_rds_write_latency_average: rds 写入延迟平均值
cloudwatch_aws_rds_write_latency_maximum: rds 最大写入延迟
cloudwatch_aws_rds_write_latency_minimum: rds 写入延迟最小值
cloudwatch_aws_rds_write_latency_sample_count: rds 写入延迟样本计数
cloudwatch_aws_rds_write_latency_sum: rds 写入延迟总和
cloudwatch_aws_rds_write_throughput_average: rds 写入吞吐量平均值
cloudwatch_aws_rds_write_throughput_maximum: rds 最大写入吞吐量
cloudwatch_aws_rds_write_throughput_minimum: rds 写入吞吐量最小值
cloudwatch_aws_rds_write_throughput_sample_count: rds 写入吞吐量样本计数
cloudwatch_aws_rds_write_throughput_sum: rds 写入吞吐量总和
en:
ip_conntrack_count: the number of entries in the conntrack table(unit:int, count)
ip_conntrack_max: the max capacity of the conntrack table(unit:int, size)
cpu_usage_idle: "CPU idle rate(unit:%)"
cpu_usage_active: "CPU usage rate(unit:%)"
cpu_usage_system: "CPU kernel state time proportion(unit:%)"
cpu_usage_user: "CPU user attitude time proportion(unit:%)"
cpu_usage_nice: "The proportion of low priority CPU time, that is, the process NICE value is adjusted to the CPU time between 1-19. Note here that the value range of NICE is -20 to 19, the larger the value, the lower the priority, the lower the priority(unit:%)"
cpu_usage_iowait: "CPU waiting for I/O time proportion(unit:%)"
cpu_usage_irq: "CPU processing hard interrupt time proportion(unit:%)"
cpu_usage_softirq: "CPU processing soft interrupt time proportion(unit:%)"
cpu_usage_steal: "In the virtual machine environment, there is this indicator, which means that the CPU is used by other virtual machines for the proportion of time.(unit:%)"
cpu_usage_guest: "The time to run other operating systems by virtualization, that is, the proportion of CPU time running the virtual machine(unit:%)"
cpu_usage_guest_nice: "The proportion of time to run the virtual machine at low priority(unit:%)"
disk_free: "The remaining amount of the hard disk partition (unit: byte)"
disk_used: "Hard disk partitional use (unit: byte)"
disk_used_percent: "Hard disk partitional use rate (unit:%)"
disk_total: "Total amount of hard disk partition (unit: byte)"
disk_inodes_free: "Hard disk partition INODE remaining amount"
disk_inodes_used: "Hard disk partition INODE usage amount"
disk_inodes_total: "The total amount of hard disk partition INODE"
diskio_io_time: "From the perspective of the device perspective, the total time of I/O request, the I/O request in the queue is count (unit: millisecond), the counter type, you need to use the function to find the value"
diskio_iops_in_progress: "IO requests that have been assigned to device -driven and have not yet been completed, not included in the queue but not yet assigned to the device -driven IO request, Gauge type"
diskio_merged_reads: "The number of times of adjacent reading request Merge, the counter type"
diskio_merged_writes: "The number of times the request Merge writes, the counter type"
diskio_read_bytes: "The number of byte reads, the counter type, you need to use the function to find the Rate to use the value"
diskio_read_time: "The total time of reading request (unit: millisecond), the counter type, you need to use the function to find the Rate to have the value of use"
diskio_reads: "Read the number of requests, the counter type, you need to use the function to find the Rate to use the value"
diskio_weighted_io_time: "From the perspective of the I/O request perspective, I/O wait for the total time. If there are multiple I/O requests at the same time, the time will be superimposed (unit: millisecond)"
diskio_write_bytes: "The number of bytes written, the counter type, you need to use the function to find the Rate to use the value"
diskio_write_time: "The total time of the request (unit: millisecond), the counter type, you need to use the function to find the rate to have the value of use"
diskio_writes: "Write the number of requests, the counter type, you need to use the function to find the rate to use value"
kernel_boot_time: "Kernel startup time"
kernel_context_switches: "Number of kernel context switching times"
kernel_entropy_avail: "Entropy pool inside the Linux system"
kernel_interrupts: "Number of kernel interruption"
kernel_processes_forked: "ForK's process number"
mem_active: "The total number of memory (including Cache and BUFFER memory)"
mem_available: "Application can use memory numbers"
mem_available_percent: "Memory remaining percentage (0 ~ 100)"
mem_buffered: "Used to make buffer size for the file"
mem_cached: "The size of the memory used by the cache memory (equal to diskcache minus Swap Cache )"
mem_commit_limit: "According to the over allocation ratio ('vm.overCommit _ Ratio'), this is the current total memory that can be allocated on the system."
mem_committed_as: "Currently allocated on the system. It is the sum of the memory of all process applications"
mem_dirty: "Waiting to be written back to the memory size of the disk"
mem_free: "Senior memory number"
mem_high_free: "Unused high memory size"
mem_high_total: "The total memory size of the high memory (Highmem refers to all the physical memory that is higher than 860 MB of memory, the HighMem area is used for user programs, or for page cache. This area is not directly mapped to the kernel space. The kernels must use different methods to use this section of memory. )"
mem_huge_page_size: "The size of each big page"
mem_huge_pages_free: "The number of Huge Pages in the pool that have not been allocated"
mem_huge_pages_total: "Reserve the total number of Huge Pages"
mem_inactive: "Free memory (including the memory of free and available)"
mem_low_free: "Unused low size"
mem_low_total: "The total size of the low memory memory can achieve the same role of high memory, and it can be used by the kernel to record some of its own data structure"
mem_mapped: "The size of the mapping of equipment and files"
mem_page_tables: "The size of the index table of the management of the memory paging page"
mem_shared: "The total memory shared by multiple processes"
mem_slab: "The size of the kernel data structure cache can reduce the consumption of application and release memory"
mem_sreclaimable: "The size of the SLAB can be recovered"
mem_sunreclaim: "The size of the SLAB cannot be recovered(SUnreclaim+SReclaimable=Slab)"
mem_swap_cached: "The size of the swap space used by the cache memory (cache memory), the memory that has been swapped out, but is still stored in the swapfile. Used to be quickly replaced when needed without opening the I/O port again"
mem_swap_free: "The size of the switching space is not used"
mem_swap_total: "The total size of the exchange space"
mem_total: "Total memory"
mem_used: "Memory number"
mem_used_percent: "The memory has been used by several percentage (0 ~ 100)"
mem_vmalloc_chunk: "The largest continuous unused vmalloc area"
mem_vmalloc_totalL: "You can vmalloc virtual memory size"
mem_vmalloc_used: "Vmalloc's virtual memory size"
mem_write_back: "The memory size of the disk is being written back to the disk"
mem_write_back_tmp: "Fuse is used to temporarily write back the memory of the buffer area"
net_bytes_recv: "Total inbound traffic(bytes) of network card"
net_bytes_sent: "Total outbound traffic(bytes) of network card"
net_bits_recv: "Total inbound traffic(bits) of network card"
net_bits_sent: "Total outbound traffic(bits) of network card"
net_drop_in: "The number of packets for network cards"
net_drop_out: "The number of packets issued by the network card"
net_err_in: "The number of incorrect packets of the network card"
net_err_out: "Number of incorrect number of network cards"
net_packets_recv: "Net card collection quantity"
net_packets_sent: "Number of network card issuance"
netstat_tcp_established: "ESTABLISHED status network link number"
netstat_tcp_fin_wait1: "FIN _ WAIT1 status network link number"
netstat_tcp_fin_wait2: "FIN _ WAIT2 status number of network links"
netstat_tcp_last_ack: "LAST_ ACK status number of network links"
netstat_tcp_listen: "Number of network links in Listen status"
netstat_tcp_syn_recv: "SYN _ RECV status number of network links"
netstat_tcp_syn_sent: "SYN _ SENT status number of network links"
netstat_tcp_time_wait: "Time _ WAIT status network link number"
netstat_udp_socket: "Number of network links in UDP status"
processes_blocked: "The number of processes in the unreproducible sleep state('U','D','L')"
processes_dead: "Number of processes in recycling('X')"
processes_idle: "Number of idle processes hanging('I')"
processes_paging: "Number of paging processes('P')"
processes_running: "Number of processes during operation('R')"
processes_sleeping: "Can interrupt the number of processes('S')"
processes_stopped: "Pushing status process number('T')"
processes_total: "Total process number"
processes_total_threads: "Number of threads"
processes_unknown: "Unknown status process number"
processes_zombies: "Number of zombies('Z')"
swap_used_percent: "SWAP space replace the data volume"
system_load1: "1 minute average load value"
system_load5: "5 minutes average load value"
system_load15: "15 minutes average load value"
system_load_norm_1: "1 minute average load value/logical CPU number"
system_load_norm_5: "5 minutes average load value/logical CPU number"
system_load_norm_15: "15 minutes average load value/logical CPU number"
system_n_users: "User number"
system_n_cpus: "CPU nuclear number"
system_uptime: "System startup time"
nginx_accepts: "Since Nginx started, the total number of connections has been established with the client"
nginx_active: "The current number of activity connections that Nginx is being processed is equal to Reading/Writing/Waiting"
nginx_handled: "Starting from Nginx, the total number of client connections that have been processed"
nginx_reading: "Reading the total number of connections on the http request header"
nginx_requests: "Since nginx is started, the total number of client requests processed, due to the existence of HTTP Keep-Alive requests, this value will be greater than the handled value"
nginx_upstream_check_fall: "UPStream_CHECK module detects the number of back -end failures"
nginx_upstream_check_rise: "UPSTREAM _ Check module to detect the number of back -end"
nginx_upstream_check_status_code: "The state of the backstream is 1, and the down is 0"
nginx_waiting: "When keep-alive is enabled, this value is equal to active – (reading+writing), which means that Nginx has processed the resident connection that is waiting for the next request command"
nginx_writing: "The total number of connections to send a response to the client"
http_response_content_length: "HTTP message entity transmission length"
http_response_http_response_code: "http response status code"
http_response_response_time: "When http ring application"
http_response_result_code: "URL detection result 0 is normal, otherwise the URL cannot be accessed"
# [mysqld_exporter]
mysql_global_status_uptime: The number of seconds that the server has been up.(Gauge)
mysql_global_status_uptime_since_flush_status: The number of seconds since the most recent FLUSH STATUS statement.(Gauge)
mysql_global_status_queries: The number of statements executed by the server. This variable includes statements executed within stored programs, unlike the Questions variable. It does not count COM_PING or COM_STATISTICS commands.(Counter)
mysql_global_status_threads_connected: The number of currently open connections.(Counter)
mysql_global_status_connections: The number of connection attempts (successful or not) to the MySQL server.(Gauge)
mysql_global_status_max_used_connections: The maximum number of connections that have been in use simultaneously since the server started.(Gauge)
mysql_global_status_threads_running: The number of threads that are not sleeping.(Gauge)
mysql_global_status_questions: The number of statements executed by the server. This includes only statements sent to the server by clients and not statements executed within stored programs, unlike the Queries variable. This variable does not count COM_PING, COM_STATISTICS, COM_STMT_PREPARE, COM_STMT_CLOSE, or COM_STMT_RESET commands.(Counter)
mysql_global_status_threads_cached: The number of threads in the thread cache.(Counter)
mysql_global_status_threads_created: The number of threads created to handle connections. If Threads_created is big, you may want to increase the thread_cache_size value. The cache miss rate can be calculated as Threads_created/Connections.(Counter)
mysql_global_status_created_tmp_tables: The number of internal temporary tables created by the server while executing statements.(Counter)
mysql_global_status_created_tmp_disk_tables: The number of internal on-disk temporary tables created by the server while executing statements. You can compare the number of internal on-disk temporary tables created to the total number of internal temporary tables created by comparing Created_tmp_disk_tables and Created_tmp_tables values.(Counter)
mysql_global_status_created_tmp_files: How many temporary files mysqld has created.(Counter)
mysql_global_status_select_full_join: The number of joins that perform table scans because they do not use indexes. If this value is not 0, you should carefully check the indexes of your tables.(Counter)
mysql_global_status_select_full_range_join: The number of joins that used a range search on a reference table.(Counter)
mysql_global_status_select_range: The number of joins that used ranges on the first table. This is normally not a critical issue even if the value is quite large.(Counter)
mysql_global_status_select_range_check: The number of joins without keys that check for key usage after each row. If this is not 0, you should carefully check the indexes of your tables.(Counter)
mysql_global_status_select_scan: The number of joins that did a full scan of the first table.(Counter)
mysql_global_status_sort_rows: The number of sorted rows.(Counter)
mysql_global_status_sort_range: The number of sorts that were done using ranges.(Counter)
mysql_global_status_sort_merge_passes: The number of merge passes that the sort algorithm has had to do. If this value is large, you should consider increasing the value of the sort_buffer_size system variable.(Counter)
mysql_global_status_sort_scan: The number of sorts that were done by scanning the table.(Counter)
mysql_global_status_slow_queries: The number of queries that have taken more than long_query_time seconds. This counter increments regardless of whether the slow query log is enabled.(Counter)
mysql_global_status_aborted_connects: The number of failed attempts to connect to the MySQL server.(Counter)
mysql_global_status_aborted_clients: The number of connections that were aborted because the client died without closing the connection properly.(Counter)
mysql_global_status_table_locks_immediate: The number of times that a request for a table lock could be granted immediately. Locks Immediate rising and falling is normal activity.(Counter)
mysql_global_status_table_locks_waited: The number of times that a request for a table lock could not be granted immediately and a wait was needed. If this is high and you have performance problems, you should first optimize your queries, and then either split your table or tables or use replication.(Counter)
mysql_global_status_bytes_received: The number of bytes received from all clients.(Counter)
mysql_global_status_bytes_sent: The number of bytes sent to all clients.(Counter)
mysql_global_status_innodb_page_size: InnoDB page size (default 16KB). Many values are counted in pages; the page size enables them to be easily converted to bytes.(Gauge)
mysql_global_status_buffer_pool_pages: The number of pages in the InnoDB buffer pool.(Gauge)
mysql_global_status_commands_total: The number of times each xxx statement has been executed.(Counter)
mysql_global_status_handlers_total: Handler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes. This is in fact the layer between the Storage Engine and MySQL.(Counter)
mysql_global_status_opened_files: The number of files that have been opened with my_open() (a mysys library function). Parts of the server that open files without using this function do not increment the count.(Counter)
mysql_global_status_open_tables: The number of tables that are open.(Gauge)
mysql_global_status_opened_tables: The number of tables that have been opened. If Opened_tables is big, your table_open_cache value is probably too small.(Counter)
mysql_global_status_table_open_cache_hits: The number of hits for open tables cache lookups.(Counter)
mysql_global_status_table_open_cache_misses: The number of misses for open tables cache lookups.(Counter)
mysql_global_status_table_open_cache_overflows: The number of overflows for the open tables cache.(Counter)
mysql_global_status_innodb_num_open_files: The number of files InnoDB currently holds open.(Gauge)
mysql_global_status_connection_errors_total: These variables provide information about errors that occur during the client connection process.(Counter)
mysql_global_status_innodb_buffer_pool_read_requests: The number of logical read requests.(Counter)
mysql_global_status_innodb_buffer_pool_reads: The number of logical reads that InnoDB could not satisfy from the buffer pool, and had to read directly from disk.(Counter)
mysql_global_variables_thread_cache_size: How many threads the server should cache for reuse.(Gauge)
mysql_global_variables_max_connections: The maximum permitted number of simultaneous client connections.(Gauge)
mysql_global_variables_innodb_buffer_pool_size: The size in bytes of the buffer pool, the memory area where InnoDB caches table and index data. The default value is 134217728 bytes (128MB).(Gauge)
mysql_global_variables_innodb_log_buffer_size: The size in bytes of the buffer that InnoDB uses to write to the log files on disk.(Gauge)
mysql_global_variables_key_buffer_size: Index blocks for MyISAM tables are buffered and are shared by all threads.(Gauge)
mysql_global_variables_query_cache_size: The amount of memory allocated for caching query results.(Gauge)
mysql_global_variables_table_open_cache: The number of open tables for all threads.(Gauge)
mysql_global_variables_open_files_limit: The number of file descriptors available to mysqld from the operating system.(Gauge)
# [redis_exporter]
redis_active_defrag_running: When activedefrag is enabled, this indicates whether defragmentation is currently active, and the CPU percentage it intends to utilize.
redis_allocator_active_bytes: Total bytes in the allocator active pages, this includes external-fragmentation.
redis_allocator_allocated_bytes: Total bytes allocated form the allocator, including internal-fragmentation. Normally the same as used_memory.
redis_allocator_frag_bytes: Delta between allocator_active and allocator_allocated. See note about mem_fragmentation_bytes.
redis_allocator_frag_ratio: Ratio between allocator_active and allocator_allocated. This is the true (external) fragmentation metric (not mem_fragmentation_ratio).
redis_allocator_resident_bytes: Total bytes resident (RSS) in the allocator, this includes pages that can be released to the OS (by MEMORY PURGE, or just waiting).
redis_allocator_rss_bytes: Delta between allocator_resident and allocator_active.
redis_allocator_rss_ratio: Ratio between allocator_resident and allocator_active. This usually indicates pages that the allocator can and probably will soon release back to the OS.
redis_aof_current_rewrite_duration_sec: Duration of the on-going AOF rewrite operation if any.
redis_aof_enabled: Flag indicating AOF logging is activated.
redis_aof_last_bgrewrite_status: Status of the last AOF rewrite operation.
redis_aof_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last AOF rewrite operation.
redis_aof_last_rewrite_duration_sec: Duration of the last AOF rewrite operation in seconds.
redis_aof_last_write_status: Status of the last write operation to the AOF.
redis_aof_rewrite_in_progress: Flag indicating a AOF rewrite operation is on-going.
redis_aof_rewrite_scheduled: Flag indicating an AOF rewrite operation will be scheduled once the on-going RDB save is complete.
redis_blocked_clients: Number of clients pending on a blocking call (BLPOP, BRPOP, BRPOPLPUSH, BLMOVE, BZPOPMIN, BZPOPMAX).
redis_client_recent_max_input_buffer_bytes: Biggest input buffer among current client connections.
redis_client_recent_max_output_buffer_bytes: Biggest output buffer among current client connections.
redis_cluster_enabled: Indicate Redis cluster is enabled.
redis_commands_duration_seconds_total: The total CPU time consumed by these commands.(Counter)
redis_commands_processed_total: Total number of commands processed by the server.(Counter)
redis_commands_total: The number of calls that reached command execution (not rejected).(Counter)
redis_config_maxclients: The value of the maxclients configuration directive. This is the upper limit for the sum of connected_clients, connected_slaves and cluster_connections.
redis_config_maxmemory: The value of the maxmemory configuration directive.
redis_connected_clients: Number of client connections (excluding connections from replicas).
redis_connected_slaves: Number of connected replicas.
redis_connections_received_total: Total number of connections accepted by the server.(Counter)
redis_cpu_sys_children_seconds_total: System CPU consumed by the background processes.(Counter)
redis_cpu_sys_seconds_total: System CPU consumed by the Redis server, which is the sum of system CPU consumed by all threads of the server process (main thread and background threads).(Counter)
redis_cpu_user_children_seconds_total: User CPU consumed by the background processes.(Counter)
redis_cpu_user_seconds_total: User CPU consumed by the Redis server, which is the sum of user CPU consumed by all threads of the server process (main thread and background threads).(Counter)
redis_db_keys: Total number of keys by DB.
redis_db_keys_expiring: Total number of expiring keys by DB
redis_defrag_hits: Number of value reallocations performed by active the defragmentation process.
redis_defrag_misses: Number of aborted value reallocations started by the active defragmentation process.
redis_defrag_key_hits: Number of keys that were actively defragmented.
redis_defrag_key_misses: Number of keys that were skipped by the active defragmentation process.
redis_evicted_keys_total: Number of evicted keys due to maxmemory limit.(Counter)
redis_expired_keys_total: Total number of key expiration events.(Counter)
redis_expired_stale_percentage: The percentage of keys probably expired.
redis_expired_time_cap_reached_total: The count of times that active expiry cycles have stopped early.
redis_exporter_last_scrape_connect_time_seconds: The duration(in seconds) to connect when scrape.
redis_exporter_last_scrape_duration_seconds: The last scrape duration.
redis_exporter_last_scrape_error: The last scrape error status.
redis_exporter_scrape_duration_seconds_count: Durations of scrapes by the exporter
redis_exporter_scrape_duration_seconds_sum: Durations of scrapes by the exporter
redis_exporter_scrapes_total: Current total redis scrapes.(Counter)
redis_instance_info: Information about the Redis instance.
redis_keyspace_hits_total: Hits total.(Counter)
redis_keyspace_misses_total: Misses total.(Counter)
redis_last_key_groups_scrape_duration_milliseconds: Duration of the last key group metrics scrape in milliseconds.
redis_last_slow_execution_duration_seconds: The amount of time needed for last slow execution, in seconds.
redis_latest_fork_seconds: The amount of time needed for last fork, in seconds.
redis_lazyfree_pending_objects: The number of objects waiting to be freed (as a result of calling UNLINK, or FLUSHDB and FLUSHALL with the ASYNC option).
redis_master_repl_offset: The server's current replication offset.
redis_mem_clients_normal: Memory used by normal clients.(Gauge)
redis_mem_clients_slaves: Memory used by replica clients - Starting Redis 7.0, replica buffers share memory with the replication backlog, so this field can show 0 when replicas don't trigger an increase of memory usage.
redis_mem_fragmentation_bytes: Delta between used_memory_rss and used_memory. Note that when the total fragmentation bytes is low (few megabytes), a high ratio (e.g. 1.5 and above) is not an indication of an issue.
redis_mem_fragmentation_ratio: Ratio between used_memory_rss and used_memory. Note that this doesn't only includes fragmentation, but also other process overheads (see the allocator_* metrics), and also overheads like code, shared libraries, stack, etc.
redis_mem_not_counted_for_eviction_bytes: (Gauge)
redis_memory_max_bytes: Max memory limit in bytes.
redis_memory_used_bytes: Total number of bytes allocated by Redis using its allocator (either standard libc, jemalloc, or an alternative allocator such as tcmalloc)
redis_memory_used_dataset_bytes: The size in bytes of the dataset (used_memory_overhead subtracted from used_memory)
redis_memory_used_lua_bytes: Number of bytes used by the Lua engine.
redis_memory_used_overhead_bytes: The sum in bytes of all overheads that the server allocated for managing its internal data structures.
redis_memory_used_peak_bytes: Peak memory consumed by Redis (in bytes)
redis_memory_used_rss_bytes: Number of bytes that Redis allocated as seen by the operating system (a.k.a resident set size). This is the number reported by tools such as top(1) and ps(1)
redis_memory_used_scripts_bytes: Number of bytes used by cached Lua scripts
redis_memory_used_startup_bytes: Initial amount of memory consumed by Redis at startup in bytes
redis_migrate_cached_sockets_total: The number of sockets open for MIGRATE purposes
redis_net_input_bytes_total: Total input bytes(Counter)
redis_net_output_bytes_total: Total output bytes(Counter)
redis_process_id: Process ID
redis_pubsub_channels: Global number of pub/sub channels with client subscriptions
redis_pubsub_patterns: Global number of pub/sub pattern with client subscriptions
redis_rdb_bgsave_in_progress: Flag indicating a RDB save is on-going
redis_rdb_changes_since_last_save: Number of changes since the last dump
redis_rdb_current_bgsave_duration_sec: Duration of the on-going RDB save operation if any
redis_rdb_last_bgsave_duration_sec: Duration of the last RDB save operation in seconds
redis_rdb_last_bgsave_status: Status of the last RDB save operation
redis_rdb_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last RDB save operation
redis_rdb_last_save_timestamp_seconds: Epoch-based timestamp of last successful RDB save
redis_rejected_connections_total: Number of connections rejected because of maxclients limit(Counter)
redis_repl_backlog_first_byte_offset: The master offset of the replication backlog buffer
redis_repl_backlog_history_bytes: Size in bytes of the data in the replication backlog buffer
redis_repl_backlog_is_active: Flag indicating replication backlog is active
redis_replica_partial_resync_accepted: The number of accepted partial resync requests(Gauge)
redis_replica_partial_resync_denied: The number of denied partial resync requests(Gauge)
redis_replica_resyncs_full: The number of full resyncs with replicas
redis_replication_backlog_bytes: Memory used by replication backlog
redis_second_repl_offset: The offset up to which replication IDs are accepted.
redis_slave_expires_tracked_keys: The number of keys tracked for expiry purposes (applicable only to writable replicas)(Gauge)
redis_slowlog_last_id: Last id of slowlog
redis_slowlog_length: Total slowlog
redis_start_time_seconds: Start time of the Redis instance since unix epoch in seconds.
redis_target_scrape_request_errors_total: Errors in requests to the exporter
redis_up: Flag indicating redis instance is up
redis_uptime_in_seconds: Number of seconds since Redis server start
# [windows_exporter]
windows_cpu_clock_interrupts_total: Total number of received and serviced clock tick interrupts(counter)
windows_cpu_core_frequency_mhz: Core frequency in megahertz(gauge)
windows_cpu_cstate_seconds_total: Time spent in low-power idle state(counter)
windows_cpu_dpcs_total: Total number of received and serviced deferred procedure calls (DPCs)(counter)
windows_cpu_idle_break_events_total: Total number of time processor was woken from idle(counter)
windows_cpu_interrupts_total: Total number of received and serviced hardware interrupts(counter)
windows_cpu_parking_status: Parking Status represents whether a processor is parked or not(gauge)
windows_cpu_processor_performance: Processor Performance is the average performance of the processor while it is executing instructions, as a percentage of the nominal performance of the processor. On some processors, Processor Performance may exceed 100%(gauge)
windows_cpu_time_total: Time that processor spent in different modes (idle, user, system, ...)(counter)
windows_cs_hostname: Labeled system hostname information as provided by ComputerSystem.DNSHostName and ComputerSystem.Domain(gauge)
windows_cs_logical_processors: ComputerSystem.NumberOfLogicalProcessors(gauge)
windows_cs_physical_memory_bytes: ComputerSystem.TotalPhysicalMemory(gauge)
windows_exporter_build_info: A metric with a constant '1' value labeled by version, revision, branch, and goversion from which windows_exporter was built.(gauge)
windows_exporter_collector_duration_seconds: Duration of a collection.(gauge)
windows_exporter_collector_success: Whether the collector was successful.(gauge)
windows_exporter_collector_timeout: Whether the collector timed out.(gauge)
windows_exporter_perflib_snapshot_duration_seconds: Duration of perflib snapshot capture(gauge)
windows_logical_disk_free_bytes: Free space in bytes (LogicalDisk.PercentFreeSpace)(gauge)
windows_logical_disk_idle_seconds_total: Seconds that the disk was idle (LogicalDisk.PercentIdleTime)(counter)
windows_logical_disk_read_bytes_total: The number of bytes transferred from the disk during read operations (LogicalDisk.DiskReadBytesPerSec)(counter)
windows_logical_disk_read_latency_seconds_total: Shows the average time, in seconds, of a read operation from the disk (LogicalDisk.AvgDiskSecPerRead)(counter)
windows_logical_disk_read_seconds_total: Seconds that the disk was busy servicing read requests (LogicalDisk.PercentDiskReadTime)(counter)
windows_logical_disk_read_write_latency_seconds_total: Shows the time, in seconds, of the average disk transfer (LogicalDisk.AvgDiskSecPerTransfer)(counter)
windows_logical_disk_reads_total: The number of read operations on the disk (LogicalDisk.DiskReadsPerSec)(counter)
windows_logical_disk_requests_queued: The number of requests queued to the disk (LogicalDisk.CurrentDiskQueueLength)(gauge)
windows_logical_disk_size_bytes: Total space in bytes (LogicalDisk.PercentFreeSpace_Base)(gauge)
windows_logical_disk_split_ios_total: The number of I/Os to the disk were split into multiple I/Os (LogicalDisk.SplitIOPerSec)(counter)
windows_logical_disk_write_bytes_total: The number of bytes transferred to the disk during write operations (LogicalDisk.DiskWriteBytesPerSec)(counter)
windows_logical_disk_write_latency_seconds_total: Shows the average time, in seconds, of a write operation to the disk (LogicalDisk.AvgDiskSecPerWrite)(counter)
windows_logical_disk_write_seconds_total: Seconds that the disk was busy servicing write requests (LogicalDisk.PercentDiskWriteTime)(counter)
windows_logical_disk_writes_total: The number of write operations on the disk (LogicalDisk.DiskWritesPerSec)(counter)
windows_net_bytes_received_total: (Network.BytesReceivedPerSec)(counter)
windows_net_bytes_sent_total: (Network.BytesSentPerSec)(counter)
windows_net_bytes_total: (Network.BytesTotalPerSec)(counter)
windows_net_current_bandwidth: (Network.CurrentBandwidth)(gauge)
windows_net_packets_outbound_discarded_total: (Network.PacketsOutboundDiscarded)(counter)
windows_net_packets_outbound_errors_total: (Network.PacketsOutboundErrors)(counter)
windows_net_packets_received_discarded_total: (Network.PacketsReceivedDiscarded)(counter)
windows_net_packets_received_errors_total: (Network.PacketsReceivedErrors)(counter)
windows_net_packets_received_total: (Network.PacketsReceivedPerSec)(counter)
windows_net_packets_received_unknown_total: (Network.PacketsReceivedUnknown)(counter)
windows_net_packets_sent_total: (Network.PacketsSentPerSec)(counter)
windows_net_packets_total: (Network.PacketsPerSec)(counter)
windows_os_info: OperatingSystem.Caption, OperatingSystem.Version(gauge)
windows_os_paging_free_bytes: OperatingSystem.FreeSpaceInPagingFiles(gauge)
windows_os_paging_limit_bytes: OperatingSystem.SizeStoredInPagingFiles(gauge)
windows_os_physical_memory_free_bytes: OperatingSystem.FreePhysicalMemory(gauge)
windows_os_process_memory_limix_bytes: OperatingSystem.MaxProcessMemorySize(gauge)
windows_os_processes: OperatingSystem.NumberOfProcesses(gauge)
windows_os_processes_limit: OperatingSystem.MaxNumberOfProcesses(gauge)
windows_os_time: OperatingSystem.LocalDateTime(gauge)
windows_os_timezone: OperatingSystem.LocalDateTime(gauge)
windows_os_users: OperatingSystem.NumberOfUsers(gauge)
windows_os_virtual_memory_bytes: OperatingSystem.TotalVirtualMemorySize(gauge)
windows_os_virtual_memory_free_bytes: OperatingSystem.FreeVirtualMemory(gauge)
windows_os_visible_memory_bytes: OperatingSystem.TotalVisibleMemorySize(gauge)
windows_service_info: A metric with a constant '1' value labeled with service information(gauge)
windows_service_start_mode: The start mode of the service (StartMode)(gauge)
windows_service_state: The state of the service (State)(gauge)
windows_service_status: The status of the service (Status)(gauge)
windows_system_context_switches_total: Total number of context switches (WMI source is PerfOS_System.ContextSwitchesPersec)(counter)
windows_system_exception_dispatches_total: Total number of exceptions dispatched (WMI source is PerfOS_System.ExceptionDispatchesPersec)(counter)
windows_system_processor_queue_length: Length of processor queue (WMI source is PerfOS_System.ProcessorQueueLength)(gauge)
windows_system_system_calls_total: Total number of system calls (WMI source is PerfOS_System.SystemCallsPersec)(counter)
windows_system_system_up_time: System boot time (WMI source is PerfOS_System.SystemUpTime)(gauge)
windows_system_threads: Current number of threads (WMI source is PerfOS_System.Threads)(gauge)
# [node_exporter]
# SYSTEM
# CPU context switch 次数
node_context_switches_total: context_switches
# Interrupts 次数
node_intr_total: Interrupts
# 运行的进程数
node_procs_running: Processes in runnable state
# 熵池大小
node_entropy_available_bits: Entropy available to random number generators
node_time_seconds: System time in seconds since epoch (1970)
node_boot_time_seconds: Node boot time, in unixtime
# CPU
node_cpu_seconds_total: Seconds the CPUs spent in each mode
node_load1: cpu load 1m
node_load5: cpu load 5m
node_load15: cpu load 15m
# MEM
# 内核态
# 内核用于缓存数据结构供自己使用的内存
node_memory_Slab_bytes: Memory used by the kernel to cache data structures for its own use
# slab中可回收的部分
node_memory_SReclaimable_bytes: SReclaimable - Part of Slab, that might be reclaimed, such as caches
# slab中不可回收的部分
node_memory_SUnreclaim_bytes: Part of Slab, that cannot be reclaimed on memory pressure
# Vmalloc内存区的大小
node_memory_VmallocTotal_bytes: Total size of vmalloc memory area
# vmalloc已分配的内存,虚拟地址空间上的连续的内存
node_memory_VmallocUsed_bytes: Amount of vmalloc area which is used
# vmalloc区可用的连续最大快的大小,通过此指标可以知道vmalloc可分配连续内存的最大值
node_memory_VmallocChunk_bytes: Largest contiguous block of vmalloc area which is free
# 内存的硬件故障删除掉的内存页的总大小
node_memory_HardwareCorrupted_bytes: Amount of RAM that the kernel identified as corrupted / not working
# 用于在虚拟和物理内存地址之间映射的内存
node_memory_PageTables_bytes: Memory used to map between virtual and physical memory addresses (gauge)
# 内核栈内存,常驻内存,不可回收
node_memory_KernelStack_bytes: Kernel memory stack. This is not reclaimable
# 用来访问高端内存,复制高端内存的临时buffer,称为“bounce buffering”,会降低I/O 性能
node_memory_Bounce_bytes: Memory used for block device bounce buffers
#用户态
# 单个巨页大小
node_memory_Hugepagesize_bytes: Huge Page size
# 系统分配的常驻巨页数
node_memory_HugePages_Total: Total size of the pool of huge pages
# 系统空闲的巨页数
node_memory_HugePages_Free: Huge pages in the pool that are not yet allocated
# 进程已申请但未使用的巨页数
node_memory_HugePages_Rsvd: Huge pages for which a commitment to allocate from the pool has been made, but no allocation
# 超过系统设定的常驻HugePages数量的个数
node_memory_HugePages_Surp: Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages
# 透明巨页 Transparent HugePages (THP)
node_memory_AnonHugePages_bytes: Memory in anonymous huge pages
# inactivelist中的File-backed内存
node_memory_Inactive_file_bytes: File-backed memory on inactive LRU list
# inactivelist中的Anonymous内存
node_memory_Inactive_anon_bytes: Anonymous and swap cache on inactive LRU list, including tmpfs (shmem)
# activelist中的File-backed内存
node_memory_Active_file_bytes: File-backed memory on active LRU list
# activelist中的Anonymous内存
node_memory_Active_anon_bytes: Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs
# 禁止换出的页,对应 Unevictable 链表
node_memory_Unevictable_bytes: Amount of unevictable memory that can't be swapped out for a variety of reasons
# 共享内存
node_memory_Shmem_bytes: Used shared memory (shared between several processes, thus including RAM disks)
# 匿名页内存大小
node_memory_AnonPages_bytes: Memory in user pages not backed by files
# 被关联的内存页大小
node_memory_Mapped_bytes: Used memory in mapped pages files which have been mapped, such as libraries
# file-backed内存页缓存大小
node_memory_Cached_bytes: Parked file data (file content) cache
# 系统中有多少匿名页曾经被swap-out、现在又被swap-in并且swap-in之后页面中的内容一直没发生变化
node_memory_SwapCached_bytes: Memory that keeps track of pages that have been fetched from swap but not yet been modified
# 被mlock()系统调用锁定的内存大小
node_memory_Mlocked_bytes: Size of pages locked to memory using the mlock() system call
# 块设备(block device)所占用的缓存页
node_memory_Buffers_bytes: Block device (e.g. harddisk) cache
node_memory_SwapTotal_bytes: Memory information field SwapTotal_bytes
node_memory_SwapFree_bytes: Memory information field SwapFree_bytes
# DISK
node_filesystem_avail_bytes: Filesystem space available to non-root users in byte
node_filesystem_free_bytes: Filesystem free space in bytes
node_filesystem_size_bytes: Filesystem size in bytes
node_filesystem_files_free: Filesystem total free file nodes
node_filesystem_files: Filesystem total free file nodes
node_filefd_maximum: Max open files
node_filefd_allocated: Open files
node_filesystem_readonly: Filesystem read-only status
node_filesystem_device_error: Whether an error occurred while getting statistics for the given device
node_disk_reads_completed_total: The total number of reads completed successfully
node_disk_writes_completed_total: The total number of writes completed successfully
node_disk_reads_merged_total: The number of reads merged
node_disk_writes_merged_total: The number of writes merged
node_disk_read_bytes_total: The total number of bytes read successfully
node_disk_written_bytes_total: The total number of bytes written successfully
node_disk_io_time_seconds_total: Total seconds spent doing I/Os
node_disk_read_time_seconds_total: The total number of seconds spent by all reads
node_disk_write_time_seconds_total: The total number of seconds spent by all writes
node_disk_io_time_weighted_seconds_total: The weighted of seconds spent doing I/Os
# NET
node_network_receive_bytes_total: Network device statistic receive_bytes (counter)
node_network_transmit_bytes_total: Network device statistic transmit_bytes (counter)
node_network_receive_packets_total: Network device statistic receive_bytes
node_network_transmit_packets_total: Network device statistic transmit_bytes
node_network_receive_errs_total: Network device statistic receive_errs
node_network_transmit_errs_total: Network device statistic transmit_errs
node_network_receive_drop_total: Network device statistic receive_drop
node_network_transmit_drop_total: Network device statistic transmit_drop
node_nf_conntrack_entries: Number of currently allocated flow entries for connection tracking
node_sockstat_TCP_alloc: Number of TCP sockets in state alloc
node_sockstat_TCP_inuse: Number of TCP sockets in state inuse
node_sockstat_TCP_orphan: Number of TCP sockets in state orphan
node_sockstat_TCP_tw: Number of TCP sockets in state tw
node_netstat_Tcp_CurrEstab: Statistic TcpCurrEstab
node_sockstat_sockets_used: Number of IPv4 sockets in use
# [kafka_exporter]
kafka_brokers: count of kafka_brokers (gauge)
kafka_topic_partitions: Number of partitions for this Topic (gauge)
kafka_topic_partition_current_offset: Current Offset of a Broker at Topic/Partition (gauge)
kafka_consumergroup_current_offset: Current Offset of a ConsumerGroup at Topic/Partition (gauge)
kafka_consumer_lag_millis: Current approximation of consumer lag for a ConsumerGroup at Topic/Partition (gauge)
kafka_topic_partition_under_replicated_partition: 1 if Topic/Partition is under Replicated
# [zookeeper_exporter]
zk_znode_count: The total count of znodes stored
zk_ephemerals_count: The number of Ephemerals nodes
zk_watch_count: The number of watchers setup over Zookeeper nodes.
zk_approximate_data_size: Size of data in bytes that a zookeeper server has in its data tree
zk_outstanding_requests: Number of currently executing requests
zk_packets_sent: Count of the number of zookeeper packets sent from a server
zk_packets_received: Count of the number of zookeeper packets received by a server
zk_num_alive_connections: Number of active clients connected to a zookeeper server
zk_open_file_descriptor_count: Number of file descriptors that a zookeeper server has open
zk_max_file_descriptor_count: Maximum number of file descriptors that a zookeeper server can open
zk_avg_latency: Average time in milliseconds for requests to be processed
zk_min_latency: Maximum time in milliseconds for a request to be processed
zk_max_latency: Minimum time in milliseconds for a request to be processed
================================================
FILE: docker/compose-host-network-metric-log/etc-nightingale/script/notify.bak.py
================================================
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys
import json
import urllib2
import smtplib
from email.mime.text import MIMEText
reload(sys)
sys.setdefaultencoding('utf8')
notify_channel_funcs = {
"email":"email",
"sms":"sms",
"voice":"voice",
"dingtalk":"dingtalk",
"wecom":"wecom",
"feishu":"feishu"
}
mail_host = "smtp.163.com"
mail_port = 994
mail_user = "ulricqin"
mail_pass = "password"
mail_from = "ulricqin@163.com"
class Sender(object):
@classmethod
def send_email(cls, payload):
if mail_user == "ulricqin" and mail_pass == "password":
print("invalid smtp configuration")
return
users = payload.get('event').get("notify_users_obj")
emails = {}
for u in users:
if u.get("email"):
emails[u.get("email")] = 1
if not emails:
return
recipients = emails.keys()
mail_body = payload.get('tpls').get("email.tpl", "email.tpl not found")
message = MIMEText(mail_body, 'html', 'utf-8')
message['From'] = mail_from
message['To'] = ", ".join(recipients)
message["Subject"] = payload.get('tpls').get("subject.tpl", "subject.tpl not found")
try:
smtp = smtplib.SMTP_SSL(mail_host, mail_port)
smtp.login(mail_user, mail_pass)
smtp.sendmail(mail_from, recipients, message.as_string())
smtp.close()
except smtplib.SMTPException, error:
print(error)
@classmethod
def send_wecom(cls, payload):
users = payload.get('event').get("notify_users_obj")
tokens = {}
for u in users:
contacts = u.get("contacts")
if contacts.get("wecom_robot_token", ""):
tokens[contacts.get("wecom_robot_token", "")] = 1
opener = urllib2.build_opener(urllib2.HTTPHandler())
method = "POST"
for t in tokens:
url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}".format(t)
body = {
"msgtype": "markdown",
"markdown": {
"content": payload.get('tpls').get("wecom.tpl", "wecom.tpl not found")
}
}
request = urllib2.Request(url, data=json.dumps(body))
request.add_header("Content-Type",'application/json;charset=utf-8')
request.get_method = lambda: method
try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
@classmethod
def send_dingtalk(cls, payload):
event = payload.get('event')
users = event.get("notify_users_obj")
rule_name = event.get("rule_name")
event_state = "Triggered"
if event.get("is_recovered"):
event_state = "Recovered"
tokens = {}
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
contacts = u.get("contacts")
if contacts.get("dingtalk_robot_token", ""):
tokens[contacts.get("dingtalk_robot_token", "")] = 1
opener = urllib2.build_opener(urllib2.HTTPHandler())
method = "POST"
for t in tokens:
url = "https://oapi.dingtalk.com/robot/send?access_token={}".format(t)
body = {
"msgtype": "markdown",
"markdown": {
"title": "{} - {}".format(event_state, rule_name),
"text": payload.get('tpls').get("dingtalk.tpl", "dingtalk.tpl not found") + ' '.join(["@"+i for i in phones.keys()])
},
"at": {
"atMobiles": phones.keys(),
"isAtAll": False
}
}
request = urllib2.Request(url, data=json.dumps(body))
request.add_header("Content-Type",'application/json;charset=utf-8')
request.get_method = lambda: method
try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
@classmethod
def send_feishu(cls, payload):
users = payload.get('event').get("notify_users_obj")
tokens = {}
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
contacts = u.get("contacts")
if contacts.get("feishu_robot_token", ""):
tokens[contacts.get("feishu_robot_token", "")] = 1
opener = urllib2.build_opener(urllib2.HTTPHandler())
method = "POST"
for t in tokens:
url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t)
body = {
"msg_type": "text",
"content": {
"text": payload.get('tpls').get("feishu.tpl", "feishu.tpl not found")
},
"at": {
"atMobiles": phones.keys(),
"isAtAll": False
}
}
request = urllib2.Request(url, data=json.dumps(body))
request.add_header("Content-Type",'application/json;charset=utf-8')
request.get_method = lambda: method
try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
@classmethod
def send_sms(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_sms not implemented, phones: {}".format(phones.keys()))
@classmethod
def send_voice(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_voice not implemented, phones: {}".format(phones.keys()))
def main():
payload = json.load(sys.stdin)
with open(".payload", 'w') as f:
f.write(json.dumps(payload, indent=4))
for ch in payload.get('event').get('notify_channels'):
send_func_name = "send_{}".format(notify_channel_funcs.get(ch.strip()))
if not hasattr(Sender, send_func_name):
print("function: {} not found", send_func_name)
continue
send_func = getattr(Sender, send_func_name)
send_func(payload)
def hello():
print("hello nightingale")
if __name__ == "__main__":
if len(sys.argv) == 1:
main()
elif sys.argv[1] == "hello":
hello()
else:
print("I am confused")
================================================
FILE: docker/compose-host-network-metric-log/etc-nightingale/script/notify.py
================================================
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import sys
import json
class Sender(object):
@classmethod
def send_email(cls, payload):
# already done in go code
pass
@classmethod
def send_wecom(cls, payload):
# already done in go code
pass
@classmethod
def send_dingtalk(cls, payload):
# already done in go code
pass
@classmethod
def send_feishu(cls, payload):
# already done in go code
pass
@classmethod
def send_mm(cls, payload):
# already done in go code
pass
@classmethod
def send_sms(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_sms not implemented, phones: {}".format(phones.keys()))
@classmethod
def send_voice(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_voice not implemented, phones: {}".format(phones.keys()))
def main():
payload = json.load(sys.stdin)
with open(".payload", 'w') as f:
f.write(json.dumps(payload, indent=4))
for ch in payload.get('event').get('notify_channels'):
send_func_name = "send_{}".format(ch.strip())
if not hasattr(Sender, send_func_name):
print("function: {} not found", send_func_name)
continue
send_func = getattr(Sender, send_func_name)
send_func(payload)
def hello():
print("hello nightingale")
if __name__ == "__main__":
if len(sys.argv) == 1:
main()
elif sys.argv[1] == "hello":
hello()
else:
print("I am confused")
================================================
FILE: docker/compose-host-network-metric-log/etc-nightingale/script/notify_feishu.py
================================================
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys
import json
import requests
class Sender(object):
@classmethod
def send_email(cls, payload):
# already done in go code
pass
@classmethod
def send_wecom(cls, payload):
# already done in go code
pass
@classmethod
def send_dingtalk(cls, payload):
# already done in go code
pass
@classmethod
def send_ifeishu(cls, payload):
users = payload.get('event').get("notify_users_obj")
tokens = {}
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
contacts = u.get("contacts")
if contacts.get("feishu_robot_token", ""):
tokens[contacts.get("feishu_robot_token", "")] = 1
headers = {
"Content-Type": "application/json;charset=utf-8",
"Host": "open.feishu.cn"
}
for t in tokens:
url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t)
body = {
"msg_type": "text",
"content": {
"text": payload.get('tpls').get("feishu", "feishu not found")
},
"at": {
"atMobiles": list(phones.keys()),
"isAtAll": False
}
}
response = requests.post(url, headers=headers, data=json.dumps(body))
print(f"notify_ifeishu: token={t} status_code={response.status_code} response_text={response.text}")
@classmethod
def send_mm(cls, payload):
# already done in go code
pass
@classmethod
def send_sms(cls, payload):
pass
@classmethod
def send_voice(cls, payload):
pass
def main():
payload = json.load(sys.stdin)
with open(".payload", 'w') as f:
f.write(json.dumps(payload, indent=4))
for ch in payload.get('event').get('notify_channels'):
send_func_name = "send_{}".format(ch.strip())
if not hasattr(Sender, send_func_name):
print("function: {} not found", send_func_name)
continue
send_func = getattr(Sender, send_func_name)
send_func(payload)
def hello():
print("hello nightingale")
if __name__ == "__main__":
if len(sys.argv) == 1:
main()
elif sys.argv[1] == "hello":
hello()
else:
print("I am confused")
================================================
FILE: docker/compose-host-network-metric-log/etc-nightingale/script/rule_converter.py
================================================
import json
import yaml
'''
将promtheus/vmalert的rule转换为n9e中的rule
支持k8s的rule configmap
'''
rule_file = 'rules.yaml'
def convert_interval(interval):
if interval.endswith('s') or interval.endswith('S'):
return int(interval[:-1])
if interval.endswith('m') or interval.endswith('M'):
return int(interval[:-1]) * 60
if interval.endswith('h') or interval.endswith('H'):
return int(interval[:-1]) * 60 * 60
if interval.endswith('d') or interval.endswith('D'):
return int(interval[:-1]) * 60 * 60 * 24
return int(interval)
def convert_alert(rule, interval):
name = rule['alert']
prom_ql = rule['expr']
if 'for' in rule:
prom_for_duration = convert_interval(rule['for'])
else:
prom_for_duration = 0
prom_eval_interval = convert_interval(interval)
note = ''
if 'annotations' in rule:
for v in rule['annotations'].values():
note = v
break
annotations = {}
if 'annotations' in rule:
for k, v in rule['annotations'].items():
annotations[k] = v
append_tags = []
severity = 2
if 'labels' in rule:
for k, v in rule['labels'].items():
if k != 'severity':
append_tags.append('{}={}'.format(k, v))
continue
if v == 'critical':
severity = 1
elif v == 'info':
severity = 3
# elif v == 'warning':
# severity = 2
n9e_alert_rule = {
"name": name,
"note": note,
"severity": severity,
"disabled": 0,
"prom_for_duration": prom_for_duration,
"prom_ql": prom_ql,
"prom_eval_interval": prom_eval_interval,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": append_tags,
"annotations":annotations
}
return n9e_alert_rule
def convert_record(rule, interval):
name = rule['record']
prom_ql = rule['expr']
prom_eval_interval = convert_interval(interval)
note = ''
append_tags = []
if 'labels' in rule:
for k, v in rule['labels'].items():
append_tags.append('{}={}'.format(k, v))
n9e_record_rule = {
"name": name,
"note": note,
"disabled": 0,
"prom_ql": prom_ql,
"prom_eval_interval": prom_eval_interval,
"append_tags": append_tags
}
return n9e_record_rule
'''
example of rule group file
---
groups:
- name: example
rules:
- alert: HighRequestLatency
expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5
for: 10m
labels:
severity: page
annotations:
summary: High request latency
'''
def deal_group(group):
"""
parse single prometheus/vmalert rule group
"""
alert_rules = []
record_rules = []
for rule_segment in group['groups']:
if 'interval' in rule_segment:
interval = rule_segment['interval']
else:
interval = '15s'
for rule in rule_segment['rules']:
if 'alert' in rule:
alert_rules.append(convert_alert(rule, interval))
else:
record_rules.append(convert_record(rule, interval))
return alert_rules, record_rules
'''
example of k8s rule configmap
---
apiVersion: v1
kind: ConfigMap
metadata:
name: rulefiles-0
data:
etcdrules.yaml: |
groups:
- name: etcd
rules:
- alert: etcdInsufficientMembers
annotations:
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value}}).'
expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"})
by (job) + 1) / 2)
for: 3m
labels:
severity: critical
'''
def deal_configmap(rule_configmap):
"""
parse rule configmap from k8s
"""
all_record_rules = []
all_alert_rules = []
for _, rule_group_str in rule_configmap['data'].items():
rule_group = yaml.load(rule_group_str, Loader=yaml.FullLoader)
alert_rules, record_rules = deal_group(rule_group)
all_alert_rules.extend(alert_rules)
all_record_rules.extend(record_rules)
return all_alert_rules, all_record_rules
def main():
with open(rule_file, 'r') as f:
rule_config = yaml.load(f, Loader=yaml.FullLoader)
# 如果文件是k8s中的configmap,使用下面的方法
# alert_rules, record_rules = deal_configmap(rule_config)
alert_rules, record_rules = deal_group(rule_config)
with open("alert-rules.json", 'w') as fw:
json.dump(alert_rules, fw, indent=2, ensure_ascii=False)
with open("record-rules.json", 'w') as fw:
json.dump(record_rules, fw, indent=2, ensure_ascii=False)
if __name__ == '__main__':
main()
================================================
FILE: docker/compose-host-network-metric-log/etc-prometheus/prometheus.yml
================================================
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
scrape_configs:
# The job name is added as a label `job=` to any timeseries scraped from this config.
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'nightingale'
static_configs:
- targets: ['localhost:17000']
================================================
FILE: docker/compose-postgres/categraf/conf/config.toml
================================================
[global]
# whether print configs
print_configs = false
# add label(agent_hostname) to series
# "" -> auto detect hostname
# "xx" -> use specified string xx
# "$hostname" -> auto detect hostname
# "$ip" -> auto detect ip
# "$hostname-$ip" -> auto detect hostname and ip to replace the vars
hostname = "$HOSTNAME"
# will not add label(agent_hostname) if true
omit_hostname = false
# s | ms
precision = "ms"
# global collect interval
interval = 15
[global.labels]
source="categraf"
# region = "shanghai"
# env = "localhost"
[writer_opt]
# default: 2000
batch = 2000
# channel(as queue) size
chan_size = 10000
[[writers]]
url = "http://nightingale:17000/prometheus/v1/write"
# Basic auth username
basic_auth_user = ""
# Basic auth password
basic_auth_pass = ""
# timeout settings, unit: ms
timeout = 5000
dial_timeout = 2500
max_idle_conns_per_host = 100
[http]
enable = false
address = ":9100"
print_access = false
run_mode = "release"
[heartbeat]
enable = true
# report os version cpu.util mem.util metadata
url = "http://nightingale:17000/v1/n9e/heartbeat"
# interval, unit: s
interval = 10
# Basic auth username
basic_auth_user = ""
# Basic auth password
basic_auth_pass = ""
## Optional headers
# headers = ["X-From", "categraf", "X-Xyz", "abc"]
# timeout settings, unit: ms
timeout = 5000
dial_timeout = 2500
max_idle_conns_per_host = 100
[ibex]
enable = true
## ibex flush interval
interval = "1000ms"
## n9e ibex server rpc address
servers = ["nightingale:20090"]
## temp script dir
meta_dir = "./meta"
================================================
FILE: docker/compose-postgres/categraf/conf/input.cpu/cpu.toml
================================================
# # collect interval
# interval = 15
# # whether collect per cpu
# collect_per_cpu = false
================================================
FILE: docker/compose-postgres/categraf/conf/input.disk/disk.toml
================================================
# # collect interval
# interval = 15
# # By default stats will be gathered for all mount points.
# # Set mount_points will restrict the stats to only the specified mount points.
# mount_points = ["/"]
# Ignore mount points by filesystem type.
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
ignore_mount_points = ["/boot"]
================================================
FILE: docker/compose-postgres/categraf/conf/input.diskio/diskio.toml
================================================
# # collect interval
# interval = 15
# # By default, categraf will gather stats for all devices including disk partitions.
# # Setting devices will restrict the stats to the specified devices.
# devices = ["sda", "sdb", "vd*"]
================================================
FILE: docker/compose-postgres/categraf/conf/input.docker/docker.toml
================================================
# # collect interval
# interval = 15
[[instances]]
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
## Docker Endpoint
## To use TCP, set endpoint = "tcp://[ip]:[port]"
## To use environment variables (ie, docker-machine), set endpoint = "ENV"
endpoint = "unix:///var/run/docker.sock"
## Set to true to collect Swarm metrics(desired_replicas, running_replicas)
gather_services = false
gather_extend_memstats = false
container_id_label_enable = true
container_id_label_short_style = true
## Containers to include and exclude. Globs accepted.
## Note that an empty array for both will include all containers
container_name_include = []
container_name_exclude = []
## Container states to include and exclude. Globs accepted.
## When empty only containers in the "running" state will be captured.
## example: container_state_include = ["created", "restarting", "running", "removing", "paused", "exited", "dead"]
## example: container_state_exclude = ["created", "restarting", "running", "removing", "paused", "exited", "dead"]
# container_state_include = []
# container_state_exclude = []
## Timeout for docker list, info, and stats commands
timeout = "5s"
## Specifies for which classes a per-device metric should be issued
## Possible values are 'cpu' (cpu0, cpu1, ...), 'blkio' (8:0, 8:1, ...) and 'network' (eth0, eth1, ...)
## Please note that this setting has no effect if 'perdevice' is set to 'true'
perdevice_include = []
## Specifies for which classes a total metric should be issued. Total is an aggregated of the 'perdevice' values.
## Possible values are 'cpu', 'blkio' and 'network'
## Total 'cpu' is reported directly by Docker daemon, and 'network' and 'blkio' totals are aggregated by this plugin.
## Please note that this setting has no effect if 'total' is set to 'false'
total_include = ["cpu", "blkio", "network"]
## Which environment variables should we use as a tag
##tag_env = ["JAVA_HOME", "HEAP_SIZE"]
## docker labels to include and exclude as tags. Globs accepted.
## Note that an empty array for both will include all labels as tags
docker_label_include = []
docker_label_exclude = ["annotation*", "io.kubernetes*", "*description*", "*maintainer*", "*hash", "*author*"]
## Optional TLS Config
# use_tls = false
# tls_ca = "/etc/telegraf/ca.pem"
# tls_cert = "/etc/telegraf/cert.pem"
# tls_key = "/etc/telegraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
================================================
FILE: docker/compose-postgres/categraf/conf/input.kernel/kernel.toml
================================================
# # collect interval
# interval = 15
================================================
FILE: docker/compose-postgres/categraf/conf/input.mem/mem.toml
================================================
# # collect interval
# interval = 15
# # whether collect platform specified metrics
collect_platform_fields = true
================================================
FILE: docker/compose-postgres/categraf/conf/input.net/net.toml
================================================
# # collect interval
# interval = 15
# # whether collect protocol stats on Linux
# collect_protocol_stats = false
# # setting interfaces will tell categraf to gather these explicit interfaces
# interfaces = ["eth0"]
================================================
FILE: docker/compose-postgres/categraf/conf/input.netstat/netstat.toml
================================================
# # collect interval
# interval = 15
================================================
FILE: docker/compose-postgres/categraf/conf/input.processes/processes.toml
================================================
# # collect interval
# interval = 15
# # force use ps command to gather
# force_ps = false
# # force use /proc to gather
# force_proc = false
================================================
FILE: docker/compose-postgres/categraf/conf/input.system/system.toml
================================================
# # collect interval
# interval = 15
# # whether collect metric: system_n_users
# collect_user_number = false
================================================
FILE: docker/compose-postgres/categraf/conf/prometheus.toml
================================================
[prometheus]
enable=true
scrape_config_file="/etc/prometheus/prometheus.yml"
## log level, debug warn info error
log_level="info"
## wal file storage path ,default ./data-agent
# wal_storage_path="/path/to/storage"
## wal reserve time duration, default value is 2 hour
# wal_min_duration=2
================================================
FILE: docker/compose-postgres/docker-compose.yaml
================================================
version: "3.7"
networks:
nightingale:
driver: bridge
services:
postgres:
# platform: linux/x86_64
image: "postgres:12-alpine"
container_name: postgres
hostname: postgres
restart: always
ports:
- "5432:5432"
environment:
TZ: Asia/Shanghai
POSTGRES_USER: root
POSTGRES_PASSWORD: 1234
POSTGRES_DB: n9e_v6
PGDATA: /var/lib/postgresql/data/pgdata
volumes:
- ./pgdata:/var/lib/postgresql/data
- ./initsql_for_postgres:/docker-entrypoint-initdb.d/
networks:
- nightingale
redis:
image: "redis:7.0-alpine"
container_name: redis
hostname: redis
restart: always
ports:
- "6379:6379"
environment:
TZ: Asia/Shanghai
networks:
- nightingale
victoriametrics:
image: victoriametrics/victoria-metrics:v1.79.12
container_name: victoriametrics
hostname: victoriametrics
restart: always
environment:
TZ: Asia/Shanghai
ports:
- "8428:8428"
networks:
- nightingale
command:
- "--loggerTimezone=Asia/Shanghai"
nightingale:
image: flashcatcloud/nightingale:latest
container_name: nightingale
hostname: nightingale
restart: always
environment:
GIN_MODE: release
TZ: Asia/Shanghai
WAIT_HOSTS: postgres:5432, redis:6379
volumes:
- ./n9eetc_pg:/app/etc
ports:
- "17000:17000"
networks:
- nightingale
depends_on:
- postgres
- redis
- victoriametrics
links:
- postgres:postgres
- redis:redis
- victoriametrics:victoriametrics
command:
- /app/n9e
categraf:
image: "flashcatcloud/categraf:latest"
container_name: "categraf"
hostname: "categraf01"
restart: always
environment:
TZ: Asia/Shanghai
HOST_PROC: /hostfs/proc
HOST_SYS: /hostfs/sys
HOST_MOUNT_PREFIX: /hostfs
WAIT_HOSTS: nightingale:17000, nightingale:20090
volumes:
- ./categraf/conf:/etc/categraf/conf
- /:/hostfs
- /var/run/docker.sock:/var/run/docker.sock
- ./prometc_vm:/etc/prometheus
# ports:
# - "9100:9100/tcp"
networks:
- nightingale
depends_on:
- nightingale
links:
- nightingale:nightingale
================================================
FILE: docker/compose-postgres/initsql_for_postgres/a-n9e-for-Postgres.sql
================================================
CREATE TABLE users (
id bigserial,
username varchar(64) not null,
nickname varchar(64) not null,
password varchar(128) not null default '',
phone varchar(16) not null default '',
email varchar(64) not null default '',
portrait varchar(255) not null default '',
roles varchar(255) not null,
contacts varchar(1024),
maintainer int not null default 0,
belong varchar(16) not null default '',
last_active_time bigint not null default 0,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id),
UNIQUE (username)
);
COMMENT ON COLUMN users.id IS 'id';
COMMENT ON COLUMN users.username IS 'login name, cannot rename';
COMMENT ON COLUMN users.nickname IS 'display name, chinese name';
COMMENT ON COLUMN users.portrait IS 'portrait image url';
COMMENT ON COLUMN users.roles IS 'Admin | Standard | Guest, split by space';
COMMENT ON COLUMN users.contacts IS 'json e.g. {wecom:xx, dingtalk_robot_token:yy}';
COMMENT ON COLUMN users.belong IS 'belong';
insert into users(id, username, nickname, password, roles, create_at, create_by, update_at, update_by) values(1, 'root', '超管', 'root.2020', 'Admin', date_part('epoch',current_timestamp)::int, 'system', date_part('epoch',current_timestamp)::int, 'system');
CREATE TABLE user_group (
id bigserial,
name varchar(128) not null default '',
note varchar(255) not null default '',
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id)
) ;
CREATE INDEX user_group_create_by_idx ON user_group (create_by);
CREATE INDEX user_group_update_at_idx ON user_group (update_at);
insert into user_group(id, name, create_at, create_by, update_at, update_by) values(1, 'demo-root-group', date_part('epoch',current_timestamp)::int, 'root', date_part('epoch',current_timestamp)::int, 'root');
CREATE TABLE user_group_member (
id bigserial,
group_id bigint not null,
user_id bigint not null,
PRIMARY KEY(id)
) ;
CREATE INDEX user_group_member_group_id_idx ON user_group_member (group_id);
CREATE INDEX user_group_member_user_id_idx ON user_group_member (user_id);
insert into user_group_member(group_id, user_id) values(1, 1);
CREATE TABLE configs (
id bigserial,
ckey varchar(191) not null,
cval text not null default '',
note varchar(1024) not null default '',
external int not null default 0,
encrypted int not null default 0,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id),
UNIQUE (ckey)
);
CREATE TABLE role (
id bigserial,
name varchar(191) not null default '',
note varchar(255) not null default '',
PRIMARY KEY (id),
UNIQUE (name)
) ;
insert into role(name, note) values('Admin', 'Administrator role');
insert into role(name, note) values('Standard', 'Ordinary user role');
insert into role(name, note) values('Guest', 'Readonly user role');
CREATE TABLE role_operation(
id bigserial,
role_name varchar(128) not null,
operation varchar(191) not null,
PRIMARY KEY(id)
) ;
CREATE INDEX role_operation_role_name_idx ON role_operation (role_name);
CREATE INDEX role_operation_operation_idx ON role_operation (operation);
-- Admin is special, who has no concrete operation but can do anything.
insert into role_operation(role_name, operation) values('Guest', '/metric/explorer');
insert into role_operation(role_name, operation) values('Guest', '/object/explorer');
insert into role_operation(role_name, operation) values('Guest', '/log/explorer');
insert into role_operation(role_name, operation) values('Guest', '/trace/explorer');
insert into role_operation(role_name, operation) values('Guest', '/help/version');
insert into role_operation(role_name, operation) values('Guest', '/help/contact');
insert into role_operation(role_name, operation) values('Standard', '/metric/explorer');
insert into role_operation(role_name, operation) values('Standard', '/object/explorer');
insert into role_operation(role_name, operation) values('Standard', '/log/explorer');
insert into role_operation(role_name, operation) values('Standard', '/trace/explorer');
insert into role_operation(role_name, operation) values('Standard', '/help/version');
insert into role_operation(role_name, operation) values('Standard', '/help/contact');
insert into role_operation(role_name, operation) values('Standard', '/help/servers');
insert into role_operation(role_name, operation) values('Standard', '/help/migrate');
insert into role_operation(role_name, operation) values('Standard', '/alert-rules-built-in');
insert into role_operation(role_name, operation) values('Standard', '/dashboards-built-in');
insert into role_operation(role_name, operation) values('Standard', '/trace/dependencies');
insert into role_operation(role_name, operation) values('Admin', '/help/source');
insert into role_operation(role_name, operation) values('Admin', '/help/sso');
insert into role_operation(role_name, operation) values('Admin', '/help/notification-tpls');
insert into role_operation(role_name, operation) values('Admin', '/help/notification-settings');
insert into role_operation(role_name, operation) values('Standard', '/users');
insert into role_operation(role_name, operation) values('Standard', '/user-groups');
insert into role_operation(role_name, operation) values('Standard', '/user-groups/add');
insert into role_operation(role_name, operation) values('Standard', '/user-groups/put');
insert into role_operation(role_name, operation) values('Standard', '/user-groups/del');
insert into role_operation(role_name, operation) values('Standard', '/busi-groups');
insert into role_operation(role_name, operation) values('Standard', '/busi-groups/add');
insert into role_operation(role_name, operation) values('Standard', '/busi-groups/put');
insert into role_operation(role_name, operation) values('Standard', '/busi-groups/del');
insert into role_operation(role_name, operation) values('Standard', '/targets');
insert into role_operation(role_name, operation) values('Standard', '/targets/add');
insert into role_operation(role_name, operation) values('Standard', '/targets/put');
insert into role_operation(role_name, operation) values('Standard', '/targets/del');
insert into role_operation(role_name, operation) values('Standard', '/dashboards');
insert into role_operation(role_name, operation) values('Standard', '/dashboards/add');
insert into role_operation(role_name, operation) values('Standard', '/dashboards/put');
insert into role_operation(role_name, operation) values('Standard', '/dashboards/del');
insert into role_operation(role_name, operation) values('Standard', '/alert-rules');
insert into role_operation(role_name, operation) values('Standard', '/alert-rules/add');
insert into role_operation(role_name, operation) values('Standard', '/alert-rules/put');
insert into role_operation(role_name, operation) values('Standard', '/alert-rules/del');
insert into role_operation(role_name, operation) values('Standard', '/alert-mutes');
insert into role_operation(role_name, operation) values('Standard', '/alert-mutes/add');
insert into role_operation(role_name, operation) values('Standard', '/alert-mutes/del');
insert into role_operation(role_name, operation) values('Standard', '/alert-subscribes');
insert into role_operation(role_name, operation) values('Standard', '/alert-subscribes/add');
insert into role_operation(role_name, operation) values('Standard', '/alert-subscribes/put');
insert into role_operation(role_name, operation) values('Standard', '/alert-subscribes/del');
insert into role_operation(role_name, operation) values('Standard', '/alert-cur-events');
insert into role_operation(role_name, operation) values('Standard', '/alert-cur-events/del');
insert into role_operation(role_name, operation) values('Standard', '/alert-his-events');
insert into role_operation(role_name, operation) values('Standard', '/job-tpls');
insert into role_operation(role_name, operation) values('Standard', '/job-tpls/add');
insert into role_operation(role_name, operation) values('Standard', '/job-tpls/put');
insert into role_operation(role_name, operation) values('Standard', '/job-tpls/del');
insert into role_operation(role_name, operation) values('Standard', '/job-tasks');
insert into role_operation(role_name, operation) values('Standard', '/job-tasks/add');
insert into role_operation(role_name, operation) values('Standard', '/job-tasks/put');
insert into role_operation(role_name, operation) values('Standard', '/recording-rules');
insert into role_operation(role_name, operation) values('Standard', '/recording-rules/add');
insert into role_operation(role_name, operation) values('Standard', '/recording-rules/put');
insert into role_operation(role_name, operation) values('Standard', '/recording-rules/del');
-- for alert_rule | collect_rule | mute | dashboard grouping
CREATE TABLE busi_group (
id bigserial,
name varchar(191) not null,
label_enable smallint not null default 0,
label_value varchar(191) not null default '' ,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id),
UNIQUE (name)
) ;
COMMENT ON COLUMN busi_group.label_value IS 'if label_enable: label_value can not be blank';
insert into busi_group(id, name, create_at, create_by, update_at, update_by) values(1, 'Default Busi Group', date_part('epoch',current_timestamp)::int, 'root', date_part('epoch',current_timestamp)::int, 'root');
CREATE TABLE busi_group_member (
id bigserial,
busi_group_id bigint not null ,
user_group_id bigint not null ,
perm_flag char(2) not null ,
PRIMARY KEY (id)
) ;
CREATE INDEX busi_group_member_busi_group_id_idx ON busi_group_member (busi_group_id);
CREATE INDEX busi_group_member_user_group_id_idx ON busi_group_member (user_group_id);
COMMENT ON COLUMN busi_group_member.busi_group_id IS 'busi group id';
COMMENT ON COLUMN busi_group_member.user_group_id IS 'user group id';
COMMENT ON COLUMN busi_group_member.perm_flag IS 'ro | rw';
insert into busi_group_member(busi_group_id, user_group_id, perm_flag) values(1, 1, 'rw');
-- for dashboard new version
CREATE TABLE board (
id bigserial,
group_id bigint not null default 0 ,
name varchar(191) not null,
ident varchar(200) not null default '',
tags varchar(255) not null ,
public smallint not null default 0 ,
built_in smallint not null default 0 ,
hide smallint not null default 0 ,
public_cate bigint NOT NULL DEFAULT 0,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
note varchar(1024) not null default '',
PRIMARY KEY (id),
UNIQUE (group_id, name)
) ;
CREATE INDEX board_ident_idx ON board (ident);
COMMENT ON COLUMN board.group_id IS 'busi group id';
COMMENT ON COLUMN board.tags IS 'split by space';
COMMENT ON COLUMN board.public IS '0:false 1:true';
COMMENT ON COLUMN board.built_in IS '0:false 1:true';
COMMENT ON COLUMN board.hide IS '0:false 1:true';
COMMENT ON COLUMN board.public_cate IS '0 anonymous 1 login 2 busi';
COMMENT ON COLUMN board.note IS 'note';
-- for dashboard new version
CREATE TABLE board_payload (
id bigint not null ,
payload text not null,
UNIQUE (id)
) ;
COMMENT ON COLUMN board_payload.id IS 'dashboard id';
-- deprecated
CREATE TABLE dashboard (
id bigserial,
group_id bigint not null default 0 ,
name varchar(191) not null,
tags varchar(255) not null ,
configs varchar(8192) ,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id),
UNIQUE (group_id, name)
) ;
COMMENT ON COLUMN dashboard.group_id IS 'busi group id';
COMMENT ON COLUMN dashboard.tags IS 'split by space';
COMMENT ON COLUMN dashboard.configs IS 'dashboard variables';
-- deprecated
-- auto create the first subclass 'Default chart group' of dashboard
CREATE TABLE chart_group (
id bigserial,
dashboard_id bigint not null,
name varchar(255) not null,
weight int not null default 0,
PRIMARY KEY (id)
) ;
CREATE INDEX chart_group_dashboard_id_idx ON chart_group (dashboard_id);
-- deprecated
CREATE TABLE chart (
id bigserial,
group_id bigint not null ,
configs text,
weight int not null default 0,
PRIMARY KEY (id)
) ;
CREATE INDEX chart_group_id_idx ON chart (group_id);
COMMENT ON COLUMN chart.group_id IS 'chart group id';
CREATE TABLE chart_share (
id bigserial,
cluster varchar(128) not null,
datasource_id bigint not null default 0,
configs text,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
primary key (id)
) ;
CREATE INDEX chart_share_create_at_idx ON chart_share (create_at);
CREATE TABLE alert_rule (
id bigserial,
group_id bigint not null default 0 ,
cate varchar(128) not null,
datasource_ids varchar(255) not null default '' ,
cluster varchar(128) not null,
name varchar(255) not null,
note varchar(1024) not null default '',
prod varchar(255) not null default '',
algorithm varchar(255) not null default '',
algo_params varchar(255),
delay int not null default 0,
severity smallint not null ,
disabled smallint not null ,
prom_for_duration int not null ,
rule_config text not null ,
prom_ql text not null ,
prom_eval_interval int not null ,
enable_stime varchar(255) not null default '00:00',
enable_etime varchar(255) not null default '23:59',
enable_days_of_week varchar(255) not null default '' ,
enable_in_bg smallint not null default 0 ,
notify_recovered smallint not null ,
notify_channels varchar(255) not null default '' ,
notify_groups varchar(255) not null default '' ,
notify_repeat_step int not null default 0 ,
notify_max_number int not null default 0 ,
recover_duration int not null default 0 ,
callbacks varchar(255) not null default '' ,
runbook_url varchar(255),
append_tags varchar(255) not null default '' ,
annotations text not null ,
extra_config text not null ,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
time_zone varchar(64) not null default '',
PRIMARY KEY (id)
) ;
CREATE INDEX alert_rule_group_id_idx ON alert_rule (group_id);
CREATE INDEX alert_rule_update_at_idx ON alert_rule (update_at);
COMMENT ON COLUMN alert_rule.group_id IS 'busi group id';
COMMENT ON COLUMN alert_rule.datasource_ids IS 'datasource ids';
COMMENT ON COLUMN alert_rule.severity IS '1:Emergency 2:Warning 3:Notice';
COMMENT ON COLUMN alert_rule.disabled IS '0:enabled 1:disabled';
COMMENT ON COLUMN alert_rule.prom_for_duration IS 'prometheus for, unit:s';
COMMENT ON COLUMN alert_rule.rule_config IS 'rule_config';
COMMENT ON COLUMN alert_rule.prom_ql IS 'promql';
COMMENT ON COLUMN alert_rule.prom_eval_interval IS 'evaluate interval';
COMMENT ON COLUMN alert_rule.enable_stime IS '00:00';
COMMENT ON COLUMN alert_rule.enable_etime IS '23:59';
COMMENT ON COLUMN alert_rule.enable_days_of_week IS 'split by space: 0 1 2 3 4 5 6';
COMMENT ON COLUMN alert_rule.enable_in_bg IS '1: only this bg 0: global';
COMMENT ON COLUMN alert_rule.notify_recovered IS 'whether notify when recovery';
COMMENT ON COLUMN alert_rule.notify_channels IS 'split by space: sms voice email dingtalk wecom';
COMMENT ON COLUMN alert_rule.notify_groups IS 'split by space: 233 43';
COMMENT ON COLUMN alert_rule.notify_repeat_step IS 'unit: min';
COMMENT ON COLUMN alert_rule.recover_duration IS 'unit: s';
COMMENT ON COLUMN alert_rule.callbacks IS 'split by space: http://a.com/api/x http://a.com/api/y';
COMMENT ON COLUMN alert_rule.append_tags IS 'split by space: service=n9e mod=api';
COMMENT ON COLUMN alert_rule.annotations IS 'annotations';
COMMENT ON COLUMN alert_rule.extra_config IS 'extra_config';
CREATE TABLE alert_mute (
id bigserial,
group_id bigint not null default 0 ,
prod varchar(255) not null default '',
note varchar(1024) not null default '',
cate varchar(128) not null,
cluster varchar(128) not null,
datasource_ids varchar(255) not null default '' ,
tags jsonb NOT NULL ,
cause varchar(255) not null default '',
btime bigint not null default 0 ,
etime bigint not null default 0 ,
disabled smallint not null default 0 ,
mute_time_type smallint not null default 0,
periodic_mutes varchar(4096) not null default '',
severities varchar(32) not null default '',
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id)
) ;
CREATE INDEX alert_mute_group_id_idx ON alert_mute (group_id);
CREATE INDEX alert_mute_update_at_idx ON alert_mute (update_at);
COMMENT ON COLUMN alert_mute.group_id IS 'busi group id';
COMMENT ON COLUMN alert_mute.datasource_ids IS 'datasource ids';
COMMENT ON COLUMN alert_mute.tags IS 'json,map,tagkey->regexp|value';
COMMENT ON COLUMN alert_mute.btime IS 'begin time';
COMMENT ON COLUMN alert_mute.etime IS 'end time';
COMMENT ON COLUMN alert_mute.disabled IS '0:enabled 1:disabled';
CREATE TABLE alert_subscribe (
id bigserial,
name varchar(255) not null default '',
disabled int not null default 0,
group_id bigint not null default 0,
prod varchar(255) not null default '',
cate varchar(128) not null,
datasource_ids varchar(255) not null default '',
cluster varchar(128) not null,
rule_id bigint not null default 0,
severities varchar(32) not null default '',
tags varchar(4096) not null default '[]',
redefine_severity smallint default 0 ,
new_severity smallint not null,
redefine_channels smallint default 0 ,
new_channels varchar(255) not null default '',
user_group_ids varchar(250) not null,
busi_groups VARCHAR(4096) NOT NULL DEFAULT '[]',
note VARCHAR(1024) DEFAULT '',
rule_ids VARCHAR(1024) DEFAULT '',
webhooks text not null,
extra_config text not null,
redefine_webhooks int default 0,
for_duration bigint not null default 0,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id)
);
CREATE INDEX ON alert_subscribe (update_at);
CREATE INDEX ON alert_subscribe (group_id);
COMMENT ON COLUMN alert_subscribe.disabled IS '0:enabled 1:disabled';
COMMENT ON COLUMN alert_subscribe.group_id IS 'busi group id';
COMMENT ON COLUMN alert_subscribe.datasource_ids IS 'datasource ids';
COMMENT ON COLUMN alert_subscribe.tags IS 'json,map,tagkey->regexp|value';
COMMENT ON COLUMN alert_subscribe.redefine_severity IS 'is redefine severity?';
COMMENT ON COLUMN alert_subscribe.new_severity IS '0:Emergency 1:Warning 2:Notice';
COMMENT ON COLUMN alert_subscribe.redefine_channels IS 'is redefine channels?';
COMMENT ON COLUMN alert_subscribe.new_channels IS 'split by space: sms voice email dingtalk wecom';
COMMENT ON COLUMN alert_subscribe.user_group_ids IS 'split by space 1 34 5, notify cc to user_group_ids';
COMMENT ON COLUMN alert_subscribe.note IS 'note';
COMMENT ON COLUMN alert_subscribe.rule_ids IS 'rule_ids';
COMMENT ON COLUMN alert_subscribe.extra_config IS 'extra_config';
CREATE TABLE target (
id bigserial,
group_id bigint not null default 0,
ident varchar(191) not null,
note varchar(255) not null default '',
tags varchar(512) not null default '',
host_tags text,
host_ip varchar(15) default '',
agent_version varchar(255) default '',
engine_name varchar(255) default '',
os varchar(31) default '',
update_at bigint not null default 0,
PRIMARY KEY (id),
UNIQUE (ident)
);
CREATE INDEX ON target (group_id);
CREATE INDEX idx_host_ip ON target (host_ip);
CREATE INDEX idx_agent_version ON target (agent_version);
CREATE INDEX idx_engine_name ON target (engine_name);
CREATE INDEX idx_os ON target (os);
COMMENT ON COLUMN target.group_id IS 'busi group id';
COMMENT ON COLUMN target.ident IS 'target id';
COMMENT ON COLUMN target.note IS 'append to alert event as field';
COMMENT ON COLUMN target.tags IS 'append to series data as tags, split by space, append external space at suffix';
COMMENT ON COLUMN target.host_tags IS 'global labels set in conf file';
COMMENT ON COLUMN target.host_ip IS 'IPv4 string';
COMMENT ON COLUMN target.agent_version IS 'agent version';
COMMENT ON COLUMN target.engine_name IS 'engine_name';
COMMENT ON COLUMN target.os IS 'os type';
CREATE TABLE metric_view (
id bigserial,
name varchar(191) not null default '',
cate smallint not null ,
configs varchar(8192) not null default '',
create_at bigint not null default 0,
create_by bigint not null default 0,
update_at bigint not null default 0,
PRIMARY KEY (id)
) ;
CREATE INDEX metric_view_create_by_idx ON metric_view (create_by);
COMMENT ON COLUMN metric_view.cate IS '0: preset 1: custom';
COMMENT ON COLUMN metric_view.create_by IS 'user id';
insert into metric_view(name, cate, configs) values('Host View', 0, '{"filters":[{"oper":"=","label":"__name__","value":"cpu_usage_idle"}],"dynamicLabels":[],"dimensionLabels":[{"label":"ident","value":""}]}');
CREATE TABLE recording_rule (
id bigserial,
group_id bigint not null default '0',
datasource_ids varchar(255) not null default '',
cluster varchar(128) not null,
name varchar(255) not null ,
note varchar(255) not null ,
disabled smallint not null default 0 ,
prom_ql varchar(8192) not null ,
prom_eval_interval int not null ,
append_tags varchar(255) default '' ,
query_configs text not null ,
create_at bigint default '0',
create_by varchar(64) default '',
update_at bigint default '0',
update_by varchar(64) default '',
PRIMARY KEY (id)
) ;
CREATE INDEX recording_rule_group_id_idx ON recording_rule (group_id);
CREATE INDEX recording_rule_update_at_idx ON recording_rule (update_at);
COMMENT ON COLUMN recording_rule.group_id IS 'group_id';
COMMENT ON COLUMN recording_rule.datasource_ids IS 'datasource ids';
COMMENT ON COLUMN recording_rule.name IS 'new metric name';
COMMENT ON COLUMN recording_rule.note IS 'rule note';
COMMENT ON COLUMN recording_rule.disabled IS '0:enabled 1:disabled';
COMMENT ON COLUMN recording_rule.prom_ql IS 'promql';
COMMENT ON COLUMN recording_rule.prom_eval_interval IS 'evaluate interval';
COMMENT ON COLUMN recording_rule.append_tags IS 'split by space: service=n9e mod=api';
COMMENT ON COLUMN recording_rule.query_configs IS 'query configs';
CREATE TABLE alert_aggr_view (
id bigserial,
name varchar(191) not null default '',
rule varchar(2048) not null default '',
cate smallint not null ,
create_at bigint not null default 0,
create_by bigint not null default 0,
update_at bigint not null default 0,
PRIMARY KEY (id)
) ;
CREATE INDEX alert_aggr_view_create_by_idx ON alert_aggr_view (create_by);
COMMENT ON COLUMN alert_aggr_view.cate IS '0: preset 1: custom';
COMMENT ON COLUMN alert_aggr_view.create_by IS 'user id';
insert into alert_aggr_view(name, rule, cate) values('By BusiGroup, Severity', 'field:group_name::field:severity', 0);
insert into alert_aggr_view(name, rule, cate) values('By RuleName', 'field:rule_name', 0);
CREATE TABLE alert_cur_event (
id bigint not null ,
cate varchar(128) not null,
datasource_id bigint not null default 0 ,
cluster varchar(128) not null,
group_id bigint not null ,
group_name varchar(255) not null default '' ,
hash varchar(64) not null ,
rule_id bigint not null,
rule_name varchar(255) not null,
rule_note varchar(2048) not null ,
rule_prod varchar(255) not null default '',
rule_algo varchar(255) not null default '',
severity smallint not null ,
prom_for_duration int not null ,
prom_ql varchar(8192) not null ,
prom_eval_interval int not null ,
callbacks varchar(255) not null default '' ,
runbook_url varchar(255),
notify_recovered smallint not null ,
notify_channels varchar(255) not null default '' ,
notify_groups varchar(255) not null default '' ,
notify_repeat_next bigint not null default 0 ,
notify_cur_number int not null default 0 ,
target_ident varchar(191) not null default '' ,
target_note varchar(191) not null default '' ,
first_trigger_time bigint,
trigger_time bigint not null,
trigger_value varchar(2048) not null,
annotations text not null ,
rule_config text not null ,
tags varchar(1024) not null default '' ,
PRIMARY KEY (id)
) ;
CREATE INDEX alert_cur_event_hash_idx ON alert_cur_event (hash);
CREATE INDEX alert_cur_event_rule_id_idx ON alert_cur_event (rule_id);
CREATE INDEX alert_cur_event_tg_idx ON alert_cur_event (trigger_time, group_id);
CREATE INDEX alert_cur_event_nrn_idx ON alert_cur_event (notify_repeat_next);
COMMENT ON COLUMN alert_cur_event.id IS 'use alert_his_event.id';
COMMENT ON COLUMN alert_cur_event.datasource_id IS 'datasource id';
COMMENT ON COLUMN alert_cur_event.group_id IS 'busi group id of rule';
COMMENT ON COLUMN alert_cur_event.group_name IS 'busi group name';
COMMENT ON COLUMN alert_cur_event.hash IS 'rule_id + vector_pk';
COMMENT ON COLUMN alert_cur_event.rule_note IS 'alert rule note';
COMMENT ON COLUMN alert_cur_event.severity IS '1:Emergency 2:Warning 3:Notice';
COMMENT ON COLUMN alert_cur_event.prom_for_duration IS 'prometheus for, unit:s';
COMMENT ON COLUMN alert_cur_event.prom_ql IS 'promql';
COMMENT ON COLUMN alert_cur_event.prom_eval_interval IS 'evaluate interval';
COMMENT ON COLUMN alert_cur_event.callbacks IS 'split by space: http://a.com/api/x http://a.com/api/y';
COMMENT ON COLUMN alert_cur_event.notify_recovered IS 'whether notify when recovery';
COMMENT ON COLUMN alert_cur_event.notify_channels IS 'split by space: sms voice email dingtalk wecom';
COMMENT ON COLUMN alert_cur_event.notify_groups IS 'split by space: 233 43';
COMMENT ON COLUMN alert_cur_event.notify_repeat_next IS 'next timestamp to notify, get repeat settings from rule';
COMMENT ON COLUMN alert_cur_event.target_ident IS 'target ident, also in tags';
COMMENT ON COLUMN alert_cur_event.target_note IS 'target note';
COMMENT ON COLUMN alert_cur_event.annotations IS 'annotations';
COMMENT ON COLUMN alert_cur_event.rule_config IS 'rule_config';
COMMENT ON COLUMN alert_cur_event.tags IS 'merge data_tags rule_tags, split by ,,';
CREATE TABLE alert_his_event (
id bigserial,
is_recovered smallint not null,
cate varchar(128) not null,
datasource_id bigint not null default 0 ,
cluster varchar(128) not null,
group_id bigint not null ,
group_name varchar(255) not null default '' ,
hash varchar(64) not null ,
rule_id bigint not null,
rule_name varchar(255) not null,
rule_note varchar(2048) not null default 'alert rule note',
rule_prod varchar(255) not null default '',
rule_algo varchar(255) not null default '',
severity smallint not null ,
prom_for_duration int not null ,
prom_ql varchar(8192) not null ,
prom_eval_interval int not null ,
callbacks varchar(255) not null default '' ,
runbook_url varchar(255),
notify_recovered smallint not null ,
notify_channels varchar(255) not null default '' ,
notify_groups varchar(255) not null default '' ,
notify_cur_number int not null default 0 ,
target_ident varchar(191) not null default '' ,
target_note varchar(191) not null default '' ,
first_trigger_time bigint,
trigger_time bigint not null,
trigger_value varchar(2048) not null,
recover_time bigint not null default 0,
last_eval_time bigint not null default 0 ,
tags varchar(1024) not null default '' ,
annotations text not null ,
rule_config text not null ,
PRIMARY KEY (id)
) ;
CREATE INDEX alert_his_event_hash_idx ON alert_his_event (hash);
CREATE INDEX alert_his_event_rule_id_idx ON alert_his_event (rule_id);
CREATE INDEX alert_his_event_tg_idx ON alert_his_event (trigger_time, group_id);
CREATE INDEX alert_his_event_nrn_idx ON alert_his_event (last_eval_time);
COMMENT ON COLUMN alert_his_event.group_id IS 'busi group id of rule';
COMMENT ON COLUMN alert_his_event.datasource_id IS 'datasource id';
COMMENT ON COLUMN alert_his_event.group_name IS 'busi group name';
COMMENT ON COLUMN alert_his_event.hash IS 'rule_id + vector_pk';
COMMENT ON COLUMN alert_his_event.rule_note IS 'alert rule note';
COMMENT ON COLUMN alert_his_event.severity IS '0:Emergency 1:Warning 2:Notice';
COMMENT ON COLUMN alert_his_event.prom_for_duration IS 'prometheus for, unit:s';
COMMENT ON COLUMN alert_his_event.prom_ql IS 'promql';
COMMENT ON COLUMN alert_his_event.prom_eval_interval IS 'evaluate interval';
COMMENT ON COLUMN alert_his_event.callbacks IS 'split by space: http://a.com/api/x http://a.com/api/y';
COMMENT ON COLUMN alert_his_event.notify_recovered IS 'whether notify when recovery';
COMMENT ON COLUMN alert_his_event.notify_channels IS 'split by space: sms voice email dingtalk wecom';
COMMENT ON COLUMN alert_his_event.notify_groups IS 'split by space: 233 43';
COMMENT ON COLUMN alert_his_event.target_ident IS 'target ident, also in tags';
COMMENT ON COLUMN alert_his_event.target_note IS 'target note';
COMMENT ON COLUMN alert_his_event.last_eval_time IS 'for time filter';
COMMENT ON COLUMN alert_his_event.tags IS 'merge data_tags rule_tags, split by ,,';
COMMENT ON COLUMN alert_his_event.annotations IS 'annotations';
COMMENT ON COLUMN alert_his_event.rule_config IS 'rule_config';
CREATE TABLE task_tpl
(
id serial,
group_id int not null ,
title varchar(255) not null default '',
account varchar(64) not null,
batch int not null default 0,
tolerance int not null default 0,
timeout int not null default 0,
pause varchar(255) not null default '',
script text not null,
args varchar(512) not null default '',
tags varchar(255) not null default '' ,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id)
) ;
CREATE INDEX task_tpl_group_id_idx ON task_tpl (group_id);
COMMENT ON COLUMN task_tpl.group_id IS 'busi group id';
COMMENT ON COLUMN task_tpl.tags IS 'split by space';
CREATE TABLE task_tpl_host
(
ii serial,
id int not null ,
host varchar(128) not null ,
PRIMARY KEY (ii)
) ;
CREATE INDEX task_tpl_host_id_host_idx ON task_tpl_host (id, host);
COMMENT ON COLUMN task_tpl_host.id IS 'task tpl id';
COMMENT ON COLUMN task_tpl_host.host IS 'ip or hostname';
CREATE TABLE task_record
(
id bigint not null ,
event_id bigint not null default 0,
group_id bigint not null ,
ibex_address varchar(128) not null,
ibex_auth_user varchar(128) not null default '',
ibex_auth_pass varchar(128) not null default '',
title varchar(255) not null default '',
account varchar(64) not null,
batch int not null default 0,
tolerance int not null default 0,
timeout int not null default 0,
pause varchar(255) not null default '',
script text not null,
args varchar(512) not null default '',
create_at bigint not null default 0,
create_by varchar(64) not null default '',
PRIMARY KEY (id)
) ;
CREATE INDEX task_record_cg_idx ON task_record (create_at, group_id);
CREATE INDEX task_record_create_by_idx ON task_record (create_by);
CREATE INDEX task_record_event_id_idx ON task_record (event_id);
COMMENT ON COLUMN task_record.id IS 'ibex task id';
COMMENT ON COLUMN task_record.group_id IS 'busi group id';
COMMENT ON COLUMN task_record.event_id IS 'event id';
CREATE TABLE alerting_engines
(
id serial,
instance varchar(128) not null default '' ,
datasource_id bigint not null default 0 ,
engine_cluster varchar(128) not null default '' ,
clock bigint not null,
PRIMARY KEY (id)
) ;
COMMENT ON COLUMN alerting_engines.instance IS 'instance identification, e.g. 10.9.0.9:9090';
COMMENT ON COLUMN alerting_engines.datasource_id IS 'datasource id';
COMMENT ON COLUMN alerting_engines.engine_cluster IS 'target reader cluster';
CREATE TABLE datasource
(
id serial,
name varchar(191) not null default '',
identifier varchar(255) not null default '',
description varchar(255) not null default '',
category varchar(255) not null default '',
plugin_id int not null default 0,
plugin_type varchar(255) not null default '',
plugin_type_name varchar(255) not null default '',
cluster_name varchar(255) not null default '',
settings text not null,
status varchar(255) not null default '',
http varchar(4096) not null default '',
auth varchar(8192) not null default '',
is_default boolean not null default false,
weight int not null default 0,
created_at bigint not null default 0,
created_by varchar(64) not null default '',
updated_at bigint not null default 0,
updated_by varchar(64) not null default '',
UNIQUE (name),
PRIMARY KEY (id)
) ;
CREATE TABLE builtin_cate (
id bigserial,
name varchar(191) not null,
user_id bigint not null default 0,
PRIMARY KEY (id)
) ;
CREATE TABLE notify_tpl (
id bigserial,
channel varchar(32) not null,
name varchar(255) not null,
content text not null,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default '',
PRIMARY KEY (id),
UNIQUE (channel)
);
CREATE TABLE sso_config (
id bigserial,
name varchar(191) not null,
content text not null,
update_at bigint not null default 0,
PRIMARY KEY (id),
UNIQUE (name)
);
CREATE TABLE es_index_pattern (
id bigserial,
datasource_id bigint not null default 0,
name varchar(191) not null,
time_field varchar(128) not null default '@timestamp',
allow_hide_system_indices smallint not null default 0,
fields_format varchar(4096) not null default '',
cross_cluster_enabled int not null default 0,
create_at bigint default '0',
create_by varchar(64) default '',
update_at bigint default '0',
update_by varchar(64) default '',
note varchar(4096) not null default '',
PRIMARY KEY (id),
UNIQUE (datasource_id, name)
) ;
COMMENT ON COLUMN es_index_pattern.datasource_id IS 'datasource id';
COMMENT ON COLUMN es_index_pattern.note IS 'description of metric in Chinese';
CREATE TABLE builtin_metrics (
id bigserial,
collector varchar(191) NOT NULL,
typ varchar(191) NOT NULL,
name varchar(191) NOT NULL,
unit varchar(191) NOT NULL,
lang varchar(191) NOT NULL DEFAULT '',
note varchar(4096) NOT NULL,
expression varchar(4096) NOT NULL,
expression_type varchar(32) NOT NULL DEFAULT 'promql',
metric_type varchar(191) NOT NULL DEFAULT '',
extra_fields text,
created_at bigint NOT NULL DEFAULT 0,
created_by varchar(191) NOT NULL DEFAULT '',
updated_at bigint NOT NULL DEFAULT 0,
updated_by varchar(191) NOT NULL DEFAULT '',
uuid BIGINT NOT NULL DEFAULT 0,
PRIMARY KEY (id),
UNIQUE (lang, collector, typ, name)
);
CREATE INDEX idx_collector ON builtin_metrics (collector);
CREATE INDEX idx_typ ON builtin_metrics (typ);
CREATE INDEX idx_name ON builtin_metrics (name);
CREATE INDEX idx_lang ON builtin_metrics (lang);
COMMENT ON COLUMN builtin_metrics.id IS 'unique identifier';
COMMENT ON COLUMN builtin_metrics.collector IS 'type of collector';
COMMENT ON COLUMN builtin_metrics.typ IS 'type of metric';
COMMENT ON COLUMN builtin_metrics.name IS 'name of metric';
COMMENT ON COLUMN builtin_metrics.unit IS 'unit of metric';
COMMENT ON COLUMN builtin_metrics.lang IS 'language of metric';
COMMENT ON COLUMN builtin_metrics.note IS 'description of metric in Chinese';
COMMENT ON COLUMN builtin_metrics.expression IS 'expression of metric';
COMMENT ON COLUMN builtin_metrics.expression_type IS 'expression type: metric_name or promql';
COMMENT ON COLUMN builtin_metrics.metric_type IS 'metric type like counter/gauge';
COMMENT ON COLUMN builtin_metrics.extra_fields IS 'custom extra fields';
COMMENT ON COLUMN builtin_metrics.created_at IS 'create time';
COMMENT ON COLUMN builtin_metrics.created_by IS 'creator';
COMMENT ON COLUMN builtin_metrics.updated_at IS 'update time';
COMMENT ON COLUMN builtin_metrics.updated_by IS 'updater';
COMMENT ON COLUMN builtin_metrics.uuid IS 'unique identifier';
CREATE TABLE metric_filter (
id BIGSERIAL PRIMARY KEY,
name VARCHAR(191) NOT NULL,
configs VARCHAR(4096) NOT NULL,
groups_perm TEXT,
create_at BIGINT NOT NULL DEFAULT 0,
create_by VARCHAR(191) NOT NULL DEFAULT '',
update_at BIGINT NOT NULL DEFAULT 0,
update_by VARCHAR(191) NOT NULL DEFAULT ''
);
CREATE INDEX idx_metric_filter_name ON metric_filter (name);
CREATE TABLE board_busigroup (
busi_group_id BIGINT NOT NULL DEFAULT 0,
board_id BIGINT NOT NULL DEFAULT 0,
PRIMARY KEY (busi_group_id, board_id)
);
CREATE TABLE builtin_components (
id BIGSERIAL PRIMARY KEY,
ident VARCHAR(191) NOT NULL,
logo VARCHAR(191) NOT NULL,
readme TEXT NOT NULL,
disabled INT NOT NULL DEFAULT 0,
created_at BIGINT NOT NULL DEFAULT 0,
created_by VARCHAR(191) NOT NULL DEFAULT '',
updated_at BIGINT NOT NULL DEFAULT 0,
updated_by VARCHAR(191) NOT NULL DEFAULT ''
);
CREATE INDEX idx_ident ON builtin_components (ident);
CREATE TABLE builtin_payloads (
id BIGSERIAL PRIMARY KEY,
type VARCHAR(191) NOT NULL,
uuid BIGINT NOT NULL DEFAULT 0,
component VARCHAR(191) NOT NULL,
cate VARCHAR(191) NOT NULL,
name VARCHAR(191) NOT NULL,
tags VARCHAR(191) NOT NULL DEFAULT '',
content TEXT NOT NULL,
note VARCHAR(1024) NOT NULL DEFAULT '',
created_at BIGINT NOT NULL DEFAULT 0,
created_by VARCHAR(191) NOT NULL DEFAULT '',
updated_at BIGINT NOT NULL DEFAULT 0,
updated_by VARCHAR(191) NOT NULL DEFAULT ''
);
CREATE INDEX idx_component ON builtin_payloads (component);
CREATE INDEX idx_builtin_payloads_name ON builtin_payloads (name);
CREATE INDEX idx_cate ON builtin_payloads (cate);
CREATE INDEX idx_type ON builtin_payloads (type);
CREATE TABLE dash_annotation (
id bigserial PRIMARY KEY,
dashboard_id bigint not null,
panel_id varchar(191) not null,
tags text,
description text,
config text,
time_start bigint not null default 0,
time_end bigint not null default 0,
create_at bigint not null default 0,
create_by varchar(64) not null default '',
update_at bigint not null default 0,
update_by varchar(64) not null default ''
);
CREATE TABLE source_token (
id bigserial PRIMARY KEY,
source_type varchar(64) NOT NULL DEFAULT '',
source_id varchar(255) NOT NULL DEFAULT '',
token varchar(255) NOT NULL DEFAULT '',
expire_at bigint NOT NULL DEFAULT 0,
create_at bigint NOT NULL DEFAULT 0,
create_by varchar(64) NOT NULL DEFAULT ''
);
CREATE INDEX idx_source_token_type_id_token ON source_token (source_type, source_id, token);
CREATE TABLE notification_record (
id BIGSERIAL PRIMARY KEY,
notify_rule_id BIGINT NOT NULL DEFAULT 0,
event_id bigint NOT NULL,
sub_id bigint DEFAULT NULL,
channel varchar(255) NOT NULL,
status bigint DEFAULT NULL,
target varchar(1024) NOT NULL,
details varchar(2048) DEFAULT '',
created_at bigint NOT NULL
);
CREATE INDEX idx_evt ON notification_record (event_id);
COMMENT ON COLUMN notification_record.event_id IS 'event history id';
COMMENT ON COLUMN notification_record.sub_id IS 'subscribed rule id';
COMMENT ON COLUMN notification_record.channel IS 'notification channel name';
COMMENT ON COLUMN notification_record.status IS 'notification status';
COMMENT ON COLUMN notification_record.target IS 'notification target';
COMMENT ON COLUMN notification_record.details IS 'notification other info';
COMMENT ON COLUMN notification_record.created_at IS 'create time';
CREATE TABLE target_busi_group (
id BIGSERIAL PRIMARY KEY,
target_ident varchar(191) NOT NULL,
group_id bigint NOT NULL,
update_at bigint NOT NULL
);
CREATE UNIQUE INDEX idx_target_group ON target_busi_group (target_ident, group_id);
CREATE TABLE user_token (
id BIGSERIAL PRIMARY KEY,
username varchar(255) NOT NULL DEFAULT '',
token_name varchar(255) NOT NULL DEFAULT '',
token varchar(255) NOT NULL DEFAULT '',
create_at bigint NOT NULL DEFAULT 0,
last_used bigint NOT NULL DEFAULT 0
);
CREATE TABLE notify_rule (
id bigserial PRIMARY KEY,
name varchar(255) NOT NULL,
description text,
enable boolean DEFAULT false,
user_group_ids varchar(255) NOT NULL DEFAULT '',
notify_configs text,
pipeline_configs text,
create_at bigint NOT NULL DEFAULT 0,
create_by varchar(64) NOT NULL DEFAULT '',
update_at bigint NOT NULL DEFAULT 0,
update_by varchar(64) NOT NULL DEFAULT ''
);
CREATE TABLE notify_channel (
id bigserial PRIMARY KEY,
name varchar(255) NOT NULL,
ident varchar(255) NOT NULL,
description text,
enable boolean DEFAULT false,
param_config text,
request_type varchar(50) NOT NULL,
request_config text,
weight int NOT NULL DEFAULT 0,
create_at bigint NOT NULL DEFAULT 0,
create_by varchar(64) NOT NULL DEFAULT '',
update_at bigint NOT NULL DEFAULT 0,
update_by varchar(64) NOT NULL DEFAULT ''
);
CREATE TABLE message_template (
id bigserial PRIMARY KEY,
name varchar(64) NOT NULL,
ident varchar(64) NOT NULL,
content text,
user_group_ids varchar(64),
notify_channel_ident varchar(64) NOT NULL DEFAULT '',
private int NOT NULL DEFAULT 0,
weight int NOT NULL DEFAULT 0,
create_at bigint NOT NULL DEFAULT 0,
create_by varchar(64) NOT NULL DEFAULT '',
update_at bigint NOT NULL DEFAULT 0,
update_by varchar(64) NOT NULL DEFAULT ''
);
CREATE TABLE event_pipeline (
id bigserial PRIMARY KEY,
name varchar(128) NOT NULL,
team_ids text,
description varchar(255) NOT NULL DEFAULT '',
filter_enable smallint NOT NULL DEFAULT 0,
label_filters text,
attribute_filters text,
processors text,
create_at bigint NOT NULL DEFAULT 0,
create_by varchar(64) NOT NULL DEFAULT '',
update_at bigint NOT NULL DEFAULT 0,
update_by varchar(64) NOT NULL DEFAULT ''
);
CREATE TABLE embedded_product (
id bigserial PRIMARY KEY,
name varchar(255) DEFAULT NULL,
url varchar(255) DEFAULT NULL,
is_private boolean DEFAULT NULL,
team_ids varchar(255),
create_at bigint NOT NULL DEFAULT 0,
create_by varchar(64) NOT NULL DEFAULT '',
update_at bigint NOT NULL DEFAULT 0,
update_by varchar(64) NOT NULL DEFAULT ''
);
================================================
FILE: docker/compose-postgres/initsql_for_postgres/b-ibex-for-Postgres.sql
================================================
CREATE TABLE task_meta
(
id bigserial,
title varchar(255) not null default '',
account varchar(64) not null,
batch int not null default 0,
tolerance int not null default 0,
timeout int not null default 0,
pause varchar(255) not null default '',
script text not null,
args varchar(512) not null default '',
stdin varchar(1024) not null default '' ,
creator varchar(64) not null default '',
created timestamp not null default CURRENT_TIMESTAMP,
PRIMARY KEY (id)
) ;
CREATE INDEX task_meta_creator_idx ON task_meta (creator);
CREATE INDEX task_meta_created_idx ON task_meta (created);
/* start|cancel|kill|pause */
CREATE TABLE task_action
(
id bigint not null,
action varchar(32) not null,
clock bigint not null default 0,
PRIMARY KEY (id)
) ;
CREATE TABLE task_scheduler
(
id bigint not null,
scheduler varchar(128) not null default ''
) ;
CREATE INDEX task_scheduler_id_scheduler_idx ON task_scheduler (id, scheduler);
CREATE TABLE task_scheduler_health
(
scheduler varchar(128) not null,
clock bigint not null,
UNIQUE (scheduler)
) ;
CREATE INDEX task_scheduler_health_clock_idx ON task_scheduler_health (clock);
CREATE TABLE task_host_doing
(
id bigint not null,
host varchar(128) not null,
clock bigint not null default 0,
action varchar(16) not null
) ;
CREATE INDEX task_host_doing_id_idx ON task_host_doing (id);
CREATE INDEX task_host_doing_host_idx ON task_host_doing (host);
CREATE TABLE task_host_0
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_1
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_2
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_3
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_4
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_5
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_6
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_7
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_8
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_9
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_10
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_11
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_12
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_13
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_14
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_15
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_16
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_17
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_18
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_19
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_20
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_21
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_22
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_23
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_24
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_25
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_26
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_27
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_28
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_29
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_30
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_31
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_32
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_33
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_34
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_35
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_36
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_37
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_38
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_39
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_40
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_41
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_42
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_43
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_44
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_45
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_46
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_47
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_48
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_49
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_50
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_51
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_52
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_53
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_54
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_55
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_56
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_57
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_58
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_59
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_60
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_61
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_62
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_63
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_64
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_65
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_66
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_67
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_68
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_69
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_70
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_71
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_72
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_73
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_74
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_75
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_76
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_77
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_78
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_79
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_80
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_81
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_82
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_83
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_84
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_85
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_86
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_87
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_88
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_89
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_90
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_91
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_92
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_93
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_94
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_95
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_96
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_97
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_98
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
CREATE TABLE task_host_99
(
ii bigserial,
id bigint not null,
host varchar(128) not null,
status varchar(32) not null,
stdout text,
stderr text,
UNIQUE (id, host),
PRIMARY KEY (ii)
) ;
================================================
FILE: docker/compose-postgres/n9eetc_pg/config.toml
================================================
[Global]
RunMode = "release"
[Log]
# log write dir
Dir = "logs"
# log level: DEBUG INFO WARNING ERROR
Level = "INFO"
# stdout, stderr, file
Output = "stdout"
# # rotate by time
# KeepHours = 4
# # rotate by size
# RotateNum = 3
# # unit: MB
# RotateSize = 256
[HTTP]
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 17000
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = false
# whether enable pprof
PProf = false
# expose prometheus /metrics?
ExposeMetrics = true
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120
[HTTP.ShowCaptcha]
Enable = false
[HTTP.APIForAgent]
Enable = true
# [HTTP.APIForAgent.BasicAuth]
# user001 = "ccc26da7b9aba533cbb263a36c07dcc5"
[HTTP.APIForService]
Enable = false
[HTTP.APIForService.BasicAuth]
user001 = "ccc26da7b9aba533cbb263a36c07dcc5"
[HTTP.JWTAuth]
# unit: min
AccessExpired = 1500
# unit: min
RefreshExpired = 10080
RedisKeyPrefix = "/jwt/"
[HTTP.ProxyAuth]
# if proxy auth enabled, jwt auth is disabled
Enable = false
# username key in http proxy header
HeaderUserNameKey = "X-User-Name"
DefaultRoles = ["Standard"]
[HTTP.RSA]
# open RSA
OpenRSA = false
# RSA public key
RSAPublicKeyPath = "/etc/n9e/public.pem"
# RSA private key
RSAPrivateKeyPath = "/etc/n9e/private.pem"
# RSA private key password
RSAPassWord = ""
[DB]
DSN="host=postgres port=5432 user=root dbname=n9e_v6 password=1234 sslmode=disable"
# enable debug mode or not
Debug = false
# mysql postgres
DBType = "postgres"
# unit: s
MaxLifetime = 7200
# max open connections
MaxOpenConns = 150
# max idle connections
MaxIdleConns = 50
[Redis]
# address, ip:port or ip1:port,ip2:port for cluster and sentinel(SentinelAddrs)
Address = "redis:6379"
# Username = ""
# Password = ""
# DB = 0
# UseTLS = false
# TLSMinVersion = "1.2"
# standalone cluster sentinel
RedisType = "standalone"
# Mastername for sentinel type
# MasterName = "mymaster"
# SentinelUsername = ""
# SentinelPassword = ""
[Alert]
[Alert.Heartbeat]
# auto detect if blank
IP = ""
# unit ms
Interval = 1000
EngineName = "default"
# [Alert.Alerting]
# NotifyConcurrency = 10
[Center]
MetricsYamlFile = "./etc/metrics.yaml"
I18NHeaderKey = "X-Language"
[Center.AnonymousAccess]
PromQuerier = true
AlertDetail = true
[Pushgw]
# use target labels in database instead of in series
LabelRewrite = true
ForceUseServerTS = true
# [Pushgw.DebugSample]
# ident = "xx"
# __name__ = "xx"
# [Pushgw.WriterOpt]
# QueueMaxSize = 1000000
# QueuePopSize = 1000
[[Pushgw.Writers]]
# Url = "http://127.0.0.1:8480/insert/0/prometheus/api/v1/write"
Url = "http://victoriametrics:8428/api/v1/write"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Headers = ["X-From", "n9e"]
Timeout = 10000
DialTimeout = 3000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
## Optional TLS Config
# UseTLS = false
# TLSCA = "/etc/n9e/ca.pem"
# TLSCert = "/etc/n9e/cert.pem"
# TLSKey = "/etc/n9e/key.pem"
# InsecureSkipVerify = false
# [[Writers.WriteRelabels]]
# Action = "replace"
# SourceLabels = ["__address__"]
# Regex = "([^:]+)(?::\\d+)?"
# Replacement = "$1:80"
# TargetLabel = "__address__"
[Ibex]
Enable = true
RPCListen = "0.0.0.0:20090"
================================================
FILE: docker/compose-postgres/n9eetc_pg/metrics.yaml
================================================
cpu_usage_idle: CPU空闲率(单位:%)
cpu_usage_active: CPU使用率(单位:%)
cpu_usage_system: CPU内核态时间占比(单位:%)
cpu_usage_user: CPU用户态时间占比(单位:%)
cpu_usage_nice: 低优先级用户态CPU时间占比,也就是进程nice值被调整为1-19之间的CPU时间。这里注意,nice可取值范围是-20到19,数值越大,优先级反而越低(单位:%)
cpu_usage_iowait: CPU等待I/O的时间占比(单位:%)
cpu_usage_irq: CPU处理硬中断的时间占比(单位:%)
cpu_usage_softirq: CPU处理软中断的时间占比(单位:%)
cpu_usage_steal: 在虚拟机环境下有该指标,表示CPU被其他虚拟机争用的时间占比,超过20就表示争抢严重(单位:%)
cpu_usage_guest: 通过虚拟化运行其他操作系统的时间,也就是运行虚拟机的CPU时间占比(单位:%)
cpu_usage_guest_nice: 以低优先级运行虚拟机的时间占比(单位:%)
disk_free: 硬盘分区剩余量(单位:byte)
disk_used: 硬盘分区使用量(单位:byte)
disk_used_percent: 硬盘分区使用率(单位:%)
disk_total: 硬盘分区总量(单位:byte)
disk_inodes_free: 硬盘分区inode剩余量
disk_inodes_used: 硬盘分区inode使用量
disk_inodes_total: 硬盘分区inode总量
diskio_io_time: 从设备视角来看I/O请求总时间,队列中有I/O请求就计数(单位:毫秒),counter类型,需要用函数求rate才有使用价值
diskio_iops_in_progress: 已经分配给设备驱动且尚未完成的IO请求,不包含在队列中但尚未分配给设备驱动的IO请求,gauge类型
diskio_merged_reads: 相邻读请求merge读的次数,counter类型
diskio_merged_writes: 相邻写请求merge写的次数,counter类型
diskio_read_bytes: 读取的byte数量,counter类型,需要用函数求rate才有使用价值
diskio_read_time: 读请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值
diskio_reads: 读请求次数,counter类型,需要用函数求rate才有使用价值
diskio_weighted_io_time: 从I/O请求视角来看I/O等待总时间,如果同时有多个I/O请求,时间会叠加(单位:毫秒)
diskio_write_bytes: 写入的byte数量,counter类型,需要用函数求rate才有使用价值
diskio_write_time: 写请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值
diskio_writes: 写请求次数,counter类型,需要用函数求rate才有使用价值
kernel_boot_time: 内核启动时间
kernel_context_switches: 内核上下文切换次数
kernel_entropy_avail: linux系统内部的熵池
kernel_interrupts: 内核中断次数
kernel_processes_forked: fork的进程数
mem_active: 活跃使用的内存总数(包括cache和buffer内存)
mem_available: 应用程序可用内存数
mem_available_percent: 内存剩余百分比(0~100)
mem_buffered: 用来给文件做缓冲大小
mem_cached: 被高速缓冲存储器(cache memory)用的内存的大小(等于 diskcache minus SwapCache )
mem_commit_limit: 根据超额分配比率('vm.overcommit_ratio'),这是当前在系统上分配可用的内存总量,这个限制只是在模式2('vm.overcommit_memory')时启用
mem_committed_as: 目前在系统上分配的内存量。是所有进程申请的内存的总和
mem_dirty: 等待被写回到磁盘的内存大小
mem_free: 空闲内存数
mem_high_free: 未被使用的高位内存大小
mem_high_total: 高位内存总大小(Highmem是指所有内存高于860MB的物理内存,Highmem区域供用户程序使用,或用于页面缓存。该区域不是直接映射到内核空间。内核必须使用不同的手法使用该段内存)
mem_huge_page_size: 每个大页的大小
mem_huge_pages_free: 池中尚未分配的 HugePages 数量
mem_huge_pages_total: 预留HugePages的总个数
mem_inactive: 空闲的内存数(包括free和available的内存)
mem_low_free: 未被使用的低位大小
mem_low_total: 低位内存总大小,低位可以达到高位内存一样的作用,而且它还能够被内核用来记录一些自己的数据结构
mem_mapped: 设备和文件等映射的大小
mem_page_tables: 管理内存分页页面的索引表的大小
mem_shared: 多个进程共享的内存总额
mem_slab: 内核数据结构缓存的大小,可以减少申请和释放内存带来的消耗
mem_sreclaimable: 可收回Slab的大小
mem_sunreclaim: 不可收回Slab的大小(SUnreclaim+SReclaimable=Slab)
mem_swap_cached: 被高速缓冲存储器(cache memory)用的交换空间的大小,已经被交换出来的内存,但仍然被存放在swapfile中。用来在需要的时候很快的被替换而不需要再次打开I/O端口
mem_swap_free: 未被使用交换空间的大小
mem_swap_total: 交换空间的总大小
mem_total: 内存总数
mem_used: 已用内存数
mem_used_percent: 已用内存数百分比(0~100)
mem_vmalloc_chunk: 最大的连续未被使用的vmalloc区域
mem_vmalloc_totalL: 可以vmalloc虚拟内存大小
mem_vmalloc_used: vmalloc已使用的虚拟内存大小
mem_write_back: 正在被写回到磁盘的内存大小
mem_write_back_tmp: FUSE用于临时写回缓冲区的内存
net_bytes_recv: 网卡收包总数(bytes)
net_bytes_sent: 网卡发包总数(bytes)
net_drop_in: 网卡收丢包数量
net_drop_out: 网卡发丢包数量
net_err_in: 网卡收包错误数量
net_err_out: 网卡发包错误数量
net_packets_recv: 网卡收包数量
net_packets_sent: 网卡发包数量
netstat_tcp_established: ESTABLISHED状态的网络链接数
netstat_tcp_fin_wait1: FIN_WAIT1状态的网络链接数
netstat_tcp_fin_wait2: FIN_WAIT2状态的网络链接数
netstat_tcp_last_ack: LAST_ACK状态的网络链接数
netstat_tcp_listen: LISTEN状态的网络链接数
netstat_tcp_syn_recv: SYN_RECV状态的网络链接数
netstat_tcp_syn_sent: SYN_SENT状态的网络链接数
netstat_tcp_time_wait: TIME_WAIT状态的网络链接数
netstat_udp_socket: UDP状态的网络链接数
processes_blocked: 不可中断的睡眠状态下的进程数('U','D','L')
processes_dead: 回收中的进程数('X')
processes_idle: 挂起的空闲进程数('I')
processes_paging: 分页进程数('P')
processes_running: 运行中的进程数('R')
processes_sleeping: 可中断进程数('S')
processes_stopped: 暂停状态进程数('T')
processes_total: 总进程数
processes_total_threads: 总线程数
processes_unknown: 未知状态进程数
processes_zombies: 僵尸态进程数('Z')
swap_used_percent: Swap空间换出数据量
system_load1: 1分钟平均load值
system_load5: 5分钟平均load值
system_load15: 15分钟平均load值
system_n_users: 用户数
system_n_cpus: CPU核数
system_uptime: 系统启动时间
nginx_accepts: 自nginx启动起,与客户端建立过得连接总数
nginx_active: 当前nginx正在处理的活动连接数,等于Reading/Writing/Waiting总和
nginx_handled: 自nginx启动起,处理过的客户端连接总数
nginx_reading: 正在读取HTTP请求头部的连接总数
nginx_requests: 自nginx启动起,处理过的客户端请求总数,由于存在HTTP Keep-Alive请求,该值会大于handled值
nginx_upstream_check_fall: upstream_check模块检测到后端失败的次数
nginx_upstream_check_rise: upstream_check模块对后端的检测次数
nginx_upstream_check_status_code: 后端upstream的状态,up为1,down为0
nginx_waiting: 开启 keep-alive 的情况下,这个值等于 active – (reading+writing), 意思就是 Nginx 已经处理完正在等候下一次请求指令的驻留连接
nginx_writing: 正在向客户端发送响应的连接总数
http_response_content_length: HTTP消息实体的传输长度
http_response_http_response_code: http响应状态码
http_response_response_time: http响应用时
http_response_result_code: url探测结果0为正常否则url无法访问
# [mysqld_exporter]
mysql_global_status_uptime: The number of seconds that the server has been up.(Gauge)
mysql_global_status_uptime_since_flush_status: The number of seconds since the most recent FLUSH STATUS statement.(Gauge)
mysql_global_status_queries: The number of statements executed by the server. This variable includes statements executed within stored programs, unlike the Questions variable. It does not count COM_PING or COM_STATISTICS commands.(Counter)
mysql_global_status_threads_connected: The number of currently open connections.(Counter)
mysql_global_status_connections: The number of connection attempts (successful or not) to the MySQL server.(Gauge)
mysql_global_status_max_used_connections: The maximum number of connections that have been in use simultaneously since the server started.(Gauge)
mysql_global_status_threads_running: The number of threads that are not sleeping.(Gauge)
mysql_global_status_questions: The number of statements executed by the server. This includes only statements sent to the server by clients and not statements executed within stored programs, unlike the Queries variable. This variable does not count COM_PING, COM_STATISTICS, COM_STMT_PREPARE, COM_STMT_CLOSE, or COM_STMT_RESET commands.(Counter)
mysql_global_status_threads_cached: The number of threads in the thread cache.(Counter)
mysql_global_status_threads_created: The number of threads created to handle connections. If Threads_created is big, you may want to increase the thread_cache_size value. The cache miss rate can be calculated as Threads_created/Connections.(Counter)
mysql_global_status_created_tmp_tables: The number of internal temporary tables created by the server while executing statements.(Counter)
mysql_global_status_created_tmp_disk_tables: The number of internal on-disk temporary tables created by the server while executing statements. You can compare the number of internal on-disk temporary tables created to the total number of internal temporary tables created by comparing Created_tmp_disk_tables and Created_tmp_tables values.(Counter)
mysql_global_status_created_tmp_files: How many temporary files mysqld has created.(Counter)
mysql_global_status_select_full_join: The number of joins that perform table scans because they do not use indexes. If this value is not 0, you should carefully check the indexes of your tables.(Counter)
mysql_global_status_select_full_range_join: The number of joins that used a range search on a reference table.(Counter)
mysql_global_status_select_range: The number of joins that used ranges on the first table. This is normally not a critical issue even if the value is quite large.(Counter)
mysql_global_status_select_range_check: The number of joins without keys that check for key usage after each row. If this is not 0, you should carefully check the indexes of your tables.(Counter)
mysql_global_status_select_scan: The number of joins that did a full scan of the first table.(Counter)
mysql_global_status_sort_rows: The number of sorted rows.(Counter)
mysql_global_status_sort_range: The number of sorts that were done using ranges.(Counter)
mysql_global_status_sort_merge_passes: The number of merge passes that the sort algorithm has had to do. If this value is large, you should consider increasing the value of the sort_buffer_size system variable.(Counter)
mysql_global_status_sort_scan: The number of sorts that were done by scanning the table.(Counter)
mysql_global_status_slow_queries: The number of queries that have taken more than long_query_time seconds. This counter increments regardless of whether the slow query log is enabled.(Counter)
mysql_global_status_aborted_connects: The number of failed attempts to connect to the MySQL server.(Counter)
mysql_global_status_aborted_clients: The number of connections that were aborted because the client died without closing the connection properly.(Counter)
mysql_global_status_table_locks_immediate: The number of times that a request for a table lock could be granted immediately. Locks Immediate rising and falling is normal activity.(Counter)
mysql_global_status_table_locks_waited: The number of times that a request for a table lock could not be granted immediately and a wait was needed. If this is high and you have performance problems, you should first optimize your queries, and then either split your table or tables or use replication.(Counter)
mysql_global_status_bytes_received: The number of bytes received from all clients.(Counter)
mysql_global_status_bytes_sent: The number of bytes sent to all clients.(Counter)
mysql_global_status_innodb_page_size: InnoDB page size (default 16KB). Many values are counted in pages; the page size enables them to be easily converted to bytes.(Gauge)
mysql_global_status_buffer_pool_pages: The number of pages in the InnoDB buffer pool.(Gauge)
mysql_global_status_commands_total: The number of times each xxx statement has been executed.(Counter)
mysql_global_status_handlers_total: Handler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes. This is in fact the layer between the Storage Engine and MySQL.(Counter)
mysql_global_status_opened_files: The number of files that have been opened with my_open() (a mysys library function). Parts of the server that open files without using this function do not increment the count.(Counter)
mysql_global_status_open_tables: The number of tables that are open.(Gauge)
mysql_global_status_opened_tables: The number of tables that have been opened. If Opened_tables is big, your table_open_cache value is probably too small.(Counter)
mysql_global_status_table_open_cache_hits: The number of hits for open tables cache lookups.(Counter)
mysql_global_status_table_open_cache_misses: The number of misses for open tables cache lookups.(Counter)
mysql_global_status_table_open_cache_overflows: The number of overflows for the open tables cache.(Counter)
mysql_global_status_innodb_num_open_files: The number of files InnoDB currently holds open.(Gauge)
mysql_global_status_connection_errors_total: These variables provide information about errors that occur during the client connection process.(Counter)
mysql_global_status_innodb_buffer_pool_read_requests: The number of logical read requests.(Counter)
mysql_global_status_innodb_buffer_pool_reads: The number of logical reads that InnoDB could not satisfy from the buffer pool, and had to read directly from disk.(Counter)
mysql_global_variables_thread_cache_size: How many threads the server should cache for reuse.(Gauge)
mysql_global_variables_max_connections: The maximum permitted number of simultaneous client connections.(Gauge)
mysql_global_variables_innodb_buffer_pool_size: The size in bytes of the buffer pool, the memory area where InnoDB caches table and index data. The default value is 134217728 bytes (128MB).(Gauge)
mysql_global_variables_innodb_log_buffer_size: The size in bytes of the buffer that InnoDB uses to write to the log files on disk.(Gauge)
mysql_global_variables_key_buffer_size: Index blocks for MyISAM tables are buffered and are shared by all threads.(Gauge)
mysql_global_variables_query_cache_size: The amount of memory allocated for caching query results.(Gauge)
mysql_global_variables_table_open_cache: The number of open tables for all threads.(Gauge)
mysql_global_variables_open_files_limit: The number of file descriptors available to mysqld from the operating system.(Gauge)
# [redis_exporter]
redis_active_defrag_running: When activedefrag is enabled, this indicates whether defragmentation is currently active, and the CPU percentage it intends to utilize.
redis_allocator_active_bytes: Total bytes in the allocator active pages, this includes external-fragmentation.
redis_allocator_allocated_bytes: Total bytes allocated form the allocator, including internal-fragmentation. Normally the same as used_memory.
redis_allocator_frag_bytes: Delta between allocator_active and allocator_allocated. See note about mem_fragmentation_bytes.
redis_allocator_frag_ratio: Ratio between allocator_active and allocator_allocated. This is the true (external) fragmentation metric (not mem_fragmentation_ratio).
redis_allocator_resident_bytes: Total bytes resident (RSS) in the allocator, this includes pages that can be released to the OS (by MEMORY PURGE, or just waiting).
redis_allocator_rss_bytes: Delta between allocator_resident and allocator_active.
redis_allocator_rss_ratio: Ratio between allocator_resident and allocator_active. This usually indicates pages that the allocator can and probably will soon release back to the OS.
redis_aof_current_rewrite_duration_sec: Duration of the on-going AOF rewrite operation if any.
redis_aof_enabled: Flag indicating AOF logging is activated.
redis_aof_last_bgrewrite_status: Status of the last AOF rewrite operation.
redis_aof_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last AOF rewrite operation.
redis_aof_last_rewrite_duration_sec: Duration of the last AOF rewrite operation in seconds.
redis_aof_last_write_status: Status of the last write operation to the AOF.
redis_aof_rewrite_in_progress: Flag indicating a AOF rewrite operation is on-going.
redis_aof_rewrite_scheduled: Flag indicating an AOF rewrite operation will be scheduled once the on-going RDB save is complete.
redis_blocked_clients: Number of clients pending on a blocking call (BLPOP, BRPOP, BRPOPLPUSH, BLMOVE, BZPOPMIN, BZPOPMAX).
redis_client_recent_max_input_buffer_bytes: Biggest input buffer among current client connections.
redis_client_recent_max_output_buffer_bytes: Biggest output buffer among current client connections.
redis_cluster_enabled: Indicate Redis cluster is enabled.
redis_commands_duration_seconds_total: The total CPU time consumed by these commands.(Counter)
redis_commands_processed_total: Total number of commands processed by the server.(Counter)
redis_commands_total: The number of calls that reached command execution (not rejected).(Counter)
redis_config_maxclients: The value of the maxclients configuration directive. This is the upper limit for the sum of connected_clients, connected_slaves and cluster_connections.
redis_config_maxmemory: The value of the maxmemory configuration directive.
redis_connected_clients: Number of client connections (excluding connections from replicas).
redis_connected_slaves: Number of connected replicas.
redis_connections_received_total: Total number of connections accepted by the server.(Counter)
redis_cpu_sys_children_seconds_total: System CPU consumed by the background processes.(Counter)
redis_cpu_sys_seconds_total: System CPU consumed by the Redis server, which is the sum of system CPU consumed by all threads of the server process (main thread and background threads).(Counter)
redis_cpu_user_children_seconds_total: User CPU consumed by the background processes.(Counter)
redis_cpu_user_seconds_total: User CPU consumed by the Redis server, which is the sum of user CPU consumed by all threads of the server process (main thread and background threads).(Counter)
redis_db_keys: Total number of keys by DB.
redis_db_keys_expiring: Total number of expiring keys by DB
redis_defrag_hits: Number of value reallocations performed by active the defragmentation process.
redis_defrag_misses: Number of aborted value reallocations started by the active defragmentation process.
redis_defrag_key_hits: Number of keys that were actively defragmented.
redis_defrag_key_misses: Number of keys that were skipped by the active defragmentation process.
redis_evicted_keys_total: Number of evicted keys due to maxmemory limit.(Counter)
redis_expired_keys_total: Total number of key expiration events.(Counter)
redis_expired_stale_percentage: The percentage of keys probably expired.
redis_expired_time_cap_reached_total: The count of times that active expiry cycles have stopped early.
redis_exporter_last_scrape_connect_time_seconds: The duration(in seconds) to connect when scrape.
redis_exporter_last_scrape_duration_seconds: The last scrape duration.
redis_exporter_last_scrape_error: The last scrape error status.
redis_exporter_scrape_duration_seconds_count: Durations of scrapes by the exporter
redis_exporter_scrape_duration_seconds_sum: Durations of scrapes by the exporter
redis_exporter_scrapes_total: Current total redis scrapes.(Counter)
redis_instance_info: Information about the Redis instance.
redis_keyspace_hits_total: Hits total.(Counter)
redis_keyspace_misses_total: Misses total.(Counter)
redis_last_key_groups_scrape_duration_milliseconds: Duration of the last key group metrics scrape in milliseconds.
redis_last_slow_execution_duration_seconds: The amount of time needed for last slow execution, in seconds.
redis_latest_fork_seconds: The amount of time needed for last fork, in seconds.
redis_lazyfree_pending_objects: The number of objects waiting to be freed (as a result of calling UNLINK, or FLUSHDB and FLUSHALL with the ASYNC option).
redis_master_repl_offset: The server's current replication offset.
redis_mem_clients_normal: Memory used by normal clients.(Gauge)
redis_mem_clients_slaves: Memory used by replica clients - Starting Redis 7.0, replica buffers share memory with the replication backlog, so this field can show 0 when replicas don't trigger an increase of memory usage.
redis_mem_fragmentation_bytes: Delta between used_memory_rss and used_memory. Note that when the total fragmentation bytes is low (few megabytes), a high ratio (e.g. 1.5 and above) is not an indication of an issue.
redis_mem_fragmentation_ratio: Ratio between used_memory_rss and used_memory. Note that this doesn't only includes fragmentation, but also other process overheads (see the allocator_* metrics), and also overheads like code, shared libraries, stack, etc.
redis_mem_not_counted_for_eviction_bytes: (Gauge)
redis_memory_max_bytes: Max memory limit in bytes.
redis_memory_used_bytes: Total number of bytes allocated by Redis using its allocator (either standard libc, jemalloc, or an alternative allocator such as tcmalloc)
redis_memory_used_dataset_bytes: The size in bytes of the dataset (used_memory_overhead subtracted from used_memory)
redis_memory_used_lua_bytes: Number of bytes used by the Lua engine.
redis_memory_used_overhead_bytes: The sum in bytes of all overheads that the server allocated for managing its internal data structures.
redis_memory_used_peak_bytes: Peak memory consumed by Redis (in bytes)
redis_memory_used_rss_bytes: Number of bytes that Redis allocated as seen by the operating system (a.k.a resident set size). This is the number reported by tools such as top(1) and ps(1)
redis_memory_used_scripts_bytes: Number of bytes used by cached Lua scripts
redis_memory_used_startup_bytes: Initial amount of memory consumed by Redis at startup in bytes
redis_migrate_cached_sockets_total: The number of sockets open for MIGRATE purposes
redis_net_input_bytes_total: Total input bytes(Counter)
redis_net_output_bytes_total: Total output bytes(Counter)
redis_process_id: Process ID
redis_pubsub_channels: Global number of pub/sub channels with client subscriptions
redis_pubsub_patterns: Global number of pub/sub pattern with client subscriptions
redis_rdb_bgsave_in_progress: Flag indicating a RDB save is on-going
redis_rdb_changes_since_last_save: Number of changes since the last dump
redis_rdb_current_bgsave_duration_sec: Duration of the on-going RDB save operation if any
redis_rdb_last_bgsave_duration_sec: Duration of the last RDB save operation in seconds
redis_rdb_last_bgsave_status: Status of the last RDB save operation
redis_rdb_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last RDB save operation
redis_rdb_last_save_timestamp_seconds: Epoch-based timestamp of last successful RDB save
redis_rejected_connections_total: Number of connections rejected because of maxclients limit(Counter)
redis_repl_backlog_first_byte_offset: The master offset of the replication backlog buffer
redis_repl_backlog_history_bytes: Size in bytes of the data in the replication backlog buffer
redis_repl_backlog_is_active: Flag indicating replication backlog is active
redis_replica_partial_resync_accepted: The number of accepted partial resync requests(Gauge)
redis_replica_partial_resync_denied: The number of denied partial resync requests(Gauge)
redis_replica_resyncs_full: The number of full resyncs with replicas
redis_replication_backlog_bytes: Memory used by replication backlog
redis_second_repl_offset: The offset up to which replication IDs are accepted.
redis_slave_expires_tracked_keys: The number of keys tracked for expiry purposes (applicable only to writable replicas)(Gauge)
redis_slowlog_last_id: Last id of slowlog
redis_slowlog_length: Total slowlog
redis_start_time_seconds: Start time of the Redis instance since unix epoch in seconds.
redis_target_scrape_request_errors_total: Errors in requests to the exporter
redis_up: Flag indicating redis instance is up
redis_uptime_in_seconds: Number of seconds since Redis server start
# [windows_exporter]
windows_cpu_clock_interrupts_total: Total number of received and serviced clock tick interrupts(counter)
windows_cpu_core_frequency_mhz: Core frequency in megahertz(gauge)
windows_cpu_cstate_seconds_total: Time spent in low-power idle state(counter)
windows_cpu_dpcs_total: Total number of received and serviced deferred procedure calls (DPCs)(counter)
windows_cpu_idle_break_events_total: Total number of time processor was woken from idle(counter)
windows_cpu_interrupts_total: Total number of received and serviced hardware interrupts(counter)
windows_cpu_parking_status: Parking Status represents whether a processor is parked or not(gauge)
windows_cpu_processor_performance: Processor Performance is the average performance of the processor while it is executing instructions, as a percentage of the nominal performance of the processor. On some processors, Processor Performance may exceed 100%(gauge)
windows_cpu_time_total: Time that processor spent in different modes (idle, user, system, ...)(counter)
windows_cs_hostname: Labeled system hostname information as provided by ComputerSystem.DNSHostName and ComputerSystem.Domain(gauge)
windows_cs_logical_processors: ComputerSystem.NumberOfLogicalProcessors(gauge)
windows_cs_physical_memory_bytes: ComputerSystem.TotalPhysicalMemory(gauge)
windows_exporter_build_info: A metric with a constant '1' value labeled by version, revision, branch, and goversion from which windows_exporter was built.(gauge)
windows_exporter_collector_duration_seconds: Duration of a collection.(gauge)
windows_exporter_collector_success: Whether the collector was successful.(gauge)
windows_exporter_collector_timeout: Whether the collector timed out.(gauge)
windows_exporter_perflib_snapshot_duration_seconds: Duration of perflib snapshot capture(gauge)
windows_logical_disk_free_bytes: Free space in bytes (LogicalDisk.PercentFreeSpace)(gauge)
windows_logical_disk_idle_seconds_total: Seconds that the disk was idle (LogicalDisk.PercentIdleTime)(counter)
windows_logical_disk_read_bytes_total: The number of bytes transferred from the disk during read operations (LogicalDisk.DiskReadBytesPerSec)(counter)
windows_logical_disk_read_latency_seconds_total: Shows the average time, in seconds, of a read operation from the disk (LogicalDisk.AvgDiskSecPerRead)(counter)
windows_logical_disk_read_seconds_total: Seconds that the disk was busy servicing read requests (LogicalDisk.PercentDiskReadTime)(counter)
windows_logical_disk_read_write_latency_seconds_total: Shows the time, in seconds, of the average disk transfer (LogicalDisk.AvgDiskSecPerTransfer)(counter)
windows_logical_disk_reads_total: The number of read operations on the disk (LogicalDisk.DiskReadsPerSec)(counter)
windows_logical_disk_requests_queued: The number of requests queued to the disk (LogicalDisk.CurrentDiskQueueLength)(gauge)
windows_logical_disk_size_bytes: Total space in bytes (LogicalDisk.PercentFreeSpace_Base)(gauge)
windows_logical_disk_split_ios_total: The number of I/Os to the disk were split into multiple I/Os (LogicalDisk.SplitIOPerSec)(counter)
windows_logical_disk_write_bytes_total: The number of bytes transferred to the disk during write operations (LogicalDisk.DiskWriteBytesPerSec)(counter)
windows_logical_disk_write_latency_seconds_total: Shows the average time, in seconds, of a write operation to the disk (LogicalDisk.AvgDiskSecPerWrite)(counter)
windows_logical_disk_write_seconds_total: Seconds that the disk was busy servicing write requests (LogicalDisk.PercentDiskWriteTime)(counter)
windows_logical_disk_writes_total: The number of write operations on the disk (LogicalDisk.DiskWritesPerSec)(counter)
windows_net_bytes_received_total: (Network.BytesReceivedPerSec)(counter)
windows_net_bytes_sent_total: (Network.BytesSentPerSec)(counter)
windows_net_bytes_total: (Network.BytesTotalPerSec)(counter)
windows_net_current_bandwidth: (Network.CurrentBandwidth)(gauge)
windows_net_packets_outbound_discarded_total: (Network.PacketsOutboundDiscarded)(counter)
windows_net_packets_outbound_errors_total: (Network.PacketsOutboundErrors)(counter)
windows_net_packets_received_discarded_total: (Network.PacketsReceivedDiscarded)(counter)
windows_net_packets_received_errors_total: (Network.PacketsReceivedErrors)(counter)
windows_net_packets_received_total: (Network.PacketsReceivedPerSec)(counter)
windows_net_packets_received_unknown_total: (Network.PacketsReceivedUnknown)(counter)
windows_net_packets_sent_total: (Network.PacketsSentPerSec)(counter)
windows_net_packets_total: (Network.PacketsPerSec)(counter)
windows_os_info: OperatingSystem.Caption, OperatingSystem.Version(gauge)
windows_os_paging_free_bytes: OperatingSystem.FreeSpaceInPagingFiles(gauge)
windows_os_paging_limit_bytes: OperatingSystem.SizeStoredInPagingFiles(gauge)
windows_os_physical_memory_free_bytes: OperatingSystem.FreePhysicalMemory(gauge)
windows_os_process_memory_limix_bytes: OperatingSystem.MaxProcessMemorySize(gauge)
windows_os_processes: OperatingSystem.NumberOfProcesses(gauge)
windows_os_processes_limit: OperatingSystem.MaxNumberOfProcesses(gauge)
windows_os_time: OperatingSystem.LocalDateTime(gauge)
windows_os_timezone: OperatingSystem.LocalDateTime(gauge)
windows_os_users: OperatingSystem.NumberOfUsers(gauge)
windows_os_virtual_memory_bytes: OperatingSystem.TotalVirtualMemorySize(gauge)
windows_os_virtual_memory_free_bytes: OperatingSystem.FreeVirtualMemory(gauge)
windows_os_visible_memory_bytes: OperatingSystem.TotalVisibleMemorySize(gauge)
windows_service_info: A metric with a constant '1' value labeled with service information(gauge)
windows_service_start_mode: The start mode of the service (StartMode)(gauge)
windows_service_state: The state of the service (State)(gauge)
windows_service_status: The status of the service (Status)(gauge)
windows_system_context_switches_total: Total number of context switches (WMI source is PerfOS_System.ContextSwitchesPersec)(counter)
windows_system_exception_dispatches_total: Total number of exceptions dispatched (WMI source is PerfOS_System.ExceptionDispatchesPersec)(counter)
windows_system_processor_queue_length: Length of processor queue (WMI source is PerfOS_System.ProcessorQueueLength)(gauge)
windows_system_system_calls_total: Total number of system calls (WMI source is PerfOS_System.SystemCallsPersec)(counter)
windows_system_system_up_time: System boot time (WMI source is PerfOS_System.SystemUpTime)(gauge)
windows_system_threads: Current number of threads (WMI source is PerfOS_System.Threads)(gauge)
# [node_exporter]
# SYSTEM
# CPU context switch 次数
node_context_switches_total: context_switches
# Interrupts 次数
node_intr_total: Interrupts
# 运行的进程数
node_procs_running: Processes in runnable state
# 熵池大小
node_entropy_available_bits: Entropy available to random number generators
node_time_seconds: System time in seconds since epoch (1970)
node_boot_time_seconds: Node boot time, in unixtime
# CPU
node_cpu_seconds_total: Seconds the CPUs spent in each mode
node_load1: cpu load 1m
node_load5: cpu load 5m
node_load15: cpu load 15m
# MEM
# 内核态
# 用户追踪已从交换区获取但尚未修改的页面的内存
node_memory_SwapCached_bytes: Memory that keeps track of pages that have been fetched from swap but not yet been modified
# 内核用于缓存数据结构供自己使用的内存
node_memory_Slab_bytes: Memory used by the kernel to cache data structures for its own use
# slab中可回收的部分
node_memory_SReclaimable_bytes: SReclaimable - Part of Slab, that might be reclaimed, such as caches
# slab中不可回收的部分
node_memory_SUnreclaim_bytes: Part of Slab, that cannot be reclaimed on memory pressure
# Vmalloc内存区的大小
node_memory_VmallocTotal_bytes: Total size of vmalloc memory area
# vmalloc已分配的内存,虚拟地址空间上的连续的内存
node_memory_VmallocUsed_bytes: Amount of vmalloc area which is used
# vmalloc区可用的连续最大快的大小,通过此指标可以知道vmalloc可分配连续内存的最大值
node_memory_VmallocChunk_bytes: Largest contiguous block of vmalloc area which is free
# 内存的硬件故障删除掉的内存页的总大小
node_memory_HardwareCorrupted_bytes: Amount of RAM that the kernel identified as corrupted / not working
# 用于在虚拟和物理内存地址之间映射的内存
node_memory_PageTables_bytes: Memory used to map between virtual and physical memory addresses (gauge)
# 内核栈内存,常驻内存,不可回收
node_memory_KernelStack_bytes: Kernel memory stack. This is not reclaimable
# 用来访问高端内存,复制高端内存的临时buffer,称为“bounce buffering”,会降低I/O 性能
node_memory_Bounce_bytes: Memory used for block device bounce buffers
#用户态
# 单个巨页大小
node_memory_Hugepagesize_bytes: Huge Page size
# 系统分配的常驻巨页数
node_memory_HugePages_Total: Total size of the pool of huge pages
# 系统空闲的巨页数
node_memory_HugePages_Free: Huge pages in the pool that are not yet allocated
# 进程已申请但未使用的巨页数
node_memory_HugePages_Rsvd: Huge pages for which a commitment to allocate from the pool has been made, but no allocation
# 超过系统设定的常驻HugePages数量的个数
node_memory_HugePages_Surp: Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages
# 透明巨页 Transparent HugePages (THP)
node_memory_AnonHugePages_bytes: Memory in anonymous huge pages
# inactivelist中的File-backed内存
node_memory_Inactive_file_bytes: File-backed memory on inactive LRU list
# inactivelist中的Anonymous内存
node_memory_Inactive_anon_bytes: Anonymous and swap cache on inactive LRU list, including tmpfs (shmem)
# activelist中的File-backed内存
node_memory_Active_file_bytes: File-backed memory on active LRU list
# activelist中的Anonymous内存
node_memory_Active_anon_bytes: Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs
# 禁止换出的页,对应 Unevictable 链表
node_memory_Unevictable_bytes: Amount of unevictable memory that can't be swapped out for a variety of reasons
# 共享内存
node_memory_Shmem_bytes: Used shared memory (shared between several processes, thus including RAM disks)
# 匿名页内存大小
node_memory_AnonPages_bytes: Memory in user pages not backed by files
# 被关联的内存页大小
node_memory_Mapped_bytes: Used memory in mapped pages files which have been mapped, such as libraries
# file-backed内存页缓存大小
node_memory_Cached_bytes: Parked file data (file content) cache
# 系统中有多少匿名页曾经被swap-out、现在又被swap-in并且swap-in之后页面中的内容一直没发生变化
node_memory_SwapCached_bytes: Memory that keeps track of pages that have been fetched from swap but not yet been modified
# 被mlock()系统调用锁定的内存大小
node_memory_Mlocked_bytes: Size of pages locked to memory using the mlock() system call
# 块设备(block device)所占用的缓存页
node_memory_Buffers_bytes: Block device (e.g. harddisk) cache
node_memory_SwapTotal_bytes: Memory information field SwapTotal_bytes
node_memory_SwapFree_bytes: Memory information field SwapFree_bytes
# DISK
node_filesystem_files_free: Filesystem space available to non-root users in byte
node_filesystem_free_bytes: Filesystem free space in bytes
node_filesystem_size_bytes: Filesystem size in bytes
node_filesystem_files_free: Filesystem total free file nodes
node_filesystem_files: Filesystem total free file nodes
node_filefd_maximum: Max open files
node_filefd_allocated: Open files
node_filesystem_readonly: Filesystem read-only status
node_filesystem_device_error: Whether an error occurred while getting statistics for the given device
node_disk_reads_completed_total: The total number of reads completed successfully
node_disk_writes_completed_total: The total number of writes completed successfully
node_disk_reads_merged_total: The number of reads merged
node_disk_writes_merged_total: The number of writes merged
node_disk_read_bytes_total: The total number of bytes read successfully
node_disk_written_bytes_total: The total number of bytes written successfully
node_disk_io_time_seconds_total: Total seconds spent doing I/Os
node_disk_read_time_seconds_total: The total number of seconds spent by all reads
node_disk_write_time_seconds_total: The total number of seconds spent by all writes
node_disk_io_time_weighted_seconds_total: The weighted of seconds spent doing I/Os
# NET
node_network_receive_bytes_total: Network device statistic receive_bytes (counter)
node_network_transmit_bytes_total: Network device statistic transmit_bytes (counter)
node_network_receive_packets_total: Network device statistic receive_bytes
node_network_transmit_packets_total: Network device statistic transmit_bytes
node_network_receive_errs_total: Network device statistic receive_errs
node_network_transmit_errs_total: Network device statistic transmit_errs
node_network_receive_drop_total: Network device statistic receive_drop
node_network_transmit_drop_total: Network device statistic transmit_drop
node_nf_conntrack_entries: Number of currently allocated flow entries for connection tracking
node_sockstat_TCP_alloc: Number of TCP sockets in state alloc
node_sockstat_TCP_inuse: Number of TCP sockets in state inuse
node_sockstat_TCP_orphan: Number of TCP sockets in state orphan
node_sockstat_TCP_tw: Number of TCP sockets in state tw
node_netstat_Tcp_CurrEstab: Statistic TcpCurrEstab
node_sockstat_sockets_used: Number of IPv4 sockets in use
# [kafka_exporter]
kafka_brokers: count of kafka_brokers (gauge)
kafka_topic_partitions: Number of partitions for this Topic (gauge)
kafka_topic_partition_current_offset: Current Offset of a Broker at Topic/Partition (gauge)
kafka_consumergroup_current_offset: Current Offset of a ConsumerGroup at Topic/Partition (gauge)
kafka_consumer_lag_millis: Current approximation of consumer lag for a ConsumerGroup at Topic/Partition (gauge)
kafka_topic_partition_under_replicated_partition: 1 if Topic/Partition is under Replicated
# [zookeeper_exporter]
zk_znode_count: The total count of znodes stored
zk_ephemerals_count: The number of Ephemerals nodes
zk_watch_count: The number of watchers setup over Zookeeper nodes.
zk_approximate_data_size: Size of data in bytes that a zookeeper server has in its data tree
zk_outstanding_requests: Number of currently executing requests
zk_packets_sent: Count of the number of zookeeper packets sent from a server
zk_packets_received: Count of the number of zookeeper packets received by a server
zk_num_alive_connections: Number of active clients connected to a zookeeper server
zk_open_file_descriptor_count: Number of file descriptors that a zookeeper server has open
zk_max_file_descriptor_count: Maximum number of file descriptors that a zookeeper server can open
zk_avg_latency: Average time in milliseconds for requests to be processed
zk_min_latency: Maximum time in milliseconds for a request to be processed
zk_max_latency: Minimum time in milliseconds for a request to be processed
================================================
FILE: docker/compose-postgres/prometc_vm/prometheus.yml
================================================
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=` to any timeseries scraped from this config.
- job_name: 'victoriametrics'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['victoriametrics:8428']
- job_name: 'n9e'
# static_configs:
# - targets: ['n9e:17000']
file_sd_configs:
- files:
- targets.json
remote_write:
- url: 'http://n9e:17000/prometheus/v1/write'
================================================
FILE: docker/compose-postgres/prometc_vm/targets.json
================================================
[
{
"targets": [
"n9e:17000"
]
}
]
================================================
FILE: docker/initsql/a-n9e.sql
================================================
set names utf8mb4;
-- drop database if exists n9e_v6;
create database n9e_v6;
use n9e_v6;
CREATE TABLE `users` (
`id` bigint unsigned not null auto_increment,
`username` varchar(64) not null comment 'login name, cannot rename',
`nickname` varchar(64) not null comment 'display name, chinese name',
`password` varchar(128) not null default '',
`phone` varchar(16) not null default '',
`email` varchar(64) not null default '',
`portrait` varchar(255) not null default '' comment 'portrait image url',
`roles` varchar(255) not null comment 'Admin | Standard | Guest, split by space',
`contacts` varchar(1024) comment 'json e.g. {wecom:xx, dingtalk_robot_token:yy}',
`maintainer` tinyint(1) not null default 0,
`belong` varchar(191) DEFAULT '' COMMENT 'belong',
`last_active_time` bigint DEFAULT 0 COMMENT 'last_active_time',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
UNIQUE KEY (`username`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
insert into `users`(id, username, nickname, password, roles, create_at, create_by, update_at, update_by) values(1, 'root', '超管', 'root.2020', 'Admin', unix_timestamp(now()), 'system', unix_timestamp(now()), 'system');
CREATE TABLE `user_group` (
`id` bigint unsigned not null auto_increment,
`name` varchar(128) not null default '',
`note` varchar(255) not null default '',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
KEY (`create_by`),
KEY (`update_at`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
insert into user_group(id, name, create_at, create_by, update_at, update_by) values(1, 'demo-root-group', unix_timestamp(now()), 'root', unix_timestamp(now()), 'root');
CREATE TABLE `user_group_member` (
`id` bigint unsigned not null auto_increment,
`group_id` bigint unsigned not null,
`user_id` bigint unsigned not null,
KEY (`group_id`),
KEY (`user_id`),
PRIMARY KEY(`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
insert into user_group_member(group_id, user_id) values(1, 1);
CREATE TABLE `configs` (
`id` bigint unsigned not null auto_increment,
`ckey` varchar(191) not null,
`note` varchar(1024) NOT NULL DEFAULT '' COMMENT 'note',
`cval` text COMMENT 'config value',
`external` bigint DEFAULT 0 COMMENT '0\\:built-in 1\\:external',
`encrypted` bigint DEFAULT 0 COMMENT '0\\:plaintext 1\\:ciphertext',
`create_at` bigint DEFAULT 0 COMMENT 'create_at',
`create_by` varchar(64) NOT NULL DEFAULT '' COMMENT 'create_by',
`update_at` bigint DEFAULT 0 COMMENT 'update_at',
`update_by` varchar(64) NOT NULL DEFAULT '' COMMENT 'update_by',
PRIMARY KEY (`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `role` (
`id` bigint unsigned not null auto_increment,
`name` varchar(191) not null default '',
`note` varchar(255) not null default '',
PRIMARY KEY (`id`),
UNIQUE KEY (`name`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
insert into `role`(name, note) values('Admin', 'Administrator role');
insert into `role`(name, note) values('Standard', 'Ordinary user role');
insert into `role`(name, note) values('Guest', 'Readonly user role');
CREATE TABLE `role_operation`(
`id` bigint unsigned not null auto_increment,
`role_name` varchar(128) not null,
`operation` varchar(191) not null,
KEY (`role_name`),
KEY (`operation`),
PRIMARY KEY(`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
-- Admin is special, who has no concrete operation but can do anything.
insert into `role_operation`(role_name, operation) values('Guest', '/metric/explorer');
insert into `role_operation`(role_name, operation) values('Guest', '/object/explorer');
insert into `role_operation`(role_name, operation) values('Guest', '/log/explorer');
insert into `role_operation`(role_name, operation) values('Guest', '/trace/explorer');
insert into `role_operation`(role_name, operation) values('Guest', '/help/version');
insert into `role_operation`(role_name, operation) values('Guest', '/help/contact');
insert into `role_operation`(role_name, operation) values('Standard', '/metric/explorer');
insert into `role_operation`(role_name, operation) values('Standard', '/object/explorer');
insert into `role_operation`(role_name, operation) values('Standard', '/log/explorer');
insert into `role_operation`(role_name, operation) values('Standard', '/trace/explorer');
insert into `role_operation`(role_name, operation) values('Standard', '/help/version');
insert into `role_operation`(role_name, operation) values('Standard', '/help/contact');
insert into `role_operation`(role_name, operation) values('Standard', '/help/servers');
insert into `role_operation`(role_name, operation) values('Standard', '/help/migrate');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules-built-in');
insert into `role_operation`(role_name, operation) values('Standard', '/dashboards-built-in');
insert into `role_operation`(role_name, operation) values('Standard', '/trace/dependencies');
insert into `role_operation`(role_name, operation) values('Standard', '/users');
insert into `role_operation`(role_name, operation) values('Standard', '/user-groups');
insert into `role_operation`(role_name, operation) values('Standard', '/user-groups/add');
insert into `role_operation`(role_name, operation) values('Standard', '/user-groups/put');
insert into `role_operation`(role_name, operation) values('Standard', '/user-groups/del');
insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups');
insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups/add');
insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups/put');
insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups/del');
insert into `role_operation`(role_name, operation) values('Standard', '/targets');
insert into `role_operation`(role_name, operation) values('Standard', '/targets/add');
insert into `role_operation`(role_name, operation) values('Standard', '/targets/put');
insert into `role_operation`(role_name, operation) values('Standard', '/targets/del');
insert into `role_operation`(role_name, operation) values('Standard', '/dashboards');
insert into `role_operation`(role_name, operation) values('Standard', '/dashboards/add');
insert into `role_operation`(role_name, operation) values('Standard', '/dashboards/put');
insert into `role_operation`(role_name, operation) values('Standard', '/dashboards/del');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules/add');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules/put');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules/del');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-mutes');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-mutes/add');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-mutes/del');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes/add');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes/put');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes/del');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-cur-events');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-cur-events/del');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-his-events');
insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls');
insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls/add');
insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls/put');
insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls/del');
insert into `role_operation`(role_name, operation) values('Standard', '/job-tasks');
insert into `role_operation`(role_name, operation) values('Standard', '/job-tasks/add');
insert into `role_operation`(role_name, operation) values('Standard', '/job-tasks/put');
insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules');
insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules/add');
insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules/put');
insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules/del');
-- for alert_rule | collect_rule | mute | dashboard grouping
CREATE TABLE `busi_group` (
`id` bigint unsigned not null auto_increment,
`name` varchar(191) not null,
`label_enable` tinyint(1) not null default 0,
`label_value` varchar(191) not null default '' comment 'if label_enable: label_value can not be blank',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
UNIQUE KEY (`name`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
insert into busi_group(id, name, create_at, create_by, update_at, update_by) values(1, 'Default Busi Group', unix_timestamp(now()), 'root', unix_timestamp(now()), 'root');
CREATE TABLE `busi_group_member` (
`id` bigint unsigned not null auto_increment,
`busi_group_id` bigint not null comment 'busi group id',
`user_group_id` bigint not null comment 'user group id',
`perm_flag` char(2) not null comment 'ro | rw',
PRIMARY KEY (`id`),
KEY (`busi_group_id`),
KEY (`user_group_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
insert into busi_group_member(busi_group_id, user_group_id, perm_flag) values(1, 1, 'rw');
-- for dashboard new version
CREATE TABLE `board` (
`id` bigint unsigned not null auto_increment,
`group_id` bigint not null default 0 comment 'busi group id',
`name` varchar(191) not null,
`ident` varchar(200) not null default '',
`tags` varchar(255) not null comment 'split by space',
`public` tinyint(1) not null default 0 comment '0:false 1:true',
`built_in` tinyint(1) not null default 0 comment '0:false 1:true',
`hide` tinyint(1) not null default 0 comment '0:false 1:true',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
`note` varchar(1024) not null default '' comment 'note',
`public_cate` bigint NOT NULL NOT NULL DEFAULT 0 COMMENT '0 anonymous 1 login 2 busi',
PRIMARY KEY (`id`),
UNIQUE KEY (`group_id`, `name`),
KEY(`ident`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
-- for dashboard new version
CREATE TABLE `board_payload` (
`id` bigint unsigned not null comment 'dashboard id',
`payload` mediumtext not null,
UNIQUE KEY (`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
-- deprecated
CREATE TABLE `dashboard` (
`id` bigint unsigned not null auto_increment,
`group_id` bigint not null default 0 comment 'busi group id',
`name` varchar(191) not null,
`tags` varchar(255) not null comment 'split by space',
`configs` varchar(8192) comment 'dashboard variables',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
UNIQUE KEY (`group_id`, `name`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
-- deprecated
-- auto create the first subclass 'Default chart group' of dashboard
CREATE TABLE `chart_group` (
`id` bigint unsigned not null auto_increment,
`dashboard_id` bigint unsigned not null,
`name` varchar(255) not null,
`weight` int not null default 0,
PRIMARY KEY (`id`),
KEY (`dashboard_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
-- deprecated
CREATE TABLE `chart` (
`id` bigint unsigned not null auto_increment,
`group_id` bigint unsigned not null comment 'chart group id',
`configs` text,
`weight` int not null default 0,
PRIMARY KEY (`id`),
KEY (`group_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `chart_share` (
`id` bigint unsigned not null auto_increment,
`cluster` varchar(128) not null,
`datasource_id` bigint NOT NULL NOT NULL DEFAULT 0 COMMENT 'datasource id',
`configs` text,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
primary key (`id`),
key (`create_at`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `alert_rule` (
`id` bigint unsigned not null auto_increment,
`group_id` bigint not null default 0 comment 'busi group id',
`cate` varchar(128) not null,
`datasource_ids` varchar(255) not null default '' comment 'datasource ids',
`cluster` varchar(128) not null,
`name` varchar(255) not null,
`note` varchar(1024) not null default '',
`prod` varchar(255) not null default '',
`algorithm` varchar(255) not null default '',
`algo_params` varchar(255),
`delay` int not null default 0,
`severity` tinyint(1) not null comment '1:Emergency 2:Warning 3:Notice',
`disabled` tinyint(1) not null comment '0:enabled 1:disabled',
`prom_for_duration` int not null comment 'prometheus for, unit:s',
`rule_config` text not null comment 'rule_config',
`prom_ql` text not null comment 'promql',
`prom_eval_interval` int not null comment 'evaluate interval',
`enable_stime` varchar(255) not null default '00:00',
`enable_etime` varchar(255) not null default '23:59',
`enable_days_of_week` varchar(255) not null default '' comment 'split by space: 0 1 2 3 4 5 6',
`enable_in_bg` tinyint(1) not null default 0 comment '1: only this bg 0: global',
`notify_recovered` tinyint(1) not null comment 'whether notify when recovery',
`notify_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom',
`notify_groups` varchar(255) not null default '' comment 'split by space: 233 43',
`notify_repeat_step` int not null default 0 comment 'unit: min',
`notify_max_number` int not null default 0 comment '',
`recover_duration` int not null default 0 comment 'unit: s',
`callbacks` varchar(4096) not null default '' comment 'split by space: http://a.com/api/x http://a.com/api/y',
`runbook_url` varchar(4096),
`append_tags` varchar(255) not null default '' comment 'split by space: service=n9e mod=api',
`annotations` text not null comment 'annotations',
`extra_config` text,
`notify_rule_ids` varchar(1024) DEFAULT '',
`notify_version` int DEFAULT 0,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
`cron_pattern` varchar(64),
`time_zone` varchar(64) not null default '',
`datasource_queries` text,
PRIMARY KEY (`id`),
KEY (`group_id`),
KEY (`update_at`)
) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `alert_mute` (
`id` bigint unsigned not null auto_increment,
`group_id` bigint not null default 0 comment 'busi group id',
`prod` varchar(255) not null default '',
`note` varchar(1024) not null default '',
`cate` varchar(128) not null,
`cluster` varchar(128) not null,
`datasource_ids` varchar(255) not null default '' comment 'datasource ids',
`tags` varchar(4096) default '[]' comment 'json,map,tagkey->regexp|value',
`cause` varchar(255) not null default '',
`btime` bigint not null default 0 comment 'begin time',
`etime` bigint not null default 0 comment 'end time',
`disabled` tinyint(1) not null default 0 comment '0:enabled 1:disabled',
`mute_time_type` tinyint(1) not null default 0,
`periodic_mutes` varchar(4096) not null default '',
`severities` varchar(32) not null default '',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
KEY (`create_at`),
KEY (`group_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `alert_subscribe` (
`id` bigint unsigned not null auto_increment,
`name` varchar(255) not null default '',
`disabled` tinyint(1) not null default 0 comment '0:enabled 1:disabled',
`group_id` bigint not null default 0 comment 'busi group id',
`prod` varchar(255) not null default '',
`cate` varchar(128) not null,
`datasource_ids` varchar(255) not null default '' comment 'datasource ids',
`cluster` varchar(128) not null,
`rule_id` bigint not null default 0,
`rule_ids` varchar(1024),
`severities` varchar(32) not null default '',
`tags` varchar(4096) not null default '' comment 'json,map,tagkey->regexp|value',
`redefine_severity` tinyint(1) default 0 comment 'is redefine severity?',
`new_severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice',
`redefine_channels` tinyint(1) default 0 comment 'is redefine channels?',
`new_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom',
`user_group_ids` varchar(250) not null comment 'split by space 1 34 5, notify cc to user_group_ids',
`busi_groups` varchar(4096),
`note` VARCHAR(1024) DEFAULT '' COMMENT 'note',
`webhooks` text not null,
`extra_config` text,
`redefine_webhooks` tinyint(1) default 0,
`for_duration` bigint not null default 0,
`notify_rule_ids` varchar(1024) DEFAULT '',
`notify_version` int DEFAULT 0,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
KEY (`update_at`),
KEY (`group_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `target` (
`id` bigint unsigned not null auto_increment,
`group_id` bigint not null default 0 comment 'busi group id',
`ident` varchar(191) not null comment 'target id',
`note` varchar(255) not null default '' comment 'append to alert event as field',
`tags` varchar(512) not null default '' comment 'append to series data as tags, split by space, append external space at suffix',
`host_tags` text COMMENT 'global labels set in conf file',
`host_ip` varchar(15) default '' COMMENT 'IPv4 string',
`agent_version` varchar(255) default '' COMMENT 'agent version',
`engine_name` varchar(255) DEFAULT '' COMMENT 'engine name',
`os` VARCHAR(31) DEFAULT '' COMMENT 'os type',
`update_at` bigint not null default 0,
PRIMARY KEY (`id`),
UNIQUE KEY (`ident`),
KEY (`group_id`),
INDEX `idx_host_ip` (`host_ip`),
INDEX `idx_agent_version` (`agent_version`),
INDEX `idx_engine_name` (`engine_name`),
INDEX `idx_os` (`os`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `metric_view` (
`id` bigint unsigned not null auto_increment,
`name` varchar(191) not null default '',
`cate` tinyint(1) not null comment '0: preset 1: custom',
`configs` varchar(8192) not null default '',
`create_at` bigint not null default 0,
`create_by` bigint not null default 0 comment 'user id',
`update_at` bigint not null default 0,
PRIMARY KEY (`id`),
KEY (`create_by`)
) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4;
insert into metric_view(name, cate, configs) values('Host View', 0, '{"filters":[{"oper":"=","label":"__name__","value":"cpu_usage_idle"}],"dynamicLabels":[],"dimensionLabels":[{"label":"ident","value":""}]}');
CREATE TABLE `recording_rule` (
`id` bigint unsigned not null auto_increment,
`group_id` bigint not null default '0' comment 'group_id',
`datasource_ids` varchar(255) not null default '' comment 'datasource ids',
`cluster` varchar(128) not null,
`name` varchar(255) not null comment 'new metric name',
`note` varchar(255) not null comment 'rule note',
`disabled` tinyint(1) not null default 0 comment '0:enabled 1:disabled',
`prom_ql` varchar(8192) not null comment 'promql',
`prom_eval_interval` int not null comment 'evaluate interval',
`cron_pattern` varchar(255) default '' comment 'cron pattern',
`append_tags` varchar(255) default '' comment 'split by space: service=n9e mod=api',
`query_configs` text NOT NULL,
`create_at` bigint default '0',
`create_by` varchar(64) default '',
`update_at` bigint default '0',
`update_by` varchar(64) default '',
`datasource_queries` text,
PRIMARY KEY (`id`),
KEY `group_id` (`group_id`),
KEY `update_at` (`update_at`)
) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `alert_aggr_view` (
`id` bigint unsigned not null auto_increment,
`name` varchar(191) not null default '',
`rule` varchar(2048) not null default '',
`cate` tinyint(1) not null comment '0: preset 1: custom',
`create_at` bigint not null default 0,
`create_by` bigint not null default 0 comment 'user id',
`update_at` bigint not null default 0,
PRIMARY KEY (`id`),
KEY (`create_by`)
) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4;
insert into alert_aggr_view(name, rule, cate) values('By BusiGroup, Severity', 'field:group_name::field:severity', 0);
insert into alert_aggr_view(name, rule, cate) values('By RuleName', 'field:rule_name', 0);
CREATE TABLE `alert_cur_event` (
`id` bigint unsigned not null comment 'use alert_his_event.id',
`cate` varchar(128) not null,
`datasource_id` bigint not null default 0 comment 'datasource id',
`cluster` varchar(128) not null,
`group_id` bigint unsigned not null comment 'busi group id of rule',
`group_name` varchar(255) not null default '' comment 'busi group name',
`hash` varchar(64) not null comment 'rule_id + vector_pk',
`rule_id` bigint unsigned not null,
`rule_name` varchar(255) not null,
`rule_note` varchar(2048) not null default 'alert rule note',
`rule_prod` varchar(255) not null default '',
`rule_algo` varchar(255) not null default '',
`severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice',
`prom_for_duration` int not null comment 'prometheus for, unit:s',
`prom_ql` varchar(8192) not null comment 'promql',
`prom_eval_interval` int not null comment 'evaluate interval',
`callbacks` varchar(2048) not null default '' comment 'split by space: http://a.com/api/x http://a.com/api/y',
`runbook_url` varchar(255),
`notify_recovered` tinyint(1) not null comment 'whether notify when recovery',
`notify_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom',
`notify_groups` varchar(255) not null default '' comment 'split by space: 233 43',
`notify_repeat_next` bigint not null default 0 comment 'next timestamp to notify, get repeat settings from rule',
`notify_cur_number` int not null default 0 comment '',
`target_ident` varchar(191) not null default '' comment 'target ident, also in tags',
`target_note` varchar(191) not null default '' comment 'target note',
`first_trigger_time` bigint,
`trigger_time` bigint not null,
`trigger_value` text not null,
`annotations` text not null comment 'annotations',
`rule_config` text not null comment 'annotations',
`tags` varchar(1024) not null default '' comment 'merge data_tags rule_tags, split by ,,',
`original_tags` text comment 'labels key=val,,k2=v2',
`notify_rule_ids` text COMMENT 'notify rule ids',
PRIMARY KEY (`id`),
KEY (`hash`),
KEY (`rule_id`),
KEY (`trigger_time`, `group_id`),
KEY (`notify_repeat_next`)
) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `alert_his_event` (
`id` bigint unsigned not null AUTO_INCREMENT,
`is_recovered` tinyint(1) not null,
`cate` varchar(128) not null,
`datasource_id` bigint not null default 0 comment 'datasource id',
`cluster` varchar(128) not null,
`group_id` bigint unsigned not null comment 'busi group id of rule',
`group_name` varchar(255) not null default '' comment 'busi group name',
`hash` varchar(64) not null comment 'rule_id + vector_pk',
`rule_id` bigint unsigned not null,
`rule_name` varchar(255) not null,
`rule_note` varchar(2048) not null default 'alert rule note',
`rule_prod` varchar(255) not null default '',
`rule_algo` varchar(255) not null default '',
`severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice',
`prom_for_duration` int not null comment 'prometheus for, unit:s',
`prom_ql` varchar(8192) not null comment 'promql',
`prom_eval_interval` int not null comment 'evaluate interval',
`callbacks` varchar(2048) not null default '' comment 'split by space: http://a.com/api/x http://a.com/api/y',
`runbook_url` varchar(255),
`notify_recovered` tinyint(1) not null comment 'whether notify when recovery',
`notify_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom',
`notify_groups` varchar(255) not null default '' comment 'split by space: 233 43',
`notify_cur_number` int not null default 0 comment '',
`target_ident` varchar(191) not null default '' comment 'target ident, also in tags',
`target_note` varchar(191) not null default '' comment 'target note',
`first_trigger_time` bigint,
`trigger_time` bigint not null,
`trigger_value` text not null,
`recover_time` bigint not null default 0,
`last_eval_time` bigint not null default 0 comment 'for time filter',
`tags` varchar(1024) not null default '' comment 'merge data_tags rule_tags, split by ,,',
`original_tags` text comment 'labels key=val,,k2=v2',
`annotations` text not null comment 'annotations',
`rule_config` text not null comment 'annotations',
`notify_rule_ids` text COMMENT 'notify rule ids',
PRIMARY KEY (`id`),
INDEX `idx_last_eval_time` (`last_eval_time`),
KEY (`hash`),
KEY (`rule_id`),
KEY (`trigger_time`, `group_id`)
) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `board_busigroup` (
`busi_group_id` bigint(20) NOT NULL DEFAULT '0' COMMENT 'busi group id',
`board_id` bigint(20) NOT NULL DEFAULT '0' COMMENT 'board id',
PRIMARY KEY (`busi_group_id`, `board_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE `builtin_components` (
`id` bigint UNSIGNED NOT NULL AUTO_INCREMENT COMMENT 'unique identifier',
`ident` varchar(191) NOT NULL,
`logo` mediumtext COMMENT '''logo of component''',
`readme` text NOT NULL COMMENT '''readme of component''',
`created_at` bigint NOT NULL DEFAULT 0 COMMENT '''create time''',
`created_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''',
`updated_at` bigint NOT NULL DEFAULT 0 COMMENT '''update time''',
`updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''',
`disabled` int NOT NULL DEFAULT 0 COMMENT '''is disabled or not''',
PRIMARY KEY (`id`),
KEY (`ident`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE `builtin_payloads` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '''unique identifier''',
`component_id` bigint NOT NULL DEFAULT 0 COMMENT '''component_id of payload''',
`uuid` bigint(20) NOT NULL COMMENT '''uuid of payload''',
`type` varchar(191) NOT NULL COMMENT '''type of payload''',
`component` varchar(191) NOT NULL COMMENT '''component of payload''',
`cate` varchar(191) NOT NULL COMMENT '''category of payload''',
`name` varchar(191) NOT NULL COMMENT '''name of payload''',
`tags` varchar(191) NOT NULL DEFAULT '' COMMENT '''tags of payload''',
`content` longtext NOT NULL COMMENT '''content of payload''',
`note` varchar(1024) NOT NULL DEFAULT '' COMMENT '''note of payload''',
`created_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''create time''',
`created_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''',
`updated_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''update time''',
`updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''',
PRIMARY KEY (`id`),
KEY `idx_component` (`component`),
KEY `idx_name` (`name`),
KEY `idx_cate` (`cate`),
KEY `idx_uuid` (`uuid`),
KEY `idx_type` (`type`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE notification_record (
`id` BIGINT PRIMARY KEY AUTO_INCREMENT,
`notify_rule_id` BIGINT NOT NULL DEFAULT 0,
`event_id` bigint NOT NULL COMMENT 'event history id',
`sub_id` bigint COMMENT 'subscribed rule id',
`channel` varchar(255) NOT NULL COMMENT 'notification channel name',
`status` bigint COMMENT 'notification status',
`target` varchar(1024) NOT NULL COMMENT 'notification target',
`details` varchar(2048) DEFAULT '' COMMENT 'notification other info',
`created_at` bigint NOT NULL COMMENT 'create time',
INDEX idx_evt (event_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE `task_tpl`
(
`id` int unsigned NOT NULL AUTO_INCREMENT,
`group_id` int unsigned not null comment 'busi group id',
`title` varchar(255) not null default '',
`account` varchar(64) not null,
`batch` int unsigned not null default 0,
`tolerance` int unsigned not null default 0,
`timeout` int unsigned not null default 0,
`pause` varchar(255) not null default '',
`script` text not null,
`args` varchar(512) not null default '',
`tags` varchar(255) not null default '' comment 'split by space',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
KEY (`group_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `task_tpl_host`
(
`ii` int unsigned NOT NULL AUTO_INCREMENT,
`id` int unsigned not null comment 'task tpl id',
`host` varchar(128) not null comment 'ip or hostname',
PRIMARY KEY (`ii`),
KEY (`id`, `host`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `task_record`
(
`id` bigint unsigned not null comment 'ibex task id',
`event_id` bigint not null comment 'event id' default 0,
`group_id` bigint not null comment 'busi group id',
`ibex_address` varchar(128) not null,
`ibex_auth_user` varchar(128) not null default '',
`ibex_auth_pass` varchar(128) not null default '',
`title` varchar(255) not null default '',
`account` varchar(64) not null,
`batch` int unsigned not null default 0,
`tolerance` int unsigned not null default 0,
`timeout` int unsigned not null default 0,
`pause` varchar(255) not null default '',
`script` text not null,
`args` varchar(512) not null default '',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
KEY (`create_at`, `group_id`),
KEY (`create_by`),
INDEX `idx_event_id` (`event_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `alerting_engines`
(
`id` int unsigned NOT NULL AUTO_INCREMENT,
`instance` varchar(128) not null default '' comment 'instance identification, e.g. 10.9.0.9:9090',
`datasource_id` bigint not null default 0 comment 'datasource id',
`engine_cluster` varchar(128) not null default '' comment 'n9e-alert cluster',
`clock` bigint not null,
PRIMARY KEY (`id`),
INDEX `idx_inst` (`instance`),
INDEX `idx_clock` (`clock`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `datasource`
(
`id` int unsigned NOT NULL AUTO_INCREMENT,
`name` varchar(191) not null default '',
`identifier` varchar(255) not null default '',
`description` varchar(255) not null default '',
`category` varchar(255) not null default '',
`plugin_id` int unsigned not null default 0,
`plugin_type` varchar(255) not null default '',
`plugin_type_name` varchar(255) not null default '',
`cluster_name` varchar(255) not null default '',
`settings` text not null,
`status` varchar(255) not null default '',
`http` varchar(4096) not null default '',
`auth` varchar(8192) not null default '',
`is_default` boolean COMMENT 'is default datasource',
`weight` int not null default 0,
`created_at` bigint not null default 0,
`created_by` varchar(64) not null default '',
`updated_at` bigint not null default 0,
`updated_by` varchar(64) not null default '',
UNIQUE KEY (`name`),
PRIMARY KEY (`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `builtin_cate` (
`id` bigint unsigned not null auto_increment,
`name` varchar(191) not null,
`user_id` bigint not null default 0,
PRIMARY KEY (`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `notify_tpl` (
`id` bigint unsigned not null auto_increment,
`channel` varchar(32) not null,
`name` varchar(255) not null,
`content` text not null,
`create_at` bigint DEFAULT 0 COMMENT 'create_at',
`create_by` varchar(64) DEFAULT '' COMMENT 'create_by',
`update_at` bigint DEFAULT 0 COMMENT 'update_at',
`update_by` varchar(64) DEFAULT '' COMMENT 'update_by',
PRIMARY KEY (`id`),
UNIQUE KEY (`channel`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `sso_config` (
`id` bigint unsigned not null auto_increment,
`name` varchar(191) not null,
`content` text not null,
`update_at` bigint DEFAULT 0 COMMENT 'update_at',
PRIMARY KEY (`id`),
UNIQUE KEY (`name`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `es_index_pattern` (
`id` bigint unsigned not null auto_increment,
`datasource_id` bigint not null default 0 comment 'datasource id',
`name` varchar(191) not null,
`time_field` varchar(128) not null default '@timestamp',
`allow_hide_system_indices` tinyint(1) not null default 0,
`fields_format` varchar(4096) not null default '',
`cross_cluster_enabled` int not null default 0,
`note` varchar(1024) not null default '',
`create_at` bigint default '0',
`create_by` varchar(64) default '',
`update_at` bigint default '0',
`update_by` varchar(64) default '',
PRIMARY KEY (`id`),
UNIQUE KEY (`datasource_id`, `name`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `builtin_metrics` (
`id` bigint unsigned NOT NULL AUTO_INCREMENT COMMENT 'unique identifier',
`collector` varchar(191) NOT NULL COMMENT '''type of collector''',
`typ` varchar(191) NOT NULL COMMENT '''type of metric''',
`name` varchar(191) NOT NULL COMMENT '''name of metric''',
`unit` varchar(191) NOT NULL COMMENT '''unit of metric''',
`lang` varchar(191) NOT NULL DEFAULT 'zh' COMMENT '''language''',
`note` varchar(4096) NOT NULL COMMENT '''description of metric''',
`expression` varchar(4096) NOT NULL COMMENT '''expression of metric''',
`expression_type` varchar(32) NOT NULL DEFAULT 'promql' COMMENT '''expression type: metric_name or promql''',
`metric_type` varchar(191) NOT NULL DEFAULT '' COMMENT '''metric type like counter/gauge''',
`extra_fields` text COMMENT '''custom extra fields''',
`created_at` bigint NOT NULL DEFAULT 0 COMMENT '''create time''',
`created_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''',
`updated_at` bigint NOT NULL DEFAULT 0 COMMENT '''update time''',
`updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''',
`uuid` bigint NOT NULL DEFAULT 0 COMMENT '''uuid''',
PRIMARY KEY (`id`),
INDEX `idx_uuid` (`uuid`),
INDEX `idx_collector` (`collector`),
INDEX `idx_typ` (`typ`),
INDEX `idx_builtinmetric_name` (`name` ASC),
INDEX `idx_lang` (`lang`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE `metric_filter` (
`id` bigint NOT NULL AUTO_INCREMENT COMMENT 'unique identifier',
`name` varchar(191) NOT NULL COMMENT '''name of metric filter''',
`configs` varchar(4096) NOT NULL COMMENT '''configuration of metric filter''',
`groups_perm` text,
`create_at` bigint NOT NULL DEFAULT 0 COMMENT '''create time''',
`create_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''',
`update_at` bigint NOT NULL DEFAULT 0 COMMENT '''update time''',
`update_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''',
PRIMARY KEY (`id`),
INDEX `idx_metricfilter_name` (`name` ASC)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE `target_busi_group` (
`id` bigint NOT NULL AUTO_INCREMENT,
`target_ident` varchar(191) NOT NULL,
`group_id` bigint NOT NULL,
`update_at` bigint NOT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `idx_target_group` (`target_ident`,`group_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE `dash_annotation` (
`id` bigint unsigned not null auto_increment,
`dashboard_id` bigint not null comment 'dashboard id',
`panel_id` varchar(191) not null comment 'panel id',
`tags` text comment 'tags array json string',
`description` text comment 'annotation description',
`config` text comment 'annotation config',
`time_start` bigint not null default 0 comment 'start timestamp',
`time_end` bigint not null default 0 comment 'end timestamp',
`create_at` bigint not null default 0 comment 'create time',
`create_by` varchar(64) not null default '' comment 'creator',
`update_at` bigint not null default 0 comment 'update time',
`update_by` varchar(64) not null default '' comment 'updater',
PRIMARY KEY (`id`),
KEY `idx_dashboard_id` (`dashboard_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE `user_token` (
`id` bigint NOT NULL AUTO_INCREMENT,
`username` varchar(255) NOT NULL DEFAULT '',
`token_name` varchar(255) NOT NULL DEFAULT '',
`token` varchar(255) NOT NULL DEFAULT '',
`create_at` bigint NOT NULL DEFAULT 0,
`last_used` bigint NOT NULL DEFAULT 0,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE `notify_rule` (
`id` bigint unsigned not null auto_increment,
`name` varchar(255) not null,
`description` text,
`enable` tinyint(1) not null default 0,
`user_group_ids` varchar(255) not null default '',
`notify_configs` text,
`pipeline_configs` text,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `notify_channel` (
`id` bigint unsigned not null auto_increment,
`name` varchar(255) not null,
`ident` varchar(255) not null,
`description` text,
`enable` tinyint(1) not null default 0,
`param_config` text,
`request_type` varchar(50) not null,
`request_config` text,
`weight` int not null default 0,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `message_template` (
`id` bigint unsigned not null auto_increment,
`name` varchar(64) not null,
`ident` varchar(64) not null,
`content` text,
`user_group_ids` varchar(64),
`notify_channel_ident` varchar(64) not null default '',
`private` int not null default 0,
`weight` int not null default 0,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `event_pipeline` (
`id` bigint unsigned not null auto_increment,
`name` varchar(128) not null,
`team_ids` text,
`description` varchar(255) not null default '',
`filter_enable` tinyint(1) not null default 0,
`label_filters` text,
`attr_filters` text,
`processor_configs` text,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `embedded_product` (
`id` bigint unsigned NOT NULL AUTO_INCREMENT,
`name` varchar(255) DEFAULT NULL,
`url` varchar(255) DEFAULT NULL,
`is_private` boolean DEFAULT NULL,
`team_ids` varchar(255),
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE `task_meta`
(
`id` bigint unsigned NOT NULL AUTO_INCREMENT,
`title` varchar(255) not null default '',
`account` varchar(64) not null,
`batch` bigint not null default 0,
`tolerance` bigint not null default 0,
`timeout` bigint not null default 0,
`pause` varchar(255) not null default '',
`script` text not null,
`args` varchar(512) not null default '',
`stdin` varchar(1024) not null default '',
`creator` varchar(64) not null default '',
`created` timestamp not null default CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
KEY `idx_task_meta_creator` (`creator`),
KEY `idx_task_meta_created` (`created`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
/* start|cancel|kill|pause */
CREATE TABLE `task_action`
(
`id` bigint unsigned not null,
`action` varchar(32) not null,
`clock` bigint not null default 0,
PRIMARY KEY (`id`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE `task_scheduler`
(
`id` bigint unsigned not null,
`scheduler` varchar(128) not null default '',
KEY (`id`, `scheduler`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE `task_scheduler_health`
(
`scheduler` varchar(128) NOT NULL,
`clock` bigint not null,
UNIQUE KEY `idx_task_scheduler_health_scheduler` (`scheduler`),
KEY (`clock`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE `task_host_doing`
(
`id` bigint unsigned not null,
`host` varchar(128) not null,
`clock` bigint not null default 0,
`action` varchar(16) not null,
KEY `idx_task_host_doing_id` (`id`),
KEY `idx_task_host_doing_host` (`host`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_0
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_1
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_2
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_3
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_4
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_5
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_6
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_7
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_8
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_9
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_10
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_11
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_12
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_13
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_14
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_15
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_16
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_17
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_18
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_19
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_20
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_21
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_22
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_23
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_24
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_25
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_26
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_27
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_28
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_29
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_30
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_31
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_32
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_33
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_34
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_35
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_36
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_37
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_38
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_39
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_40
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_41
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_42
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_43
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_44
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_45
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_46
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_47
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_48
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_49
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_50
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_51
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_52
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_53
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_54
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_55
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_56
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_57
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_58
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_59
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_60
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_61
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_62
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_63
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_64
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_65
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_66
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_67
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_68
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_69
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_70
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_71
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_72
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_73
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_74
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_75
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_76
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_77
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_78
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_79
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_80
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_81
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_82
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_83
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_84
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_85
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_86
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_87
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_88
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_89
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_90
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_91
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_92
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_93
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_94
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_95
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_96
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_97
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_98
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE task_host_99
(
`ii` bigint unsigned NOT NULL AUTO_INCREMENT,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
UNIQUE KEY `idx_id_host` (`id`, `host`),
PRIMARY KEY (`ii`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8mb4;
CREATE TABLE `source_token` (
`id` bigint unsigned NOT NULL AUTO_INCREMENT,
`source_type` varchar(64) NOT NULL DEFAULT '' COMMENT 'source type',
`source_id` varchar(255) NOT NULL DEFAULT '' COMMENT 'source identifier',
`token` varchar(255) NOT NULL DEFAULT '' COMMENT 'access token',
`expire_at` bigint NOT NULL DEFAULT 0 COMMENT 'expire timestamp',
`create_at` bigint NOT NULL DEFAULT 0 COMMENT 'create timestamp',
`create_by` varchar(64) NOT NULL DEFAULT '' COMMENT 'creator',
PRIMARY KEY (`id`),
KEY `idx_source_type_id_token` (`source_type`, `source_id`, `token`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
================================================
FILE: docker/initsql/c-init.sql
================================================
CREATE USER IF NOT EXISTS 'root'@'127.0.0.1' IDENTIFIED BY '1234';
GRANT ALL PRIVILEGES ON *.* TO 'root'@'127.0.0.1' WITH GRANT OPTION;
CREATE USER IF NOT EXISTS 'root'@'localhost' IDENTIFIED BY '1234';
GRANT ALL PRIVILEGES ON *.* TO 'root'@'localhost' WITH GRANT OPTION;
CREATE USER IF NOT EXISTS 'root'@'%' IDENTIFIED BY '1234';
GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' WITH GRANT OPTION;
FLUSH PRIVILEGES;
================================================
FILE: docker/migratesql/migrate.sql
================================================
/* v7.0.0-beta.3 */
CREATE TABLE `builtin_metrics` (
`id` bigint unsigned NOT NULL AUTO_INCREMENT COMMENT 'unique identifier',
`collector` varchar(191) NOT NULL COMMENT 'type of collector',
`typ` varchar(191) NOT NULL COMMENT 'type of metric',
`name` varchar(191) NOT NULL COMMENT 'name of metric',
`unit` varchar(191) NOT NULL COMMENT 'unit of metric',
`lang` varchar(191) NOT NULL DEFAULT '' COMMENT 'language of metric',
`note` varchar(4096) NOT NULL COMMENT 'description of metric in Chinese',
`expression` varchar(4096) NOT NULL COMMENT 'expression of metric',
`created_at` bigint NOT NULL DEFAULT 0 COMMENT 'create time',
`created_by` varchar(191) NOT NULL DEFAULT '' COMMENT 'creator',
`updated_at` bigint NOT NULL DEFAULT 0 COMMENT 'update time',
`updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT 'updater',
PRIMARY KEY (`id`),
INDEX `idx_collector` (`collector`),
INDEX `idx_typ` (`typ`),
INDEX `idx_name` (`name`),
INDEX `idx_lang` (`lang`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE `metric_filter` (
`id` bigint NOT NULL AUTO_INCREMENT COMMENT 'unique identifier',
`name` varchar(191) NOT NULL COMMENT 'name of metric filter',
`configs` varchar(4096) NOT NULL COMMENT 'configuration of metric filter',
`groups_perm` text,
`create_at` bigint NOT NULL DEFAULT '0' COMMENT 'create time',
`create_by` varchar(191) NOT NULL DEFAULT '' COMMENT 'creator',
`update_at` bigint NOT NULL DEFAULT '0' COMMENT 'update time',
`update_by` varchar(191) NOT NULL DEFAULT '' COMMENT 'updater',
PRIMARY KEY (`id`),
KEY `idx_name` (`name`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE `board_busigroup` (
`busi_group_id` bigint(20) NOT NULL DEFAULT '0' COMMENT 'busi group id',
`board_id` bigint(20) NOT NULL DEFAULT '0' COMMENT 'board id',
PRIMARY KEY (`busi_group_id`, `board_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/* v7.0.0-beta.6 */
CREATE TABLE `builtin_components` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '''unique identifier''',
`ident` varchar(191) NOT NULL COMMENT '''identifier of component''',
`logo` varchar(191) NOT NULL COMMENT '''logo of component''',
`readme` text NOT NULL COMMENT '''readme of component''',
`created_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''create time''',
`created_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''',
`updated_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''update time''',
`updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''',
PRIMARY KEY (`id`),
KEY `idx_ident` (`ident`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE `builtin_payloads` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '''unique identifier''',
`uuid` bigint(20) NOT NULL COMMENT '''uuid of payload''',
`type` varchar(191) NOT NULL COMMENT '''type of payload''',
`component` varchar(191) NOT NULL COMMENT '''component of payload''',
`cate` varchar(191) NOT NULL COMMENT '''category of payload''',
`name` varchar(191) NOT NULL COMMENT '''name of payload''',
`tags` varchar(191) NOT NULL DEFAULT '' COMMENT '''tags of payload''',
`content` longtext NOT NULL COMMENT '''content of payload''',
`created_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''create time''',
`created_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''',
`updated_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''update time''',
`updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''',
PRIMARY KEY (`id`),
KEY `idx_component` (`component`),
KEY `idx_name` (`name`),
KEY `idx_cate` (`cate`),
KEY `idx_uuid` (`uuid`),
KEY `idx_type` (`type`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/* v7.0.0-beta.7 */
ALTER TABLE users ADD COLUMN last_active_time BIGINT NOT NULL DEFAULT 0;
/* v7.0.0-beta.13 */
ALTER TABLE recording_rule ADD COLUMN cron_pattern VARCHAR(255) DEFAULT '' COMMENT 'cron pattern';
/* v7.0.0-beta.14 */
ALTER TABLE alert_cur_event ADD COLUMN original_tags TEXT COMMENT 'labels key=val,,k2=v2';
ALTER TABLE alert_his_event ADD COLUMN original_tags TEXT COMMENT 'labels key=val,,k2=v2';
/* v7.1.0 */
ALTER TABLE target ADD COLUMN os VARCHAR(31) DEFAULT '' COMMENT 'os type';
/* v7.2.0 */
CREATE TABLE notification_record (
`id` BIGINT PRIMARY KEY AUTO_INCREMENT,
`event_id` BIGINT NOT NULL,
`sub_id` BIGINT NOT NULL,
`channel` VARCHAR(255) NOT NULL,
`status` TINYINT NOT NULL DEFAULT 0,
`target` VARCHAR(1024) NOT NULL,
`details` VARCHAR(2048),
`created_at` BIGINT NOT NULL,
INDEX idx_evt (event_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/* v7.3.0 2024-08-26 */
ALTER TABLE `target` ADD COLUMN `host_tags` TEXT COMMENT 'global labels set in conf file';
/* v7.3.4 2024-08-28 */
ALTER TABLE `builtin_payloads` ADD COLUMN `component_id` bigint(20) NOT NULL DEFAULT 0 COMMENT 'component_id';
/* v7.4.0 2024-09-20 */
CREATE TABLE `target_busi_group` (
`id` bigint NOT NULL AUTO_INCREMENT,
`target_ident` varchar(191) NOT NULL,
`group_id` bigint NOT NULL,
`update_at` bigint NOT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `idx_target_group` (`target_ident`,`group_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/* v7.7.0 2024-11-13 */
ALTER TABLE `recording_rule` ADD COLUMN `datasource_queries` TEXT;
ALTER TABLE `alert_rule` ADD COLUMN `datasource_queries` TEXT;
/* v7.7.2 2024-12-02 */
ALTER TABLE alert_subscribe MODIFY COLUMN rule_ids varchar(1024);
ALTER TABLE alert_subscribe MODIFY COLUMN busi_groups varchar(4096);
/* v8.0.0-beta.1 2024-12-13 */
ALTER TABLE `alert_rule` ADD COLUMN `cron_pattern` VARCHAR(64);
ALTER TABLE `builtin_components` MODIFY COLUMN `logo` mediumtext COMMENT '''logo of component''';
/* v8.0.0-beta.2 2024-12-26 */
ALTER TABLE `es_index_pattern` ADD COLUMN `cross_cluster_enabled` int not null default 0;
/* v8.0.0-beta.3 2025-01-03 */
ALTER TABLE `builtin_components` ADD COLUMN `disabled` INT NOT NULL DEFAULT 0 COMMENT 'is disabled or not';
CREATE TABLE `dash_annotation` (
`id` bigint unsigned not null auto_increment,
`dashboard_id` bigint not null comment 'dashboard id',
`panel_id` varchar(191) not null comment 'panel id',
`tags` text comment 'tags array json string',
`description` text comment 'annotation description',
`config` text comment 'annotation config',
`time_start` bigint not null default 0 comment 'start timestamp',
`time_end` bigint not null default 0 comment 'end timestamp',
`create_at` bigint not null default 0 comment 'create time',
`create_by` varchar(64) not null default '' comment 'creator',
`update_at` bigint not null default 0 comment 'update time',
`update_by` varchar(64) not null default '' comment 'updater',
PRIMARY KEY (`id`),
KEY `idx_dashboard_id` (`dashboard_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/* v8.0.0-beta.5 2025-02-05 */
CREATE TABLE `user_token` (
`id` bigint NOT NULL AUTO_INCREMENT,
`username` varchar(255) NOT NULL DEFAULT '',
`token_name` varchar(255) NOT NULL DEFAULT '',
`token` varchar(255) NOT NULL DEFAULT '',
`create_at` bigint NOT NULL DEFAULT 0,
`last_used` bigint NOT NULL DEFAULT 0,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/* v8.0.0-beta.7 2025-03-01 */
CREATE TABLE `notify_rule` (
`id` bigint unsigned not null auto_increment,
`name` varchar(255) not null,
`description` text,
`enable` tinyint(1) not null default 0,
`user_group_ids` varchar(255) not null default '',
`notify_configs` text,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `notify_channel` (
`id` bigint unsigned not null auto_increment,
`name` varchar(255) not null,
`ident` varchar(255) not null,
`description` text,
`enable` tinyint(1) not null default 0,
`param_config` text,
`request_type` varchar(50) not null,
`request_config` text,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `message_template` (
`id` bigint unsigned not null auto_increment,
`name` varchar(64) not null,
`ident` varchar(64) not null,
`content` text,
`user_group_ids` varchar(64),
`notify_channel_ident` varchar(64) not null default '',
`private` int not null default 0,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
ALTER TABLE `alert_rule` ADD COLUMN `notify_rule_ids` varchar(1024) DEFAULT '';
ALTER TABLE `alert_rule` ADD COLUMN `notify_version` int DEFAULT 0;
ALTER TABLE `alert_subscribe` ADD COLUMN `notify_rule_ids` varchar(1024) DEFAULT '';
ALTER TABLE `alert_subscribe` ADD COLUMN `notify_version` int DEFAULT 0;
ALTER TABLE `notification_record` ADD COLUMN `notify_rule_id` BIGINT NOT NULL DEFAULT 0;
/* v8.0.0-beta.9 2025-03-17 */
ALTER TABLE `message_template` ADD COLUMN `weight` int not null default 0;
ALTER TABLE `notify_channel` ADD COLUMN `weight` int not null default 0;
/* v8.0.0-beta.11 2025-04-10 */
ALTER TABLE `es_index_pattern` ADD COLUMN `note` varchar(1024) not null default '';
ALTER TABLE `datasource` ADD COLUMN `identifier` varchar(255) not null default '';
/* v8.0.0-beta.11 2025-05-15 */
ALTER TABLE `notify_rule` ADD COLUMN `pipeline_configs` text;
CREATE TABLE `event_pipeline` (
`id` bigint unsigned not null auto_increment,
`name` varchar(128) not null,
`team_ids` text,
`description` varchar(255) not null default '',
`filter_enable` tinyint(1) not null default 0,
`attr_filters` text,
`processor_configs` text,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
/* v8.0.0 2025-05-15 */
CREATE TABLE `embedded_product` (
`id` bigint unsigned NOT NULL AUTO_INCREMENT,
`name` varchar(255) DEFAULT NULL,
`url` varchar(255) DEFAULT NULL,
`is_private` boolean DEFAULT NULL,
`team_ids` varchar(255),
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/* v8.0.0 2025-05-29 */
CREATE TABLE `source_token` (
`id` bigint unsigned NOT NULL AUTO_INCREMENT,
`source_type` varchar(64) NOT NULL DEFAULT '' COMMENT 'source type',
`source_id` varchar(255) NOT NULL DEFAULT '' COMMENT 'source identifier',
`token` varchar(255) NOT NULL DEFAULT '' COMMENT 'access token',
`expire_at` bigint NOT NULL DEFAULT 0 COMMENT 'expire timestamp',
`create_at` bigint NOT NULL DEFAULT 0 COMMENT 'create timestamp',
`create_by` varchar(64) NOT NULL DEFAULT '' COMMENT 'creator',
PRIMARY KEY (`id`),
KEY `idx_source_type_id_token` (`source_type`, `source_id`, `token`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/* v8.0.0-beta.12 2025-06-03 */
ALTER TABLE `alert_his_event` ADD COLUMN `notify_rule_ids` text COMMENT 'notify rule ids';
ALTER TABLE `alert_cur_event` ADD COLUMN `notify_rule_ids` text COMMENT 'notify rule ids';
/* v8.0.0-beta.13 */
-- 删除 builtin_metrics 表的 idx_collector_typ_name 唯一索引
DROP INDEX IF EXISTS `idx_collector_typ_name` ON `builtin_metrics`;
/* v8.0.0 2025-07-03 */
ALTER TABLE `builtin_metrics` ADD COLUMN `translation` TEXT COMMENT 'translation of metric' AFTER `lang`;
/* v8.4.0 2025-10-15 */
ALTER TABLE `notify_rule` ADD COLUMN `extra_config` text COMMENT 'extra config';
/* v8.4.1 2025-11-10 */
ALTER TABLE `alert_rule` ADD COLUMN `pipeline_configs` text COMMENT 'pipeline configs';
/* v8.4.2 2025-11-13 */
ALTER TABLE `board` ADD COLUMN `note` varchar(1024) not null default '' comment 'note';
ALTER TABLE `builtin_payloads` ADD COLUMN `note` varchar(1024) not null default '' comment 'note of payload';
/* v9 2026-01-09 */
ALTER TABLE `event_pipeline` ADD COLUMN `typ` varchar(128) NOT NULL DEFAULT '' COMMENT 'pipeline type: builtin, user-defined';
ALTER TABLE `event_pipeline` ADD COLUMN `use_case` varchar(128) NOT NULL DEFAULT '' COMMENT 'use case: metric_explorer, event_summary, event_pipeline';
ALTER TABLE `event_pipeline` ADD COLUMN `trigger_mode` varchar(128) NOT NULL DEFAULT 'event' COMMENT 'trigger mode: event, api, cron';
ALTER TABLE `event_pipeline` ADD COLUMN `disabled` tinyint(1) NOT NULL DEFAULT 0 COMMENT 'disabled flag';
ALTER TABLE `event_pipeline` ADD COLUMN `nodes` text COMMENT 'workflow nodes (JSON)';
ALTER TABLE `event_pipeline` ADD COLUMN `connections` text COMMENT 'node connections (JSON)';
ALTER TABLE `event_pipeline` ADD COLUMN `input_variables` text COMMENT 'input variables (JSON)';
ALTER TABLE `event_pipeline` ADD COLUMN `label_filters` text COMMENT 'label filters (JSON)';
CREATE TABLE `event_pipeline_execution` (
`id` varchar(36) NOT NULL COMMENT 'execution id',
`pipeline_id` bigint NOT NULL COMMENT 'pipeline id',
`pipeline_name` varchar(128) DEFAULT '' COMMENT 'pipeline name snapshot',
`event_id` bigint DEFAULT 0 COMMENT 'related alert event id',
`mode` varchar(16) NOT NULL DEFAULT 'event' COMMENT 'trigger mode: event/api/cron',
`status` varchar(16) NOT NULL DEFAULT 'running' COMMENT 'status: running/success/failed',
`node_results` mediumtext COMMENT 'node execution results (JSON)',
`error_message` varchar(1024) DEFAULT '' COMMENT 'error message',
`error_node` varchar(36) DEFAULT '' COMMENT 'error node id',
`created_at` bigint NOT NULL DEFAULT 0 COMMENT 'start timestamp',
`finished_at` bigint DEFAULT 0 COMMENT 'finish timestamp',
`duration_ms` bigint DEFAULT 0 COMMENT 'duration in milliseconds',
`trigger_by` varchar(64) DEFAULT '' COMMENT 'trigger by',
`inputs_snapshot` text COMMENT 'inputs snapshot',
PRIMARY KEY (`id`),
KEY `idx_pipeline_id` (`pipeline_id`),
KEY `idx_event_id` (`event_id`),
KEY `idx_mode` (`mode`),
KEY `idx_status` (`status`),
KEY `idx_created_at` (`created_at`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='event pipeline execution records';
/* v8.5.0 builtin_metrics new fields */
ALTER TABLE `builtin_metrics` ADD COLUMN `expression_type` varchar(32) NOT NULL DEFAULT 'promql' COMMENT 'expression type: metric_name or promql';
ALTER TABLE `builtin_metrics` ADD COLUMN `metric_type` varchar(191) NOT NULL DEFAULT '' COMMENT 'metric type like counter/gauge';
ALTER TABLE `builtin_metrics` ADD COLUMN `extra_fields` text COMMENT 'custom extra fields';
/* v9 2026-01-16 saved_view */
CREATE TABLE `saved_view` (
`id` bigint NOT NULL AUTO_INCREMENT,
`name` varchar(255) NOT NULL COMMENT 'view name',
`page` varchar(64) NOT NULL COMMENT 'page identifier',
`filter` text COMMENT 'filter config (JSON)',
`public_cate` int NOT NULL DEFAULT 0 COMMENT 'public category: 0-self, 1-team, 2-all',
`gids` text COMMENT 'team group ids (JSON)',
`create_at` bigint NOT NULL DEFAULT 0 COMMENT 'create timestamp',
`create_by` varchar(64) NOT NULL DEFAULT '' COMMENT 'creator',
`update_at` bigint NOT NULL DEFAULT 0 COMMENT 'update timestamp',
`update_by` varchar(64) NOT NULL DEFAULT '' COMMENT 'updater',
PRIMARY KEY (`id`),
KEY `idx_page` (`page`),
KEY `idx_create_by` (`create_by`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='saved views for pages';
CREATE TABLE `user_view_favorite` (
`id` bigint NOT NULL AUTO_INCREMENT,
`view_id` bigint NOT NULL COMMENT 'saved view id',
`user_id` bigint NOT NULL COMMENT 'user id',
`create_at` bigint NOT NULL DEFAULT 0 COMMENT 'create timestamp',
PRIMARY KEY (`id`),
KEY `idx_view_id` (`view_id`),
KEY `idx_user_id` (`user_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='user favorite views';
/* v9 2026-01-20 datasource weight */
ALTER TABLE `datasource` ADD COLUMN `weight` int not null default 0 COMMENT 'weight for sorting';
/* v9 2026-01-20 alert_rule time_zone support */
ALTER TABLE `alert_rule` ADD COLUMN `time_zone` varchar(64) not null default '';
================================================
FILE: docker/sqlite.sql
================================================
CREATE TABLE `users` (
`id` integer primary key autoincrement,
`username` varchar(64) not null unique,
`nickname` varchar(64) not null,
`password` varchar(128) not null default '',
`phone` varchar(16) not null default '',
`email` varchar(64) not null default '',
`portrait` varchar(255) not null default '',
`roles` varchar(255) not null,
`contacts` varchar(1024),
`maintainer` tinyint(1) not null default 0,
`belong` varchar(16) not null default '',
`last_active_time` bigint not null default 0,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default ''
);
CREATE UNIQUE INDEX idx_users_username ON `users` (username);
insert into `users`(id, username, nickname, password, roles, create_at, create_by, update_at, update_by) values(1, 'root', '超管', 'root.2020', 'Admin', strftime('%s', 'now'), 'system', strftime('%s', 'now'), 'system');
CREATE TABLE `user_group` (
`id` integer primary key autoincrement,
`name` varchar(128) not null default '',
`note` varchar(255) not null default '',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default ''
);
CREATE INDEX `idx_user_group_create_by` ON `user_group` (`create_by` asc);
CREATE INDEX `idx_user_group_update_at` ON `user_group` (`update_at` asc);
insert into user_group(id, name, create_at, create_by, update_at, update_by) values(1, 'demo-root-group', strftime('%s', 'now'), 'root', strftime('%s', 'now'), 'root');
CREATE TABLE `user_group_member` (
`id` integer primary key autoincrement,
`group_id` bigint unsigned not null,
`user_id` bigint unsigned not null
);
CREATE INDEX `idx_user_group_member_group_id` ON `user_group_member` (`group_id` asc);
CREATE INDEX `idx_user_group_member_user_id` ON `user_group_member` (`user_id` asc);
insert into user_group_member(group_id, user_id) values(1, 1);
CREATE TABLE `configs` (
`id` integer primary key autoincrement,
`ckey` varchar(191) not null,
`cval` text not null,
`note` varchar(1024) not null default '',
`external` tinyint(1) not null default 0,
`encrypted` tinyint(1) not null default 0,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default ''
);
CREATE TABLE `role` (
`id` integer primary key autoincrement,
`name` varchar(191) not null unique default '',
`note` varchar(255) not null default ''
);
insert into `role`(name, note) values('Admin', 'Administrator role');
insert into `role`(name, note) values('Standard', 'Ordinary user role');
insert into `role`(name, note) values('Guest', 'Readonly user role');
CREATE TABLE `role_operation`(
`id` integer primary key autoincrement,
`role_name` varchar(128) not null,
`operation` varchar(191) not null
);
CREATE INDEX `idx_role_operation_role_name` ON `role_operation` (`role_name` asc);
CREATE INDEX `idx_role_operation_operation` ON `role_operation` (`operation` asc);
-- Admin is special, who has no concrete operation but can do anything.
insert into `role_operation`(role_name, operation) values('Guest', '/metric/explorer');
insert into `role_operation`(role_name, operation) values('Guest', '/object/explorer');
insert into `role_operation`(role_name, operation) values('Guest', '/log/explorer');
insert into `role_operation`(role_name, operation) values('Guest', '/trace/explorer');
insert into `role_operation`(role_name, operation) values('Guest', '/help/version');
insert into `role_operation`(role_name, operation) values('Guest', '/help/contact');
insert into `role_operation`(role_name, operation) values('Standard', '/metric/explorer');
insert into `role_operation`(role_name, operation) values('Standard', '/object/explorer');
insert into `role_operation`(role_name, operation) values('Standard', '/log/explorer');
insert into `role_operation`(role_name, operation) values('Standard', '/trace/explorer');
insert into `role_operation`(role_name, operation) values('Standard', '/help/version');
insert into `role_operation`(role_name, operation) values('Standard', '/help/contact');
insert into `role_operation`(role_name, operation) values('Standard', '/help/servers');
insert into `role_operation`(role_name, operation) values('Standard', '/help/migrate');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules-built-in');
insert into `role_operation`(role_name, operation) values('Standard', '/dashboards-built-in');
insert into `role_operation`(role_name, operation) values('Standard', '/trace/dependencies');
insert into `role_operation`(role_name, operation) values('Admin', '/help/source');
insert into `role_operation`(role_name, operation) values('Admin', '/help/sso');
insert into `role_operation`(role_name, operation) values('Admin', '/help/notification-tpls');
insert into `role_operation`(role_name, operation) values('Admin', '/help/notification-settings');
insert into `role_operation`(role_name, operation) values('Standard', '/users');
insert into `role_operation`(role_name, operation) values('Standard', '/user-groups');
insert into `role_operation`(role_name, operation) values('Standard', '/user-groups/add');
insert into `role_operation`(role_name, operation) values('Standard', '/user-groups/put');
insert into `role_operation`(role_name, operation) values('Standard', '/user-groups/del');
insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups');
insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups/add');
insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups/put');
insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups/del');
insert into `role_operation`(role_name, operation) values('Standard', '/targets');
insert into `role_operation`(role_name, operation) values('Standard', '/targets/add');
insert into `role_operation`(role_name, operation) values('Standard', '/targets/put');
insert into `role_operation`(role_name, operation) values('Standard', '/targets/del');
insert into `role_operation`(role_name, operation) values('Standard', '/dashboards');
insert into `role_operation`(role_name, operation) values('Standard', '/dashboards/add');
insert into `role_operation`(role_name, operation) values('Standard', '/dashboards/put');
insert into `role_operation`(role_name, operation) values('Standard', '/dashboards/del');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules/add');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules/put');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules/del');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-mutes');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-mutes/add');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-mutes/del');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes/add');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes/put');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes/del');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-cur-events');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-cur-events/del');
insert into `role_operation`(role_name, operation) values('Standard', '/alert-his-events');
insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls');
insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls/add');
insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls/put');
insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls/del');
insert into `role_operation`(role_name, operation) values('Standard', '/job-tasks');
insert into `role_operation`(role_name, operation) values('Standard', '/job-tasks/add');
insert into `role_operation`(role_name, operation) values('Standard', '/job-tasks/put');
insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules');
insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules/add');
insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules/put');
insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules/del');
-- for alert_rule | collect_rule | mute | dashboard grouping
CREATE TABLE `busi_group` (
`id` integer primary key autoincrement,
`name` varchar(191) not null unique,
`label_enable` tinyint(1) not null default 0,
`label_value` varchar(191) not null default '',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default ''
);
insert into busi_group(id, name, create_at, create_by, update_at, update_by) values(1, 'Default Busi Group', strftime('%s', 'now'), 'root', strftime('%s', 'now'), 'root');
CREATE TABLE `busi_group_member` (
`id` integer primary key autoincrement,
`busi_group_id` bigint not null,
`user_group_id` bigint not null,
`perm_flag` char(2) not null
);
CREATE INDEX `idx_busi_group_member_busi_group_id` ON `busi_group_member` (`busi_group_id` asc);
CREATE INDEX `idx_busi_group_member_user_group_id` ON `busi_group_member` (`user_group_id` asc);
insert into busi_group_member(busi_group_id, user_group_id, perm_flag) values(1, 1, 'rw');
-- for dashboard new version
CREATE TABLE `board` (
`id` integer primary key autoincrement,
`group_id` bigint not null default 0,
`name` varchar(191) not null,
`ident` varchar(200) not null default '',
`tags` varchar(255) not null,
`public` tinyint(1) not null default 0,
`built_in` tinyint(1) not null default 0,
`hide` tinyint(1) not null default 0,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
`note` varchar(1024) not null default '',
`public_cate` bigint not null default 0
);
CREATE UNIQUE INDEX idx_board_group_id_name ON `board` (group_id, name);
CREATE INDEX `idx_board_ident` ON `board` (`ident` asc);
-- for dashboard new version
CREATE TABLE `board_payload` (
`id` bigint unsigned not null unique,
`payload` mediumtext not null
);
CREATE TABLE `chart` (
`id` integer primary key autoincrement,
`group_id` integer not null,
`configs` text,
`weight` integer not null default 0
);
CREATE INDEX idx_chart_group_id ON `chart` (group_id);
CREATE TABLE `chart_share` (
`id` integer primary key autoincrement,
`cluster` varchar(128) not null,
`datasource_id` bigint unsigned not null default 0,
`configs` text,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default ''
);
CREATE INDEX `idx_chart_share_create_at` ON `chart_share` (`create_at` asc);
CREATE TABLE `alert_rule` (
`id` integer primary key autoincrement,
`group_id` bigint not null default 0,
`cate` varchar(128) not null,
`datasource_ids` varchar(255) not null default '',
`cluster` varchar(128) not null,
`name` varchar(255) not null,
`note` varchar(1024) not null default '',
`prod` varchar(255) not null default '',
`algorithm` varchar(255) not null default '',
`algo_params` varchar(255),
`delay` int not null default 0,
`severity` tinyint(1) not null,
`disabled` tinyint(1) not null,
`prom_for_duration` int not null,
`rule_config` text not null,
`prom_ql` text not null,
`prom_eval_interval` int not null,
`enable_stime` varchar(255) not null default '00:00',
`enable_etime` varchar(255) not null default '23:59',
`enable_days_of_week` varchar(255) not null default '',
`enable_in_bg` tinyint(1) not null default 0,
`notify_recovered` tinyint(1) not null,
`notify_channels` varchar(255) not null default '',
`notify_groups` varchar(255) not null default '',
`notify_repeat_step` int not null default 0,
`notify_max_number` int not null default 0,
`recover_duration` int not null default 0 ,
`callbacks` varchar(4096) not null default '',
`runbook_url` varchar(4096),
`append_tags` varchar(255) not null default '',
`annotations` text not null,
`extra_config` text not null,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
`cron_pattern` varchar(64),
`time_zone` varchar(64) not null default '',
`datasource_queries` text
);
CREATE INDEX `idx_alert_rule_group_id` ON `alert_rule` (`group_id` asc);
CREATE INDEX `idx_alert_rule_update_at` ON `alert_rule` (`update_at` asc);
CREATE TABLE `alert_mute` (
`id` integer primary key autoincrement,
`group_id` bigint not null default 0,
`prod` varchar(255) not null default '',
`note` varchar(1024) not null default '',
`cate` varchar(128) not null,
`cluster` varchar(128) not null,
`datasource_ids` varchar(255) not null default '',
`tags` varchar(4096) default '[]',
`cause` varchar(255) not null default '',
`btime` bigint not null default 0,
`etime` bigint not null default 0,
`disabled` tinyint(1) not null default 0,
`mute_time_type` tinyint(1) not null default 0,
`periodic_mutes` varchar(4096) not null default '',
`severities` varchar(32) not null default '',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default ''
);
CREATE INDEX `idx_alert_mute_create_at` ON `alert_mute` (`create_at` asc);
CREATE INDEX `idx_alert_mute_group_id` ON `alert_mute` (`group_id` asc);
CREATE TABLE `alert_subscribe` (
`id` integer primary key autoincrement,
`name` varchar(255) not null default '',
`disabled` tinyint(1) not null default 0,
`group_id` bigint not null default 0,
`prod` varchar(255) not null default '',
`cate` varchar(128) not null,
`datasource_ids` varchar(255) not null default '',
`cluster` varchar(128) not null,
`rule_id` bigint not null default 0,
`severities` varchar(32) not null default '',
`tags` varchar(4096) not null default '',
`redefine_severity` tinyint(1) default 0,
`new_severity` tinyint(1) not null,
`redefine_channels` tinyint(1) default 0,
`new_channels` varchar(255) not null default '',
`user_group_ids` varchar(250) not null,
`busi_groups` VARCHAR(4096) NOT NULL DEFAULT '[]',
`note` VARCHAR(1024) DEFAULT '',
`rule_ids` VARCHAR(1024) DEFAULT '',
`webhooks` text not null,
`extra_config` text not null,
`redefine_webhooks` tinyint(1) default 0,
`for_duration` bigint not null default 0,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default ''
);
CREATE INDEX `idx_alert_subscribe_update_at` ON `alert_subscribe` (`update_at` asc);
CREATE INDEX `idx_alert_subscribe_group_id` ON `alert_subscribe` (`group_id` asc);
CREATE TABLE `target` (
`id` integer primary key autoincrement,
`group_id` bigint not null default 0,
`ident` varchar(191) not null unique,
`note` varchar(255) not null default '',
`tags` varchar(512) not null default '',
`host_ip` varchar(15) default '',
`agent_version` varchar(255) default '',
`host_tags` text,
`engine_name` varchar(255) default '',
`os` varchar(31) default '',
`update_at` bigint not null default 0
);
CREATE INDEX `idx_target_group_id` ON `target` (`group_id` asc);
CREATE UNIQUE INDEX idx_target_ident ON `target` (ident);
CREATE INDEX idx_host_ip ON `target` (host_ip);
CREATE INDEX idx_agent_version ON `target` (agent_version);
CREATE INDEX idx_engine_name ON `target` (engine_name);
CREATE INDEX idx_os ON `target` (os);
CREATE TABLE `metric_view` (
`id` integer primary key autoincrement,
`name` varchar(191) not null default '',
`cate` tinyint(1) not null,
`configs` varchar(8192) not null default '',
`create_at` bigint not null default 0,
`create_by` bigint not null default 0,
`update_at` bigint not null default 0
);
CREATE INDEX `idx_metric_view_create_by` ON `metric_view` (`create_by` asc);
insert into metric_view(name, cate, configs) values('Host View', 0, '{"filters":[{"oper":"=","label":"__name__","value":"cpu_usage_idle"}],"dynamicLabels":[],"dimensionLabels":[{"label":"ident","value":""}]}');
CREATE TABLE `recording_rule` (
`id` integer primary key autoincrement,
`group_id` bigint not null default '0',
`datasource_ids` varchar(255) not null default '',
`cluster` varchar(128) not null,
`name` varchar(255) not null,
`note` varchar(255) not null,
`disabled` tinyint(1) not null default 0,
`prom_ql` varchar(8192) not null,
`prom_eval_interval` int not null,
`cron_pattern` varchar(255) default '',
`append_tags` varchar(255) default '',
`query_configs` text not null,
`create_at` bigint default '0',
`create_by` varchar(64) default '',
`update_at` bigint default '0',
`update_by` varchar(64) default '',
`datasource_queries` text
);
CREATE INDEX `idx_recording_rule_group_id` ON `recording_rule` (`group_id` asc);
CREATE INDEX `idx_recording_rule_update_at` ON `recording_rule` (`update_at` asc);
CREATE TABLE `alert_aggr_view` (
`id` integer primary key autoincrement,
`name` varchar(191) not null default '',
`rule` varchar(2048) not null default '',
`cate` tinyint(1) not null,
`create_at` bigint not null default 0,
`create_by` bigint not null default 0,
`update_at` bigint not null default 0
);
CREATE INDEX `idx_alert_aggr_view_create_by` ON `alert_aggr_view` (`create_by` asc);
insert into alert_aggr_view(name, rule, cate) values('By BusiGroup, Severity', 'field:group_name::field:severity', 0);
insert into alert_aggr_view(name, rule, cate) values('By RuleName', 'field:rule_name', 0);
CREATE TABLE `alert_cur_event` (
`id` integer primary key autoincrement,
`cate` varchar(128) not null,
`datasource_id` bigint not null default 0,
`cluster` varchar(128) not null,
`group_id` bigint unsigned not null,
`group_name` varchar(255) not null default '',
`hash` varchar(64) not null,
`rule_id` bigint unsigned not null,
`rule_name` varchar(255) not null,
`rule_note` varchar(2048) not null default 'alert rule note',
`rule_prod` varchar(255) not null default '',
`rule_algo` varchar(255) not null default '',
`severity` tinyint(1) not null,
`prom_for_duration` int not null,
`prom_ql` varchar(8192) not null,
`prom_eval_interval` int not null,
`callbacks` varchar(255) not null default '',
`runbook_url` varchar(255),
`notify_recovered` tinyint(1) not null,
`notify_channels` varchar(255) not null default '',
`notify_groups` varchar(255) not null default '',
`notify_repeat_next` bigint not null default 0,
`notify_cur_number` int not null default 0,
`target_ident` varchar(191) not null default '',
`target_note` varchar(191) not null default '',
`first_trigger_time` bigint,
`trigger_time` bigint not null,
`trigger_value` varchar(2048) not null,
`annotations` text not null,
`rule_config` text not null,
`tags` varchar(1024) not null default ''
);
CREATE INDEX `idx_alert_cur_event_hash` ON `alert_cur_event` (`hash` asc);
CREATE INDEX `idx_alert_cur_event_rule_id` ON `alert_cur_event` (`rule_id` asc);
CREATE INDEX `idx_alert_cur_event_trigger_time_group_id` ON `alert_cur_event` (`trigger_time`, `group_id` asc);
CREATE INDEX `idx_alert_cur_event_notify_repeat_next` ON `alert_cur_event` (`notify_repeat_next` asc);
CREATE TABLE `alert_his_event` (
`id` integer primary key autoincrement,
`is_recovered` tinyint(1) not null,
`cate` varchar(128) not null,
`datasource_id` bigint not null default 0,
`cluster` varchar(128) not null,
`group_id` bigint unsigned not null,
`group_name` varchar(255) not null default '',
`hash` varchar(64) not null,
`rule_id` bigint unsigned not null,
`rule_name` varchar(255) not null,
`rule_note` varchar(2048) not null default 'alert rule note',
`rule_prod` varchar(255) not null default '',
`rule_algo` varchar(255) not null default '',
`severity` tinyint(1) not null,
`prom_for_duration` int not null,
`prom_ql` varchar(8192) not null,
`prom_eval_interval` int not null,
`callbacks` varchar(255) not null default '',
`runbook_url` varchar(255),
`notify_recovered` tinyint(1) not null,
`notify_channels` varchar(255) not null default '',
`notify_groups` varchar(255) not null default '',
`notify_cur_number` int not null default 0,
`target_ident` varchar(191) not null default '',
`target_note` varchar(191) not null default '',
`first_trigger_time` bigint,
`trigger_time` bigint not null,
`trigger_value` varchar(2048) not null,
`recover_time` bigint not null default 0,
`last_eval_time` bigint not null default 0,
`original_tags` varchar(8192),
`tags` varchar(1024) not null default '',
`annotations` text not null,
`rule_config` text not null
);
CREATE INDEX `idx_alert_his_event_last_eval_time` ON `alert_his_event` (`last_eval_time` asc);
CREATE INDEX `idx_alert_his_event_hash` ON `alert_his_event` (`hash` asc);
CREATE INDEX `idx_alert_his_event_rule_id` ON `alert_his_event` (`rule_id` asc);
CREATE INDEX `idx_alert_his_event_trigger_time_group_id` ON `alert_his_event` (`trigger_time`, `group_id` asc);
CREATE TABLE `board_busigroup` (
`busi_group_id` bigint(20) NOT NULL DEFAULT '0',
`board_id` bigint(20) NOT NULL DEFAULT '0',
primary key (`busi_group_id`, `board_id`)
);
CREATE TABLE `builtin_components` (
`id` integer primary key autoincrement,
`ident` varchar(191) not null,
`logo` varchar(191) not null,
`readme` text not null,
`created_at` bigint(20) not null default 0,
`created_by` varchar(191) not null default '',
`updated_at` bigint(20) not null default 0,
`updated_by` varchar(191) not null default ''
);
CREATE INDEX `idx_builtin_components_ident` ON `builtin_components` (`ident` asc);
CREATE TABLE `builtin_payloads` (
`id` integer primary key autoincrement,
`component_id` integer not null default 0,
`uuid` integer not null,
`type` varchar(191) not null,
`component` varchar(191) not null,
`cate` varchar(191) not null,
`name` varchar(191) not null,
`tags` varchar(191) not null default '',
`content` longtext not null,
`note` varchar(1024) not null default '',
`created_at` bigint(20) not null default 0,
`created_by` varchar(191) not null default '',
`updated_at` bigint(20) not null default 0,
`updated_by` varchar(191) not null default ''
);
CREATE INDEX `idx_builtin_payloads_component` ON `builtin_payloads` (`component` asc);
CREATE INDEX `idx_builtin_payloads_name` ON `builtin_payloads` (`name` asc);
CREATE INDEX `idx_builtin_payloads_cate` ON `builtin_payloads` (`cate` asc);
CREATE INDEX `idx_builtin_payloads_type` ON `builtin_payloads` (`type` asc);
CREATE INDEX idx_uuid ON `builtin_payloads` (uuid);
CREATE TABLE `notification_record` (
`id` integer primary key autoincrement,
`event_id` integer not null,
`sub_id` integer,
`channel` varchar(255) not null,
`status` integer,
`target` varchar(1024) not null,
`details` varchar(2048) default '',
`created_at` integer not null
);
CREATE INDEX idx_evt ON notification_record (event_id);
CREATE TABLE `task_tpl` (
`id` integer primary key autoincrement,
`group_id` int unsigned not null,
`title` varchar(255) not null default '',
`account` varchar(64) not null,
`batch` int unsigned not null default 0,
`tolerance` int unsigned not null default 0,
`timeout` int unsigned not null default 0,
`pause` varchar(255) not null default '',
`script` text not null,
`args` varchar(512) not null default '',
`tags` varchar(255) not null default '',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default ''
);
CREATE INDEX `idx_task_tpl_group_id` ON `task_tpl` (`group_id` asc);
CREATE TABLE `task_tpl_host` (
`ii` integer primary key autoincrement,
`id` int unsigned not null,
`host` varchar(128) not null
);
CREATE INDEX `idx_task_tpl_host_id_host` ON `task_tpl_host` (`id`, `host` asc);
CREATE TABLE `task_record` (
`id` integer primary key autoincrement,
`event_id` bigint not null default 0,
`group_id` bigint not null,
`ibex_address` varchar(128) not null,
`ibex_auth_user` varchar(128) not null default '',
`ibex_auth_pass` varchar(128) not null default '',
`title` varchar(255) not null default '',
`account` varchar(64) not null,
`batch` int unsigned not null default 0,
`tolerance` int unsigned not null default 0,
`timeout` int unsigned not null default 0,
`pause` varchar(255) not null default '',
`script` text not null,
`args` varchar(512) not null default '',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default ''
);
CREATE INDEX `idx_task_record_create_at_group_id` ON `task_record` (`create_at`, `group_id` asc);
CREATE INDEX `idx_task_record_create_by` ON `task_record` (`create_by` asc);
CREATE INDEX `idx_task_record_event_id` ON `task_record` (`event_id` asc);
CREATE TABLE `alerting_engines` (
`id` integer primary key autoincrement,
`instance` varchar(128) not null default '',
`datasource_id` bigint not null default 0,
`engine_cluster` varchar(128) not null default '',
`clock` bigint not null
);
CREATE TABLE `datasource`
(
`id` integer primary key autoincrement,
`name` varchar(191) not null default '' unique,
`description` varchar(255) not null default '',
`category` varchar(255) not null default '',
`plugin_id` int unsigned not null default 0,
`plugin_type` varchar(255) not null default '',
`plugin_type_name` varchar(255) not null default '',
`cluster_name` varchar(255) not null default '',
`settings` text not null,
`status` varchar(255) not null default '',
`http` varchar(4096) not null default '',
`auth` varchar(8192) not null default '',
`is_default` tinyint not null default 0,
`weight` int not null default 0,
`created_at` bigint not null default 0,
`created_by` varchar(64) not null default '',
`updated_at` bigint not null default 0,
`updated_by` varchar(64) not null default ''
);
CREATE UNIQUE INDEX idx_datasource_name ON datasource (name);
CREATE TABLE `builtin_cate` (
`id` integer primary key autoincrement,
`name` varchar(191) not null,
`user_id` bigint not null default 0
);
CREATE TABLE `notify_tpl` (
`id` integer primary key autoincrement,
`channel` varchar(32) not null unique,
`name` varchar(255) not null,
`content` text not null,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default ''
);
CREATE UNIQUE INDEX idx_notify_tpl_channel ON notify_tpl (channel);
CREATE TABLE `sso_config` (
`id` integer primary key autoincrement,
`name` varchar(191) not null unique,
`content` text not null,
`update_at` bigint not null default 0
);
CREATE UNIQUE INDEX idx_sso_config_name ON sso_config (name);
CREATE TABLE `es_index_pattern` (
`id` integer primary key autoincrement,
`datasource_id` bigint not null default 0,
`name` varchar(191) not null,
`time_field` varchar(128) not null default '@timestamp',
`allow_hide_system_indices` tinyint(1) not null default 0,
`fields_format` varchar(4096) not null default '',
`cross_cluster_enabled` int not null default 0,
`create_at` bigint default '0',
`create_by` varchar(64) default '',
`update_at` bigint default '0',
`update_by` varchar(64) default '',
unique (`datasource_id`, `name`)
);
CREATE UNIQUE INDEX idx_es_index_pattern_datasource_id_name ON es_index_pattern (datasource_id, name);
CREATE TABLE `builtin_metrics` (
`id` integer primary key autoincrement,
`collector` varchar(191) NOT NULL,
`typ` varchar(191) NOT NULL,
`name` varchar(191) NOT NULL,
`unit` varchar(191) NOT NULL,
`lang` varchar(191) NOT NULL DEFAULT '',
`note` varchar(4096) NOT NULL,
`expression` varchar(4096) NOT NULL,
`expression_type` varchar(32) NOT NULL DEFAULT 'promql',
`metric_type` varchar(191) NOT NULL DEFAULT '',
`extra_fields` text,
`created_at` bigint NOT NULL DEFAULT 0,
`created_by` varchar(191) NOT NULL DEFAULT '',
`updated_at` bigint NOT NULL DEFAULT 0,
`updated_by` varchar(191) NOT NULL DEFAULT '',
`uuid integer` not null default 0
);
CREATE INDEX idx_collector ON builtin_metrics (collector);
CREATE INDEX idx_typ ON builtin_metrics (typ);
CREATE INDEX idx_builtinmetric_name ON builtin_metrics (name);
CREATE INDEX idx_lang ON builtin_metrics (lang);
CREATE TABLE `metric_filter` (
`id` integer primary key autoincrement,
`name` varchar(191) NOT NULL,
`configs` varchar(4096) NOT NULL,
`groups_perm` text,
`create_at` bigint NOT NULL DEFAULT '0',
`create_by` varchar(191) NOT NULL DEFAULT '',
`update_at` bigint NOT NULL DEFAULT '0',
`update_by` varchar(191) NOT NULL DEFAULT ''
);
CREATE INDEX `idx_metric_filter_name` ON `metric_filter` (`name` asc);
CREATE TABLE `target_busi_group` (
`id` integer primary key autoincrement,
`target_ident` varchar(191) not null,
`group_id` integer not null,
`update_at` integer not null
);
CREATE UNIQUE INDEX idx_target_busi_group ON target_busi_group (target_ident, group_id);
CREATE TABLE `dash_annotation` (
`id` integer primary key autoincrement,
`dashboard_id` bigint not null,
`panel_id` varchar(191) not null,
`tags` text,
`description` text,
`config` text,
`time_start` bigint not null default 0,
`time_end` bigint not null default 0,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default ''
);
CREATE TABLE `task_meta`
(
`id` integer primary key autoincrement,
`title` varchar(255) not null default '',
`account` varchar(64) not null,
`batch` int unsigned not null default 0,
`tolerance` int unsigned not null default 0,
`timeout` int unsigned not null default 0,
`pause` varchar(255) not null default '',
`script` text not null,
`args` varchar(512) not null default '',
`stdin` varchar(1024) not null default '',
`creator` varchar(64) not null default '',
`created` timestamp not null default CURRENT_TIMESTAMP
);
CREATE INDEX `idx_task_meta_creator` ON `task_meta` (`creator` asc);
CREATE INDEX `idx_task_meta_created` ON `task_meta` (`created` asc);
/* start|cancel|kill|pause */
CREATE TABLE `task_action`
(
`id` integer primary key autoincrement,
`action` varchar(32) not null,
`clock` bigint not null default 0
);
CREATE TABLE `task_scheduler`
(
`id` bigint unsigned not null,
`scheduler` varchar(128) not null default ''
);
CREATE INDEX `idx_task_scheduler_id_scheduler` ON `task_scheduler` (`id`, `scheduler` asc);
CREATE TABLE `task_scheduler_health`
(
`scheduler` varchar(128) not null unique,
`clock` bigint not null
);
CREATE INDEX `idx_task_scheduler_health_clock` ON `task_scheduler_health` (`clock` asc);
CREATE TABLE `task_host_doing`
(
`id` bigint unsigned not null,
`host` varchar(128) not null,
`clock` bigint not null default 0,
`action` varchar(16) not null
);
CREATE INDEX `idx_task_host_doing_id` ON `task_host_doing` (`id` asc);
CREATE INDEX `idx_task_host_doing_host` ON `task_host_doing` (`host` asc);
CREATE TABLE task_host_0
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_1
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_2
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_3
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_4
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_5
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_6
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_7
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_8
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_9
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_10
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_11
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_12
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_13
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_14
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_15
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_16
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_17
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_18
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_19
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_20
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_21
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_22
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_23
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_24
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_25
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_26
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_27
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_28
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_29
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_30
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_31
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_32
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_33
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_34
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_35
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_36
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_37
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_38
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_39
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_40
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_41
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_42
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_43
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_44
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_45
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_46
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_47
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_48
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_49
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_50
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_51
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_52
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_53
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_54
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_55
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_56
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_57
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_58
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_59
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_60
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_61
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_62
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_63
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_64
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_65
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_66
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_67
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_68
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_69
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_70
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_71
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_72
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_73
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_74
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_75
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_76
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_77
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_78
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_79
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_80
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_81
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_82
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_83
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_84
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_85
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_86
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_87
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_88
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_89
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_90
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_91
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_92
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_93
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_94
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_95
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_96
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_97
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_98
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
CREATE TABLE task_host_99
(
`ii` integer primary key autoincrement,
`id` bigint unsigned not null,
`host` varchar(128) not null,
`status` varchar(32) not null,
`stdout` text,
`stderr` text,
unique (`id`, `host`)
);
================================================
FILE: dscache/cache.go
================================================
package dscache
import (
"sync"
"github.com/ccfos/nightingale/v6/datasource"
"github.com/toolkits/pkg/logger"
)
type Cache struct {
datas map[string]map[int64]datasource.Datasource
mutex *sync.RWMutex
}
var DsCache = Cache{
datas: make(map[string]map[int64]datasource.Datasource),
mutex: new(sync.RWMutex),
}
func (cs *Cache) Put(cate string, dsId int64, ds datasource.Datasource) {
cs.mutex.Lock()
if _, found := cs.datas[cate]; !found {
cs.datas[cate] = make(map[int64]datasource.Datasource)
}
if _, found := cs.datas[cate][dsId]; found {
if cs.datas[cate][dsId].Equal(ds) {
cs.mutex.Unlock()
return
}
}
cs.mutex.Unlock()
// InitClient() 在用户配置错误或远端不可用时, 会非常耗时, mutex被长期持有, 导致Get()会超时
err := ds.InitClient()
if err != nil {
logger.Errorf("init plugin:%s %d %+v client fail: %v", cate, dsId, ds, err)
return
}
logger.Debugf("init plugin:%s %d %+v client success", cate, dsId, ds)
cs.mutex.Lock()
cs.datas[cate][dsId] = ds
cs.mutex.Unlock()
}
func (cs *Cache) Get(cate string, dsId int64) (datasource.Datasource, bool) {
cs.mutex.RLock()
defer cs.mutex.RUnlock()
if _, found := cs.datas[cate]; !found {
return nil, false
}
if _, found := cs.datas[cate][dsId]; !found {
return nil, false
}
return cs.datas[cate][dsId], true
}
func (cs *Cache) Delete(cate string, dsId int64) {
cs.mutex.Lock()
defer cs.mutex.Unlock()
if _, found := cs.datas[cate]; !found {
return
}
delete(cs.datas[cate], dsId)
logger.Debugf("delete plugin:%s %d from cache", cate, dsId)
}
// GetAllIds 返回缓存中所有数据源的 ID,按类型分组
func (cs *Cache) GetAllIds() map[string][]int64 {
cs.mutex.RLock()
defer cs.mutex.RUnlock()
result := make(map[string][]int64)
for cate, dsMap := range cs.datas {
ids := make([]int64, 0, len(dsMap))
for dsId := range dsMap {
ids = append(ids, dsId)
}
result[cate] = ids
}
return result
}
================================================
FILE: dscache/sync.go
================================================
package dscache
import (
"context"
"encoding/base64"
"strings"
"sync/atomic"
"time"
"github.com/ccfos/nightingale/v6/datasource"
_ "github.com/ccfos/nightingale/v6/datasource/ck"
_ "github.com/ccfos/nightingale/v6/datasource/doris"
"github.com/ccfos/nightingale/v6/datasource/es"
_ "github.com/ccfos/nightingale/v6/datasource/mysql"
_ "github.com/ccfos/nightingale/v6/datasource/opensearch"
_ "github.com/ccfos/nightingale/v6/datasource/postgresql"
_ "github.com/ccfos/nightingale/v6/datasource/victorialogs"
"github.com/ccfos/nightingale/v6/dskit/tdengine"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/toolkits/pkg/logger"
)
var FromAPIHook func()
var DatasourceProcessHook func(items []datasource.DatasourceInfo) []datasource.DatasourceInfo
func Init(ctx *ctx.Context, fromAPI bool) {
if !ctx.IsCenter {
// 从 center 同步密钥
var rsaConfig = new(models.RsaConfig)
c, err := poster.GetByUrls[*models.RsaConfig](ctx, "/v1/n9e/datasource-rsa-config")
if err != nil || c == nil {
logger.Fatalf("failed to get datasource rsa-config, error: %v", err)
}
rsaConfig = c
if c.OpenRSA {
logger.Infof("datasource rsa is open in n9e-plus")
rsaConfig.PrivateKeyBytes, err = base64.StdEncoding.DecodeString(c.RSAPrivateKey)
if err != nil {
logger.Fatalf("failed to decode rsa-config, error: %v", err)
}
}
models.SetRsaConfig(rsaConfig)
}
go getDatasourcesFromDBLoop(ctx, fromAPI)
}
type ListInput struct {
Page int `json:"p"`
Limit int `json:"limit"`
Category string `json:"category"`
PluginType string `json:"plugin_type"` // prometheus
Status string `json:"status"`
}
type DSReply struct {
RequestID string `json:"request_id"`
Data struct {
Items []datasource.DatasourceInfo `json:"items"`
} `json:"data"`
}
type DSReplyEncrypt struct {
RequestID string `json:"request_id"`
Data string `json:"data"`
}
var PromDefaultDatasourceId int64
func getDatasourcesFromDBLoop(ctx *ctx.Context, fromAPI bool) {
for {
if !fromAPI {
foundDefaultDatasource := false
items, err := models.GetDatasources(ctx)
if err != nil {
logger.Errorf("get datasource from database fail: %v", err)
//stat.CounterExternalErrorTotal.WithLabelValues("db", "get_cluster").Inc()
time.Sleep(time.Second * 2)
continue
}
var dss []datasource.DatasourceInfo
for _, item := range items {
if item.PluginType == "prometheus" && item.IsDefault {
atomic.StoreInt64(&PromDefaultDatasourceId, item.Id)
foundDefaultDatasource = true
}
// logger.Debugf("get datasource: %+v", item)
ds := datasource.DatasourceInfo{
Id: item.Id,
Name: item.Name,
Description: item.Description,
Category: item.Category,
PluginId: item.PluginId,
Type: item.PluginType,
PluginTypeName: item.PluginTypeName,
Settings: item.SettingsJson,
HTTPJson: item.HTTPJson,
AuthJson: item.AuthJson,
Status: item.Status,
IsDefault: item.IsDefault,
Weight: item.Weight,
}
if item.PluginType == "elasticsearch" {
esN9eToDatasourceInfo(&ds, item)
} else if item.PluginType == "tdengine" {
tdN9eToDatasourceInfo(&ds, item)
} else {
ds.Settings = make(map[string]interface{})
for k, v := range item.SettingsJson {
ds.Settings[k] = v
}
}
dss = append(dss, ds)
}
if !foundDefaultDatasource && atomic.LoadInt64(&PromDefaultDatasourceId) != 0 {
logger.Debugf("no default datasource found")
atomic.StoreInt64(&PromDefaultDatasourceId, 0)
}
if DatasourceProcessHook != nil {
dss = DatasourceProcessHook(dss)
}
PutDatasources(dss)
} else {
FromAPIHook()
}
time.Sleep(time.Second * 2)
}
}
func tdN9eToDatasourceInfo(ds *datasource.DatasourceInfo, item models.Datasource) {
ds.Settings = make(map[string]interface{})
ds.Settings["tdengine.cluster_name"] = item.Name
ds.Settings["tdengine.addr"] = item.HTTPJson.Url
ds.Settings["tdengine.timeout"] = item.HTTPJson.Timeout
ds.Settings["tdengine.dial_timeout"] = item.HTTPJson.DialTimeout
ds.Settings["tdengine.max_idle_conns_per_host"] = item.HTTPJson.MaxIdleConnsPerHost
ds.Settings["tdengine.headers"] = item.HTTPJson.Headers
ds.Settings["tdengine.basic"] = tdengine.TDengineBasicAuth{
User: item.AuthJson.BasicAuthUser,
Password: item.AuthJson.BasicAuthPassword,
}
}
func esN9eToDatasourceInfo(ds *datasource.DatasourceInfo, item models.Datasource) {
ds.Settings = make(map[string]interface{})
ds.Settings["es.nodes"] = []string{item.HTTPJson.Url}
if len(item.HTTPJson.Urls) > 0 {
ds.Settings["es.nodes"] = item.HTTPJson.Urls
}
ds.Settings["es.timeout"] = item.HTTPJson.Timeout
ds.Settings["es.basic"] = es.BasicAuth{
Username: item.AuthJson.BasicAuthUser,
Password: item.AuthJson.BasicAuthPassword,
}
ds.Settings["es.tls"] = es.TLS{
SkipTlsVerify: item.HTTPJson.TLS.SkipTlsVerify,
}
ds.Settings["es.version"] = item.SettingsJson["version"]
ds.Settings["es.headers"] = item.HTTPJson.Headers
ds.Settings["es.min_interval"] = item.SettingsJson["min_interval"]
ds.Settings["es.max_shard"] = item.SettingsJson["max_shard"]
ds.Settings["es.enable_write"] = item.SettingsJson["enable_write"]
}
func PutDatasources(items []datasource.DatasourceInfo) {
// 记录当前有效的数据源 ID,按类型分组
validIds := make(map[string]map[int64]struct{})
ids := make([]int64, 0)
for _, item := range items {
if item.Type == "prometheus" {
continue
}
if item.Type == "loki" {
continue
}
if item.Name == "" {
logger.Warningf("cluster name is empty, ignore %+v", item)
continue
}
typ := strings.ReplaceAll(item.Type, ".logging", "")
ds, err := datasource.GetDatasourceByType(typ, item.Settings)
if err != nil {
logger.Debugf("get plugin:%+v fail: %v", item, err)
continue
}
err = ds.Validate(context.Background())
if err != nil {
logger.Warningf("get plugin:%+v fail: %v", item, err)
continue
}
ids = append(ids, item.Id)
// 记录有效的数据源 ID
if _, ok := validIds[typ]; !ok {
validIds[typ] = make(map[int64]struct{})
}
validIds[typ][item.Id] = struct{}{}
// 异步初始化 client 不然数据源同步的会很慢
go func() {
defer func() {
if r := recover(); r != nil {
logger.Errorf("panic in datasource item: %+v panic:%v", item, r)
}
}()
DsCache.Put(typ, item.Id, ds)
}()
}
// 删除 items 中不存在但 DsCache 中存在的数据源
cachedIds := DsCache.GetAllIds()
for cate, dsIds := range cachedIds {
for _, dsId := range dsIds {
if _, ok := validIds[cate]; !ok {
// 该类型在 items 中完全不存在,删除缓存中的所有该类型数据源
DsCache.Delete(cate, dsId)
} else if _, ok := validIds[cate][dsId]; !ok {
// 该数据源 ID 在 items 中不存在,删除
DsCache.Delete(cate, dsId)
}
}
}
// logger.Debugf("get plugin by type success Ids:%v", ids)
}
================================================
FILE: dskit/clickhouse/clickhouse.go
================================================
package clickhouse
import (
"context"
"crypto/tls"
"database/sql"
"errors"
"fmt"
"strings"
"time"
"github.com/ccfos/nightingale/v6/dskit/sqlbase"
"github.com/ccfos/nightingale/v6/dskit/types"
"github.com/ClickHouse/clickhouse-go/v2"
"github.com/mitchellh/mapstructure"
"github.com/toolkits/pkg/logger"
ckDriver "gorm.io/driver/clickhouse"
"gorm.io/gorm"
)
const (
ckDataSource = "clickhouse://%s:%s@%s?read_timeout=10s"
DefaultLimit = 500
)
type Clickhouse struct {
Nodes []string `json:"ck.nodes" mapstructure:"ck.nodes"`
User string `json:"ck.user" mapstructure:"ck.user"`
Password string `json:"ck.password" mapstructure:"ck.password"`
Timeout int `json:"ck.timeout" mapstructure:"ck.timeout"`
MaxQueryRows int `json:"ck.max_query_rows" mapstructure:"ck.max_query_rows"`
Protocol string `json:"ck.protocol" mapstructure:"ck.protocol"`
SkipSSLVerify bool `json:"ck.skip_ssl_verify" mapstructure:"ck.skip_ssl_verify"`
SecureConnection bool `json:"ck.secure_connection" mapstructure:"ck.secure_connection"`
// 连接池配置(可选)
MaxIdleConns int `json:"ck.max_idle_conns" mapstructure:"ck.max_idle_conns"` // 最大空闲连接数
MaxOpenConns int `json:"ck.max_open_conns" mapstructure:"ck.max_open_conns"` // 最大打开连接数
ConnMaxLifetime int `json:"ck.conn_max_lifetime" mapstructure:"ck.conn_max_lifetime"` // 连接最大生命周期(秒)
Client *gorm.DB `json:"-"`
ClientByHTTP *sql.DB `json:"-"`
}
func (c *Clickhouse) InitCli() error {
if c.MaxQueryRows == 0 {
c.MaxQueryRows = DefaultLimit
}
if len(c.Nodes) == 0 {
return fmt.Errorf("not found ck shard, please check datasource config")
}
// 前端只允许 host:port,直接使用第一个节点
addr := c.Nodes[0]
prot := strings.ToLower(strings.TrimSpace(c.Protocol))
// 如果用户显式指定 protocol,只允许 http 或 native
if prot != "" {
if prot != "http" && prot != "native" {
return fmt.Errorf("unsupported clickhouse protocol: %s, only `http`, `https` or `native` allowed", c.Protocol)
}
// HTTP(S) 路径(使用 clickhouse-go HTTP client)
if prot == "http" {
opts := &clickhouse.Options{
Addr: []string{addr},
Auth: clickhouse.Auth{Username: c.User, Password: c.Password},
Settings: clickhouse.Settings{"max_execution_time": 60},
DialTimeout: 10 * time.Second,
Protocol: clickhouse.HTTP,
}
// 仅当显式指定 https 时才启用 TLS 并使用 SkipSSL 控制 InsecureSkipVerify
if c.SecureConnection {
opts.TLS = &tls.Config{InsecureSkipVerify: c.SkipSSLVerify}
}
ckconn := clickhouse.OpenDB(opts)
if ckconn == nil {
return errors.New("db conn failed")
}
// 应用连接池配置到 HTTP sql.DB
if c.MaxIdleConns > 0 {
ckconn.SetMaxIdleConns(c.MaxIdleConns)
}
if c.MaxOpenConns > 0 {
ckconn.SetMaxOpenConns(c.MaxOpenConns)
}
if c.ConnMaxLifetime > 0 {
ckconn.SetConnMaxLifetime(time.Duration(c.ConnMaxLifetime) * time.Second)
}
c.ClientByHTTP = ckconn
return nil
}
// native 路径(使用 gorm + native driver)
dsn := fmt.Sprintf(ckDataSource, c.User, c.Password, addr)
// 如果启用了 SecureConnection,为 DSN 添加 TLS 参数;SkipSSLVerify 控制是否跳过证书校验
if c.SecureConnection {
dsn = dsn + "&secure=true"
if c.SkipSSLVerify {
dsn = dsn + "&skip_verify=true"
}
}
db, err := gorm.Open(
ckDriver.New(
ckDriver.Config{
DSN: dsn,
DisableDatetimePrecision: true,
DontSupportRenameColumn: true,
SkipInitializeWithVersion: false,
}),
)
if err != nil {
return err
}
// 应用连接池配置到 gorm 底层 *sql.DB
if sqlDB, derr := db.DB(); derr == nil {
if c.MaxIdleConns > 0 {
sqlDB.SetMaxIdleConns(c.MaxIdleConns)
}
if c.MaxOpenConns > 0 {
sqlDB.SetMaxOpenConns(c.MaxOpenConns)
}
if c.ConnMaxLifetime > 0 {
sqlDB.SetConnMaxLifetime(time.Duration(c.ConnMaxLifetime) * time.Second)
}
} else {
logger.Debugf("clickhouse: get native sql DB failed: %v", derr)
}
c.Client = db
return nil
}
opts := &clickhouse.Options{
Addr: []string{addr},
Auth: clickhouse.Auth{Username: c.User, Password: c.Password},
Settings: clickhouse.Settings{"max_execution_time": 60},
DialTimeout: 10 * time.Second,
Protocol: clickhouse.HTTP,
}
ckconn := clickhouse.OpenDB(opts)
if ckconn != nil {
// 做一次 Ping 校验,避免把 native 端口误当作 HTTP 使用
if err := ckconn.Ping(); err == nil {
if c.MaxIdleConns > 0 {
ckconn.SetMaxIdleConns(c.MaxIdleConns)
}
if c.MaxOpenConns > 0 {
ckconn.SetMaxOpenConns(c.MaxOpenConns)
}
if c.ConnMaxLifetime > 0 {
ckconn.SetConnMaxLifetime(time.Duration(c.ConnMaxLifetime) * time.Second)
}
c.ClientByHTTP = ckconn
return nil
} else {
logger.Debugf("clickhouse http ping failed for %s, fallback to native: %v", addr, err)
_ = ckconn.Close()
}
}
// 作为最后回退,尝试 native 连接
host := strings.TrimPrefix(strings.TrimPrefix(addr, "http://"), "https://")
dsn := fmt.Sprintf(ckDataSource, c.User, c.Password, host)
// 如果启用了 SecureConnection,为 DSN 添加 TLS 参数;SkipSSLVerify 控制是否跳过证书校验
if c.SecureConnection {
dsn = dsn + "&secure=true"
if c.SkipSSLVerify {
dsn = dsn + "&skip_verify=true"
}
}
db, err := gorm.Open(
ckDriver.New(
ckDriver.Config{
DSN: dsn,
DisableDatetimePrecision: true,
DontSupportRenameColumn: true,
SkipInitializeWithVersion: false,
}),
)
if err != nil {
return err
}
if sqlDB, derr := db.DB(); derr == nil {
if c.MaxIdleConns > 0 {
sqlDB.SetMaxIdleConns(c.MaxIdleConns)
}
if c.MaxOpenConns > 0 {
sqlDB.SetMaxOpenConns(c.MaxOpenConns)
}
if c.ConnMaxLifetime > 0 {
sqlDB.SetConnMaxLifetime(time.Duration(c.ConnMaxLifetime) * time.Second)
}
}
c.Client = db
return nil
}
const (
ShowDatabases = "SHOW DATABASES"
ShowTables = "SELECT name FROM system.tables WHERE database = '%s'"
DescTable = "SELECT name,type FROM system.columns WHERE database='%s' AND table = '%s';"
)
func (c *Clickhouse) QueryRows(ctx context.Context, query string) (*sql.Rows, error) {
var (
rows *sql.Rows
err error
)
if c.ClientByHTTP != nil {
rows, err = c.ClientByHTTP.Query(query)
if err != nil {
return nil, err
}
} else if c.Client != nil {
rows, err = c.Client.Raw(query).Rows()
if err != nil {
return nil, err
}
} else {
return nil, fmt.Errorf("clickhouse client is nil")
}
return rows, nil
}
// ShowDatabases lists all databases in Clickhouse
func (c *Clickhouse) ShowDatabases(ctx context.Context) ([]string, error) {
res := make([]string, 0)
rows, err := c.QueryRows(ctx, ShowDatabases)
if err != nil {
return nil, err
}
for rows.Next() {
var r string
if err := rows.Scan(&r); err != nil {
return nil, err
}
res = append(res, r)
}
return res, nil
}
// ShowTables lists all tables in a given database
func (c *Clickhouse) ShowTables(ctx context.Context, database string) ([]string, error) {
res := make([]string, 0)
showTables := fmt.Sprintf(ShowTables, database)
rows, err := c.QueryRows(ctx, showTables)
if err != nil {
return nil, err
}
for rows.Next() {
var r string
if err := rows.Scan(&r); err != nil {
return nil, err
}
res = append(res, r)
}
return res, nil
}
// DescribeTable describes the schema of a specified table in Clickhouse
func (c *Clickhouse) DescribeTable(ctx context.Context, query interface{}) ([]*types.ColumnProperty, error) {
var (
ret []*types.ColumnProperty
)
ckQueryParam := new(QueryParam)
if err := mapstructure.Decode(query, ckQueryParam); err != nil {
return nil, err
}
descTable := fmt.Sprintf(DescTable, ckQueryParam.Database, ckQueryParam.Table)
rows, err := c.QueryRows(ctx, descTable)
if err != nil {
return nil, err
}
for rows.Next() {
var column types.ColumnProperty
if err := rows.Scan(&column.Field, &column.Type); err != nil {
return nil, err
}
ret = append(ret, &column)
}
return ret, nil
}
func (c *Clickhouse) ExecQueryBySqlDB(ctx context.Context, sql string) ([]map[string]interface{}, error) {
rows, err := c.QueryRows(ctx, sql)
if err != nil {
return nil, err
}
defer rows.Close()
columns, err := rows.Columns()
if err != nil {
return nil, err
}
var results []map[string]interface{}
for rows.Next() {
columnValues := make([]interface{}, len(columns))
columnPointers := make([]interface{}, len(columns))
for i := range columnValues {
columnPointers[i] = &columnValues[i]
}
if err := rows.Scan(columnPointers...); err != nil {
continue
}
rowMap := make(map[string]interface{})
for i, colName := range columns {
val := columnValues[i]
bytes, ok := val.([]byte)
if ok {
rowMap[colName] = string(bytes)
} else {
rowMap[colName] = val
}
}
results = append(results, rowMap)
}
return results, nil
}
func (c *Clickhouse) Query(ctx context.Context, query interface{}) ([]map[string]interface{}, error) {
ckQuery := new(QueryParam)
if err := mapstructure.Decode(query, ckQuery); err != nil {
return nil, err
}
// 校验SQL的合法性, 过滤掉 write请求
sqlItem := strings.Split(strings.ToUpper(ckQuery.Sql), " ")
for _, item := range sqlItem {
if _, ok := ckBannedOp[item]; ok {
return nil, fmt.Errorf("operation %s is forbid, only read db, please check your sql", item)
}
}
// 检查匹配数据长度,防止数据量过大
err := c.CheckMaxQueryRows(ctx, ckQuery.Sql)
if err != nil {
return nil, err
}
dbRows := make([]map[string]interface{}, 0)
if c.ClientByHTTP != nil {
dbRows, err = c.ExecQueryBySqlDB(ctx, ckQuery.Sql)
} else {
err = c.Client.Raw(ckQuery.Sql).Find(&dbRows).Error
}
if err != nil {
return nil, fmt.Errorf("fetch data failed, sql is %s, err is %s", ckQuery.Sql, err.Error())
}
return dbRows, nil
}
func (c *Clickhouse) CheckMaxQueryRows(ctx context.Context, sql string) error {
subSql := strings.ReplaceAll(sql, ";", "")
subSql = fmt.Sprintf("SELECT COUNT(*) as count FROM (%s) AS subquery;", subSql)
dbRows, err := c.ExecQueryBySqlDB(ctx, subSql)
if err != nil {
return fmt.Errorf("fetch data failed, sql is %s, err is %s", subSql, err.Error())
}
if len(dbRows) > 0 {
if count, exists := dbRows[0]["count"]; exists {
v, err := sqlbase.ParseFloat64Value(count)
if err != nil {
return err
}
if v > float64(c.MaxQueryRows) {
return fmt.Errorf("query result rows count %d exceeds the maximum limit %d", int(v), c.MaxQueryRows)
}
}
}
return nil
}
================================================
FILE: dskit/clickhouse/clickhouse_test.go
================================================
package clickhouse
import (
"context"
"encoding/json"
"fmt"
"testing"
"time"
"github.com/ccfos/nightingale/v6/dskit/types"
)
func Test_Timeseries(t *testing.T) {
ck := &Clickhouse{
Nodes: []string{"127.0.0.1:8123"},
User: "default",
Password: "123456",
}
err := ck.InitCli()
if err != nil {
t.Fatal(err)
}
data, err := ck.QueryTimeseries(context.TODO(), &QueryParam{
Sql: `select * from default.student limit 20`,
From: time.Now().Unix() - 300,
To: time.Now().Unix(),
TimeField: "created_at",
TimeFormat: "datetime",
Keys: types.Keys{
LabelKey: "age",
},
})
if err != nil {
t.Fatal(err)
}
bs, err := json.Marshal(data)
if err != nil {
t.Fatal(err)
}
fmt.Println(string(bs))
}
================================================
FILE: dskit/clickhouse/timeseries.go
================================================
package clickhouse
import (
"context"
"fmt"
"github.com/ccfos/nightingale/v6/dskit/sqlbase"
"github.com/ccfos/nightingale/v6/dskit/types"
)
const (
TimeFieldFormatEpochMilli = "epoch_millis"
TimeFieldFormatEpochSecond = "epoch_second"
)
// 时序数据相关的API
type QueryParam struct {
Limit int `json:"limit" mapstructure:"limit"`
Sql string `json:"sql" mapstructure:"sql"`
Ref string `json:"ref" mapstructure:"ref"`
From int64 `json:"from" mapstructure:"from"`
To int64 `json:"to" mapstructure:"to"`
TimeField string `json:"time_field" mapstructure:"time_field"`
TimeFormat string `json:"time_format" mapstructure:"time_format"`
Keys types.Keys `json:"keys" mapstructure:"keys"`
Database string `json:"database" mapstructure:"database"`
Table string `json:"table" mapstructure:"table"`
}
var (
ckBannedOp = map[string]struct{}{
"CREATE": {},
"INSERT": {},
"ALTER": {},
"REVOKE": {},
"DROP": {},
"RENAME": {},
"ATTACH": {},
"DETACH": {},
"OPTIMIZE": {},
"TRUNCATE": {},
"SET": {},
}
)
func (c *Clickhouse) QueryTimeseries(ctx context.Context, query *QueryParam) ([]types.MetricValues, error) {
if query.Keys.ValueKey == "" {
return nil, fmt.Errorf("valueKey is required")
}
rows, err := c.Query(ctx, query)
if err != nil {
return nil, err
}
// 构造成时续数据
return sqlbase.FormatMetricValues(query.Keys, rows, true), nil
}
================================================
FILE: dskit/doris/doris.go
================================================
package doris
import (
"context"
"database/sql"
"encoding/json"
"errors"
"fmt"
"reflect"
"strings"
"time"
"unicode"
"github.com/ccfos/nightingale/v6/dskit/pool"
"github.com/ccfos/nightingale/v6/dskit/types"
_ "github.com/go-sql-driver/mysql" // MySQL driver
"github.com/mitchellh/mapstructure"
)
const (
ShowIndexFieldIndexType = "index_type"
ShowIndexFieldColumnName = "column_name"
ShowIndexKeyName = "key_name"
SQLShowIndex = "SHOW INDEX FROM "
)
// Doris struct to hold connection details and the connection object
type Doris struct {
Addr string `json:"doris.addr" mapstructure:"doris.addr"` // fe mysql endpoint
FeAddr string `json:"doris.fe_addr" mapstructure:"doris.fe_addr"` // fe http endpoint
User string `json:"doris.user" mapstructure:"doris.user"` //
Password string `json:"doris.password" mapstructure:"doris.password"` //
Timeout int `json:"doris.timeout" mapstructure:"doris.timeout"` // ms
MaxIdleConns int `json:"doris.max_idle_conns" mapstructure:"doris.max_idle_conns"`
MaxOpenConns int `json:"doris.max_open_conns" mapstructure:"doris.max_open_conns"`
ConnMaxLifetime int `json:"doris.conn_max_lifetime" mapstructure:"doris.conn_max_lifetime"`
MaxQueryRows int `json:"doris.max_query_rows" mapstructure:"doris.max_query_rows"`
ClusterName string `json:"doris.cluster_name" mapstructure:"doris.cluster_name"`
EnableWrite bool `json:"doris.enable_write" mapstructure:"doris.enable_write"`
// 写用户,用来区分读写用户,减少数据源
UserWrite string `json:"doris.user_write" mapstructure:"doris.user_write"`
PasswordWrite string `json:"doris.password_write" mapstructure:"doris.password_write"`
}
// NewDorisWithSettings initializes a new Doris instance with the given settings
func NewDorisWithSettings(ctx context.Context, settings interface{}) (*Doris, error) {
newest := new(Doris)
settingsMap := map[string]interface{}{}
if reflect.TypeOf(settings).Kind() == reflect.String {
if err := json.Unmarshal([]byte(settings.(string)), &settingsMap); err != nil {
return nil, err
}
} else {
var assert bool
settingsMap, assert = settings.(map[string]interface{})
if !assert {
return nil, errors.New("settings type invalid")
}
}
if err := mapstructure.Decode(settingsMap, newest); err != nil {
return nil, err
}
return newest, nil
}
// NewConn establishes a new connection to Doris
func (d *Doris) NewConn(ctx context.Context, database string) (*sql.DB, error) {
if len(d.Addr) == 0 {
return nil, errors.New("empty fe-node addr")
}
// Set default values similar to postgres implementation
if d.Timeout == 0 {
d.Timeout = 60000
}
if d.MaxIdleConns == 0 {
d.MaxIdleConns = 10
}
if d.MaxOpenConns == 0 {
d.MaxOpenConns = 100
}
if d.ConnMaxLifetime == 0 {
d.ConnMaxLifetime = 14400
}
if d.MaxQueryRows == 0 {
d.MaxQueryRows = 500
}
var keys []string
keys = append(keys, d.Addr)
keys = append(keys, d.User, d.Password)
if len(database) > 0 {
keys = append(keys, database)
}
cachedKey := strings.Join(keys, ":")
// cache conn with database
conn, ok := pool.PoolClient.Load(cachedKey)
if ok {
return conn.(*sql.DB), nil
}
var db *sql.DB
var err error
defer func() {
if db != nil && err == nil {
pool.PoolClient.Store(cachedKey, db)
}
}()
// Simplified connection logic for Doris using MySQL driver
dsn := fmt.Sprintf("%s:%s@tcp(%s)/%s?charset=utf8", d.User, d.Password, d.Addr, database)
db, err = sql.Open("mysql", dsn)
if err != nil {
return nil, err
}
// Set connection pool configuration
db.SetMaxIdleConns(d.MaxIdleConns)
db.SetMaxOpenConns(d.MaxOpenConns)
db.SetConnMaxLifetime(time.Duration(d.ConnMaxLifetime) * time.Second)
return db, nil
}
// NewWriteConn establishes a new connection to Doris for write operations
// When EnableWrite is true and UserWrite is configured, it uses the write user credentials
// Otherwise, it reuses the read connection from NewConn
func (d *Doris) NewWriteConn(ctx context.Context, database string) (*sql.DB, error) {
// If write user is not configured, reuse the read connection
if !d.EnableWrite || len(d.UserWrite) == 0 {
return d.NewConn(ctx, database)
}
if len(d.Addr) == 0 {
return nil, errors.New("empty fe-node addr")
}
// Set default values similar to postgres implementation
if d.Timeout == 0 {
d.Timeout = 60000
}
if d.MaxIdleConns == 0 {
d.MaxIdleConns = 10
}
if d.MaxOpenConns == 0 {
d.MaxOpenConns = 100
}
if d.ConnMaxLifetime == 0 {
d.ConnMaxLifetime = 14400
}
if d.MaxQueryRows == 0 {
d.MaxQueryRows = 500
}
// Use write user credentials
user := d.UserWrite
password := d.PasswordWrite
var keys []string
keys = append(keys, d.Addr)
keys = append(keys, user, password)
if len(database) > 0 {
keys = append(keys, database)
}
cachedKey := strings.Join(keys, ":")
// cache conn with database
conn, ok := pool.PoolClient.Load(cachedKey)
if ok {
return conn.(*sql.DB), nil
}
var db *sql.DB
var err error
defer func() {
if db != nil && err == nil {
pool.PoolClient.Store(cachedKey, db)
}
}()
// Simplified connection logic for Doris using MySQL driver
dsn := fmt.Sprintf("%s:%s@tcp(%s)/%s?charset=utf8", user, password, d.Addr, database)
db, err = sql.Open("mysql", dsn)
if err != nil {
return nil, err
}
// Set connection pool configuration for write connections
// Use more conservative values since write operations are typically less frequent
writeMaxIdleConns := max(d.MaxIdleConns/5, 2)
writeMaxOpenConns := max(d.MaxOpenConns/10, 5)
db.SetMaxIdleConns(writeMaxIdleConns)
db.SetMaxOpenConns(writeMaxOpenConns)
db.SetConnMaxLifetime(time.Duration(d.ConnMaxLifetime) * time.Second)
return db, nil
}
// createTimeoutContext creates a context with timeout based on Doris configuration
func (d *Doris) createTimeoutContext(ctx context.Context) (context.Context, context.CancelFunc) {
timeout := d.Timeout
if timeout == 0 {
timeout = 60000
}
return context.WithTimeout(ctx, time.Duration(timeout)*time.Millisecond)
}
// ShowDatabases lists all databases in Doris
func (d *Doris) ShowDatabases(ctx context.Context) ([]string, error) {
timeoutCtx, cancel := d.createTimeoutContext(ctx)
defer cancel()
db, err := d.NewConn(timeoutCtx, "")
if err != nil {
return []string{}, err
}
rows, err := db.QueryContext(timeoutCtx, "SHOW DATABASES")
if err != nil {
return nil, err
}
defer rows.Close()
databases := make([]string, 0)
for rows.Next() {
var dbName string
if err := rows.Scan(&dbName); err != nil {
continue
}
databases = append(databases, dbName)
}
return databases, nil
}
// ShowResources lists all resources with type resourceType in Doris
func (d *Doris) ShowResources(ctx context.Context, resourceType string) ([]string, error) {
timeoutCtx, cancel := d.createTimeoutContext(ctx)
defer cancel()
db, err := d.NewConn(timeoutCtx, "")
if err != nil {
return []string{}, err
}
// 使用 SHOW RESOURCES 命令
query := fmt.Sprintf("SHOW RESOURCES WHERE RESOURCETYPE = '%s'", resourceType)
rows, err := db.QueryContext(timeoutCtx, query)
if err != nil {
return nil, fmt.Errorf("failed to execute query: %w", err)
}
defer rows.Close()
distinctName := make(map[string]struct{})
// 获取列信息
columns, err := rows.Columns()
if err != nil {
return nil, fmt.Errorf("failed to get columns: %w", err)
}
// 准备接收数据的变量
values := make([]interface{}, len(columns))
valuePtrs := make([]interface{}, len(columns))
for i := range values {
valuePtrs[i] = &values[i]
}
// 遍历结果集
for rows.Next() {
err := rows.Scan(valuePtrs...)
if err != nil {
return nil, fmt.Errorf("error scanning row: %w", err)
}
// 提取资源名称并添加到 map 中(自动去重)
if name, ok := values[0].([]byte); ok {
distinctName[string(name)] = struct{}{}
} else if nameStr, ok := values[0].(string); ok {
distinctName[nameStr] = struct{}{}
}
}
if err := rows.Err(); err != nil {
return nil, fmt.Errorf("error iterating rows: %w", err)
}
// 将 map 转换为切片
resources := make([]string, 0)
for name := range distinctName {
resources = append(resources, name)
}
return resources, nil
}
// ShowTables lists all tables in a given database
func (d *Doris) ShowTables(ctx context.Context, database string) ([]string, error) {
timeoutCtx, cancel := d.createTimeoutContext(ctx)
defer cancel()
db, err := d.NewConn(timeoutCtx, database)
if err != nil {
return nil, err
}
query := fmt.Sprintf("SHOW TABLES IN %s", database)
rows, err := db.QueryContext(timeoutCtx, query)
if err != nil {
return nil, err
}
defer rows.Close()
tables := make([]string, 0)
for rows.Next() {
var tableName string
if err := rows.Scan(&tableName); err != nil {
continue
}
tables = append(tables, tableName)
}
return tables, nil
}
// DescTable describes the schema of a specified table in Doris
func (d *Doris) DescTable(ctx context.Context, database, table string) ([]*types.ColumnProperty, error) {
timeoutCtx, cancel := d.createTimeoutContext(ctx)
defer cancel()
db, err := d.NewConn(timeoutCtx, database)
if err != nil {
return nil, err
}
query := fmt.Sprintf("DESCRIBE %s.%s", database, table)
rows, err := db.QueryContext(timeoutCtx, query)
if err != nil {
return nil, err
}
defer rows.Close()
// 日志报表中需要把 .type 转化成内部类型
// TODO: 是否有复合类型, Array/JSON/Tuple/Nested, 是否有更多的类型
convertDorisType := func(origin string) (string, bool) {
lower := strings.ToLower(origin)
switch lower {
case "double":
return types.LogExtractValueTypeFloat, true
case "datetime", "date":
return types.LogExtractValueTypeDate, false
case "text":
return types.LogExtractValueTypeText, true
default:
if strings.Contains(lower, "int") {
return types.LogExtractValueTypeLong, true
}
// 日期类型统一按照.date处理
if strings.HasPrefix(lower, "date") {
return types.LogExtractValueTypeDate, false
}
if strings.HasPrefix(lower, "varchar") || strings.HasPrefix(lower, "char") {
return types.LogExtractValueTypeText, true
}
if strings.HasPrefix(lower, "decimal") {
return types.LogExtractValueTypeFloat, true
}
}
return origin, false
}
var columns []*types.ColumnProperty
for rows.Next() {
var (
field string
typ string
null string
key string
defaultValue sql.NullString
extra string
)
if err := rows.Scan(&field, &typ, &null, &key, &defaultValue, &extra); err != nil {
continue
}
type2, indexable := convertDorisType(typ)
columns = append(columns, &types.ColumnProperty{
Field: field,
Type: typ, // You might want to convert MySQL types to your custom types
Type2: type2,
Indexable: indexable,
})
}
return columns, nil
}
type TableIndexInfo struct {
ColumnName string `json:"column_name"`
IndexName string `json:"index_name"`
IndexType string `json:"index_type"`
}
// ShowIndexes 查询表的所有索引信息
func (d *Doris) ShowIndexes(ctx context.Context, database, table string) ([]TableIndexInfo, error) {
if database == "" || table == "" {
return nil, fmt.Errorf("database and table names cannot be empty")
}
tCtx, cancel := d.createTimeoutContext(ctx)
defer cancel()
db, err := d.NewConn(tCtx, database)
if err != nil {
return nil, err
}
querySQL := fmt.Sprintf("%s `%s`.`%s`", SQLShowIndex, database, table)
rows, err := db.QueryContext(tCtx, querySQL)
if err != nil {
return nil, fmt.Errorf("failed to query indexes: %w", err)
}
defer rows.Close()
columns, err := rows.Columns()
if err != nil {
return nil, fmt.Errorf("failed to get columns: %w", err)
}
count := len(columns)
// 预映射列索引
colIdx := map[string]int{
ShowIndexKeyName: -1,
ShowIndexFieldColumnName: -1,
ShowIndexFieldIndexType: -1,
}
for i, col := range columns {
lCol := strings.ToLower(col)
if lCol == ShowIndexKeyName || lCol == ShowIndexFieldColumnName || lCol == ShowIndexFieldIndexType {
colIdx[lCol] = i
}
}
var result []TableIndexInfo
for rows.Next() {
// 使用 sql.RawBytes 可以接受任何类型并转为 string,避免复杂的类型断言
scanArgs := make([]interface{}, count)
values := make([]sql.RawBytes, count)
for i := range values {
scanArgs[i] = &values[i]
}
if err = rows.Scan(scanArgs...); err != nil {
return nil, err
}
info := TableIndexInfo{}
if i := colIdx[ShowIndexFieldColumnName]; i != -1 && i < count {
info.ColumnName = string(values[i])
}
if i := colIdx[ShowIndexKeyName]; i != -1 && i < count {
info.IndexName = string(values[i])
}
if i := colIdx[ShowIndexFieldIndexType]; i != -1 && i < count {
info.IndexType = string(values[i])
}
if info.ColumnName != "" {
result = append(result, info)
}
}
if err = rows.Err(); err != nil {
return nil, fmt.Errorf("error iterating rows: %w", err)
}
return result, nil
}
// SelectRows selects rows from a specified table in Doris based on a given query with MaxQueryRows check
func (d *Doris) SelectRows(ctx context.Context, database, table, query string) ([]map[string]interface{}, error) {
sql := fmt.Sprintf("SELECT * FROM %s.%s", database, table)
if query != "" {
sql += " " + query
}
// 检查查询结果行数
err := d.CheckMaxQueryRows(ctx, database, sql)
if err != nil {
return nil, err
}
return d.ExecQuery(ctx, database, sql)
}
// ExecQuery executes a given SQL query in Doris and returns the results
func (d *Doris) ExecQuery(ctx context.Context, database string, sql string) ([]map[string]interface{}, error) {
timeoutCtx, cancel := d.createTimeoutContext(ctx)
defer cancel()
db, err := d.NewConn(timeoutCtx, database)
if err != nil {
return nil, err
}
rows, err := db.QueryContext(timeoutCtx, sql)
if err != nil {
return nil, err
}
defer rows.Close()
columns, err := rows.Columns()
if err != nil {
return nil, err
}
var results []map[string]interface{}
for rows.Next() {
columnValues := make([]interface{}, len(columns))
columnPointers := make([]interface{}, len(columns))
for i := range columnValues {
columnPointers[i] = &columnValues[i]
}
if err := rows.Scan(columnPointers...); err != nil {
continue
}
rowMap := make(map[string]interface{})
for i, colName := range columns {
val := columnValues[i]
bytes, ok := val.([]byte)
if ok {
rowMap[colName] = string(bytes)
} else {
rowMap[colName] = val
}
}
results = append(results, rowMap)
}
return results, nil
}
// ExecContext executes a given SQL query in Doris and returns the results
func (d *Doris) ExecContext(ctx context.Context, database string, sql string) error {
timeoutCtx, cancel := d.createTimeoutContext(ctx)
defer cancel()
db, err := d.NewWriteConn(timeoutCtx, database)
if err != nil {
return err
}
_, err = db.ExecContext(timeoutCtx, sql)
return err
}
// ExecBatchSQL 执行多条 SQL 语句
func (d *Doris) ExecBatchSQL(ctx context.Context, database string, sqlBatch string) error {
// 分割 SQL 语句
sqlStatements := SplitSQLStatements(sqlBatch)
// 逐条执行 SQL 语句
for _, ql := range sqlStatements {
// 跳过空语句
ql = strings.TrimSpace(ql)
if ql == "" {
continue
}
// 检查是否是 CREATE DATABASE 语句
isCreateDB := strings.HasPrefix(strings.ToUpper(ql), "CREATE DATABASE")
// strings.HasPrefix(strings.ToUpper(sql), "CREATE SCHEMA") // 暂时不支持CREATE SCHEMA
// 对于 CREATE DATABASE 语句,使用空数据库名连接
currentDB := database
if isCreateDB {
currentDB = ""
}
// 执行单条 SQL,ExecContext 内部已经包含超时处理
err := d.ExecContext(ctx, currentDB, ql)
if err != nil {
return fmt.Errorf("exec sql failed, sql:%s, err:%w", sqlBatch, err)
}
}
return nil
}
// SplitSQLStatements 将多条 SQL 语句分割成单独的语句
func SplitSQLStatements(sqlBatch string) []string {
var statements []string
var currentStatement strings.Builder
// 状态标记
var (
inString bool // 是否在字符串内
inComment bool // 是否在单行注释内
inMultilineComment bool // 是否在多行注释内
escaped bool // 前一个字符是否为转义字符
)
for i := 0; i < len(sqlBatch); i++ {
char := sqlBatch[i]
currentStatement.WriteByte(char)
// 处理转义字符
if inString && char == '\\' {
escaped = !escaped
continue
}
// 处理字符串
if char == '\'' && !inComment && !inMultilineComment {
if !escaped {
inString = !inString
}
escaped = false
continue
}
// 处理单行注释
if !inString && !inMultilineComment && !inComment && char == '-' && i+1 < len(sqlBatch) && sqlBatch[i+1] == '-' {
inComment = true
currentStatement.WriteByte(sqlBatch[i+1]) // 写入第二个'-'
i++
continue
}
// 处理多行注释开始
if !inString && !inComment && char == '/' && i+1 < len(sqlBatch) && sqlBatch[i+1] == '*' {
inMultilineComment = true
currentStatement.WriteByte(sqlBatch[i+1]) // 写入'*'
i++
continue
}
// 处理多行注释结束
if inMultilineComment && char == '*' && i+1 < len(sqlBatch) && sqlBatch[i+1] == '/' {
inMultilineComment = false
currentStatement.WriteByte(sqlBatch[i+1]) // 写入'/'
i++
continue
}
// 处理换行符,结束单行注释
if inComment && (char == '\n' || char == '\r') {
inComment = false
}
// 分割SQL语句
if char == ';' && !inString && !inMultilineComment && !inComment {
// 收集到分号后面的单行注释(如果有)
for j := i + 1; j < len(sqlBatch); j++ {
nextChar := sqlBatch[j]
// 检查是否是注释开始
if nextChar == '-' && j+1 < len(sqlBatch) && sqlBatch[j+1] == '-' {
// 找到了注释,添加到当前语句
currentStatement.WriteByte(nextChar) // 添加'-'
currentStatement.WriteByte(sqlBatch[j+1]) // 添加第二个'-'
j++
// 读取直到行尾
for k := j + 1; k < len(sqlBatch); k++ {
commentChar := sqlBatch[k]
currentStatement.WriteByte(commentChar)
j = k
if commentChar == '\n' || commentChar == '\r' {
break
}
}
i = j
break
} else if !isWhitespace(nextChar) {
// 非注释且非空白字符,停止收集
break
} else {
// 是空白字符,添加到当前语句
currentStatement.WriteByte(nextChar)
i = j
}
}
statements = append(statements, strings.TrimSpace(currentStatement.String()))
currentStatement.Reset()
continue
}
escaped = false
}
// 处理最后一条可能没有分号的语句
lastStatement := strings.TrimSpace(currentStatement.String())
if lastStatement != "" {
statements = append(statements, lastStatement)
}
return statements
}
// 判断字符是否为空白字符
func isWhitespace(c byte) bool {
return unicode.IsSpace(rune(c))
}
================================================
FILE: dskit/doris/logs.go
================================================
package doris
import (
"context"
"sort"
)
// 日志相关的操作
const (
TimeseriesAggregationTimestamp = "__ts__"
)
// TODO: 待测试, MAP/ARRAY/STRUCT/JSON 等类型能否处理
func (d *Doris) QueryLogs(ctx context.Context, query *QueryParam) ([]map[string]interface{}, error) {
// 等同于 Query()
return d.Query(ctx, query)
}
// 本质是查询时序数据, 取第一组, SQL由上层封装, 不再做复杂的解析和截断
func (d *Doris) QueryHistogram(ctx context.Context, query *QueryParam) ([][]float64, error) {
values, err := d.QueryTimeseries(ctx, query)
if err != nil {
return [][]float64{}, nil
}
if len(values) > 0 && len(values[0].Values) > 0 {
items := values[0].Values
sort.Slice(items, func(i, j int) bool {
if len(items[i]) > 0 && len(items[j]) > 0 {
return items[i][0] < items[j][0]
}
return false
})
return items, nil
}
return [][]float64{}, nil
}
================================================
FILE: dskit/doris/sql_analyzer.go
================================================
package doris
import (
"regexp"
"strings"
"github.com/pingcap/tidb/pkg/parser"
"github.com/pingcap/tidb/pkg/parser/ast"
_ "github.com/pingcap/tidb/pkg/parser/test_driver" // required for parser
)
// mapAccessPattern matches Doris map/array access syntax like `col['key']` or col["key"]
var mapAccessPattern = regexp.MustCompile(`\[['"]\w+['"]\]`)
// castStringPattern matches Doris CAST(... AS STRING) syntax
var castStringPattern = regexp.MustCompile(`(?i)\bAS\s+STRING\b`)
// macro patterns
var timeGroupPattern = regexp.MustCompile(`\$__timeGroup\([^)]+\)`)
var timeFilterPattern = regexp.MustCompile(`\$__timeFilter\([^)]+\)`)
var intervalPattern = regexp.MustCompile(`\$__interval`)
// SQLAnalyzeResult holds the analysis result of a SQL statement
type SQLAnalyzeResult struct {
IsSelectLike bool // whether the statement is a SELECT-like query
HasTopAgg bool // whether the top-level query has aggregate functions
LimitConst *int64 // top-level LIMIT constant value (nil if no LIMIT or non-constant)
}
// AnalyzeSQL analyzes a SQL statement and extracts top-level features
func AnalyzeSQL(sql string) (*SQLAnalyzeResult, error) {
// Preprocess SQL to remove Doris-specific syntax that TiDB parser doesn't support
preprocessedSQL := preprocessDorisSQL(sql)
p := parser.New()
stmtNodes, _, err := p.Parse(preprocessedSQL, "", "")
if err != nil {
return nil, err
}
if len(stmtNodes) == 0 {
return &SQLAnalyzeResult{}, nil
}
result := &SQLAnalyzeResult{}
stmt := stmtNodes[0]
switch s := stmt.(type) {
case *ast.SelectStmt:
result.IsSelectLike = true
analyzeSelectStmt(s, result)
case *ast.SetOprStmt: // UNION / INTERSECT / EXCEPT
result.IsSelectLike = true
analyzeSetOprStmt(s, result)
default:
result.IsSelectLike = false
}
return result, nil
}
// analyzeSelectStmt analyzes a SELECT statement
func analyzeSelectStmt(sel *ast.SelectStmt, result *SQLAnalyzeResult) {
// Check if top-level SELECT has aggregate functions
if sel.Fields != nil {
for _, field := range sel.Fields.Fields {
if field.Expr != nil && hasAggregateFunc(field.Expr) {
result.HasTopAgg = true
break
}
}
}
// Check if any CTE has aggregate functions
if !result.HasTopAgg && sel.With != nil {
for _, cte := range sel.With.CTEs {
if selectHasAggregate(cte.Query) {
result.HasTopAgg = true
break
}
}
}
// Extract top-level LIMIT
if sel.Limit != nil && sel.Limit.Count != nil {
if val, ok := extractConstValue(sel.Limit.Count); ok {
result.LimitConst = &val
}
}
}
// selectHasAggregate checks if a node (SELECT, UNION, or SubqueryExpr) has aggregate functions
func selectHasAggregate(node ast.Node) bool {
switch n := node.(type) {
case *ast.SelectStmt:
if n.Fields != nil {
for _, field := range n.Fields.Fields {
if field.Expr != nil && hasAggregateFunc(field.Expr) {
return true
}
}
}
case *ast.SetOprStmt:
// For UNION, check all branches
if n.SelectList != nil {
for _, sel := range n.SelectList.Selects {
if selectHasAggregate(sel) {
return true
}
}
}
case *ast.SubqueryExpr:
// CTE query is wrapped in SubqueryExpr
if n.Query != nil {
return selectHasAggregate(n.Query)
}
}
return false
}
// analyzeSetOprStmt analyzes UNION/INTERSECT/EXCEPT statements
func analyzeSetOprStmt(setOpr *ast.SetOprStmt, result *SQLAnalyzeResult) {
// UNION's LIMIT is at the outermost level
if setOpr.Limit != nil && setOpr.Limit.Count != nil {
if val, ok := extractConstValue(setOpr.Limit.Count); ok {
result.LimitConst = &val
}
}
// Check if all branches are aggregates (conservative: if any is non-aggregate, don't skip)
if setOpr.SelectList == nil || len(setOpr.SelectList.Selects) == 0 {
return
}
allAgg := true
for _, sel := range setOpr.SelectList.Selects {
if selectStmt, ok := sel.(*ast.SelectStmt); ok {
if selectStmt.Fields != nil {
hasAgg := false
for _, field := range selectStmt.Fields.Fields {
if field.Expr != nil && hasAggregateFunc(field.Expr) {
hasAgg = true
break
}
}
if !hasAgg {
allAgg = false
break
}
}
}
}
result.HasTopAgg = allAgg
}
// hasAggregateFunc checks if an expression contains aggregate functions (without entering subqueries)
func hasAggregateFunc(expr ast.ExprNode) bool {
checker := &aggregateChecker{}
expr.Accept(checker)
return checker.found
}
// aggregateChecker implements ast.Visitor to find aggregate functions
type aggregateChecker struct {
found bool
}
func (c *aggregateChecker) Enter(n ast.Node) (ast.Node, bool) {
if c.found {
return n, true // stop traversal
}
switch node := n.(type) {
case *ast.SubqueryExpr:
return n, true // don't enter subquery
case *ast.AggregateFuncExpr:
c.found = true
return n, true
case *ast.FuncCallExpr:
// Check for Doris-specific aggregate/statistic functions
funcName := strings.ToUpper(node.FnName.L)
if isDorisAggregateFunc(funcName) {
c.found = true
return n, true
}
}
return n, false // continue traversal
}
func (c *aggregateChecker) Leave(n ast.Node) (ast.Node, bool) {
return n, true
}
// isDorisAggregateFunc checks if a function is a Doris-specific aggregate/statistic function
func isDorisAggregateFunc(funcName string) bool {
dorisAggFuncs := map[string]bool{
// Standard aggregates (in case parser doesn't recognize them)
"COUNT": true,
"SUM": true,
"AVG": true,
"MIN": true,
"MAX": true,
"ANY": true,
"ANY_VALUE": true,
// HLL related
"HLL_UNION_AGG": true,
"HLL_RAW_AGG": true,
"HLL_CARDINALITY": true,
"HLL_UNION": true,
"HLL_HASH": true,
// Bitmap related
"BITMAP_UNION": true,
"BITMAP_UNION_COUNT": true,
"BITMAP_INTERSECT": true,
"BITMAP_COUNT": true,
"BITMAP_AND_COUNT": true,
"BITMAP_OR_COUNT": true,
"BITMAP_XOR_COUNT": true,
"BITMAP_AND_NOT_COUNT": true,
// Other aggregates
"PERCENTILE": true,
"PERCENTILE_APPROX": true,
"APPROX_COUNT_DISTINCT": true,
"NDV": true,
"COLLECT_LIST": true,
"COLLECT_SET": true,
"GROUP_CONCAT": true,
"GROUP_BIT_AND": true,
"GROUP_BIT_OR": true,
"GROUP_BIT_XOR": true,
"GROUPING": true,
"GROUPING_ID": true,
// Statistical functions
"STDDEV": true,
"STDDEV_POP": true,
"STDDEV_SAMP": true,
"STD": true,
"VARIANCE": true,
"VAR_POP": true,
"VAR_SAMP": true,
"COVAR_POP": true,
"COVAR_SAMP": true,
"CORR": true,
// Window functions that are also aggregates
"FIRST_VALUE": true,
"LAST_VALUE": true,
"LAG": true,
"LEAD": true,
"ROW_NUMBER": true,
"RANK": true,
"DENSE_RANK": true,
"NTILE": true,
"CUME_DIST": true,
"PERCENT_RANK": true,
}
return dorisAggFuncs[funcName]
}
// extractConstValue extracts constant integer value from an expression
func extractConstValue(expr ast.ExprNode) (int64, bool) {
switch v := expr.(type) {
case ast.ValueExpr:
switch val := v.GetValue().(type) {
case int64:
return val, true
case uint64:
return int64(val), true
case float64:
return int64(val), true
case int:
return int64(val), true
}
}
return 0, false
}
// preprocessDorisSQL removes Doris-specific syntax that TiDB parser doesn't support
func preprocessDorisSQL(sql string) string {
// Remove map/array access syntax like ['key'] or ["key"]
// This is used in Doris for accessing map/variant/json fields
sql = mapAccessPattern.ReplaceAllString(sql, "")
// Replace Doris CAST(... AS STRING) with CAST(... AS CHAR)
sql = castStringPattern.ReplaceAllString(sql, "AS CHAR")
// Replace macros with valid SQL equivalents
sql = timeGroupPattern.ReplaceAllString(sql, "ts")
sql = timeFilterPattern.ReplaceAllString(sql, "1=1")
sql = intervalPattern.ReplaceAllString(sql, "60")
return sql
}
// NeedsRowCountCheck determines if a SQL query needs row count checking
// Returns: needsCheck bool, directReject bool, rejectReason string
func NeedsRowCountCheck(sql string, maxQueryRows int) (bool, bool, string) {
result, err := AnalyzeSQL(sql)
if err != nil {
// Parse failed, fall back to probe check
return true, false, ""
}
if !result.IsSelectLike {
// Not a SELECT query, skip check
return false, false, ""
}
// Rule 1: Top-level has aggregate functions -> skip check
if result.HasTopAgg {
return false, false, ""
}
// Rule 2: Top-level LIMIT <= maxRows -> skip check
if result.LimitConst != nil && *result.LimitConst <= int64(maxQueryRows) {
return false, false, ""
}
// Otherwise, needs probe check (including LIMIT > maxRows, since actual result may be smaller)
return true, false, ""
}
================================================
FILE: dskit/doris/sql_analyzer_test.go
================================================
package doris
import (
"testing"
)
func TestAnalyzeSQL_AggregateQueries(t *testing.T) {
tests := []struct {
name string
sql string
wantHasAgg bool
wantIsSelect bool
}{
// Standard aggregate functions - should skip check
{
name: "COUNT(*)",
sql: "SELECT COUNT(*) AS `cnt`, FLOOR(UNIX_TIMESTAMP(event_date) DIV 10) * 10 AS `time`, CAST(`labels`['event'] AS STRING) AS `labels.event` FROM `db_insight_doris`.`ewall_event` WHERE `event_date` BETWEEN FROM_UNIXTIME(1768965669) AND FROM_UNIXTIME(1768965969) GROUP BY `time`, `labels.event` ORDER BY `time` ASC",
wantHasAgg: true,
wantIsSelect: true,
},
{
name: "COUNT with column",
sql: "SELECT COUNT(id) FROM users",
wantHasAgg: true,
wantIsSelect: true,
},
{
name: "SUM function",
sql: "SELECT SUM(amount) FROM orders",
wantHasAgg: true,
wantIsSelect: true,
},
{
name: "AVG function",
sql: "SELECT AVG(price) FROM products",
wantHasAgg: true,
wantIsSelect: true,
},
{
name: "MIN function",
sql: "SELECT MIN(created_at) FROM logs",
wantHasAgg: true,
wantIsSelect: true,
},
{
name: "MAX function",
sql: "SELECT MAX(score) FROM results",
wantHasAgg: true,
wantIsSelect: true,
},
{
name: "Multiple aggregates",
sql: "SELECT COUNT(*), SUM(amount), AVG(price) FROM orders",
wantHasAgg: true,
wantIsSelect: true,
},
{
name: "Aggregate with GROUP BY",
sql: "SELECT user_id, COUNT(*) FROM orders GROUP BY user_id",
wantHasAgg: true,
wantIsSelect: true,
},
{
name: "Aggregate with WHERE and GROUP BY",
sql: "SELECT category, SUM(sales) FROM products WHERE status = 'active' GROUP BY category",
wantHasAgg: true,
wantIsSelect: true,
},
{
name: "Aggregate with HAVING",
sql: "SELECT user_id, COUNT(*) as cnt FROM orders GROUP BY user_id HAVING cnt > 10",
wantHasAgg: true,
wantIsSelect: true,
},
// macro queries with aggregates
{
name: "COUNT with timeGroup",
sql: "SELECT COUNT(*) AS `cnt`, $__timeGroup(timestamp,$__interval) AS `time` FROM `apm`.`traces_span` WHERE (`service_name` = 'demo-logic-server') AND $__timeFilter(`timestamp`) GROUP BY `time` ORDER BY `time` ASC",
wantHasAgg: true,
wantIsSelect: true,
},
{
name: "CTE with ratio calculation",
sql: "WITH `time_totals` AS (SELECT $__timeGroup(timestamp,$__interval) AS `time`, COUNT(*) AS `total_count` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) GROUP BY `time`), `time_counts` AS (SELECT ANY_VALUE(`service_name`) AS `service_name`, $__timeGroup(timestamp,$__interval) AS `time`, COUNT(*) AS `count` FROM `apm`.`traces_span` WHERE (`service_name` = 'demo-logic-server') AND $__timeFilter(`timestamp`) GROUP BY `time`) SELECT tc.`service_name`, tc.`time`, ROUND(tc.`count` * 100.0 / tt.`total_count`, 2) AS `ratio` FROM `time_counts` tc JOIN `time_totals` tt ON tc.`time` = tt.`time` ORDER BY tc.`time` ASC",
wantHasAgg: true, // CTE has aggregate functions
wantIsSelect: true,
},
{
name: "CTE with top values and ratio",
sql: "WITH `top_values` AS (SELECT `service_name` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) GROUP BY `service_name` ORDER BY COUNT(*) DESC LIMIT 5), `time_totals` AS (SELECT $__timeGroup(timestamp,$__interval) AS `time`, COUNT(*) AS `total_count` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) GROUP BY `time`), `time_counts` AS (SELECT `service_name`, $__timeGroup(timestamp,$__interval) AS `time`, COUNT(*) AS `count` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) AND `service_name` IN (SELECT `service_name` FROM `top_values`) GROUP BY `service_name`, `time`) SELECT tc.`service_name`, tc.`time`, ROUND(tc.`count` * 100.0 / tt.`total_count`, 2) AS `ratio` FROM `time_counts` tc JOIN `time_totals` tt ON tc.`time` = tt.`time` ORDER BY tc.`time` ASC",
wantHasAgg: true, // CTE has aggregate functions
wantIsSelect: true,
},
{
name: "PERCENTILE_APPROX with timeGroup",
sql: "SELECT PERCENTILE_APPROX(`duration`, 0.95) AS `p95`, $__timeGroup(timestamp,$__interval) AS `time` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) GROUP BY `time` ORDER BY `time` ASC",
wantHasAgg: true,
wantIsSelect: true,
},
{
name: "COUNT DISTINCT with timeGroup",
sql: "SELECT COUNT(DISTINCT `duration`) AS `unique_count`, $__timeGroup(timestamp,$__interval) AS `time` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) GROUP BY `time` ORDER BY `time` ASC",
wantHasAgg: true,
wantIsSelect: true,
},
{
name: "CASE WHEN with COUNT and ROUND",
sql: "SELECT ROUND(COUNT(CASE WHEN `duration` IS NOT NULL THEN 1 END) * 100.0 / COUNT(*), 2) AS `exist_ratio`, $__timeGroup(timestamp,$__interval) AS `time` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) GROUP BY `time` ORDER BY `time` ASC",
wantHasAgg: true,
wantIsSelect: true,
},
{
name: "AVG with timeGroup",
sql: "SELECT AVG(`duration`) AS `avg`, $__timeGroup(timestamp,$__interval) AS `time` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) GROUP BY `time` ORDER BY `time` ASC",
wantHasAgg: true,
wantIsSelect: true,
},
{
name: "Simple COUNT with timeFilter",
sql: "SELECT COUNT(*) AS `cnt` FROM `apm`.`traces_span` WHERE (`span_name` = 'GET /backend/detail') AND $__timeFilter(`timestamp`)",
wantHasAgg: true,
wantIsSelect: true,
},
{
name: "CTE with CROSS JOIN ratio",
sql: "WITH `total` AS (SELECT COUNT(*) AS `total_count` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`)), `value_counts` AS (SELECT ANY_VALUE(`span_kind`) AS `span_kind`, COUNT(*) AS `count` FROM `apm`.`traces_span` WHERE (`span_kind` = 'SPAN_KIND_SERVER') AND $__timeFilter(`timestamp`)) SELECT vc.`span_kind`, vc.`count` AS `count`, ROUND(vc.`count` * 100.0 / t.`total_count`, 2) AS `ratio` FROM `value_counts` vc CROSS JOIN `total` t ORDER BY vc.`count` DESC;",
wantHasAgg: true, // CTE has aggregate functions
wantIsSelect: true,
},
// Non-aggregate queries - should not skip check
{
name: "Simple SELECT *",
sql: "SELECT * FROM users",
wantHasAgg: false,
wantIsSelect: true,
},
{
name: "SELECT with columns",
sql: "SELECT id, name, email FROM users",
wantHasAgg: false,
wantIsSelect: true,
},
{
name: "SELECT with WHERE",
sql: "SELECT * FROM users WHERE status = 'active'",
wantHasAgg: false,
wantIsSelect: true,
},
{
name: "SELECT with JOIN",
sql: "SELECT u.name, o.amount FROM users u JOIN orders o ON u.id = o.user_id",
wantHasAgg: false,
wantIsSelect: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := AnalyzeSQL(tt.sql)
if err != nil {
t.Fatalf("AnalyzeSQL() error = %v", err)
}
if result.HasTopAgg != tt.wantHasAgg {
t.Errorf("name: %s, HasTopAgg = %v, want %v", tt.name, result.HasTopAgg, tt.wantHasAgg)
}
if result.IsSelectLike != tt.wantIsSelect {
t.Errorf("IsSelectLike = %v, want %v", result.IsSelectLike, tt.wantIsSelect)
}
})
}
}
func TestAnalyzeSQL_SubqueryWithAggregate(t *testing.T) {
// Aggregate in subquery should NOT skip check for main query
tests := []struct {
name string
sql string
wantHasAgg bool
}{
{
name: "Aggregate in subquery only",
sql: "SELECT * FROM (SELECT user_id, COUNT(*) as cnt FROM orders GROUP BY user_id) t",
wantHasAgg: false, // top-level has no aggregate
},
{
name: "Aggregate in WHERE subquery",
sql: "SELECT * FROM users WHERE id IN (SELECT user_id FROM orders GROUP BY user_id HAVING COUNT(*) > 5)",
wantHasAgg: false, // top-level has no aggregate
},
{
name: "Both top-level and subquery aggregates",
sql: "SELECT COUNT(*) FROM (SELECT user_id FROM orders GROUP BY user_id) t",
wantHasAgg: true, // top-level has aggregate
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := AnalyzeSQL(tt.sql)
if err != nil {
t.Fatalf("AnalyzeSQL() error = %v", err)
}
if result.HasTopAgg != tt.wantHasAgg {
t.Errorf("HasTopAgg = %v, want %v", result.HasTopAgg, tt.wantHasAgg)
}
})
}
}
func TestAnalyzeSQL_LimitQueries(t *testing.T) {
tests := []struct {
name string
sql string
wantLimit *int64
wantIsSelect bool
}{
{
name: "LIMIT 10",
sql: "SELECT * FROM users LIMIT 10",
wantLimit: ptr(int64(10)),
wantIsSelect: true,
},
{
name: "LIMIT 100",
sql: "SELECT * FROM users LIMIT 100",
wantLimit: ptr(int64(100)),
wantIsSelect: true,
},
{
name: "LIMIT 1000",
sql: "SELECT * FROM users LIMIT 1000",
wantLimit: ptr(int64(1000)),
wantIsSelect: true,
},
{
name: "LIMIT with OFFSET",
sql: "SELECT * FROM users LIMIT 50 OFFSET 100",
wantLimit: ptr(int64(50)),
wantIsSelect: true,
},
{
name: "No LIMIT",
sql: "SELECT * FROM users",
wantLimit: nil,
wantIsSelect: true,
},
{
name: "LIMIT 0",
sql: "SELECT * FROM users LIMIT 0",
wantLimit: ptr(int64(0)),
wantIsSelect: true,
},
{
name: "LIMIT 1",
sql: "SELECT * FROM users LIMIT 1",
wantLimit: ptr(int64(1)),
wantIsSelect: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := AnalyzeSQL(tt.sql)
if err != nil {
t.Fatalf("AnalyzeSQL() error = %v", err)
}
if result.IsSelectLike != tt.wantIsSelect {
t.Errorf("IsSelectLike = %v, want %v", result.IsSelectLike, tt.wantIsSelect)
}
if tt.wantLimit == nil {
if result.LimitConst != nil {
t.Errorf("LimitConst = %v, want nil", *result.LimitConst)
}
} else {
if result.LimitConst == nil {
t.Errorf("LimitConst = nil, want %v", *tt.wantLimit)
} else if *result.LimitConst != *tt.wantLimit {
t.Errorf("LimitConst = %v, want %v", *result.LimitConst, *tt.wantLimit)
}
}
})
}
}
func TestAnalyzeSQL_UnionQueries(t *testing.T) {
tests := []struct {
name string
sql string
wantHasAgg bool
wantLimit *int64
}{
{
name: "UNION without aggregate",
sql: "SELECT id, name FROM users UNION SELECT id, name FROM admins",
wantHasAgg: false,
wantLimit: nil,
},
{
name: "UNION ALL without aggregate",
sql: "SELECT * FROM users UNION ALL SELECT * FROM admins",
wantHasAgg: false,
wantLimit: nil,
},
{
name: "UNION with aggregate in all branches",
sql: "SELECT COUNT(*) FROM users UNION SELECT COUNT(*) FROM admins",
wantHasAgg: true,
wantLimit: nil,
},
{
name: "UNION with aggregate in one branch only",
sql: "SELECT COUNT(*) FROM users UNION SELECT id FROM admins",
wantHasAgg: false, // not all branches have aggregate
wantLimit: nil,
},
{
name: "UNION with outer LIMIT",
sql: "SELECT * FROM users UNION SELECT * FROM admins LIMIT 100",
wantHasAgg: false,
wantLimit: ptr(int64(100)),
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := AnalyzeSQL(tt.sql)
if err != nil {
t.Fatalf("AnalyzeSQL() error = %v", err)
}
if result.HasTopAgg != tt.wantHasAgg {
t.Errorf("HasTopAgg = %v, want %v", result.HasTopAgg, tt.wantHasAgg)
}
if tt.wantLimit == nil {
if result.LimitConst != nil {
t.Errorf("LimitConst = %v, want nil", *result.LimitConst)
}
} else {
if result.LimitConst == nil {
t.Errorf("LimitConst = nil, want %v", *tt.wantLimit)
} else if *result.LimitConst != *tt.wantLimit {
t.Errorf("LimitConst = %v, want %v", *result.LimitConst, *tt.wantLimit)
}
}
})
}
}
func TestAnalyzeSQL_NonSelectStatements(t *testing.T) {
tests := []struct {
name string
sql string
wantIsSelect bool
}{
{
name: "SHOW DATABASES",
sql: "SHOW DATABASES",
wantIsSelect: false,
},
{
name: "SHOW TABLES",
sql: "SHOW TABLES",
wantIsSelect: false,
},
{
name: "DESCRIBE table",
sql: "DESCRIBE users",
wantIsSelect: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result, err := AnalyzeSQL(tt.sql)
if err != nil {
// Some statements may not be parseable, which is fine
return
}
if result.IsSelectLike != tt.wantIsSelect {
t.Errorf("IsSelectLike = %v, want %v", result.IsSelectLike, tt.wantIsSelect)
}
})
}
}
func TestNeedsRowCountCheck(t *testing.T) {
maxRows := 500
tests := []struct {
name string
sql string
wantNeedCheck bool
wantReject bool
}{
// Should skip check (needsCheck = false)
{
name: "Aggregate COUNT(*)",
sql: "SELECT COUNT(*) FROM users",
wantNeedCheck: false,
wantReject: false,
},
{
name: "Aggregate SUM",
sql: "SELECT SUM(amount) FROM orders",
wantNeedCheck: false,
wantReject: false,
},
{
name: "Aggregate with GROUP BY",
sql: "SELECT user_id, COUNT(*) FROM orders GROUP BY user_id",
wantNeedCheck: false,
wantReject: false,
},
{
name: "LIMIT equal to max",
sql: "SELECT * FROM users LIMIT 500",
wantNeedCheck: false,
wantReject: false,
},
{
name: "LIMIT less than max",
sql: "SELECT * FROM users LIMIT 100",
wantNeedCheck: false,
wantReject: false,
},
{
name: "LIMIT 1",
sql: "SELECT * FROM users LIMIT 1",
wantNeedCheck: false,
wantReject: false,
},
// LIMIT > maxRows still needs probe check (actual result might be smaller)
{
name: "LIMIT exceeds max",
sql: "SELECT * FROM users LIMIT 1000",
wantNeedCheck: true,
wantReject: false,
},
{
name: "LIMIT much larger than max",
sql: "SELECT * FROM users LIMIT 10000",
wantNeedCheck: true,
wantReject: false,
},
// Should execute probe check (needsCheck = true)
{
name: "No LIMIT no aggregate",
sql: "SELECT * FROM users",
wantNeedCheck: true,
wantReject: false,
},
{
name: "SELECT with WHERE no LIMIT",
sql: "SELECT * FROM users WHERE status = 'active'",
wantNeedCheck: true,
wantReject: false,
},
{
name: "SELECT with JOIN no LIMIT",
sql: "SELECT u.*, o.* FROM users u JOIN orders o ON u.id = o.user_id",
wantNeedCheck: true,
wantReject: false,
},
{
name: "Aggregate in subquery only",
sql: "SELECT * FROM (SELECT user_id, COUNT(*) as cnt FROM orders GROUP BY user_id) t",
wantNeedCheck: true,
wantReject: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
needsCheck, directReject, _ := NeedsRowCountCheck(tt.sql, maxRows)
if needsCheck != tt.wantNeedCheck {
t.Errorf("needsCheck = %v, want %v", needsCheck, tt.wantNeedCheck)
}
if directReject != tt.wantReject {
t.Errorf("directReject = %v, want %v", directReject, tt.wantReject)
}
})
}
}
func TestNeedsRowCountCheck_DorisSpecificFunctions(t *testing.T) {
maxRows := 500
tests := []struct {
name string
sql string
wantNeedCheck bool
}{
// Doris HLL functions
{
name: "HLL_UNION_AGG",
sql: "SELECT HLL_UNION_AGG(hll_col) FROM user_stats",
wantNeedCheck: false,
},
{
name: "HLL_CARDINALITY",
sql: "SELECT HLL_CARDINALITY(hll_col) FROM user_stats",
wantNeedCheck: false,
},
// Doris Bitmap functions
{
name: "BITMAP_UNION_COUNT",
sql: "SELECT BITMAP_UNION_COUNT(bitmap_col) FROM user_tags",
wantNeedCheck: false,
},
{
name: "BITMAP_UNION",
sql: "SELECT BITMAP_UNION(bitmap_col) FROM user_tags GROUP BY category",
wantNeedCheck: false,
},
// Other Doris aggregate functions
{
name: "APPROX_COUNT_DISTINCT",
sql: "SELECT APPROX_COUNT_DISTINCT(user_id) FROM events",
wantNeedCheck: false,
},
{
name: "GROUP_CONCAT",
sql: "SELECT GROUP_CONCAT(name) FROM users GROUP BY department",
wantNeedCheck: false,
},
{
name: "PERCENTILE_APPROX",
sql: "SELECT PERCENTILE_APPROX(latency, 0.99) FROM requests",
wantNeedCheck: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
needsCheck, _, _ := NeedsRowCountCheck(tt.sql, maxRows)
if needsCheck != tt.wantNeedCheck {
t.Errorf("needsCheck = %v, want %v (should skip check for Doris aggregate functions)", needsCheck, tt.wantNeedCheck)
}
})
}
}
func TestNeedsRowCountCheck_ComplexQueries(t *testing.T) {
maxRows := 500
tests := []struct {
name string
sql string
wantNeedCheck bool
wantReject bool
}{
{
name: "CTE with aggregate",
sql: "WITH user_counts AS (SELECT user_id, COUNT(*) as cnt FROM orders GROUP BY user_id) SELECT * FROM user_counts",
wantNeedCheck: false, // CTE has aggregate, skip check
wantReject: false,
},
{
name: "Complex JOIN with aggregate",
sql: "SELECT u.department, COUNT(*) FROM users u JOIN orders o ON u.id = o.user_id GROUP BY u.department",
wantNeedCheck: false, // has aggregate
wantReject: false,
},
{
name: "Nested subquery",
sql: "SELECT * FROM users WHERE id IN (SELECT user_id FROM orders WHERE amount > 100)",
wantNeedCheck: true,
wantReject: false,
},
{
name: "DISTINCT query",
sql: "SELECT DISTINCT category FROM products",
wantNeedCheck: true, // DISTINCT is not aggregate
wantReject: false,
},
{
name: "ORDER BY with LIMIT",
sql: "SELECT * FROM users ORDER BY created_at DESC LIMIT 100",
wantNeedCheck: false, // has valid LIMIT
wantReject: false,
},
{
name: "Multiple aggregates in single query",
sql: "SELECT COUNT(*), SUM(amount), AVG(amount), MIN(amount), MAX(amount) FROM orders",
wantNeedCheck: false,
wantReject: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
needsCheck, directReject, _ := NeedsRowCountCheck(tt.sql, maxRows)
if needsCheck != tt.wantNeedCheck {
t.Errorf("needsCheck = %v, want %v", needsCheck, tt.wantNeedCheck)
}
if directReject != tt.wantReject {
t.Errorf("directReject = %v, want %v", directReject, tt.wantReject)
}
})
}
}
func TestNeedsRowCountCheck_EdgeCases(t *testing.T) {
maxRows := 500
tests := []struct {
name string
sql string
wantNeedCheck bool
wantReject bool
}{
{
name: "Empty-ish LIMIT 0",
sql: "SELECT * FROM users LIMIT 0",
wantNeedCheck: false,
wantReject: false,
},
{
name: "LIMIT at boundary",
sql: "SELECT * FROM users LIMIT 501",
wantNeedCheck: true, // 501 > 500, needs probe check
wantReject: false,
},
{
name: "SELECT with trailing semicolon",
sql: "SELECT * FROM users;",
wantNeedCheck: true,
wantReject: false,
},
{
name: "SELECT with extra whitespace",
sql: " SELECT * FROM users ",
wantNeedCheck: true,
wantReject: false,
},
{
name: "Lowercase keywords",
sql: "select count(*) from users",
wantNeedCheck: false,
wantReject: false,
},
{
name: "Mixed case keywords",
sql: "Select Count(*) From users",
wantNeedCheck: false,
wantReject: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
needsCheck, directReject, _ := NeedsRowCountCheck(tt.sql, maxRows)
if needsCheck != tt.wantNeedCheck {
t.Errorf("needsCheck = %v, want %v", needsCheck, tt.wantNeedCheck)
}
if directReject != tt.wantReject {
t.Errorf("directReject = %v, want %v", directReject, tt.wantReject)
}
})
}
}
func TestNeedsRowCountCheck_DifferentMaxRows(t *testing.T) {
tests := []struct {
name string
sql string
maxRows int
wantNeedCheck bool
wantReject bool
}{
{
name: "LIMIT 100 with maxRows 50",
sql: "SELECT * FROM users LIMIT 100",
maxRows: 50,
wantNeedCheck: true, // LIMIT > maxRows, needs probe check
wantReject: false,
},
{
name: "LIMIT 100 with maxRows 100",
sql: "SELECT * FROM users LIMIT 100",
maxRows: 100,
wantNeedCheck: false,
wantReject: false,
},
{
name: "LIMIT 100 with maxRows 200",
sql: "SELECT * FROM users LIMIT 100",
maxRows: 200,
wantNeedCheck: false,
wantReject: false,
},
{
name: "No LIMIT with maxRows 1000",
sql: "SELECT * FROM users",
maxRows: 1000,
wantNeedCheck: true,
wantReject: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
needsCheck, directReject, _ := NeedsRowCountCheck(tt.sql, tt.maxRows)
if needsCheck != tt.wantNeedCheck {
t.Errorf("needsCheck = %v, want %v", needsCheck, tt.wantNeedCheck)
}
if directReject != tt.wantReject {
t.Errorf("directReject = %v, want %v", directReject, tt.wantReject)
}
})
}
}
// TestSummary_SkipProbeCheck prints a summary of which SQL patterns skip the probe check
func TestSummary_SkipProbeCheck(t *testing.T) {
maxRows := 500
skipCheckCases := []struct {
category string
sql string
}{
// Aggregate functions
{"Aggregate - COUNT(*)", "SELECT COUNT(*) FROM users"},
{"Aggregate - COUNT(col)", "SELECT COUNT(id) FROM users"},
{"Aggregate - SUM", "SELECT SUM(amount) FROM orders"},
{"Aggregate - AVG", "SELECT AVG(price) FROM products"},
{"Aggregate - MIN", "SELECT MIN(created_at) FROM logs"},
{"Aggregate - MAX", "SELECT MAX(score) FROM results"},
{"Aggregate - GROUP BY", "SELECT user_id, COUNT(*) FROM orders GROUP BY user_id"},
{"Aggregate - HAVING", "SELECT user_id, SUM(amount) FROM orders GROUP BY user_id HAVING SUM(amount) > 1000"},
// Doris specific aggregates
{"Doris - HLL_UNION_AGG", "SELECT HLL_UNION_AGG(hll_col) FROM stats"},
{"Doris - BITMAP_UNION_COUNT", "SELECT BITMAP_UNION_COUNT(bitmap_col) FROM tags"},
{"Doris - APPROX_COUNT_DISTINCT", "SELECT APPROX_COUNT_DISTINCT(user_id) FROM events"},
{"Doris - GROUP_CONCAT", "SELECT GROUP_CONCAT(name) FROM users GROUP BY dept"},
// LIMIT <= maxRows
{"LIMIT - Equal to max", "SELECT * FROM users LIMIT 500"},
{"LIMIT - Less than max", "SELECT * FROM users LIMIT 100"},
{"LIMIT - With OFFSET", "SELECT * FROM users LIMIT 100 OFFSET 50"},
{"LIMIT - Value 1", "SELECT * FROM users LIMIT 1"},
{"LIMIT - Value 0", "SELECT * FROM users LIMIT 0"},
}
t.Log("=== SQL patterns that SKIP probe check (no extra query needed) ===")
for _, tc := range skipCheckCases {
needsCheck, _, _ := NeedsRowCountCheck(tc.sql, maxRows)
status := "✓ SKIP"
if needsCheck {
status = "✗ NEEDS CHECK (unexpected)"
}
t.Logf(" %s: %s\n SQL: %s", status, tc.category, tc.sql)
}
needsCheckCases := []struct {
category string
sql string
}{
{"No LIMIT - Simple SELECT", "SELECT * FROM users"},
{"No LIMIT - With WHERE", "SELECT * FROM users WHERE status = 'active'"},
{"No LIMIT - With JOIN", "SELECT u.*, o.* FROM users u JOIN orders o ON u.id = o.user_id"},
{"No LIMIT - Subquery with agg", "SELECT * FROM (SELECT user_id, COUNT(*) FROM orders GROUP BY user_id) t"},
{"No LIMIT - DISTINCT", "SELECT DISTINCT category FROM products"},
{"LIMIT > max (actual may be smaller)", "SELECT * FROM users LIMIT 1000"},
{"LIMIT >> max", "SELECT * FROM users LIMIT 10000"},
}
t.Log("\n=== SQL patterns that NEED probe check ===")
for _, tc := range needsCheckCases {
needsCheck, _, _ := NeedsRowCountCheck(tc.sql, maxRows)
status := "✓ NEEDS CHECK"
if !needsCheck {
status = "✗ SKIP (unexpected)"
}
t.Logf(" %s: %s\n SQL: %s", status, tc.category, tc.sql)
}
}
// ptr is a helper function to create a pointer to int64
func ptr(v int64) *int64 {
return &v
}
================================================
FILE: dskit/doris/template.md
================================================
## SQL变量
| 字段名 | 含义 | 使用场景 |
| ---- | ---- | ---- |
|database|数据库|无|
|table|表名||
|time_field|时间戳的字段||
|query|查询条件|日志原文|
|from|开始时间||
|to|结束时间||
|aggregation|聚合算法|时序图|
|field|聚合的字段|时序图|
|limit|分页参数|日志原文|
|offset|分页参数|日志原文|
|interval|直方图的时间粒度|直方图|
## 日志原文
### 直方图
```
# 如何计算interval的值
max := 60 // 最多60个柱子
interval := ($to-$from) / max
interval = interval - interval%10
if interval <= 0 {
interval = 60
}
```
```
SELECT count() as cnt,
FLOOR(UNIX_TIMESTAMP($time_field) / $interval) * $interval AS __ts__
FROM $table
WHERE $time_field BETWEEN FROM_UNIXTIME($from) AND FROM_UNIXTIME($to)
GROUP BY __ts__;
```
```
{
"database":"$database",
"sql":"$sql",
"keys:": {
"valueKey":"cnt",
"timeKey":"__ts__"
}
}
```
### 日志原文
```
SELECT * from $table
WHERE $time_field BETWEEN FROM_UNIXTIME($from) AND FROM_UNIXTIME($to)
ORDER by $time_filed
LIMIT $limit OFFSET $offset;
```
```
{
"database":"$database",
"sql":"$sql"
}
```
## 时序图
### 日志行数
```
SELECT COUNT() AS cnt, DATE_FORMAT(date, '%Y-%m-%d %H:%i:00') AS __ts__
FROM nginx_access_log
WHERE $time_field BETWEEN FROM_UNIXTIME($from) AND FROM_UNIXTIME($to)
GROUP BY __ts__
```
```
{
"database":"$database",
"sql":"$sql",
"keys:": {
"valueKey":"cnt",
"timeKey":"__ts__"
}
}
```
### max/min/avg/sum
```
SELECT $aggregation($field) AS series, DATE_FORMAT(date, '%Y-%m-%d %H:%i:00') AS __ts__
FROM nginx_access_log
WHERE $time_field BETWEEN FROM_UNIXTIME($from) AND FROM_UNIXTIME($to)
GROUP BY __ts__
```
```
{
"database":"$database",
"sql":"$sql",
"keys:": {
"valueKey":"series",
"timeKey":"__ts__"
}
}
```
### 分位值
```
SELECT percentile($field, 0.95) AS series, DATE_FORMAT(date, '%Y-%m-%d %H:%i:00') AS __ts__
FROM nginx_access_log
WHERE $time_field BETWEEN FROM_UNIXTIME($from) AND FROM_UNIXTIME($to)
GROUP BY __ts__
```
```
{
"database":"$database",
"sql":"$sql",
"keys:": {
"valueKey":"series",
"timeKey":"__ts__"
}
}
```
================================================
FILE: dskit/doris/timeseries.go
================================================
package doris
import (
"context"
"fmt"
"strings"
"github.com/ccfos/nightingale/v6/dskit/sqlbase"
"github.com/ccfos/nightingale/v6/dskit/types"
)
const (
TimeFieldFormatEpochMilli = "epoch_millis"
TimeFieldFormatEpochSecond = "epoch_second"
TimeFieldFormatDateTime = "datetime"
)
// 不再拼接SQL, 完全信赖用户的输入
type QueryParam struct {
Database string `json:"database"`
Sql string `json:"sql"`
Keys types.Keys `json:"keys" mapstructure:"keys"`
}
var (
DorisBannedOp = map[string]struct{}{
"CREATE": {},
"INSERT": {},
"ALTER": {},
"REVOKE": {},
"DROP": {},
"RENAME": {},
"ATTACH": {},
"DETACH": {},
"OPTIMIZE": {},
"TRUNCATE": {},
"SET": {},
}
)
// Query executes a given SQL query in Doris and returns the results with MaxQueryRows check
func (d *Doris) Query(ctx context.Context, query *QueryParam) ([]map[string]interface{}, error) {
// 校验SQL的合法性, 过滤掉 write请求
sqlItem := strings.Split(strings.ToUpper(query.Sql), " ")
for _, item := range sqlItem {
if _, ok := DorisBannedOp[item]; ok {
return nil, fmt.Errorf("operation %s is forbid, only read db, please check your sql", item)
}
}
// 检查查询结果行数
err := d.CheckMaxQueryRows(ctx, query.Database, query.Sql)
if err != nil {
return nil, err
}
rows, err := d.ExecQuery(ctx, query.Database, query.Sql)
if err != nil {
return nil, err
}
return rows, nil
}
// QueryTimeseries executes a time series data query using the given parameters with MaxQueryRows check
func (d *Doris) QueryTimeseries(ctx context.Context, query *QueryParam) ([]types.MetricValues, error) {
// 使用 Query 方法执行查询,Query方法内部已包含MaxQueryRows检查
rows, err := d.Query(ctx, query)
if err != nil {
return nil, err
}
return sqlbase.FormatMetricValues(query.Keys, rows), nil
}
// CheckMaxQueryRows checks if the query result exceeds the maximum allowed rows
// It uses SQL analysis to skip unnecessary checks for aggregate queries or queries with LIMIT <= maxRows
// For queries that need checking, it uses probe approach (LIMIT maxRows+1) instead of COUNT(*) for better performance
func (d *Doris) CheckMaxQueryRows(ctx context.Context, database, sql string) error {
maxQueryRows := d.MaxQueryRows
if maxQueryRows == 0 {
maxQueryRows = 500
}
cleanedSQL := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(sql), ";"))
// Step 1: Analyze SQL to determine if check is needed
needsCheck, _, _ := NeedsRowCountCheck(cleanedSQL, maxQueryRows)
if !needsCheck {
return nil
}
// Step 2: Execute probe query (more efficient than COUNT(*))
return d.probeRowCount(ctx, database, cleanedSQL, maxQueryRows)
}
// probeRowCount uses threshold probing to check row count
// It reads at most maxRows+1 rows, which is O(maxRows) instead of O(totalRows) for COUNT(*)
// Doris optimizes LIMIT queries by stopping scan early once limit is reached
func (d *Doris) probeRowCount(ctx context.Context, database, sql string, maxRows int) error {
timeoutCtx, cancel := d.createTimeoutContext(ctx)
defer cancel()
// Probe SQL: only need to check if exceeds threshold, not actual data
probeSQL := fmt.Sprintf("SELECT 1 FROM (%s) AS __probe_chk LIMIT %d", sql, maxRows+1)
results, err := d.ExecQuery(timeoutCtx, database, probeSQL)
if err != nil {
return err
}
// If returned rows > maxRows, it exceeds the limit
if len(results) > maxRows {
return fmt.Errorf("query result rows count exceeds the maximum limit %d", maxRows)
}
return nil
}
================================================
FILE: dskit/mysql/mysql.go
================================================
// @Author: Ciusyan 5/10/24
package mysql
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"time"
"github.com/ccfos/nightingale/v6/dskit/pool"
"github.com/ccfos/nightingale/v6/dskit/sqlbase"
"github.com/ccfos/nightingale/v6/dskit/types"
_ "github.com/go-sql-driver/mysql" // MySQL driver
"github.com/mitchellh/mapstructure"
"gorm.io/driver/mysql"
"gorm.io/gorm"
)
type MySQL struct {
Shards []Shard `json:"mysql.shards" mapstructure:"mysql.shards"`
}
type Shard struct {
Addr string `json:"mysql.addr" mapstructure:"mysql.addr"`
DB string `json:"mysql.db" mapstructure:"mysql.db"`
User string `json:"mysql.user" mapstructure:"mysql.user"`
Password string `json:"mysql.password" mapstructure:"mysql.password"`
Timeout int `json:"mysql.timeout" mapstructure:"mysql.timeout"`
MaxIdleConns int `json:"mysql.max_idle_conns" mapstructure:"mysql.max_idle_conns"`
MaxOpenConns int `json:"mysql.max_open_conns" mapstructure:"mysql.max_open_conns"`
ConnMaxLifetime int `json:"mysql.conn_max_lifetime" mapstructure:"mysql.conn_max_lifetime"`
MaxQueryRows int `json:"mysql.max_query_rows" mapstructure:"mysql.max_query_rows"`
}
func NewMySQLWithSettings(ctx context.Context, settings interface{}) (*MySQL, error) {
newest := new(MySQL)
settingsMap := map[string]interface{}{}
switch s := settings.(type) {
case string:
if err := json.Unmarshal([]byte(s), &settingsMap); err != nil {
return nil, err
}
case map[string]interface{}:
settingsMap = s
default:
return nil, errors.New("unsupported settings type")
}
if err := mapstructure.Decode(settingsMap, newest); err != nil {
return nil, err
}
return newest, nil
}
// NewConn establishes a new connection to MySQL
func (m *MySQL) NewConn(ctx context.Context, database string) (*gorm.DB, error) {
if len(m.Shards) == 0 {
return nil, errors.New("empty pgsql shards")
}
shard := m.Shards[0]
if shard.Timeout == 0 {
shard.Timeout = 300
}
if shard.MaxIdleConns == 0 {
shard.MaxIdleConns = 10
}
if shard.MaxOpenConns == 0 {
shard.MaxOpenConns = 100
}
if shard.ConnMaxLifetime == 0 {
shard.ConnMaxLifetime = 300
}
if shard.MaxQueryRows == 0 {
shard.MaxQueryRows = 100
}
if len(shard.Addr) == 0 {
return nil, errors.New("empty addr")
}
if len(shard.Addr) == 0 {
return nil, errors.New("empty addr")
}
var keys []string
var err error
keys = append(keys, shard.Addr)
keys = append(keys, shard.Password, shard.User)
if len(database) > 0 {
keys = append(keys, database)
}
cachedKey := strings.Join(keys, ":")
// cache conn with database
conn, ok := pool.PoolClient.Load(cachedKey)
if ok {
return conn.(*gorm.DB), nil
}
var db *gorm.DB
defer func() {
if db != nil && err == nil {
pool.PoolClient.Store(cachedKey, db)
}
}()
dsn := fmt.Sprintf("%s:%s@tcp(%s)/%s?charset=utf8&parseTime=True", shard.User, shard.Password, shard.Addr, database)
db, err = sqlbase.NewDB(
ctx,
mysql.Open(dsn),
shard.MaxIdleConns,
shard.MaxOpenConns,
time.Duration(shard.ConnMaxLifetime)*time.Second,
)
return db, err
}
func (m *MySQL) ShowDatabases(ctx context.Context) ([]string, error) {
db, err := m.NewConn(ctx, "")
if err != nil {
return nil, err
}
return sqlbase.ShowDatabases(ctx, db, "SHOW DATABASES")
}
func (m *MySQL) ShowTables(ctx context.Context, database string) ([]string, error) {
db, err := m.NewConn(ctx, database)
if err != nil {
return nil, err
}
return sqlbase.ShowTables(ctx, db, "SHOW TABLES")
}
func (m *MySQL) DescTable(ctx context.Context, database, table string) ([]*types.ColumnProperty, error) {
db, err := m.NewConn(ctx, database)
if err != nil {
return nil, err
}
query := fmt.Sprintf("DESCRIBE %s", table)
return sqlbase.DescTable(ctx, db, query)
}
func (m *MySQL) SelectRows(ctx context.Context, database, table, query string) ([]map[string]interface{}, error) {
db, err := m.NewConn(ctx, database)
if err != nil {
return nil, err
}
return sqlbase.SelectRows(ctx, db, table, query)
}
func (m *MySQL) ExecQuery(ctx context.Context, database string, sql string) ([]map[string]interface{}, error) {
db, err := m.NewConn(ctx, database)
if err != nil {
return nil, err
}
return sqlbase.ExecQuery(ctx, db, sql)
}
================================================
FILE: dskit/mysql/mysql_test.go
================================================
// @Author: Ciusyan 5/11/24
package mysql
import (
"context"
"testing"
"github.com/stretchr/testify/require"
)
func TestNewMySQLWithSettings(t *testing.T) {
tests := []struct {
name string
settings interface{}
wantErr bool
}{
{
name: "valid string settings",
settings: `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}`,
wantErr: false,
},
{
name: "invalid settings type",
settings: 12345,
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := NewMySQLWithSettings(context.Background(), tt.settings)
if (err != nil) != tt.wantErr {
t.Errorf("NewMySQLWithSettings() error = %v, wantErr %v", err, tt.wantErr)
}
t.Log(got)
})
}
}
func TestNewConn(t *testing.T) {
ctx := context.Background()
settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}`
mysql, err := NewMySQLWithSettings(ctx, settings)
require.NoError(t, err)
tests := []struct {
name string
database string
wantErr bool
}{
{
name: "valid connection",
database: "db1",
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
_, err := mysql.NewConn(ctx, tt.database)
if (err != nil) != tt.wantErr {
t.Errorf("NewConn() error = %v, wantErr %v", err, tt.wantErr)
return
}
})
}
}
func TestShowDatabases(t *testing.T) {
ctx := context.Background()
settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}`
mysql, err := NewMySQLWithSettings(ctx, settings)
require.NoError(t, err)
databases, err := mysql.ShowDatabases(ctx)
require.NoError(t, err)
t.Log(databases)
}
func TestShowTables(t *testing.T) {
ctx := context.Background()
settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}`
mysql, err := NewMySQLWithSettings(ctx, settings)
require.NoError(t, err)
tables, err := mysql.ShowTables(ctx, "db1")
require.NoError(t, err)
t.Log(tables)
}
func TestDescTable(t *testing.T) {
ctx := context.Background()
settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}`
mysql, err := NewMySQLWithSettings(ctx, settings)
require.NoError(t, err)
descTable, err := mysql.DescTable(ctx, "db1", "students")
require.NoError(t, err)
for _, desc := range descTable {
t.Logf("%+v", *desc)
}
}
func TestExecQuery(t *testing.T) {
ctx := context.Background()
settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}`
mysql, err := NewMySQLWithSettings(ctx, settings)
require.NoError(t, err)
rows, err := mysql.ExecQuery(ctx, "db1", "SELECT * FROM students WHERE id = 10008")
require.NoError(t, err)
for _, row := range rows {
t.Log(row)
}
}
func TestSelectRows(t *testing.T) {
ctx := context.Background()
settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}`
mysql, err := NewMySQLWithSettings(ctx, settings)
require.NoError(t, err)
rows, err := mysql.SelectRows(ctx, "db1", "students", "id > 10008")
require.NoError(t, err)
for _, row := range rows {
t.Log(row)
}
}
================================================
FILE: dskit/mysql/timeseries.go
================================================
package mysql
import (
"context"
"fmt"
"strings"
"github.com/ccfos/nightingale/v6/dskit/sqlbase"
"github.com/ccfos/nightingale/v6/dskit/types"
"gorm.io/gorm"
)
// Query executes a given SQL query in MySQL and returns the results
func (m *MySQL) Query(ctx context.Context, query *sqlbase.QueryParam) ([]map[string]interface{}, error) {
db, err := m.NewConn(ctx, "")
if err != nil {
return nil, err
}
err = m.CheckMaxQueryRows(db, ctx, query)
if err != nil {
return nil, err
}
return sqlbase.Query(ctx, db, query)
}
// QueryTimeseries executes a time series data query using the given parameters
func (m *MySQL) QueryTimeseries(ctx context.Context, query *sqlbase.QueryParam) ([]types.MetricValues, error) {
db, err := m.NewConn(ctx, "")
if err != nil {
return nil, err
}
err = m.CheckMaxQueryRows(db, ctx, query)
if err != nil {
return nil, err
}
return sqlbase.QueryTimeseries(ctx, db, query)
}
func (m *MySQL) CheckMaxQueryRows(db *gorm.DB, ctx context.Context, query *sqlbase.QueryParam) error {
sql := strings.ReplaceAll(query.Sql, ";", "")
checkQuery := &sqlbase.QueryParam{
Sql: fmt.Sprintf("SELECT COUNT(*) as count FROM (%s) AS subquery;", sql),
}
res, err := sqlbase.Query(ctx, db, checkQuery)
if err != nil {
return err
}
if len(res) > 0 {
if count, exists := res[0]["count"]; exists {
v, err := sqlbase.ParseFloat64Value(count)
if err != nil {
return err
}
maxQueryRows := m.Shards[0].MaxQueryRows
if maxQueryRows == 0 {
maxQueryRows = 500
}
if v > float64(maxQueryRows) {
return fmt.Errorf("query result rows count %d exceeds the maximum limit %d", int(v), maxQueryRows)
}
}
}
return nil
}
================================================
FILE: dskit/mysql/timeseries_test.go
================================================
// @Author: Ciusyan 5/11/24
package mysql
import (
"context"
"testing"
"github.com/ccfos/nightingale/v6/dskit/sqlbase"
"github.com/ccfos/nightingale/v6/dskit/types"
"github.com/stretchr/testify/require"
)
func TestQuery(t *testing.T) {
ctx := context.Background()
settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}`
mysql, err := NewMySQLWithSettings(ctx, settings)
require.NoError(t, err)
param := &sqlbase.QueryParam{
Sql: "SELECT * FROM students WHERE id > 10900",
Keys: types.Keys{
ValueKey: "",
LabelKey: "",
TimeKey: "",
TimeFormat: "",
},
}
rows, err := mysql.Query(ctx, param)
require.NoError(t, err)
for _, row := range rows {
t.Log(row)
}
}
func TestQueryTimeseries(t *testing.T) {
ctx := context.Background()
settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}`
mysql, err := NewMySQLWithSettings(ctx, settings)
require.NoError(t, err)
// Prepare a test query parameter
param := &sqlbase.QueryParam{
Sql: "SELECT id, grade, student_name, a_grade, update_time FROM students WHERE grade > 20000", // Modify SQL query to select specific columns
Keys: types.Keys{
ValueKey: "grade a_grade", // Set the value key to the column name containing the metric value
LabelKey: "id student_name", // Set the label key to the column name containing the metric label
TimeKey: "update_time", // Set the time key to the column name containing the timestamp
TimeFormat: "2006-01-02 15:04:05 +0000 UTC", // Provide the time format according to the timestamp column's format
},
}
// Execute the query and retrieve the time series data
metricValues, err := mysql.QueryTimeseries(ctx, param)
require.NoError(t, err)
for _, metric := range metricValues {
t.Log(metric)
}
}
================================================
FILE: dskit/pool/pool.go
================================================
package pool
import (
"bytes"
"sync"
"time"
gc "github.com/patrickmn/go-cache"
)
var (
PoolClient = new(sync.Map)
)
var (
// default cache instance, do not use this if you want to specify the defaultExpiration
DefaultCache = gc.New(time.Hour*24, time.Hour)
)
var (
bytesPool = sync.Pool{
New: func() interface{} { return new(bytes.Buffer) },
}
)
func PoolGetBytesBuffer() *bytes.Buffer {
buf := bytesPool.Get().(*bytes.Buffer)
buf.Reset()
return buf
}
func PoolPutBytesBuffer(buf *bytes.Buffer) {
if buf == nil {
return
}
bytesPool.Put(buf)
}
================================================
FILE: dskit/postgres/postgres.go
================================================
// @Author: Ciusyan 5/20/24
package postgres
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/url"
"strings"
"time"
"github.com/ccfos/nightingale/v6/dskit/pool"
"github.com/ccfos/nightingale/v6/dskit/sqlbase"
"github.com/ccfos/nightingale/v6/dskit/types"
_ "github.com/lib/pq" // PostgreSQL driver
"github.com/mitchellh/mapstructure"
"gorm.io/driver/postgres"
"gorm.io/gorm"
)
type PostgreSQL struct {
Shard `json:",inline" mapstructure:",squash"`
}
type Shard struct {
Addr string `json:"pgsql.addr" mapstructure:"pgsql.addr"`
DB string `json:"pgsql.db" mapstructure:"pgsql.db"`
User string `json:"pgsql.user" mapstructure:"pgsql.user"`
Password string `json:"pgsql.password" mapstructure:"pgsql.password" `
Timeout int `json:"pgsql.timeout" mapstructure:"pgsql.timeout"`
MaxIdleConns int `json:"pgsql.max_idle_conns" mapstructure:"pgsql.max_idle_conns"`
MaxOpenConns int `json:"pgsql.max_open_conns" mapstructure:"pgsql.max_open_conns"`
ConnMaxLifetime int `json:"pgsql.conn_max_lifetime" mapstructure:"pgsql.conn_max_lifetime"`
MaxQueryRows int `json:"pgsql.max_query_rows" mapstructure:"pgsql.max_query_rows"`
}
// NewPostgreSQLWithSettings initializes a new PostgreSQL instance with the given settings
func NewPostgreSQLWithSettings(ctx context.Context, settings interface{}) (*PostgreSQL, error) {
newest := new(PostgreSQL)
settingsMap := map[string]interface{}{}
switch s := settings.(type) {
case string:
if err := json.Unmarshal([]byte(s), &settingsMap); err != nil {
return nil, err
}
case map[string]interface{}:
settingsMap = s
case *PostgreSQL:
return s, nil
case PostgreSQL:
return &s, nil
case Shard:
newest.Shard = s
return newest, nil
case *Shard:
newest.Shard = *s
return newest, nil
default:
return nil, errors.New("unsupported settings type")
}
if err := mapstructure.Decode(settingsMap, newest); err != nil {
return nil, err
}
return newest, nil
}
// NewConn establishes a new connection to PostgreSQL
func (p *PostgreSQL) NewConn(ctx context.Context, database string) (*gorm.DB, error) {
if len(p.DB) == 0 && len(database) == 0 {
return nil, errors.New("empty pgsql database") // 兼容阿里实时数仓Holgres, 连接时必须指定db名字
}
if p.Shard.Timeout == 0 {
p.Shard.Timeout = 60
}
if p.Shard.MaxIdleConns == 0 {
p.Shard.MaxIdleConns = 10
}
if p.Shard.MaxOpenConns == 0 {
p.Shard.MaxOpenConns = 100
}
if p.Shard.ConnMaxLifetime == 0 {
p.Shard.ConnMaxLifetime = 14400
}
if len(p.Shard.Addr) == 0 {
return nil, errors.New("empty fe-node addr")
}
var keys []string
var err error
keys = append(keys, p.Shard.Addr)
keys = append(keys, p.Shard.Password, p.Shard.User)
if len(database) > 0 {
keys = append(keys, database)
}
cachedKey := strings.Join(keys, ":")
// cache conn with database
conn, ok := pool.PoolClient.Load(cachedKey)
if ok {
return conn.(*gorm.DB), nil
}
var db *gorm.DB
defer func() {
if db != nil && err == nil {
pool.PoolClient.Store(cachedKey, db)
}
}()
// Simplified connection logic for PostgreSQL
dsn := fmt.Sprintf("postgres://%s:%s@%s/%s?sslmode=disable&TimeZone=Asia/Shanghai", url.QueryEscape(p.Shard.User), url.QueryEscape(p.Shard.Password), p.Shard.Addr, database)
db, err = sqlbase.NewDB(
ctx,
postgres.Open(dsn),
p.Shard.MaxIdleConns,
p.Shard.MaxOpenConns,
time.Duration(p.Shard.ConnMaxLifetime)*time.Second,
)
if err != nil {
if db != nil {
sqlDB, _ := db.DB()
if sqlDB != nil {
sqlDB.Close()
}
}
return nil, err
}
return db, nil
}
// ShowDatabases lists all databases in PostgreSQL
func (p *PostgreSQL) ShowDatabases(ctx context.Context, searchKeyword string) ([]string, error) {
db, err := p.NewConn(ctx, "postgres")
if err != nil {
return nil, err
}
sql := fmt.Sprintf("SELECT datname FROM pg_database WHERE datistemplate = false AND datname LIKE %s",
"'%"+searchKeyword+"%'")
return sqlbase.ShowDatabases(ctx, db, sql)
}
// ShowTables lists all tables in a given database
func (p *PostgreSQL) ShowTables(ctx context.Context, searchKeyword string) (map[string][]string, error) {
db, err := p.NewConn(ctx, p.DB)
if err != nil {
return nil, err
}
sql := fmt.Sprintf("SELECT schemaname, tablename FROM pg_tables WHERE schemaname !='information_schema' and schemaname !='pg_catalog' and tablename LIKE %s",
"'%"+searchKeyword+"%'")
rets, err := sqlbase.ExecQuery(ctx, db, sql)
if err != nil {
return nil, err
}
tabs := make(map[string][]string, 3)
for _, row := range rets {
if val, ok := row["schemaname"].(string); ok {
tabs[val] = append(tabs[val], row["tablename"].(string))
}
}
return tabs, nil
}
// DescTable describes the schema of a specified table in PostgreSQL
// scheme default: public if not specified
func (p *PostgreSQL) DescTable(ctx context.Context, scheme, table string) ([]*types.ColumnProperty, error) {
db, err := p.NewConn(ctx, p.DB)
if err != nil {
return nil, err
}
if scheme == "" {
scheme = "public"
}
query := fmt.Sprintf("SELECT column_name, data_type, is_nullable, column_default FROM information_schema.columns WHERE table_name = '%s' AND table_schema = '%s'", table, scheme)
return sqlbase.DescTable(ctx, db, query)
}
// SelectRows selects rows from a specified table in PostgreSQL based on a given query
func (p *PostgreSQL) SelectRows(ctx context.Context, table, where string) ([]map[string]interface{}, error) {
db, err := p.NewConn(ctx, p.DB)
if err != nil {
return nil, err
}
return sqlbase.SelectRows(ctx, db, table, where)
}
// ExecQuery executes a SQL query in PostgreSQL
func (p *PostgreSQL) ExecQuery(ctx context.Context, sql string) ([]map[string]interface{}, error) {
db, err := p.NewConn(ctx, p.DB)
if err != nil {
return nil, err
}
return sqlbase.ExecQuery(ctx, db, sql)
}
================================================
FILE: dskit/postgres/timeseries.go
================================================
package postgres
import (
"context"
"fmt"
"strings"
"github.com/ccfos/nightingale/v6/dskit/sqlbase"
"github.com/ccfos/nightingale/v6/dskit/types"
"gorm.io/gorm"
)
// Query executes a given SQL query in PostgreSQL and returns the results
func (p *PostgreSQL) Query(ctx context.Context, query *sqlbase.QueryParam) ([]map[string]interface{}, error) {
db, err := p.NewConn(ctx, p.Shard.DB)
if err != nil {
return nil, err
}
err = p.CheckMaxQueryRows(db, ctx, query)
if err != nil {
return nil, err
}
return sqlbase.Query(ctx, db, query)
}
// QueryTimeseries executes a time series data query using the given parameters
func (p *PostgreSQL) QueryTimeseries(ctx context.Context, query *sqlbase.QueryParam) ([]types.MetricValues, error) {
db, err := p.NewConn(ctx, p.Shard.DB)
if err != nil {
return nil, err
}
err = p.CheckMaxQueryRows(db, ctx, query)
if err != nil {
return nil, err
}
return sqlbase.QueryTimeseries(ctx, db, query, true)
}
func (p *PostgreSQL) CheckMaxQueryRows(db *gorm.DB, ctx context.Context, query *sqlbase.QueryParam) error {
sql := strings.ReplaceAll(query.Sql, ";", "")
checkQuery := &sqlbase.QueryParam{
Sql: fmt.Sprintf("SELECT COUNT(*) as count FROM (%s) AS subquery;", sql),
}
res, err := sqlbase.Query(ctx, db, checkQuery)
if err != nil {
return err
}
if len(res) > 0 {
if count, exists := res[0]["count"]; exists {
v, err := sqlbase.ParseFloat64Value(count)
if err != nil {
return err
}
maxQueryRows := p.Shard.MaxQueryRows
if maxQueryRows == 0 {
maxQueryRows = 500
}
if v > float64(maxQueryRows) {
return fmt.Errorf("query result rows count %d exceeds the maximum limit %d", int(v), maxQueryRows)
}
}
}
return nil
}
================================================
FILE: dskit/sqlbase/base.go
================================================
// @Author: Ciusyan 5/19/24
package sqlbase
import (
"context"
"database/sql"
"fmt"
"strings"
"time"
"gorm.io/gorm"
"github.com/ccfos/nightingale/v6/dskit/types"
)
// NewDB creates a new Gorm DB instance based on the provided gorm.Dialector and configures the connection pool
func NewDB(ctx context.Context, dialector gorm.Dialector, maxIdleConns, maxOpenConns int, connMaxLifetime time.Duration) (*gorm.DB, error) {
// Create a new Gorm DB instance
db, err := gorm.Open(dialector, &gorm.Config{})
if err != nil {
return db, err
}
// Configure the connection pool
sqlDB, err := db.DB()
if err != nil {
return nil, err
}
sqlDB.SetMaxIdleConns(maxIdleConns)
sqlDB.SetMaxOpenConns(maxOpenConns)
sqlDB.SetConnMaxLifetime(connMaxLifetime)
return db.WithContext(ctx), sqlDB.Ping()
}
func CloseDB(db *gorm.DB) error {
if db != nil {
sqlDb, err := db.DB()
if err != nil {
return err
}
return sqlDb.Close()
}
return nil
}
// ShowTables retrieves a list of all tables in the specified database
func ShowTables(ctx context.Context, db *gorm.DB, query string) ([]string, error) {
tables := make([]string, 0)
rows, err := db.WithContext(ctx).Raw(query).Rows()
if err != nil {
return nil, err
}
defer rows.Close()
for rows.Next() {
var table string
if err := rows.Scan(&table); err != nil {
return nil, err
}
tables = append(tables, table)
}
return tables, nil
}
// ShowDatabases retrieves a list of all databases in the connected database server
func ShowDatabases(ctx context.Context, db *gorm.DB, query string) ([]string, error) {
var databases []string
rows, err := db.WithContext(ctx).Raw(query).Rows()
if err != nil {
return nil, err
}
defer rows.Close()
for rows.Next() {
var database string
if err := rows.Scan(&database); err != nil {
return nil, err
}
databases = append(databases, database)
}
return databases, nil
}
// DescTable describes the schema of a specified table in MySQL or PostgreSQL
func DescTable(ctx context.Context, db *gorm.DB, query string) ([]*types.ColumnProperty, error) {
rows, err := db.WithContext(ctx).Raw(query).Rows()
if err != nil {
return nil, err
}
defer rows.Close()
var columns []*types.ColumnProperty
for rows.Next() {
var (
field string
typ string
null string
key sql.NullString
defaultValue sql.NullString
extra sql.NullString
)
switch db.Dialector.Name() {
case "mysql":
if err := rows.Scan(&field, &typ, &null, &key, &defaultValue, &extra); err != nil {
continue
}
case "postgres", "sqlserver":
if err := rows.Scan(&field, &typ, &null, &defaultValue); err != nil {
continue
}
case "oracle":
if err := rows.Scan(&field, &typ, &null); err != nil {
continue
}
}
// Convert the database-specific type to internal type
type2, indexable := ConvertDBType(db.Dialector.Name(), typ)
columns = append(columns, &types.ColumnProperty{
Field: field,
Type: typ,
Type2: type2,
Indexable: indexable,
})
}
return columns, nil
}
// ExecQuery executes the specified query and returns the result rows
func ExecQuery(ctx context.Context, db *gorm.DB, sql string) ([]map[string]interface{}, error) {
rows, err := db.WithContext(ctx).Raw(sql).Rows()
if err != nil {
return nil, err
}
defer rows.Close()
columns, err := rows.Columns()
if err != nil {
return nil, err
}
var results []map[string]interface{}
for rows.Next() {
columnValues := make([]interface{}, len(columns))
columnPointers := make([]interface{}, len(columns))
for i := range columnValues {
columnPointers[i] = &columnValues[i]
}
if err := rows.Scan(columnPointers...); err != nil {
continue
}
rowMap := make(map[string]interface{})
for i, colName := range columns {
val := columnValues[i]
bytes, ok := val.([]byte)
if ok {
rowMap[colName] = string(bytes)
} else {
rowMap[colName] = val
}
}
results = append(results, rowMap)
}
return results, nil
}
// SelectRows selects rows from a specified table based on a given query
func SelectRows(ctx context.Context, db *gorm.DB, table, query string) ([]map[string]interface{}, error) {
sql := fmt.Sprintf("SELECT * FROM %s", table)
if query != "" {
sql += " WHERE " + query
}
return ExecQuery(ctx, db, sql)
}
// convertDBType converts MySQL or PostgreSQL data types to custom internal types and determines if they are indexable
func ConvertDBType(dialect, dbType string) (string, bool) {
typ := strings.ToLower(dbType)
// Common type conversions
switch {
case strings.HasPrefix(typ, "int"), strings.HasPrefix(typ, "tinyint"),
strings.HasPrefix(typ, "smallint"), strings.HasPrefix(typ, "mediumint"),
strings.HasPrefix(typ, "bigint"), strings.HasPrefix(typ, "serial"),
strings.HasPrefix(typ, "bigserial"):
return types.LogExtractValueTypeLong, true
case strings.HasPrefix(typ, "varchar"), strings.HasPrefix(typ, "text"),
strings.HasPrefix(typ, "char"), strings.HasPrefix(typ, "tinytext"),
strings.HasPrefix(typ, "mediumtext"), strings.HasPrefix(typ, "longtext"),
strings.HasPrefix(typ, "character varying"), strings.HasPrefix(typ, "nvarchar"),
strings.HasPrefix(typ, "nchar"), strings.HasPrefix(typ, "bpchar"):
return types.LogExtractValueTypeText, true
case strings.HasPrefix(typ, "float"), strings.HasPrefix(typ, "double"),
strings.HasPrefix(typ, "decimal"), strings.HasPrefix(typ, "numeric"),
strings.HasPrefix(typ, "real"), strings.HasPrefix(typ, "double precision"):
return types.LogExtractValueTypeFloat, true
case strings.HasPrefix(typ, "date"), strings.HasPrefix(typ, "datetime"),
strings.HasPrefix(typ, "timestamp"), strings.HasPrefix(typ, "timestamptz"),
strings.HasPrefix(typ, "time"), strings.HasPrefix(typ, "smalldatetime"):
return types.LogExtractValueTypeDate, false
case strings.HasPrefix(typ, "boolean"), strings.HasPrefix(typ, "bit"), strings.HasPrefix(typ, "bool"):
return types.LogExtractValueTypeBool, false
}
// Specific type conversions for MySQL
if dialect == "mysql" {
switch {
default:
return typ, false
}
}
// Specific type conversions for PostgreSQL
if dialect == "postgres" {
switch {
default:
return typ, false
}
}
if dialect == "oracle" {
switch {
default:
return typ, false
}
}
// Can continue to add specific 'dialect' type ...
return typ, false
}
================================================
FILE: dskit/sqlbase/timeseries.go
================================================
// @Author: Ciusyan 5/20/24
package sqlbase
import (
"context"
"crypto/md5"
"encoding/json"
"fmt"
"math"
"reflect"
"sort"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/dskit/types"
"github.com/prometheus/common/model"
"gorm.io/gorm"
)
type QueryParam struct {
Sql string `json:"sql"`
Keys types.Keys `json:"keys" mapstructure:"keys"`
}
var (
BannedOp = map[string]struct{}{
"CREATE": {},
"INSERT": {},
"UPDATE": {},
"DELETE": {},
"ALTER": {},
"REVOKE": {},
"DROP": {},
"RENAME": {},
"TRUNCATE": {},
"SET": {},
}
)
// Query executes a given SQL query and returns the results
func Query(ctx context.Context, db *gorm.DB, query *QueryParam) ([]map[string]interface{}, error) {
// Validate SQL to prevent write operations if needed
sqlItem := strings.Split(strings.ToUpper(query.Sql), " ")
for _, item := range sqlItem {
if _, ok := BannedOp[item]; ok {
return nil, fmt.Errorf("operation %s is forbidden, only read operations are allowed, please check your SQL", item)
}
}
return ExecQuery(ctx, db, query.Sql)
}
// QueryTimeseries executes a time series data query using the given parameters
func QueryTimeseries(ctx context.Context, db *gorm.DB, query *QueryParam, ignoreDefault ...bool) ([]types.MetricValues, error) {
rows, err := Query(ctx, db, query)
if err != nil {
return nil, err
}
return FormatMetricValues(query.Keys, rows, ignoreDefault...), nil
}
func FormatMetricValues(keys types.Keys, rows []map[string]interface{}, ignoreDefault ...bool) []types.MetricValues {
ignore := false
if len(ignoreDefault) > 0 {
ignore = ignoreDefault[0]
}
keyMap := make(map[string]string)
for _, valueMetric := range strings.Split(keys.ValueKey, " ") {
keyMap[valueMetric] = "value"
}
for _, labelMetric := range strings.Split(keys.LabelKey, " ") {
keyMap[labelMetric] = "label"
}
if keys.TimeKey == "" {
// 默认支持 __time__ 和 time 作为时间字段
// 用户可以使用 as __time__ 来避免与表中已有的 time 字段冲突
keyMap["__time__"] = "time"
keyMap["time"] = "time"
} else {
keyMap[keys.TimeKey] = "time"
}
var dataResps []types.MetricValues
dataMap := make(map[string]*types.MetricValues)
for _, row := range rows {
labels := make(map[string]string)
metricValue := make(map[string]float64)
metricTs := make(map[string]float64)
// Process each column based on its designated role (value, label, time)
for k, v := range row {
switch keyMap[k] {
case "value":
val, err := ParseFloat64Value(v)
if err != nil {
continue
}
metricValue[k] = val
case "label":
labels[k] = fmt.Sprintf("%v", v)
case "time":
ts, err := ParseTime(v, keys.TimeFormat)
if err != nil {
continue
}
metricTs[k] = float64(ts.Unix())
default:
// Default to labels for any unrecognized columns
if !ignore && keys.LabelKey == "" {
// 只有当 labelKey 为空时,才将剩余的列作为 label
labels[k] = fmt.Sprintf("%v", v)
}
}
}
// Compile and store the metric values
for metricName, value := range metricValue {
// NaN 无法执行json.Marshal(), 接口会报错
if math.IsNaN(value) {
continue
}
metrics := make(model.Metric)
var labelsStr []string
for k1, v1 := range labels {
metrics[model.LabelName(k1)] = model.LabelValue(v1)
labelsStr = append(labelsStr, fmt.Sprintf("%s=%s", k1, v1))
}
metrics["__name__"] = model.LabelValue(metricName)
labelsStr = append(labelsStr, fmt.Sprintf("__name__=%s", metricName))
// Hash the labels to use as a key
sort.Strings(labelsStr)
labelsStrHash := fmt.Sprintf("%x", md5.Sum([]byte(strings.Join(labelsStr, ","))))
// Append new values to the existing metric, if present
var ts float64
var exists bool
if keys.TimeKey == "" {
// 没有配置 timeKey,按优先级查找:__time__ > time
ts, exists = metricTs["__time__"]
if !exists {
ts, exists = metricTs["time"]
}
} else {
// 用户配置了 timeKey,使用用户配置的
ts, exists = metricTs[keys.TimeKey]
}
if !exists {
// Default to current time if not specified
// 大多数情况下offset为空
// 对于记录规则延迟计算的情况,统计值的时间戳需要有偏移,以便跟统计值对应
ts = float64(time.Now().Unix()) - float64(keys.Offset)
}
valuePair := []float64{ts, value}
if existing, ok := dataMap[labelsStrHash]; ok {
existing.Values = append(existing.Values, valuePair)
} else {
dataResp := types.MetricValues{
Metric: metrics,
Values: [][]float64{valuePair},
}
dataMap[labelsStrHash] = &dataResp
}
}
}
// Convert the map to a slice for the response
for _, v := range dataMap {
sort.Slice(v.Values, func(i, j int) bool { return v.Values[i][0] < v.Values[j][0] }) // Sort by timestamp
dataResps = append(dataResps, *v)
}
return dataResps
}
// ParseFloat64Value attempts to convert an interface{} to float64 using reflection
func ParseFloat64Value(val interface{}) (float64, error) {
v := reflect.ValueOf(val)
switch v.Kind() {
case reflect.Float64, reflect.Float32:
return v.Float(), nil
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return float64(v.Int()), nil
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return float64(v.Uint()), nil
case reflect.String:
return strconv.ParseFloat(v.String(), 64)
case reflect.Slice:
if v.Type().Elem().Kind() == reflect.Uint8 {
return strconv.ParseFloat(string(v.Bytes()), 64)
}
case reflect.Interface:
return ParseFloat64Value(v.Interface())
case reflect.Ptr:
if !v.IsNil() {
return ParseFloat64Value(v.Elem().Interface())
}
case reflect.Struct:
if num, ok := val.(json.Number); ok {
return num.Float64()
}
}
return 0, fmt.Errorf("cannot convert type %T to float64", val)
}
// ParseTime attempts to parse a time value from an interface{} using a specified format
func ParseTime(val interface{}, format string) (time.Time, error) {
v := reflect.ValueOf(val)
switch v.Kind() {
case reflect.String:
str := v.String()
return parseTimeFromString(str, format)
case reflect.Slice:
if v.Type().Elem().Kind() == reflect.Uint8 {
str := string(v.Bytes())
return parseTimeFromString(str, format)
}
case reflect.Int, reflect.Int64:
return time.Unix(v.Int(), 0), nil
case reflect.Float64:
return time.Unix(int64(v.Float()), 0), nil
case reflect.Interface:
return ParseTime(v.Interface(), format)
case reflect.Ptr:
if !v.IsNil() {
return ParseTime(v.Elem().Interface(), format)
}
case reflect.Struct:
if t, ok := val.(time.Time); ok {
return t, nil
}
}
return time.Time{}, fmt.Errorf("invalid time value type: %v", val)
}
func parseTimeFromString(str, format string) (time.Time, error) {
// If a custom time format is provided, use it to parse the string
if format != "" {
parsedTime, err := time.Parse(format, str)
if err == nil {
return parsedTime, nil
}
return time.Time{}, fmt.Errorf("failed to parse time '%s' with format '%s': %v", str, format, err)
}
// Try to parse the string as RFC3339, RFC3339Nano, or Unix timestamp
if parsedTime, err := time.Parse(time.RFC3339, str); err == nil {
return parsedTime, nil
}
if parsedTime, err := time.Parse(time.DateTime, str); err == nil {
return parsedTime, nil
}
if parsedTime, err := time.Parse("2006-01-02 15:04:05.000000", str); err == nil {
return parsedTime, nil
}
if parsedTime, err := time.Parse(time.RFC3339Nano, str); err == nil {
return parsedTime, nil
}
if timestamp, err := strconv.ParseInt(str, 10, 64); err == nil {
return time.Unix(timestamp, 0), nil
}
if timestamp, err := strconv.ParseFloat(str, 64); err == nil {
return time.Unix(int64(timestamp), 0), nil
}
return time.Time{}, fmt.Errorf("failed to parse time '%s'", str)
}
================================================
FILE: dskit/sqlbase/timeseries_test.go
================================================
// @Author: Ciusyan 5/17/24
package sqlbase
import (
"encoding/json"
"testing"
"time"
"github.com/ccfos/nightingale/v6/dskit/types"
)
func TestFormatMetricValues(t *testing.T) {
tests := []struct {
name string
keys types.Keys
rows []map[string]interface{}
want []types.MetricValues
}{
{
name: "cases1",
keys: types.Keys{
ValueKey: "grade a_grade",
LabelKey: "id student_name",
TimeKey: "update_time",
TimeFormat: "2006-01-02 15:04:05",
},
rows: []map[string]interface{}{
{
"id": "10007",
"grade": 20003,
"student_name": "邵子韬",
"a_grade": 69,
"update_time": "2024-05-14 10:00:00",
},
{
"id": "10007",
"grade": 20003,
"student_name": "邵子韬",
"a_grade": 69,
"update_time": "2024-05-14 10:05:00",
},
{
"id": "10007",
"grade": 20003,
"student_name": "邵子韬",
"a_grade": 69,
"update_time": "2024-05-14 10:10:00",
},
{
"id": "10008",
"grade": 20004,
"student_name": "Ciusyan",
"a_grade": 100,
"update_time": "2024-05-14 12:00:00",
},
},
},
{
name: "test __time__ priority over time",
keys: types.Keys{
ValueKey: "value",
LabelKey: "host",
},
rows: []map[string]interface{}{
{
"host": "server1",
"value": 100,
"time": int64(1715642100), // 这个应该被忽略
"__time__": int64(1715642135), // 这个应该被使用
},
},
},
{
name: "test fallback to time when __time__ not exists",
keys: types.Keys{
ValueKey: "value",
LabelKey: "host",
},
rows: []map[string]interface{}{
{
"host": "server2",
"value": 200,
"time": int64(1715642200), // 应该使用这个
},
},
},
{
name: "test __time__ alone without time field",
keys: types.Keys{
ValueKey: "value",
LabelKey: "host",
},
rows: []map[string]interface{}{
{
"host": "server3",
"value": 300,
"__time__": int64(1715642300), // 应该使用这个
},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := FormatMetricValues(tt.keys, tt.rows)
for _, g := range got {
t.Log(g)
}
})
}
}
func TestParseFloat64Value(t *testing.T) {
ptr := func(val float64) *float64 {
return &val
}
tests := []struct {
name string
input interface{}
want float64
wantErr bool
}{
{"float64", 1.23, 1.23, false},
{"float32", float32(1.23), float64(float32(1.23)), false},
{"int", 123, 123, false},
{"int64", int64(123), 123, false},
{"uint", uint(123), 123, false},
{"uint64", uint64(123), 123, false},
{"string", "1.23", 1.23, false},
{"[]byte", []byte("1.23"), 1.23, false},
{"json.Number", json.Number("1.23"), 1.23, false},
{"interface", interface{}(1.23), 1.23, false},
{"pointer", ptr(1.23), 1.23, false},
{"invalid string", "abc", 0, true},
{"invalid type", struct{}{}, 0, true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := ParseFloat64Value(tt.input)
if (err != nil) != tt.wantErr {
t.Errorf("parseFloat64Value() error = %v, wantErr %v", err, tt.wantErr)
return
}
if got != tt.want {
t.Errorf("parseFloat64Value() = %v, want %v", got, tt.want)
}
})
}
}
func TestParseTime(t *testing.T) {
ptrTime := func(t time.Time) *time.Time {
return &t
}
tests := []struct {
name string
input interface{}
format string
want time.Time
wantErr bool
}{
{"RFC3339", "2024-05-14T12:34:56Z", "", time.Date(2024, 5, 14, 12, 34, 56, 0, time.UTC), false},
{"RFC3339Nano", "2024-05-14T12:34:56.789Z", "", time.Date(2024, 5, 14, 12, 34, 56, 789000000, time.UTC), false},
{"Unix timestamp int", int64(1715642135), "", time.Unix(1715642135, 0), false},
{"Unix timestamp float64", 1715642135.0, "", time.Unix(int64(1715642135), 0), false},
{"custom format", "14/05/2024", "02/01/2006", time.Date(2024, 5, 14, 0, 0, 0, 0, time.UTC), false},
{"slice", []byte("2024-05-14T12:34:56Z"), "", time.Date(2024, 5, 14, 12, 34, 56, 0, time.UTC), false},
{"interface", interface{}("2024-05-14T12:34:56Z"), "", time.Date(2024, 5, 14, 12, 34, 56, 0, time.UTC), false},
{"pointer", ptrTime(time.Date(2024, 5, 14, 12, 34, 56, 0, time.UTC)), "", time.Date(2024, 5, 14, 12, 34, 56, 0, time.UTC), false},
{"invalid format", "14-05-2024", "02/01/2006", time.Time{}, true},
{"invalid type", struct{}{}, "", time.Time{}, true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := ParseTime(tt.input, tt.format)
if (err != nil) != tt.wantErr {
t.Errorf("ParseTime() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !got.Equal(tt.want) {
t.Errorf("ParseTime() = %v, want %v", got, tt.want)
}
})
}
}
================================================
FILE: dskit/tdengine/tdengine.go
================================================
package tdengine
import (
"context"
"encoding/base64"
"encoding/json"
"fmt"
"net"
"net/http"
"strings"
"time"
"github.com/ccfos/nightingale/v6/dskit/types"
"github.com/ccfos/nightingale/v6/pkg/tlsx"
"github.com/toolkits/pkg/logger"
)
type Tdengine struct {
Addr string `json:"tdengine.addr" mapstructure:"tdengine.addr"`
Basic *TDengineBasicAuth `json:"tdengine.basic" mapstructure:"tdengine.basic"`
Token string `json:"tdengine.token" mapstructure:"tdengine.token"`
Timeout int64 `json:"tdengine.timeout" mapstructure:"tdengine.timeout"`
DialTimeout int64 `json:"tdengine.dial_timeout" mapstructure:"tdengine.dial_timeout"`
MaxIdleConnsPerHost int `json:"tdengine.max_idle_conns_per_host" mapstructure:"tdengine.max_idle_conns_per_host"`
Headers map[string]string `json:"tdengine.headers" mapstructure:"tdengine.headers"`
SkipTlsVerify bool `json:"tdengine.skip_tls_verify" mapstructure:"tdengine.skip_tls_verify"`
tlsx.ClientConfig
header map[string][]string `json:"-"`
client *http.Client `json:"-"`
}
type TDengineBasicAuth struct {
User string `json:"tdengine.user" mapstructure:"tdengine.user"`
Password string `json:"tdengine.password" mapstructure:"tdengine.password"`
IsEncrypt bool `json:"tdengine.is_encrypt" mapstructure:"tdengine.is_encrypt"`
}
type APIResponse struct {
Code int `json:"code"`
ColumnMeta [][]interface{} `json:"column_meta"`
Data [][]interface{} `json:"data"`
Rows int `json:"rows"`
}
type QueryParam struct {
Database string `json:"database"`
Table string `json:"table"`
}
func (tc *Tdengine) InitCli() {
tc.client = &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
IdleConnTimeout: 90 * time.Second,
TLSHandshakeTimeout: 10 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
DisableCompression: true,
},
}
tc.header = map[string][]string{
"Connection": {"keep-alive"},
}
for k, v := range tc.Headers {
kv := strings.Split(v, ":")
if len(kv) != 2 {
continue
}
tc.header[k] = []string{v}
}
if tc.Basic != nil {
basic := base64.StdEncoding.EncodeToString([]byte(tc.Basic.User + ":" + tc.Basic.Password))
tc.header["Authorization"] = []string{fmt.Sprintf("Basic %s", basic)}
}
}
func (tc *Tdengine) QueryTable(query string) (APIResponse, error) {
var apiResp APIResponse
req, err := http.NewRequest("POST", tc.Addr+"/rest/sql", strings.NewReader(query))
if err != nil {
return apiResp, err
}
for k, v := range tc.header {
req.Header[k] = v
}
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
resp, err := tc.client.Do(req)
if err != nil {
return apiResp, err
}
defer resp.Body.Close()
// 限制响应体大小为10MB
maxSize := int64(10 * 1024 * 1024) // 10MB
limitedReader := http.MaxBytesReader(nil, resp.Body, maxSize)
if resp.StatusCode != http.StatusOK {
return apiResp, fmt.Errorf("HTTP error, status: %s", resp.Status)
}
err = json.NewDecoder(limitedReader).Decode(&apiResp)
if err != nil {
if strings.Contains(err.Error(), "http: request body too large") {
return apiResp, fmt.Errorf("response body exceeds 10MB limit")
}
return apiResp, err
}
return apiResp, nil
}
func (tc *Tdengine) ShowDatabases(context.Context) ([]string, error) {
databases := make([]string, 0)
data, err := tc.QueryTable("show databases")
if err != nil {
return databases, err
}
for _, row := range data.Data {
databases = append(databases, row[0].(string))
}
return databases, nil
}
func (tc *Tdengine) ShowTables(ctx context.Context, database string) ([]string, error) {
tables := make([]string, 0)
sql := fmt.Sprintf("show %s", database)
data, err := tc.QueryTable(sql)
if err != nil {
return tables, err
}
for _, row := range data.Data {
tables = append(tables, row[0].(string))
}
return tables, nil
}
func (tc *Tdengine) DescribeTable(ctx context.Context, query interface{}) ([]*types.ColumnProperty, error) {
var columns []*types.ColumnProperty
queryMap, ok := query.(map[string]string)
if !ok {
return nil, fmt.Errorf("invalid query")
}
sql := fmt.Sprintf("select * from %s.%s limit 1", queryMap["database"], queryMap["table"])
data, err := tc.QueryTable(sql)
if err != nil {
return columns, err
}
for _, row := range data.ColumnMeta {
var colType string
switch t := row[1].(type) {
case float64:
// v2版本数字类型映射
switch int(t) {
case 1:
colType = "BOOL"
case 2:
colType = "TINYINT"
case 3:
colType = "SMALLINT"
case 4:
colType = "INT"
case 5:
colType = "BIGINT"
case 6:
colType = "FLOAT"
case 7:
colType = "DOUBLE"
case 8:
colType = "BINARY"
case 9:
colType = "TIMESTAMP"
case 10:
colType = "NCHAR"
default:
colType = "UNKNOWN"
}
case string:
// v3版本直接使用字符串类型
colType = t
default:
logger.Warningf("unexpected column type format: %v", row[1])
colType = "UNKNOWN"
}
column := &types.ColumnProperty{
Field: row[0].(string),
Type: colType,
}
columns = append(columns, column)
}
return columns, nil
}
================================================
FILE: dskit/types/timeseries.go
================================================
package types
import (
"bytes"
"fmt"
"strconv"
"github.com/prometheus/common/model"
)
// 时序数据
type MetricValues struct {
Metric model.Metric `json:"metric"`
Values [][]float64 `json:"values"`
}
type HistogramValues struct {
Total int64 `json:"total"`
Values [][]float64 `json:"values"`
}
// 瞬时值
type AggregateValues struct {
Labels map[string]string `json:"labels"`
Values map[string]float64 `json:"values"`
}
// string
func (m *MetricValues) String() string {
var buf bytes.Buffer
buf.WriteString(fmt.Sprintf("Metric: %+v ", m.Metric))
buf.WriteString("Values: ")
for _, v := range m.Values {
buf.WriteString(" [")
for i, ts := range v {
if i > 0 {
buf.WriteString(", ")
}
buf.WriteString(strconv.FormatFloat(ts, 'f', -1, 64))
}
buf.WriteString("] ")
}
return buf.String()
}
type Keys struct {
ValueKey string `json:"valueKey" mapstructure:"valueKey"` // 多个用空格分隔
LabelKey string `json:"labelKey" mapstructure:"labelKey"` // 多个用空格分隔
TimeKey string `json:"timeKey" mapstructure:"timeKey"`
TimeFormat string `json:"timeFormat" mapstructure:"timeFormat"` // not used anymore
Offset int `json:"offset" mapstructure:"offset"`
}
================================================
FILE: dskit/types/types.go
================================================
package types
const (
LogExtractValueTypeLong = "long"
LogExtractValueTypeFloat = "float"
LogExtractValueTypeText = "text"
LogExtractValueTypeDate = "date"
LogExtractValueTypeBool = "bool"
LogExtractValueTypeObject = "object"
LogExtractValueTypeArray = "array"
LogExtractValueTypeJSON = "json"
)
type ColumnProperty struct {
Field string `json:"field"`
Type string `json:"type"`
Type2 string `json:"type2,omitempty"` // field_property.Type
Indexable bool `json:"indexable"` // 是否可以索引
}
================================================
FILE: dskit/victorialogs/victorialogs.go
================================================
package victorialogs
import (
"bufio"
"context"
"crypto/tls"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strconv"
"strings"
"time"
)
type VictoriaLogs struct {
VictorialogsAddr string `json:"victorialogs.addr" mapstructure:"victorialogs.addr"`
VictorialogsBasic struct {
VictorialogsUser string `json:"victorialogs.user" mapstructure:"victorialogs.user"`
VictorialogsPass string `json:"victorialogs.password" mapstructure:"victorialogs.password"`
IsEncrypt bool `json:"victorialogs.is_encrypt" mapstructure:"victorialogs.is_encrypt"`
} `json:"victorialogs.basic" mapstructure:"victorialogs.basic"`
VictorialogsTls struct {
SkipTlsVerify bool `json:"victorialogs.tls.skip_tls_verify" mapstructure:"victorialogs.tls.skip_tls_verify"`
} `json:"victorialogs.tls" mapstructure:"victorialogs.tls"`
Headers map[string]string `json:"victorialogs.headers" mapstructure:"victorialogs.headers"`
Timeout int64 `json:"victorialogs.timeout" mapstructure:"victorialogs.timeout"` // millis
ClusterName string `json:"victorialogs.cluster_name" mapstructure:"victorialogs.cluster_name"`
MaxQueryRows int `json:"victorialogs.max_query_rows" mapstructure:"victorialogs.max_query_rows"`
HTTPClient *http.Client `json:"-" mapstructure:"-"`
}
// LogEntry 日志条目
type LogEntry map[string]interface{}
// PrometheusResponse Prometheus 响应格式
type PrometheusResponse struct {
Status string `json:"status"`
Data PrometheusData `json:"data"`
Error string `json:"error,omitempty"`
}
// PrometheusData Prometheus 数据部分
type PrometheusData struct {
ResultType string `json:"resultType"`
Result []PrometheusItem `json:"result"`
}
// PrometheusItem Prometheus 数据项
type PrometheusItem struct {
Metric map[string]string `json:"metric"`
Value []interface{} `json:"value,omitempty"` // [timestamp, value]
Values [][]interface{} `json:"values,omitempty"` // [[timestamp, value], ...]
}
// HitsResult hits 查询响应
type HitsResult struct {
Hits []struct {
Total int64 `json:"total"`
}
}
// InitHTTPClient 初始化 HTTP 客户端
func (vl *VictoriaLogs) InitHTTPClient() error {
transport := &http.Transport{
MaxIdleConns: 100,
MaxIdleConnsPerHost: 10,
IdleConnTimeout: 90 * time.Second,
TLSClientConfig: &tls.Config{
InsecureSkipVerify: vl.VictorialogsTls.SkipTlsVerify,
},
}
timeout := time.Duration(vl.Timeout) * time.Millisecond
if timeout == 0 {
timeout = 60 * time.Second
}
vl.HTTPClient = &http.Client{
Transport: transport,
Timeout: timeout,
}
return nil
}
// Query 执行日志查询
// GET/POST /select/logsql/query?query=&start=&end=&limit=
func (vl *VictoriaLogs) Query(ctx context.Context, query string, start, end int64, limit int) ([]LogEntry, error) {
params := url.Values{}
params.Set("query", query)
if start > 0 {
params.Set("start", strconv.FormatInt(start, 10))
}
if end > 0 {
params.Set("end", strconv.FormatInt(end, 10))
}
if limit > 0 {
params.Set("limit", strconv.Itoa(limit))
} else {
params.Set("limit", strconv.Itoa(vl.MaxQueryRows)) // 默认 1000 条
}
endpoint := fmt.Sprintf("%s/select/logsql/query", vl.VictorialogsAddr)
resp, err := vl.doRequest(ctx, "POST", endpoint, params)
if err != nil {
return nil, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("read response body failed: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("query failed: status=%d, body=%s", resp.StatusCode, string(body))
}
// VictoriaLogs returns NDJSON format (one JSON object per line)
var logs []LogEntry
scanner := bufio.NewScanner(strings.NewReader(string(body)))
for scanner.Scan() {
line := scanner.Text()
if line == "" {
continue
}
var entry LogEntry
if err := json.Unmarshal([]byte(line), &entry); err != nil {
return nil, fmt.Errorf("decode log entry failed: %w, line=%s", err, line)
}
logs = append(logs, entry)
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("scan response failed: %w", err)
}
return logs, nil
}
// StatsQuery 执行统计查询(单点时间)
// POST /select/logsql/stats_query?query=&time=
func (vl *VictoriaLogs) StatsQuery(ctx context.Context, query string, time int64) (*PrometheusResponse, error) {
params := url.Values{}
params.Set("query", query)
if time > 0 {
params.Set("time", strconv.FormatInt(time, 10))
}
endpoint := fmt.Sprintf("%s/select/logsql/stats_query", vl.VictorialogsAddr)
resp, err := vl.doRequest(ctx, "POST", endpoint, params)
if err != nil {
return nil, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("read response body failed: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("stats query failed: status=%d, body=%s", resp.StatusCode, string(body))
}
var result PrometheusResponse
if err := json.Unmarshal(body, &result); err != nil {
return nil, fmt.Errorf("decode response failed: %w, body=%s", err, string(body))
}
if result.Status != "success" {
return nil, fmt.Errorf("query failed: %s", result.Error)
}
return &result, nil
}
// StatsQueryRange 执行统计查询(时间范围)
// POST /select/logsql/stats_query_range?query=&start=&end=&step=
func (vl *VictoriaLogs) StatsQueryRange(ctx context.Context, query string, start, end int64, step string) (*PrometheusResponse, error) {
params := url.Values{}
params.Set("query", query)
if start > 0 {
params.Set("start", strconv.FormatInt(start, 10))
}
if end > 0 {
params.Set("end", strconv.FormatInt(end, 10))
}
if step != "" {
params.Set("step", step)
}
endpoint := fmt.Sprintf("%s/select/logsql/stats_query_range", vl.VictorialogsAddr)
resp, err := vl.doRequest(ctx, "POST", endpoint, params)
if err != nil {
return nil, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("read response body failed: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("stats query range failed: status=%d, body=%s", resp.StatusCode, string(body))
}
var result PrometheusResponse
if err := json.Unmarshal(body, &result); err != nil {
return nil, fmt.Errorf("decode response failed: %w, body=%s", err, string(body))
}
if result.Status != "success" {
return nil, fmt.Errorf("query failed: %s", result.Error)
}
return &result, nil
}
// HitsLogs 返回查询命中的日志数量,用于计算 total
// POST /select/logsql/hits?query=&start=&end=
func (vl *VictoriaLogs) HitsLogs(ctx context.Context, query string, start, end int64) (int64, error) {
params := url.Values{}
params.Set("query", query)
if start > 0 {
params.Set("start", strconv.FormatInt(start, 10))
}
if end > 0 {
params.Set("end", strconv.FormatInt(end, 10))
}
endpoint := fmt.Sprintf("%s/select/logsql/hits", vl.VictorialogsAddr)
resp, err := vl.doRequest(ctx, "POST", endpoint, params)
if err != nil {
return 0, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return 0, fmt.Errorf("read response body failed: %w", err)
}
if resp.StatusCode != http.StatusOK {
return 0, fmt.Errorf("hits query failed: status=%d, body=%s", resp.StatusCode, string(body))
}
var result HitsResult
if err := json.Unmarshal(body, &result); err != nil {
return 0, fmt.Errorf("decode response failed: %w, body=%s", err, string(body))
}
if len(result.Hits) == 0 {
return 0, nil
}
return result.Hits[0].Total, nil
}
// doRequest 执行 HTTP 请求
func (vl *VictoriaLogs) doRequest(ctx context.Context, method, endpoint string, params url.Values) (*http.Response, error) {
var req *http.Request
var err error
if method == "GET" {
fullURL := endpoint
if len(params) > 0 {
fullURL = fmt.Sprintf("%s?%s", endpoint, params.Encode())
}
req, err = http.NewRequestWithContext(ctx, method, fullURL, nil)
} else {
// POST with form data
req, err = http.NewRequestWithContext(ctx, method, endpoint, strings.NewReader(params.Encode()))
if err == nil {
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
}
}
if err != nil {
return nil, fmt.Errorf("create request failed: %w", err)
}
if vl.VictorialogsBasic.VictorialogsUser != "" {
req.SetBasicAuth(vl.VictorialogsBasic.VictorialogsUser, vl.VictorialogsBasic.VictorialogsPass)
}
// Custom Headers
for k, v := range vl.Headers {
req.Header.Set(k, v)
}
return vl.HTTPClient.Do(req)
}
================================================
FILE: dskit/victorialogs/victorialogs_test.go
================================================
package victorialogs
import (
"context"
"testing"
"time"
)
var v = VictoriaLogs{
VictorialogsAddr: "http://127.0.0.1:9428",
Headers: make(map[string]string),
Timeout: 10000, // 10 seconds in milliseconds
}
func TestVictoriaLogs_InitHTTPClient(t *testing.T) {
if err := v.InitHTTPClient(); err != nil {
t.Fatalf("InitHTTPClient failed: %v", err)
}
if v.HTTPClient == nil {
t.Fatal("HTTPClient should not be nil after initialization")
}
}
func TestVictoriaLogs_Query(t *testing.T) {
ctx := context.Background()
if err := v.InitHTTPClient(); err != nil {
t.Fatalf("InitHTTPClient failed: %v", err)
}
// Query logs with basic query
now := time.Now().UnixNano()
start := now - int64(time.Hour) // 1 hour ago
end := now
logs, err := v.Query(ctx, "*", start, end, 10)
if err != nil {
t.Fatalf("Query failed: %v", err)
}
t.Logf("Query returned %d log entries", len(logs))
for i, log := range logs {
t.Logf("Log[%d]: %v", i, log)
}
}
func TestVictoriaLogs_StatsQuery(t *testing.T) {
ctx := context.Background()
if err := v.InitHTTPClient(); err != nil {
t.Fatalf("InitHTTPClient failed: %v", err)
}
// Stats query with count
now := time.Now().UnixNano()
result, err := v.StatsQuery(ctx, "* | stats count() as total", now)
if err != nil {
t.Fatalf("StatsQuery failed: %v", err)
}
t.Logf("StatsQuery result: status=%s, resultType=%s", result.Status, result.Data.ResultType)
for i, item := range result.Data.Result {
t.Logf("Result[%d]: metric=%v, value=%v", i, item.Metric, item.Value)
}
}
func TestVictoriaLogs_StatsQueryRange(t *testing.T) {
ctx := context.Background()
if err := v.InitHTTPClient(); err != nil {
t.Fatalf("InitHTTPClient failed: %v", err)
}
// Stats query range
now := time.Now().UnixNano()
start := now - int64(time.Hour) // 1 hour ago
end := now
result, err := v.StatsQueryRange(ctx, "* | stats count() as total", start, end, "5m")
if err != nil {
t.Fatalf("StatsQueryRange failed: %v", err)
}
t.Logf("StatsQueryRange result: status=%s, resultType=%s", result.Status, result.Data.ResultType)
for i, item := range result.Data.Result {
t.Logf("Result[%d]: metric=%v, values count=%d", i, item.Metric, len(item.Values))
}
}
func TestVictoriaLogs_HitsLogs(t *testing.T) {
ctx := context.Background()
if err := v.InitHTTPClient(); err != nil {
t.Fatalf("InitHTTPClient failed: %v", err)
}
// Get total hits count
now := time.Now().UnixNano()
start := now - int64(time.Hour) // 1 hour ago
end := now
count, err := v.HitsLogs(ctx, "*", start, end)
if err != nil {
t.Fatalf("HitsLogs failed: %v", err)
}
t.Logf("HitsLogs total count: %d", count)
}
func TestVictoriaLogs_QueryWithFilter(t *testing.T) {
ctx := context.Background()
if err := v.InitHTTPClient(); err != nil {
t.Fatalf("InitHTTPClient failed: %v", err)
}
// Query with a filter condition
now := time.Now().UnixNano()
start := now - int64(time.Hour)
end := now
logs, err := v.Query(ctx, "_stream:{app=\"test\"}", start, end, 5)
if err != nil {
t.Fatalf("Query with filter failed: %v", err)
}
t.Logf("Query with filter returned %d log entries", len(logs))
}
func TestVictoriaLogs_StatsQueryByField(t *testing.T) {
ctx := context.Background()
if err := v.InitHTTPClient(); err != nil {
t.Fatalf("InitHTTPClient failed: %v", err)
}
// Stats query grouped by field
now := time.Now().UnixNano()
result, err := v.StatsQuery(ctx, "* | stats by (level) count() as cnt", now)
if err != nil {
t.Fatalf("StatsQuery by field failed: %v", err)
}
t.Logf("StatsQuery by field result: status=%s", result.Status)
for i, item := range result.Data.Result {
t.Logf("Result[%d]: metric=%v, value=%v", i, item.Metric, item.Value)
}
}
================================================
FILE: dumper/dumper.go
================================================
package dumper
import "github.com/gin-gonic/gin"
// package level functions
func ConfigRouter(r *gin.Engine) {
syncDumper.ConfigRouter(r)
}
================================================
FILE: dumper/sync.go
================================================
package dumper
import (
"fmt"
"strings"
"sync"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/time"
)
type SyncRecord struct {
Timestamp int64
Mills int64
Count int
Message string
}
func (sr *SyncRecord) String() string {
var sb strings.Builder
sb.WriteString("timestamp: ")
sb.WriteString(time.Format(sr.Timestamp))
sb.WriteString(", mills: ")
sb.WriteString(fmt.Sprint(sr.Mills, "ms"))
sb.WriteString(", count: ")
sb.WriteString(fmt.Sprint(sr.Count))
sb.WriteString(", message: ")
sb.WriteString(sr.Message)
return sb.String()
}
type SyncRecords struct {
Current *SyncRecord
Last *SyncRecord
}
type SyncDumper struct {
sync.RWMutex
records map[string]*SyncRecords
}
func NewSyncDumper() *SyncDumper {
return &SyncDumper{
records: make(map[string]*SyncRecords),
}
}
var syncDumper = NewSyncDumper()
func (sd *SyncDumper) Put(key string, timestamp, mills int64, count int, message string) {
sr := &SyncRecord{
Timestamp: timestamp,
Mills: mills,
Count: count,
Message: message,
}
sd.Lock()
defer sd.Unlock()
if _, ok := sd.records[key]; !ok {
sd.records[key] = &SyncRecords{Current: sr}
return
}
sd.records[key].Last = sd.records[key].Current
sd.records[key].Current = sr
}
// busi_groups:
// last: timestamp, mills, count
// curr: timestamp, mills, count
func (sd *SyncDumper) Sprint() string {
sd.RLock()
defer sd.RUnlock()
var sb strings.Builder
sb.WriteString("\n")
for k, v := range sd.records {
sb.WriteString(k)
sb.WriteString(":\n")
if v.Last != nil {
sb.WriteString("last: ")
sb.WriteString(v.Last.String())
sb.WriteString("\n")
}
sb.WriteString("curr: ")
sb.WriteString(v.Current.String())
sb.WriteString("\n\n")
}
return sb.String()
}
func (sd *SyncDumper) ConfigRouter(r *gin.Engine) {
r.GET("/dumper/sync", func(c *gin.Context) {
clientIP := c.ClientIP()
if clientIP != "127.0.0.1" && clientIP != "::1" {
c.String(403, "forbidden")
return
}
c.String(200, sd.Sprint())
})
}
func PutSyncRecord(key string, timestamp, mills int64, count int, message string) {
syncDumper.Put(key, timestamp, mills, count, message)
}
================================================
FILE: etc/config.toml
================================================
[Global]
RunMode = "release"
[Log]
# log write dir
Dir = "logs"
# log level: DEBUG INFO WARNING ERROR
Level = "DEBUG"
# stdout, stderr, file
Output = "stdout"
# # rotate by time
# KeepHours = 4
# # rotate by size
# RotateNum = 3
# # unit: MB
# RotateSize = 256
[HTTP]
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 17000
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = false
# whether enable pprof
PProf = true
# expose prometheus /metrics?
ExposeMetrics = true
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120
[HTTP.ShowCaptcha]
Enable = false
[HTTP.APIForAgent]
Enable = true
# [HTTP.APIForAgent.BasicAuth]
# user001 = "ccc26da7b9aba533cbb263a36c07dcc5"
[HTTP.APIForService]
Enable = false
[HTTP.APIForService.BasicAuth]
user001 = "ccc26da7b9aba533cbb263a36c07dcc5"
[HTTP.JWTAuth]
# unit: min
AccessExpired = 1500
# unit: min
RefreshExpired = 10080
RedisKeyPrefix = "/jwt/"
[HTTP.ProxyAuth]
# if proxy auth enabled, jwt auth is disabled
Enable = false
# username key in http proxy header
HeaderUserNameKey = "X-User-Name"
DefaultRoles = ["Standard"]
[HTTP.TokenAuth]
Enable = true
[HTTP.RSA]
# open RSA
OpenRSA = false
[DB]
# mysql postgres sqlite
DBType = "sqlite"
# postgres: host=%s port=%s user=%s dbname=%s password=%s sslmode=%s
# postgres: DSN="host=127.0.0.1 port=5432 user=root dbname=n9e_v6 password=1234 sslmode=disable"
# mysql: DSN="root:1234@tcp(localhost:3306)/n9e_v6?charset=utf8mb4&parseTime=True&loc=Local"
DSN = "n9e.db"
# enable debug mode or not
Debug = false
# unit: s
MaxLifetime = 7200
# max open connections
MaxOpenConns = 150
# max idle connections
MaxIdleConns = 50
[Redis]
# address, ip:port or ip1:port,ip2:port for cluster and sentinel(SentinelAddrs)
Address = "127.0.0.1:6379"
# Username = ""
# Password = ""
# DB = 0
# UseTLS = false
# TLSMinVersion = "1.2"
# standalone cluster sentinel miniredis
RedisType = "miniredis"
# Mastername for sentinel type
# MasterName = "mymaster"
# SentinelUsername = ""
# SentinelPassword = ""
[Alert]
[Alert.Heartbeat]
# auto detect if blank
IP = ""
# unit ms
Interval = 1000
EngineName = "default"
# [Alert.Alerting]
# NotifyConcurrency = 10
[Center]
MetricsYamlFile = "./etc/metrics.yaml"
I18NHeaderKey = "X-Language"
[Center.AnonymousAccess]
PromQuerier = true
AlertDetail = true
[Pushgw]
# use target labels in database instead of in series
LabelRewrite = true
ForceUseServerTS = true
# [Pushgw.DebugSample]
# ident = "xx"
# __name__ = "xx"
# [Pushgw.WriterOpt]
# QueueMaxSize = 1000000
# QueuePopSize = 1000
[[Pushgw.Writers]]
# Url = "http://127.0.0.1:8480/insert/0/prometheus/api/v1/write"
Url = "http://127.0.0.1:9090/api/v1/write"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Headers = ["X-From", "n9e"]
Timeout = 10000
DialTimeout = 3000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
## Optional TLS Config
# UseTLS = false
# TLSCA = "/etc/n9e/ca.pem"
# TLSCert = "/etc/n9e/cert.pem"
# TLSKey = "/etc/n9e/key.pem"
# InsecureSkipVerify = false
# [[Pushgw.Writers.WriteRelabels]]
# Action = "replace"
# SourceLabels = ["__address__"]
# Regex = "([^:]+)(?::\\d+)?"
# Replacement = "$1:80"
# TargetLabel = "__address__"
# [[Pushgw.KafkaWriters]]
# Brokers = ["127.0.0.1:9092"]
# Topic = "n9e-metrics"
# [Pushgw.KafkaWriters.SASL]
# Enable = true
# User = "admin"
# Password = "admin"
# Mechanism = "PLAIN"
# Version = 1
# Handshake = true
# AuthIdentity = ""
[Ibex]
Enable = true
RPCListen = "0.0.0.0:20090"
================================================
FILE: etc/edge/edge.toml
================================================
[Global]
RunMode = "release"
[CenterApi]
Addrs = ["http://127.0.0.1:17000"]
BasicAuthUser = "user001"
BasicAuthPass = "ccc26da7b9aba533cbb263a36c07dcc5"
# unit: ms
Timeout = 9000
[Log]
# log write dir
Dir = "logs"
# log level: DEBUG INFO WARNING ERROR
Level = "DEBUG"
# stdout, stderr, file
Output = "stdout"
# # rotate by time
# KeepHours = 4
# # rotate by size
# RotateNum = 3
# # unit: MB
# RotateSize = 256
[HTTP]
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 19000
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = false
# whether enable pprof
PProf = false
# expose prometheus /metrics?
ExposeMetrics = true
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120
[HTTP.APIForAgent]
Enable = true
# [HTTP.APIForAgent.BasicAuth]
# user001 = "ccc26da7b9aba533cbb263a36c07dcc5"
[HTTP.APIForService]
Enable = false
[HTTP.APIForService.BasicAuth]
user001 = "ccc26da7b9aba533cbb263a36c07dcc5"
[Alert]
[Alert.Heartbeat]
# auto detect if blank
IP = ""
# unit ms
Interval = 1000
EngineName = "edge"
# [Alert.Alerting]
# NotifyConcurrency = 10
[Pushgw]
# use target labels in database instead of in series
LabelRewrite = true
# # default busigroup key name
# BusiGroupLabelKey = "busigroup"
ForceUseServerTS = true
# [Pushgw.DebugSample]
# ident = "xx"
# __name__ = "xx"
# [Pushgw.WriterOpt]
# QueueMaxSize = 1000000
# QueuePopSize = 1000
[[Pushgw.Writers]]
# Url = "http://127.0.0.1:8480/insert/0/prometheus/api/v1/write"
Url = "http://127.0.0.1:9090/api/v1/write"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Headers = ["X-From", "n9e"]
Timeout = 10000
DialTimeout = 3000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
## Optional TLS Config
# UseTLS = false
# TLSCA = "/etc/n9e/ca.pem"
# TLSCert = "/etc/n9e/cert.pem"
# TLSKey = "/etc/n9e/key.pem"
# InsecureSkipVerify = false
# [[Writers.WriteRelabels]]
# Action = "replace"
# SourceLabels = ["__address__"]
# Regex = "([^:]+)(?::\\d+)?"
# Replacement = "$1:80"
# TargetLabel = "__address__"
[Ibex]
Enable = false
RPCListen = "0.0.0.0:20090"
# n9e-edge cannot directly reuse the redis that n9e relies on at the center.
# It needs to deploy a separate redis in the edge region for n9e-edge to use.
[Redis]
# address, ip:port or ip1:port,ip2:port for cluster and sentinel(SentinelAddrs)
Address = "127.0.0.1:6379"
# Username = ""
# Password = ""
# DB = 0
# UseTLS = false
# TLSMinVersion = "1.2"
# standalone cluster sentinel
RedisType = "standalone"
# Mastername for sentinel type
# MasterName = "mymaster"
# SentinelUsername = ""
# SentinelPassword = ""
================================================
FILE: etc/metrics.yaml
================================================
zh:
ip_conntrack_count: 连接跟踪表条目总数(单位:int, count)
ip_conntrack_max: 连接跟踪表最大容量(单位:int, size)
cpu_usage_idle: CPU空闲率(单位:%)
cpu_usage_active: CPU使用率(单位:%)
cpu_usage_system: CPU内核态时间占比(单位:%)
cpu_usage_user: CPU用户态时间占比(单位:%)
cpu_usage_nice: 低优先级用户态CPU时间占比,也就是进程nice值被调整为1-19之间的CPU时间。这里注意,nice可取值范围是-20到19,数值越大,优先级反而越低(单位:%)
cpu_usage_iowait: CPU等待I/O的时间占比(单位:%)
cpu_usage_irq: CPU处理硬中断的时间占比(单位:%)
cpu_usage_softirq: CPU处理软中断的时间占比(单位:%)
cpu_usage_steal: 在虚拟机环境下有该指标,表示CPU被其他虚拟机争用的时间占比,超过20就表示争抢严重(单位:%)
cpu_usage_guest: 通过虚拟化运行其他操作系统的时间,也就是运行虚拟机的CPU时间占比(单位:%)
cpu_usage_guest_nice: 以低优先级运行虚拟机的时间占比(单位:%)
disk_free: 硬盘分区剩余量(单位:byte)
disk_used: 硬盘分区使用量(单位:byte)
disk_used_percent: 硬盘分区使用率(单位:%)
disk_total: 硬盘分区总量(单位:byte)
disk_inodes_free: 硬盘分区inode剩余量
disk_inodes_used: 硬盘分区inode使用量
disk_inodes_total: 硬盘分区inode总量
diskio_io_time: 从设备视角来看I/O请求总时间,队列中有I/O请求就计数(单位:毫秒),counter类型,需要用函数求rate才有使用价值
diskio_iops_in_progress: 已经分配给设备驱动且尚未完成的IO请求,不包含在队列中但尚未分配给设备驱动的IO请求,gauge类型
diskio_merged_reads: 相邻读请求merge读的次数,counter类型
diskio_merged_writes: 相邻写请求merge写的次数,counter类型
diskio_read_bytes: 读取的byte数量,counter类型,需要用函数求rate才有使用价值
diskio_read_time: 读请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值
diskio_reads: 读请求次数,counter类型,需要用函数求rate才有使用价值
diskio_weighted_io_time: 从I/O请求视角来看I/O等待总时间,如果同时有多个I/O请求,时间会叠加(单位:毫秒)
diskio_write_bytes: 写入的byte数量,counter类型,需要用函数求rate才有使用价值
diskio_write_time: 写请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值
diskio_writes: 写请求次数,counter类型,需要用函数求rate才有使用价值
kernel_boot_time: 内核启动时间
kernel_context_switches: 内核上下文切换次数
kernel_entropy_avail: linux系统内部的熵池
kernel_interrupts: 内核中断次数
kernel_processes_forked: fork的进程数
mem_active: 活跃使用的内存总数(包括cache和buffer内存)
mem_available: 可用内存大小(bytes)
mem_available_percent: 内存剩余百分比(0~100)
mem_buffered: 用来给文件做缓冲大小
mem_cached: 被高速缓冲存储器(cache memory)用的内存的大小(等于 diskcache minus SwapCache )
mem_commit_limit: 根据超额分配比率('vm.overcommit_ratio'),这是当前在系统上分配可用的内存总量,这个限制只是在模式2('vm.overcommit_memory')时启用
mem_committed_as: 目前在系统上分配的内存量。是所有进程申请的内存的总和
mem_dirty: 等待被写回到磁盘的内存大小
mem_free: 空闲内存大小(bytes)
mem_high_free: 未被使用的高位内存大小
mem_high_total: 高位内存总大小(Highmem是指所有内存高于860MB的物理内存,Highmem区域供用户程序使用,或用于页面缓存。该区域不是直接映射到内核空间。内核必须使用不同的手法使用该段内存)
mem_huge_page_size: 每个大页的大小
mem_huge_pages_free: 池中尚未分配的 HugePages 数量
mem_huge_pages_total: 预留HugePages的总个数
mem_inactive: 空闲的内存数(包括free和available的内存)
mem_low_free: 未被使用的低位大小
mem_low_total: 低位内存总大小,低位可以达到高位内存一样的作用,而且它还能够被内核用来记录一些自己的数据结构
mem_mapped: 设备和文件等映射的大小
mem_page_tables: 管理内存分页页面的索引表的大小
mem_shared: 多个进程共享的内存总额
mem_slab: 内核数据结构缓存的大小,可以减少申请和释放内存带来的消耗
mem_sreclaimable: 可收回Slab的大小
mem_sunreclaim: 不可收回Slab的大小(SUnreclaim+SReclaimable=Slab)
mem_swap_cached: 被高速缓冲存储器(cache memory)用的交换空间的大小,已经被交换出来的内存,但仍然被存放在swapfile中。用来在需要的时候很快的被替换而不需要再次打开I/O端口
mem_swap_free: 未被使用交换空间的大小
mem_swap_total: 交换空间的总大小
mem_total: 内存总数
mem_used: 已用内存数
mem_used_percent: 已用内存数百分比(0~100)
mem_vmalloc_chunk: 最大的连续未被使用的vmalloc区域
mem_vmalloc_totalL: 可以vmalloc虚拟内存大小
mem_vmalloc_used: vmalloc已使用的虚拟内存大小
mem_write_back: 正在被写回到磁盘的内存大小
mem_write_back_tmp: FUSE用于临时写回缓冲区的内存
net_bytes_recv: 网卡收包总数(bytes),计算每秒速率时需要用到rate/irate函数
net_bytes_sent: 网卡发包总数(bytes),计算每秒速率时需要用到rate/irate函数
net_drop_in: 网卡收丢包数量
net_drop_out: 网卡发丢包数量
net_err_in: 网卡收包错误数量
net_err_out: 网卡发包错误数量
net_packets_recv: 网卡收包数量
net_packets_sent: 网卡发包数量
net_bits_recv: 网卡收包总数(bits),计算每秒速率时需要用到rate/irate函数
net_bits_sent: 网卡发包总数(bits),计算每秒速率时需要用到rate/irate函数
netstat_tcp_established: ESTABLISHED状态的网络链接数
netstat_tcp_fin_wait1: FIN_WAIT1状态的网络链接数
netstat_tcp_fin_wait2: FIN_WAIT2状态的网络链接数
netstat_tcp_last_ack: LAST_ACK状态的网络链接数
netstat_tcp_listen: LISTEN状态的网络链接数
netstat_tcp_syn_recv: SYN_RECV状态的网络链接数
netstat_tcp_syn_sent: SYN_SENT状态的网络链接数
netstat_tcp_time_wait: TIME_WAIT状态的网络链接数
netstat_udp_socket: UDP状态的网络链接数
netstat_sockets_used: 已使用的所有协议套接字总量
netstat_tcp_inuse: 正在使用(正在侦听)的TCP套接字数量
netstat_tcp_orphan: 无主(不属于任何进程)的TCP连接数(无用、待销毁的TCP socket数)
netstat_tcp_tw: TIME_WAIT状态的TCP连接数
netstat_tcp_alloc: 已分配(已建立、已申请到sk_buff)的TCP套接字数量
netstat_tcp_mem: TCP套接字内存Page使用量
netstat_udp_inuse: 在使用的UDP套接字数量
netstat_udp_mem: UDP套接字内存Page使用量
netstat_udplite_inuse: 正在使用的 udp lite 数量
netstat_raw_inuse: 正在使用的 raw socket 数量
netstat_frag_inuse: ip fragment 数量
netstat_frag_memory: ip fragment 已经分配的内存(byte)
#[ping]
ping_percent_packet_loss: ping数据包丢失百分比(%)
ping_result_code: ping返回码('0','1')
net_response_result_code: 网络探测结果,0表示正常,非0表示异常
net_response_response_time: 网络探测时延,单位:秒
processes_blocked: 不可中断的睡眠状态下的进程数('U','D','L')
processes_dead: 回收中的进程数('X')
processes_idle: 挂起的空闲进程数('I')
processes_paging: 分页进程数('P')
processes_running: 运行中的进程数('R')
processes_sleeping: 可中断进程数('S')
processes_stopped: 暂停状态进程数('T')
processes_total: 总进程数
processes_total_threads: 总线程数
processes_unknown: 未知状态进程数
processes_zombies: 僵尸态进程数('Z')
swap_used_percent: Swap空间换出数据量
system_load1: 1分钟平均load值
system_load5: 5分钟平均load值
system_load15: 15分钟平均load值
system_load_norm_1: 1分钟平均load值/逻辑CPU个数
system_load_norm_5: 5分钟平均load值/逻辑CPU个数
system_load_norm_15: 15分钟平均load值/逻辑CPU个数
system_n_users: 用户数
system_n_cpus: CPU核数
system_uptime: 系统启动时间
nginx_accepts: 自nginx启动起,与客户端建立过得连接总数
nginx_active: 当前nginx正在处理的活动连接数,等于Reading/Writing/Waiting总和
nginx_handled: 自nginx启动起,处理过的客户端连接总数
nginx_reading: 正在读取HTTP请求头部的连接总数
nginx_requests: 自nginx启动起,处理过的客户端请求总数,由于存在HTTP Keep-Alive请求,该值会大于handled值
nginx_upstream_check_fall: upstream_check模块检测到后端失败的次数
nginx_upstream_check_rise: upstream_check模块对后端的检测次数
nginx_upstream_check_status_code: 后端upstream的状态,up为1,down为0
nginx_waiting: 开启 keep-alive 的情况下,这个值等于 active – (reading+writing), 意思就是 Nginx 已经处理完正在等候下一次请求指令的驻留连接
nginx_writing: 正在向客户端发送响应的连接总数
http_response_content_length: HTTP消息实体的传输长度
http_response_http_response_code: http响应状态码
http_response_response_time: http响应用时
http_response_result_code: url探测结果0为正常否则url无法访问
# [aws cloudwatch rds]
cloudwatch_aws_rds_bin_log_disk_usage_average: rds 磁盘使用平均值
cloudwatch_aws_rds_bin_log_disk_usage_maximum: rds 磁盘使用量最大值
cloudwatch_aws_rds_bin_log_disk_usage_minimum: rds binlog 磁盘使用量最低
cloudwatch_aws_rds_bin_log_disk_usage_sample_count: rds binlog 磁盘使用情况样本计数
cloudwatch_aws_rds_bin_log_disk_usage_sum: rds binlog 磁盘使用总和
cloudwatch_aws_rds_burst_balance_average: rds 突发余额平均值
cloudwatch_aws_rds_burst_balance_maximum: rds 突发余额最大值
cloudwatch_aws_rds_burst_balance_minimum: rds 突发余额最低
cloudwatch_aws_rds_burst_balance_sample_count: rds 突发平衡样本计数
cloudwatch_aws_rds_burst_balance_sum: rds 突发余额总和
cloudwatch_aws_rds_cpu_utilization_average: rds cpu 利用率平均值
cloudwatch_aws_rds_cpu_utilization_maximum: rds cpu 利用率最大值
cloudwatch_aws_rds_cpu_utilization_minimum: rds cpu 利用率最低
cloudwatch_aws_rds_cpu_utilization_sample_count: rds cpu 利用率样本计数
cloudwatch_aws_rds_cpu_utilization_sum: rds cpu 利用率总和
cloudwatch_aws_rds_database_connections_average: rds 数据库连接平均值
cloudwatch_aws_rds_database_connections_maximum: rds 数据库连接数最大值
cloudwatch_aws_rds_database_connections_minimum: rds 数据库连接最小
cloudwatch_aws_rds_database_connections_sample_count: rds 数据库连接样本数
cloudwatch_aws_rds_database_connections_sum: rds 数据库连接总和
cloudwatch_aws_rds_db_load_average: rds db 平均负载
cloudwatch_aws_rds_db_load_cpu_average: rds db 负载 cpu 平均值
cloudwatch_aws_rds_db_load_cpu_maximum: rds db 负载 cpu 最大值
cloudwatch_aws_rds_db_load_cpu_minimum: rds db 负载 cpu 最小值
cloudwatch_aws_rds_db_load_cpu_sample_count: rds db 加载 CPU 样本数
cloudwatch_aws_rds_db_load_cpu_sum: rds db 加载cpu总和
cloudwatch_aws_rds_db_load_maximum: rds 数据库负载最大值
cloudwatch_aws_rds_db_load_minimum: rds 数据库负载最小值
cloudwatch_aws_rds_db_load_non_cpu_average: rds 加载非 CPU 平均值
cloudwatch_aws_rds_db_load_non_cpu_maximum: rds 加载非 cpu 最大值
cloudwatch_aws_rds_db_load_non_cpu_minimum: rds 加载非 cpu 最小值
cloudwatch_aws_rds_db_load_non_cpu_sample_count: rds 加载非 cpu 样本计数
cloudwatch_aws_rds_db_load_non_cpu_sum: rds 加载非cpu总和
cloudwatch_aws_rds_db_load_sample_count: rds db 加载样本计数
cloudwatch_aws_rds_db_load_sum: rds db 负载总和
cloudwatch_aws_rds_disk_queue_depth_average: rds 磁盘队列深度平均值
cloudwatch_aws_rds_disk_queue_depth_maximum: rds 磁盘队列深度最大值
cloudwatch_aws_rds_disk_queue_depth_minimum: rds 磁盘队列深度最小值
cloudwatch_aws_rds_disk_queue_depth_sample_count: rds 磁盘队列深度样本计数
cloudwatch_aws_rds_disk_queue_depth_sum: rds 磁盘队列深度总和
cloudwatch_aws_rds_ebs_byte_balance__average: rds ebs 字节余额平均值
cloudwatch_aws_rds_ebs_byte_balance__maximum: rds ebs 字节余额最大值
cloudwatch_aws_rds_ebs_byte_balance__minimum: rds ebs 字节余额最低
cloudwatch_aws_rds_ebs_byte_balance__sample_count: rds ebs 字节余额样本数
cloudwatch_aws_rds_ebs_byte_balance__sum: rds ebs 字节余额总和
cloudwatch_aws_rds_ebsio_balance__average: rds ebsio 余额平均值
cloudwatch_aws_rds_ebsio_balance__maximum: rds ebsio 余额最大值
cloudwatch_aws_rds_ebsio_balance__minimum: rds ebsio 余额最低
cloudwatch_aws_rds_ebsio_balance__sample_count: rds ebsio 平衡样本计数
cloudwatch_aws_rds_ebsio_balance__sum: rds ebsio 余额总和
cloudwatch_aws_rds_free_storage_space_average: rds 免费存储空间平均
cloudwatch_aws_rds_free_storage_space_maximum: rds 最大可用存储空间
cloudwatch_aws_rds_free_storage_space_minimum: rds 最低可用存储空间
cloudwatch_aws_rds_free_storage_space_sample_count: rds 可用存储空间样本数
cloudwatch_aws_rds_free_storage_space_sum: rds 免费存储空间总和
cloudwatch_aws_rds_freeable_memory_average: rds 可用内存平均值
cloudwatch_aws_rds_freeable_memory_maximum: rds 最大可用内存
cloudwatch_aws_rds_freeable_memory_minimum: rds 最小可用内存
cloudwatch_aws_rds_freeable_memory_sample_count: rds 可释放内存样本数
cloudwatch_aws_rds_freeable_memory_sum: rds 可释放内存总和
cloudwatch_aws_rds_lvm_read_iops_average: rds lvm 读取 iops 平均值
cloudwatch_aws_rds_lvm_read_iops_maximum: rds lvm 读取 iops 最大值
cloudwatch_aws_rds_lvm_read_iops_minimum: rds lvm 读取 iops 最低
cloudwatch_aws_rds_lvm_read_iops_sample_count: rds lvm 读取 iops 样本计数
cloudwatch_aws_rds_lvm_read_iops_sum: rds lvm 读取 iops 总和
cloudwatch_aws_rds_lvm_write_iops_average: rds lvm 写入 iops 平均值
cloudwatch_aws_rds_lvm_write_iops_maximum: rds lvm 写入 iops 最大值
cloudwatch_aws_rds_lvm_write_iops_minimum: rds lvm 写入 iops 最低
cloudwatch_aws_rds_lvm_write_iops_sample_count: rds lvm 写入 iops 样本计数
cloudwatch_aws_rds_lvm_write_iops_sum: rds lvm 写入 iops 总和
cloudwatch_aws_rds_network_receive_throughput_average: rds 网络接收吞吐量平均
cloudwatch_aws_rds_network_receive_throughput_maximum: rds 网络接收吞吐量最大值
cloudwatch_aws_rds_network_receive_throughput_minimum: rds 网络接收吞吐量最小值
cloudwatch_aws_rds_network_receive_throughput_sample_count: rds 网络接收吞吐量样本计数
cloudwatch_aws_rds_network_receive_throughput_sum: rds 网络接收吞吐量总和
cloudwatch_aws_rds_network_transmit_throughput_average: rds 网络传输吞吐量平均值
cloudwatch_aws_rds_network_transmit_throughput_maximum: rds 网络传输吞吐量最大
cloudwatch_aws_rds_network_transmit_throughput_minimum: rds 网络传输吞吐量最小值
cloudwatch_aws_rds_network_transmit_throughput_sample_count: rds 网络传输吞吐量样本计数
cloudwatch_aws_rds_network_transmit_throughput_sum: rds 网络传输吞吐量总和
cloudwatch_aws_rds_read_iops_average: rds 读取 iops 平均值
cloudwatch_aws_rds_read_iops_maximum: rds 最大读取 iops
cloudwatch_aws_rds_read_iops_minimum: rds 读取 iops 最低
cloudwatch_aws_rds_read_iops_sample_count: rds 读取 iops 样本计数
cloudwatch_aws_rds_read_iops_sum: rds 读取 iops 总和
cloudwatch_aws_rds_read_latency_average: rds 读取延迟平均值
cloudwatch_aws_rds_read_latency_maximum: rds 读取延迟最大值
cloudwatch_aws_rds_read_latency_minimum: rds 最小读取延迟
cloudwatch_aws_rds_read_latency_sample_count: rds 读取延迟样本计数
cloudwatch_aws_rds_read_latency_sum: rds 读取延迟总和
cloudwatch_aws_rds_read_throughput_average: rds 读取吞吐量平均值
cloudwatch_aws_rds_read_throughput_maximum: rds 最大读取吞吐量
cloudwatch_aws_rds_read_throughput_minimum: rds 最小读取吞吐量
cloudwatch_aws_rds_read_throughput_sample_count: rds 读取吞吐量样本计数
cloudwatch_aws_rds_read_throughput_sum: rds 读取吞吐量总和
cloudwatch_aws_rds_swap_usage_average: rds 交换使用平均值
cloudwatch_aws_rds_swap_usage_maximum: rds 交换使用最大值
cloudwatch_aws_rds_swap_usage_minimum: rds 交换使用量最低
cloudwatch_aws_rds_swap_usage_sample_count: rds 交换使用示例计数
cloudwatch_aws_rds_swap_usage_sum: rds 交换使用总和
cloudwatch_aws_rds_write_iops_average: rds 写入 iops 平均值
cloudwatch_aws_rds_write_iops_maximum: rds 写入 iops 最大值
cloudwatch_aws_rds_write_iops_minimum: rds 写入 iops 最低
cloudwatch_aws_rds_write_iops_sample_count: rds 写入 iops 样本计数
cloudwatch_aws_rds_write_iops_sum: rds 写入 iops 总和
cloudwatch_aws_rds_write_latency_average: rds 写入延迟平均值
cloudwatch_aws_rds_write_latency_maximum: rds 最大写入延迟
cloudwatch_aws_rds_write_latency_minimum: rds 写入延迟最小值
cloudwatch_aws_rds_write_latency_sample_count: rds 写入延迟样本计数
cloudwatch_aws_rds_write_latency_sum: rds 写入延迟总和
cloudwatch_aws_rds_write_throughput_average: rds 写入吞吐量平均值
cloudwatch_aws_rds_write_throughput_maximum: rds 最大写入吞吐量
cloudwatch_aws_rds_write_throughput_minimum: rds 写入吞吐量最小值
cloudwatch_aws_rds_write_throughput_sample_count: rds 写入吞吐量样本计数
cloudwatch_aws_rds_write_throughput_sum: rds 写入吞吐量总和
en:
ip_conntrack_count: the number of entries in the conntrack table(unit:int, count)
ip_conntrack_max: the max capacity of the conntrack table(unit:int, size)
cpu_usage_idle: "CPU idle rate(unit:%)"
cpu_usage_active: "CPU usage rate(unit:%)"
cpu_usage_system: "CPU kernel state time proportion(unit:%)"
cpu_usage_user: "CPU user attitude time proportion(unit:%)"
cpu_usage_nice: "The proportion of low priority CPU time, that is, the process NICE value is adjusted to the CPU time between 1-19. Note here that the value range of NICE is -20 to 19, the larger the value, the lower the priority, the lower the priority(unit:%)"
cpu_usage_iowait: "CPU waiting for I/O time proportion(unit:%)"
cpu_usage_irq: "CPU processing hard interrupt time proportion(unit:%)"
cpu_usage_softirq: "CPU processing soft interrupt time proportion(unit:%)"
cpu_usage_steal: "In the virtual machine environment, there is this indicator, which means that the CPU is used by other virtual machines for the proportion of time.(unit:%)"
cpu_usage_guest: "The time to run other operating systems by virtualization, that is, the proportion of CPU time running the virtual machine(unit:%)"
cpu_usage_guest_nice: "The proportion of time to run the virtual machine at low priority(unit:%)"
disk_free: "The remaining amount of the hard disk partition (unit: byte)"
disk_used: "Hard disk partitional use (unit: byte)"
disk_used_percent: "Hard disk partitional use rate (unit:%)"
disk_total: "Total amount of hard disk partition (unit: byte)"
disk_inodes_free: "Hard disk partition INODE remaining amount"
disk_inodes_used: "Hard disk partition INODE usage amount"
disk_inodes_total: "The total amount of hard disk partition INODE"
diskio_io_time: "From the perspective of the device perspective, the total time of I/O request, the I/O request in the queue is count (unit: millisecond), the counter type, you need to use the function to find the value"
diskio_iops_in_progress: "IO requests that have been assigned to device -driven and have not yet been completed, not included in the queue but not yet assigned to the device -driven IO request, Gauge type"
diskio_merged_reads: "The number of times of adjacent reading request Merge, the counter type"
diskio_merged_writes: "The number of times the request Merge writes, the counter type"
diskio_read_bytes: "The number of byte reads, the counter type, you need to use the function to find the Rate to use the value"
diskio_read_time: "The total time of reading request (unit: millisecond), the counter type, you need to use the function to find the Rate to have the value of use"
diskio_reads: "Read the number of requests, the counter type, you need to use the function to find the Rate to use the value"
diskio_weighted_io_time: "From the perspective of the I/O request perspective, I/O wait for the total time. If there are multiple I/O requests at the same time, the time will be superimposed (unit: millisecond)"
diskio_write_bytes: "The number of bytes written, the counter type, you need to use the function to find the Rate to use the value"
diskio_write_time: "The total time of the request (unit: millisecond), the counter type, you need to use the function to find the rate to have the value of use"
diskio_writes: "Write the number of requests, the counter type, you need to use the function to find the rate to use value"
kernel_boot_time: "Kernel startup time"
kernel_context_switches: "Number of kernel context switching times"
kernel_entropy_avail: "Entropy pool inside the Linux system"
kernel_interrupts: "Number of kernel interruption"
kernel_processes_forked: "ForK's process number"
mem_active: "The total number of memory (including Cache and BUFFER memory)"
mem_available: "Application can use memory numbers"
mem_available_percent: "Memory remaining percentage (0 ~ 100)"
mem_buffered: "Used to make buffer size for the file"
mem_cached: "The size of the memory used by the cache memory (equal to diskcache minus Swap Cache )"
mem_commit_limit: "According to the over allocation ratio ('vm.overCommit _ Ratio'), this is the current total memory that can be allocated on the system."
mem_committed_as: "Currently allocated on the system. It is the sum of the memory of all process applications"
mem_dirty: "Waiting to be written back to the memory size of the disk"
mem_free: "Senior memory number"
mem_high_free: "Unused high memory size"
mem_high_total: "The total memory size of the high memory (Highmem refers to all the physical memory that is higher than 860 MB of memory, the HighMem area is used for user programs, or for page cache. This area is not directly mapped to the kernel space. The kernels must use different methods to use this section of memory. )"
mem_huge_page_size: "The size of each big page"
mem_huge_pages_free: "The number of Huge Pages in the pool that have not been allocated"
mem_huge_pages_total: "Reserve the total number of Huge Pages"
mem_inactive: "Free memory (including the memory of free and available)"
mem_low_free: "Unused low size"
mem_low_total: "The total size of the low memory memory can achieve the same role of high memory, and it can be used by the kernel to record some of its own data structure"
mem_mapped: "The size of the mapping of equipment and files"
mem_page_tables: "The size of the index table of the management of the memory paging page"
mem_shared: "The total memory shared by multiple processes"
mem_slab: "The size of the kernel data structure cache can reduce the consumption of application and release memory"
mem_sreclaimable: "The size of the SLAB can be recovered"
mem_sunreclaim: "The size of the SLAB cannot be recovered(SUnreclaim+SReclaimable=Slab)"
mem_swap_cached: "The size of the swap space used by the cache memory (cache memory), the memory that has been swapped out, but is still stored in the swapfile. Used to be quickly replaced when needed without opening the I/O port again"
mem_swap_free: "The size of the switching space is not used"
mem_swap_total: "The total size of the exchange space"
mem_total: "Total memory"
mem_used: "Memory number"
mem_used_percent: "The memory has been used by several percentage (0 ~ 100)"
mem_vmalloc_chunk: "The largest continuous unused vmalloc area"
mem_vmalloc_totalL: "You can vmalloc virtual memory size"
mem_vmalloc_used: "Vmalloc's virtual memory size"
mem_write_back: "The memory size of the disk is being written back to the disk"
mem_write_back_tmp: "Fuse is used to temporarily write back the memory of the buffer area"
net_bytes_recv: "Total inbound traffic(bytes) of network card"
net_bytes_sent: "Total outbound traffic(bytes) of network card"
net_bits_recv: "Total inbound traffic(bits) of network card"
net_bits_sent: "Total outbound traffic(bits) of network card"
net_drop_in: "The number of packets for network cards"
net_drop_out: "The number of packets issued by the network card"
net_err_in: "The number of incorrect packets of the network card"
net_err_out: "Number of incorrect number of network cards"
net_packets_recv: "Net card collection quantity"
net_packets_sent: "Number of network card issuance"
netstat_tcp_established: "ESTABLISHED status network link number"
netstat_tcp_fin_wait1: "FIN _ WAIT1 status network link number"
netstat_tcp_fin_wait2: "FIN _ WAIT2 status number of network links"
netstat_tcp_last_ack: "LAST_ ACK status number of network links"
netstat_tcp_listen: "Number of network links in Listen status"
netstat_tcp_syn_recv: "SYN _ RECV status number of network links"
netstat_tcp_syn_sent: "SYN _ SENT status number of network links"
netstat_tcp_time_wait: "Time _ WAIT status network link number"
netstat_udp_socket: "Number of network links in UDP status"
processes_blocked: "The number of processes in the unreproducible sleep state('U','D','L')"
processes_dead: "Number of processes in recycling('X')"
processes_idle: "Number of idle processes hanging('I')"
processes_paging: "Number of paging processes('P')"
processes_running: "Number of processes during operation('R')"
processes_sleeping: "Can interrupt the number of processes('S')"
processes_stopped: "Pushing status process number('T')"
processes_total: "Total process number"
processes_total_threads: "Number of threads"
processes_unknown: "Unknown status process number"
processes_zombies: "Number of zombies('Z')"
swap_used_percent: "SWAP space replace the data volume"
system_load1: "1 minute average load value"
system_load5: "5 minutes average load value"
system_load15: "15 minutes average load value"
system_load_norm_1: "1 minute average load value/logical CPU number"
system_load_norm_5: "5 minutes average load value/logical CPU number"
system_load_norm_15: "15 minutes average load value/logical CPU number"
system_n_users: "User number"
system_n_cpus: "CPU nuclear number"
system_uptime: "System startup time"
nginx_accepts: "Since Nginx started, the total number of connections has been established with the client"
nginx_active: "The current number of activity connections that Nginx is being processed is equal to Reading/Writing/Waiting"
nginx_handled: "Starting from Nginx, the total number of client connections that have been processed"
nginx_reading: "Reading the total number of connections on the http request header"
nginx_requests: "Since nginx is started, the total number of client requests processed, due to the existence of HTTP Keep-Alive requests, this value will be greater than the handled value"
nginx_upstream_check_fall: "UPStream_CHECK module detects the number of back -end failures"
nginx_upstream_check_rise: "UPSTREAM _ Check module to detect the number of back -end"
nginx_upstream_check_status_code: "The state of the backstream is 1, and the down is 0"
nginx_waiting: "When keep-alive is enabled, this value is equal to active – (reading+writing), which means that Nginx has processed the resident connection that is waiting for the next request command"
nginx_writing: "The total number of connections to send a response to the client"
http_response_content_length: "HTTP message entity transmission length"
http_response_http_response_code: "http response status code"
http_response_response_time: "When http ring application"
http_response_result_code: "URL detection result 0 is normal, otherwise the URL cannot be accessed"
# [mysqld_exporter]
mysql_global_status_uptime: The number of seconds that the server has been up.(Gauge)
mysql_global_status_uptime_since_flush_status: The number of seconds since the most recent FLUSH STATUS statement.(Gauge)
mysql_global_status_queries: The number of statements executed by the server. This variable includes statements executed within stored programs, unlike the Questions variable. It does not count COM_PING or COM_STATISTICS commands.(Counter)
mysql_global_status_threads_connected: The number of currently open connections.(Counter)
mysql_global_status_connections: The number of connection attempts (successful or not) to the MySQL server.(Gauge)
mysql_global_status_max_used_connections: The maximum number of connections that have been in use simultaneously since the server started.(Gauge)
mysql_global_status_threads_running: The number of threads that are not sleeping.(Gauge)
mysql_global_status_questions: The number of statements executed by the server. This includes only statements sent to the server by clients and not statements executed within stored programs, unlike the Queries variable. This variable does not count COM_PING, COM_STATISTICS, COM_STMT_PREPARE, COM_STMT_CLOSE, or COM_STMT_RESET commands.(Counter)
mysql_global_status_threads_cached: The number of threads in the thread cache.(Counter)
mysql_global_status_threads_created: The number of threads created to handle connections. If Threads_created is big, you may want to increase the thread_cache_size value. The cache miss rate can be calculated as Threads_created/Connections.(Counter)
mysql_global_status_created_tmp_tables: The number of internal temporary tables created by the server while executing statements.(Counter)
mysql_global_status_created_tmp_disk_tables: The number of internal on-disk temporary tables created by the server while executing statements. You can compare the number of internal on-disk temporary tables created to the total number of internal temporary tables created by comparing Created_tmp_disk_tables and Created_tmp_tables values.(Counter)
mysql_global_status_created_tmp_files: How many temporary files mysqld has created.(Counter)
mysql_global_status_select_full_join: The number of joins that perform table scans because they do not use indexes. If this value is not 0, you should carefully check the indexes of your tables.(Counter)
mysql_global_status_select_full_range_join: The number of joins that used a range search on a reference table.(Counter)
mysql_global_status_select_range: The number of joins that used ranges on the first table. This is normally not a critical issue even if the value is quite large.(Counter)
mysql_global_status_select_range_check: The number of joins without keys that check for key usage after each row. If this is not 0, you should carefully check the indexes of your tables.(Counter)
mysql_global_status_select_scan: The number of joins that did a full scan of the first table.(Counter)
mysql_global_status_sort_rows: The number of sorted rows.(Counter)
mysql_global_status_sort_range: The number of sorts that were done using ranges.(Counter)
mysql_global_status_sort_merge_passes: The number of merge passes that the sort algorithm has had to do. If this value is large, you should consider increasing the value of the sort_buffer_size system variable.(Counter)
mysql_global_status_sort_scan: The number of sorts that were done by scanning the table.(Counter)
mysql_global_status_slow_queries: The number of queries that have taken more than long_query_time seconds. This counter increments regardless of whether the slow query log is enabled.(Counter)
mysql_global_status_aborted_connects: The number of failed attempts to connect to the MySQL server.(Counter)
mysql_global_status_aborted_clients: The number of connections that were aborted because the client died without closing the connection properly.(Counter)
mysql_global_status_table_locks_immediate: The number of times that a request for a table lock could be granted immediately. Locks Immediate rising and falling is normal activity.(Counter)
mysql_global_status_table_locks_waited: The number of times that a request for a table lock could not be granted immediately and a wait was needed. If this is high and you have performance problems, you should first optimize your queries, and then either split your table or tables or use replication.(Counter)
mysql_global_status_bytes_received: The number of bytes received from all clients.(Counter)
mysql_global_status_bytes_sent: The number of bytes sent to all clients.(Counter)
mysql_global_status_innodb_page_size: InnoDB page size (default 16KB). Many values are counted in pages; the page size enables them to be easily converted to bytes.(Gauge)
mysql_global_status_buffer_pool_pages: The number of pages in the InnoDB buffer pool.(Gauge)
mysql_global_status_commands_total: The number of times each xxx statement has been executed.(Counter)
mysql_global_status_handlers_total: Handler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes. This is in fact the layer between the Storage Engine and MySQL.(Counter)
mysql_global_status_opened_files: The number of files that have been opened with my_open() (a mysys library function). Parts of the server that open files without using this function do not increment the count.(Counter)
mysql_global_status_open_tables: The number of tables that are open.(Gauge)
mysql_global_status_opened_tables: The number of tables that have been opened. If Opened_tables is big, your table_open_cache value is probably too small.(Counter)
mysql_global_status_table_open_cache_hits: The number of hits for open tables cache lookups.(Counter)
mysql_global_status_table_open_cache_misses: The number of misses for open tables cache lookups.(Counter)
mysql_global_status_table_open_cache_overflows: The number of overflows for the open tables cache.(Counter)
mysql_global_status_innodb_num_open_files: The number of files InnoDB currently holds open.(Gauge)
mysql_global_status_connection_errors_total: These variables provide information about errors that occur during the client connection process.(Counter)
mysql_global_status_innodb_buffer_pool_read_requests: The number of logical read requests.(Counter)
mysql_global_status_innodb_buffer_pool_reads: The number of logical reads that InnoDB could not satisfy from the buffer pool, and had to read directly from disk.(Counter)
mysql_global_variables_thread_cache_size: How many threads the server should cache for reuse.(Gauge)
mysql_global_variables_max_connections: The maximum permitted number of simultaneous client connections.(Gauge)
mysql_global_variables_innodb_buffer_pool_size: The size in bytes of the buffer pool, the memory area where InnoDB caches table and index data. The default value is 134217728 bytes (128MB).(Gauge)
mysql_global_variables_innodb_log_buffer_size: The size in bytes of the buffer that InnoDB uses to write to the log files on disk.(Gauge)
mysql_global_variables_key_buffer_size: Index blocks for MyISAM tables are buffered and are shared by all threads.(Gauge)
mysql_global_variables_query_cache_size: The amount of memory allocated for caching query results.(Gauge)
mysql_global_variables_table_open_cache: The number of open tables for all threads.(Gauge)
mysql_global_variables_open_files_limit: The number of file descriptors available to mysqld from the operating system.(Gauge)
# [redis_exporter]
redis_active_defrag_running: When activedefrag is enabled, this indicates whether defragmentation is currently active, and the CPU percentage it intends to utilize.
redis_allocator_active_bytes: Total bytes in the allocator active pages, this includes external-fragmentation.
redis_allocator_allocated_bytes: Total bytes allocated form the allocator, including internal-fragmentation. Normally the same as used_memory.
redis_allocator_frag_bytes: Delta between allocator_active and allocator_allocated. See note about mem_fragmentation_bytes.
redis_allocator_frag_ratio: Ratio between allocator_active and allocator_allocated. This is the true (external) fragmentation metric (not mem_fragmentation_ratio).
redis_allocator_resident_bytes: Total bytes resident (RSS) in the allocator, this includes pages that can be released to the OS (by MEMORY PURGE, or just waiting).
redis_allocator_rss_bytes: Delta between allocator_resident and allocator_active.
redis_allocator_rss_ratio: Ratio between allocator_resident and allocator_active. This usually indicates pages that the allocator can and probably will soon release back to the OS.
redis_aof_current_rewrite_duration_sec: Duration of the on-going AOF rewrite operation if any.
redis_aof_enabled: Flag indicating AOF logging is activated.
redis_aof_last_bgrewrite_status: Status of the last AOF rewrite operation.
redis_aof_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last AOF rewrite operation.
redis_aof_last_rewrite_duration_sec: Duration of the last AOF rewrite operation in seconds.
redis_aof_last_write_status: Status of the last write operation to the AOF.
redis_aof_rewrite_in_progress: Flag indicating a AOF rewrite operation is on-going.
redis_aof_rewrite_scheduled: Flag indicating an AOF rewrite operation will be scheduled once the on-going RDB save is complete.
redis_blocked_clients: Number of clients pending on a blocking call (BLPOP, BRPOP, BRPOPLPUSH, BLMOVE, BZPOPMIN, BZPOPMAX).
redis_client_recent_max_input_buffer_bytes: Biggest input buffer among current client connections.
redis_client_recent_max_output_buffer_bytes: Biggest output buffer among current client connections.
redis_cluster_enabled: Indicate Redis cluster is enabled.
redis_commands_duration_seconds_total: The total CPU time consumed by these commands.(Counter)
redis_commands_processed_total: Total number of commands processed by the server.(Counter)
redis_commands_total: The number of calls that reached command execution (not rejected).(Counter)
redis_config_maxclients: The value of the maxclients configuration directive. This is the upper limit for the sum of connected_clients, connected_slaves and cluster_connections.
redis_config_maxmemory: The value of the maxmemory configuration directive.
redis_connected_clients: Number of client connections (excluding connections from replicas).
redis_connected_slaves: Number of connected replicas.
redis_connections_received_total: Total number of connections accepted by the server.(Counter)
redis_cpu_sys_children_seconds_total: System CPU consumed by the background processes.(Counter)
redis_cpu_sys_seconds_total: System CPU consumed by the Redis server, which is the sum of system CPU consumed by all threads of the server process (main thread and background threads).(Counter)
redis_cpu_user_children_seconds_total: User CPU consumed by the background processes.(Counter)
redis_cpu_user_seconds_total: User CPU consumed by the Redis server, which is the sum of user CPU consumed by all threads of the server process (main thread and background threads).(Counter)
redis_db_keys: Total number of keys by DB.
redis_db_keys_expiring: Total number of expiring keys by DB
redis_defrag_hits: Number of value reallocations performed by active the defragmentation process.
redis_defrag_misses: Number of aborted value reallocations started by the active defragmentation process.
redis_defrag_key_hits: Number of keys that were actively defragmented.
redis_defrag_key_misses: Number of keys that were skipped by the active defragmentation process.
redis_evicted_keys_total: Number of evicted keys due to maxmemory limit.(Counter)
redis_expired_keys_total: Total number of key expiration events.(Counter)
redis_expired_stale_percentage: The percentage of keys probably expired.
redis_expired_time_cap_reached_total: The count of times that active expiry cycles have stopped early.
redis_exporter_last_scrape_connect_time_seconds: The duration(in seconds) to connect when scrape.
redis_exporter_last_scrape_duration_seconds: The last scrape duration.
redis_exporter_last_scrape_error: The last scrape error status.
redis_exporter_scrape_duration_seconds_count: Durations of scrapes by the exporter
redis_exporter_scrape_duration_seconds_sum: Durations of scrapes by the exporter
redis_exporter_scrapes_total: Current total redis scrapes.(Counter)
redis_instance_info: Information about the Redis instance.
redis_keyspace_hits_total: Hits total.(Counter)
redis_keyspace_misses_total: Misses total.(Counter)
redis_last_key_groups_scrape_duration_milliseconds: Duration of the last key group metrics scrape in milliseconds.
redis_last_slow_execution_duration_seconds: The amount of time needed for last slow execution, in seconds.
redis_latest_fork_seconds: The amount of time needed for last fork, in seconds.
redis_lazyfree_pending_objects: The number of objects waiting to be freed (as a result of calling UNLINK, or FLUSHDB and FLUSHALL with the ASYNC option).
redis_master_repl_offset: The server's current replication offset.
redis_mem_clients_normal: Memory used by normal clients.(Gauge)
redis_mem_clients_slaves: Memory used by replica clients - Starting Redis 7.0, replica buffers share memory with the replication backlog, so this field can show 0 when replicas don't trigger an increase of memory usage.
redis_mem_fragmentation_bytes: Delta between used_memory_rss and used_memory. Note that when the total fragmentation bytes is low (few megabytes), a high ratio (e.g. 1.5 and above) is not an indication of an issue.
redis_mem_fragmentation_ratio: Ratio between used_memory_rss and used_memory. Note that this doesn't only includes fragmentation, but also other process overheads (see the allocator_* metrics), and also overheads like code, shared libraries, stack, etc.
redis_mem_not_counted_for_eviction_bytes: (Gauge)
redis_memory_max_bytes: Max memory limit in bytes.
redis_memory_used_bytes: Total number of bytes allocated by Redis using its allocator (either standard libc, jemalloc, or an alternative allocator such as tcmalloc)
redis_memory_used_dataset_bytes: The size in bytes of the dataset (used_memory_overhead subtracted from used_memory)
redis_memory_used_lua_bytes: Number of bytes used by the Lua engine.
redis_memory_used_overhead_bytes: The sum in bytes of all overheads that the server allocated for managing its internal data structures.
redis_memory_used_peak_bytes: Peak memory consumed by Redis (in bytes)
redis_memory_used_rss_bytes: Number of bytes that Redis allocated as seen by the operating system (a.k.a resident set size). This is the number reported by tools such as top(1) and ps(1)
redis_memory_used_scripts_bytes: Number of bytes used by cached Lua scripts
redis_memory_used_startup_bytes: Initial amount of memory consumed by Redis at startup in bytes
redis_migrate_cached_sockets_total: The number of sockets open for MIGRATE purposes
redis_net_input_bytes_total: Total input bytes(Counter)
redis_net_output_bytes_total: Total output bytes(Counter)
redis_process_id: Process ID
redis_pubsub_channels: Global number of pub/sub channels with client subscriptions
redis_pubsub_patterns: Global number of pub/sub pattern with client subscriptions
redis_rdb_bgsave_in_progress: Flag indicating a RDB save is on-going
redis_rdb_changes_since_last_save: Number of changes since the last dump
redis_rdb_current_bgsave_duration_sec: Duration of the on-going RDB save operation if any
redis_rdb_last_bgsave_duration_sec: Duration of the last RDB save operation in seconds
redis_rdb_last_bgsave_status: Status of the last RDB save operation
redis_rdb_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last RDB save operation
redis_rdb_last_save_timestamp_seconds: Epoch-based timestamp of last successful RDB save
redis_rejected_connections_total: Number of connections rejected because of maxclients limit(Counter)
redis_repl_backlog_first_byte_offset: The master offset of the replication backlog buffer
redis_repl_backlog_history_bytes: Size in bytes of the data in the replication backlog buffer
redis_repl_backlog_is_active: Flag indicating replication backlog is active
redis_replica_partial_resync_accepted: The number of accepted partial resync requests(Gauge)
redis_replica_partial_resync_denied: The number of denied partial resync requests(Gauge)
redis_replica_resyncs_full: The number of full resyncs with replicas
redis_replication_backlog_bytes: Memory used by replication backlog
redis_second_repl_offset: The offset up to which replication IDs are accepted.
redis_slave_expires_tracked_keys: The number of keys tracked for expiry purposes (applicable only to writable replicas)(Gauge)
redis_slowlog_last_id: Last id of slowlog
redis_slowlog_length: Total slowlog
redis_start_time_seconds: Start time of the Redis instance since unix epoch in seconds.
redis_target_scrape_request_errors_total: Errors in requests to the exporter
redis_up: Flag indicating redis instance is up
redis_uptime_in_seconds: Number of seconds since Redis server start
# [windows_exporter]
windows_cpu_clock_interrupts_total: Total number of received and serviced clock tick interrupts(counter)
windows_cpu_core_frequency_mhz: Core frequency in megahertz(gauge)
windows_cpu_cstate_seconds_total: Time spent in low-power idle state(counter)
windows_cpu_dpcs_total: Total number of received and serviced deferred procedure calls (DPCs)(counter)
windows_cpu_idle_break_events_total: Total number of time processor was woken from idle(counter)
windows_cpu_interrupts_total: Total number of received and serviced hardware interrupts(counter)
windows_cpu_parking_status: Parking Status represents whether a processor is parked or not(gauge)
windows_cpu_processor_performance: Processor Performance is the average performance of the processor while it is executing instructions, as a percentage of the nominal performance of the processor. On some processors, Processor Performance may exceed 100%(gauge)
windows_cpu_time_total: Time that processor spent in different modes (idle, user, system, ...)(counter)
windows_cs_hostname: Labeled system hostname information as provided by ComputerSystem.DNSHostName and ComputerSystem.Domain(gauge)
windows_cs_logical_processors: ComputerSystem.NumberOfLogicalProcessors(gauge)
windows_cs_physical_memory_bytes: ComputerSystem.TotalPhysicalMemory(gauge)
windows_exporter_build_info: A metric with a constant '1' value labeled by version, revision, branch, and goversion from which windows_exporter was built.(gauge)
windows_exporter_collector_duration_seconds: Duration of a collection.(gauge)
windows_exporter_collector_success: Whether the collector was successful.(gauge)
windows_exporter_collector_timeout: Whether the collector timed out.(gauge)
windows_exporter_perflib_snapshot_duration_seconds: Duration of perflib snapshot capture(gauge)
windows_logical_disk_free_bytes: Free space in bytes (LogicalDisk.PercentFreeSpace)(gauge)
windows_logical_disk_idle_seconds_total: Seconds that the disk was idle (LogicalDisk.PercentIdleTime)(counter)
windows_logical_disk_read_bytes_total: The number of bytes transferred from the disk during read operations (LogicalDisk.DiskReadBytesPerSec)(counter)
windows_logical_disk_read_latency_seconds_total: Shows the average time, in seconds, of a read operation from the disk (LogicalDisk.AvgDiskSecPerRead)(counter)
windows_logical_disk_read_seconds_total: Seconds that the disk was busy servicing read requests (LogicalDisk.PercentDiskReadTime)(counter)
windows_logical_disk_read_write_latency_seconds_total: Shows the time, in seconds, of the average disk transfer (LogicalDisk.AvgDiskSecPerTransfer)(counter)
windows_logical_disk_reads_total: The number of read operations on the disk (LogicalDisk.DiskReadsPerSec)(counter)
windows_logical_disk_requests_queued: The number of requests queued to the disk (LogicalDisk.CurrentDiskQueueLength)(gauge)
windows_logical_disk_size_bytes: Total space in bytes (LogicalDisk.PercentFreeSpace_Base)(gauge)
windows_logical_disk_split_ios_total: The number of I/Os to the disk were split into multiple I/Os (LogicalDisk.SplitIOPerSec)(counter)
windows_logical_disk_write_bytes_total: The number of bytes transferred to the disk during write operations (LogicalDisk.DiskWriteBytesPerSec)(counter)
windows_logical_disk_write_latency_seconds_total: Shows the average time, in seconds, of a write operation to the disk (LogicalDisk.AvgDiskSecPerWrite)(counter)
windows_logical_disk_write_seconds_total: Seconds that the disk was busy servicing write requests (LogicalDisk.PercentDiskWriteTime)(counter)
windows_logical_disk_writes_total: The number of write operations on the disk (LogicalDisk.DiskWritesPerSec)(counter)
windows_net_bytes_received_total: (Network.BytesReceivedPerSec)(counter)
windows_net_bytes_sent_total: (Network.BytesSentPerSec)(counter)
windows_net_bytes_total: (Network.BytesTotalPerSec)(counter)
windows_net_current_bandwidth: (Network.CurrentBandwidth)(gauge)
windows_net_packets_outbound_discarded_total: (Network.PacketsOutboundDiscarded)(counter)
windows_net_packets_outbound_errors_total: (Network.PacketsOutboundErrors)(counter)
windows_net_packets_received_discarded_total: (Network.PacketsReceivedDiscarded)(counter)
windows_net_packets_received_errors_total: (Network.PacketsReceivedErrors)(counter)
windows_net_packets_received_total: (Network.PacketsReceivedPerSec)(counter)
windows_net_packets_received_unknown_total: (Network.PacketsReceivedUnknown)(counter)
windows_net_packets_sent_total: (Network.PacketsSentPerSec)(counter)
windows_net_packets_total: (Network.PacketsPerSec)(counter)
windows_os_info: OperatingSystem.Caption, OperatingSystem.Version(gauge)
windows_os_paging_free_bytes: OperatingSystem.FreeSpaceInPagingFiles(gauge)
windows_os_paging_limit_bytes: OperatingSystem.SizeStoredInPagingFiles(gauge)
windows_os_physical_memory_free_bytes: OperatingSystem.FreePhysicalMemory(gauge)
windows_os_process_memory_limix_bytes: OperatingSystem.MaxProcessMemorySize(gauge)
windows_os_processes: OperatingSystem.NumberOfProcesses(gauge)
windows_os_processes_limit: OperatingSystem.MaxNumberOfProcesses(gauge)
windows_os_time: OperatingSystem.LocalDateTime(gauge)
windows_os_timezone: OperatingSystem.LocalDateTime(gauge)
windows_os_users: OperatingSystem.NumberOfUsers(gauge)
windows_os_virtual_memory_bytes: OperatingSystem.TotalVirtualMemorySize(gauge)
windows_os_virtual_memory_free_bytes: OperatingSystem.FreeVirtualMemory(gauge)
windows_os_visible_memory_bytes: OperatingSystem.TotalVisibleMemorySize(gauge)
windows_service_info: A metric with a constant '1' value labeled with service information(gauge)
windows_service_start_mode: The start mode of the service (StartMode)(gauge)
windows_service_state: The state of the service (State)(gauge)
windows_service_status: The status of the service (Status)(gauge)
windows_system_context_switches_total: Total number of context switches (WMI source is PerfOS_System.ContextSwitchesPersec)(counter)
windows_system_exception_dispatches_total: Total number of exceptions dispatched (WMI source is PerfOS_System.ExceptionDispatchesPersec)(counter)
windows_system_processor_queue_length: Length of processor queue (WMI source is PerfOS_System.ProcessorQueueLength)(gauge)
windows_system_system_calls_total: Total number of system calls (WMI source is PerfOS_System.SystemCallsPersec)(counter)
windows_system_system_up_time: System boot time (WMI source is PerfOS_System.SystemUpTime)(gauge)
windows_system_threads: Current number of threads (WMI source is PerfOS_System.Threads)(gauge)
# [node_exporter]
# SYSTEM
# CPU context switch 次数
node_context_switches_total: context_switches
# Interrupts 次数
node_intr_total: Interrupts
# 运行的进程数
node_procs_running: Processes in runnable state
# 熵池大小
node_entropy_available_bits: Entropy available to random number generators
node_time_seconds: System time in seconds since epoch (1970)
node_boot_time_seconds: Node boot time, in unixtime
# CPU
node_cpu_seconds_total: Seconds the CPUs spent in each mode
node_load1: cpu load 1m
node_load5: cpu load 5m
node_load15: cpu load 15m
# MEM
# 内核态
# 内核用于缓存数据结构供自己使用的内存
node_memory_Slab_bytes: Memory used by the kernel to cache data structures for its own use
# slab中可回收的部分
node_memory_SReclaimable_bytes: SReclaimable - Part of Slab, that might be reclaimed, such as caches
# slab中不可回收的部分
node_memory_SUnreclaim_bytes: Part of Slab, that cannot be reclaimed on memory pressure
# Vmalloc内存区的大小
node_memory_VmallocTotal_bytes: Total size of vmalloc memory area
# vmalloc已分配的内存,虚拟地址空间上的连续的内存
node_memory_VmallocUsed_bytes: Amount of vmalloc area which is used
# vmalloc区可用的连续最大快的大小,通过此指标可以知道vmalloc可分配连续内存的最大值
node_memory_VmallocChunk_bytes: Largest contiguous block of vmalloc area which is free
# 内存的硬件故障删除掉的内存页的总大小
node_memory_HardwareCorrupted_bytes: Amount of RAM that the kernel identified as corrupted / not working
# 用于在虚拟和物理内存地址之间映射的内存
node_memory_PageTables_bytes: Memory used to map between virtual and physical memory addresses (gauge)
# 内核栈内存,常驻内存,不可回收
node_memory_KernelStack_bytes: Kernel memory stack. This is not reclaimable
# 用来访问高端内存,复制高端内存的临时buffer,称为“bounce buffering”,会降低I/O 性能
node_memory_Bounce_bytes: Memory used for block device bounce buffers
#用户态
# 单个巨页大小
node_memory_Hugepagesize_bytes: Huge Page size
# 系统分配的常驻巨页数
node_memory_HugePages_Total: Total size of the pool of huge pages
# 系统空闲的巨页数
node_memory_HugePages_Free: Huge pages in the pool that are not yet allocated
# 进程已申请但未使用的巨页数
node_memory_HugePages_Rsvd: Huge pages for which a commitment to allocate from the pool has been made, but no allocation
# 超过系统设定的常驻HugePages数量的个数
node_memory_HugePages_Surp: Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages
# 透明巨页 Transparent HugePages (THP)
node_memory_AnonHugePages_bytes: Memory in anonymous huge pages
# inactivelist中的File-backed内存
node_memory_Inactive_file_bytes: File-backed memory on inactive LRU list
# inactivelist中的Anonymous内存
node_memory_Inactive_anon_bytes: Anonymous and swap cache on inactive LRU list, including tmpfs (shmem)
# activelist中的File-backed内存
node_memory_Active_file_bytes: File-backed memory on active LRU list
# activelist中的Anonymous内存
node_memory_Active_anon_bytes: Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs
# 禁止换出的页,对应 Unevictable 链表
node_memory_Unevictable_bytes: Amount of unevictable memory that can't be swapped out for a variety of reasons
# 共享内存
node_memory_Shmem_bytes: Used shared memory (shared between several processes, thus including RAM disks)
# 匿名页内存大小
node_memory_AnonPages_bytes: Memory in user pages not backed by files
# 被关联的内存页大小
node_memory_Mapped_bytes: Used memory in mapped pages files which have been mapped, such as libraries
# file-backed内存页缓存大小
node_memory_Cached_bytes: Parked file data (file content) cache
# 系统中有多少匿名页曾经被swap-out、现在又被swap-in并且swap-in之后页面中的内容一直没发生变化
node_memory_SwapCached_bytes: Memory that keeps track of pages that have been fetched from swap but not yet been modified
# 被mlock()系统调用锁定的内存大小
node_memory_Mlocked_bytes: Size of pages locked to memory using the mlock() system call
# 块设备(block device)所占用的缓存页
node_memory_Buffers_bytes: Block device (e.g. harddisk) cache
node_memory_SwapTotal_bytes: Memory information field SwapTotal_bytes
node_memory_SwapFree_bytes: Memory information field SwapFree_bytes
# DISK
node_filesystem_avail_bytes: Filesystem space available to non-root users in byte
node_filesystem_free_bytes: Filesystem free space in bytes
node_filesystem_size_bytes: Filesystem size in bytes
node_filesystem_files_free: Filesystem total free file nodes
node_filesystem_files: Filesystem total free file nodes
node_filefd_maximum: Max open files
node_filefd_allocated: Open files
node_filesystem_readonly: Filesystem read-only status
node_filesystem_device_error: Whether an error occurred while getting statistics for the given device
node_disk_reads_completed_total: The total number of reads completed successfully
node_disk_writes_completed_total: The total number of writes completed successfully
node_disk_reads_merged_total: The number of reads merged
node_disk_writes_merged_total: The number of writes merged
node_disk_read_bytes_total: The total number of bytes read successfully
node_disk_written_bytes_total: The total number of bytes written successfully
node_disk_io_time_seconds_total: Total seconds spent doing I/Os
node_disk_read_time_seconds_total: The total number of seconds spent by all reads
node_disk_write_time_seconds_total: The total number of seconds spent by all writes
node_disk_io_time_weighted_seconds_total: The weighted of seconds spent doing I/Os
# NET
node_network_receive_bytes_total: Network device statistic receive_bytes (counter)
node_network_transmit_bytes_total: Network device statistic transmit_bytes (counter)
node_network_receive_packets_total: Network device statistic receive_bytes
node_network_transmit_packets_total: Network device statistic transmit_bytes
node_network_receive_errs_total: Network device statistic receive_errs
node_network_transmit_errs_total: Network device statistic transmit_errs
node_network_receive_drop_total: Network device statistic receive_drop
node_network_transmit_drop_total: Network device statistic transmit_drop
node_nf_conntrack_entries: Number of currently allocated flow entries for connection tracking
node_sockstat_TCP_alloc: Number of TCP sockets in state alloc
node_sockstat_TCP_inuse: Number of TCP sockets in state inuse
node_sockstat_TCP_orphan: Number of TCP sockets in state orphan
node_sockstat_TCP_tw: Number of TCP sockets in state tw
node_netstat_Tcp_CurrEstab: Statistic TcpCurrEstab
node_sockstat_sockets_used: Number of IPv4 sockets in use
# [kafka_exporter]
kafka_brokers: count of kafka_brokers (gauge)
kafka_topic_partitions: Number of partitions for this Topic (gauge)
kafka_topic_partition_current_offset: Current Offset of a Broker at Topic/Partition (gauge)
kafka_consumergroup_current_offset: Current Offset of a ConsumerGroup at Topic/Partition (gauge)
kafka_consumer_lag_millis: Current approximation of consumer lag for a ConsumerGroup at Topic/Partition (gauge)
kafka_topic_partition_under_replicated_partition: 1 if Topic/Partition is under Replicated
# [zookeeper_exporter]
zk_znode_count: The total count of znodes stored
zk_ephemerals_count: The number of Ephemerals nodes
zk_watch_count: The number of watchers setup over Zookeeper nodes.
zk_approximate_data_size: Size of data in bytes that a zookeeper server has in its data tree
zk_outstanding_requests: Number of currently executing requests
zk_packets_sent: Count of the number of zookeeper packets sent from a server
zk_packets_received: Count of the number of zookeeper packets received by a server
zk_num_alive_connections: Number of active clients connected to a zookeeper server
zk_open_file_descriptor_count: Number of file descriptors that a zookeeper server has open
zk_max_file_descriptor_count: Maximum number of file descriptors that a zookeeper server can open
zk_avg_latency: Average time in milliseconds for requests to be processed
zk_min_latency: Maximum time in milliseconds for a request to be processed
zk_max_latency: Minimum time in milliseconds for a request to be processed
================================================
FILE: etc/script/notify.bak.py
================================================
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys
import json
import urllib2
import smtplib
from email.mime.text import MIMEText
reload(sys)
sys.setdefaultencoding('utf8')
notify_channel_funcs = {
"email":"email",
"sms":"sms",
"voice":"voice",
"dingtalk":"dingtalk",
"wecom":"wecom",
"feishu":"feishu"
}
mail_host = "smtp.163.com"
mail_port = 994
mail_user = "ulricqin"
mail_pass = "password"
mail_from = "ulricqin@163.com"
class Sender(object):
@classmethod
def send_email(cls, payload):
if mail_user == "ulricqin" and mail_pass == "password":
print("invalid smtp configuration")
return
users = payload.get('event').get("notify_users_obj")
emails = {}
for u in users:
if u.get("email"):
emails[u.get("email")] = 1
if not emails:
return
recipients = emails.keys()
mail_body = payload.get('tpls').get("email.tpl", "email.tpl not found")
message = MIMEText(mail_body, 'html', 'utf-8')
message['From'] = mail_from
message['To'] = ", ".join(recipients)
message["Subject"] = payload.get('tpls').get("subject.tpl", "subject.tpl not found")
try:
smtp = smtplib.SMTP_SSL(mail_host, mail_port)
smtp.login(mail_user, mail_pass)
smtp.sendmail(mail_from, recipients, message.as_string())
smtp.close()
except smtplib.SMTPException, error:
print(error)
@classmethod
def send_wecom(cls, payload):
users = payload.get('event').get("notify_users_obj")
tokens = {}
for u in users:
contacts = u.get("contacts")
if contacts.get("wecom_robot_token", ""):
tokens[contacts.get("wecom_robot_token", "")] = 1
opener = urllib2.build_opener(urllib2.HTTPHandler())
method = "POST"
for t in tokens:
url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}".format(t)
body = {
"msgtype": "markdown",
"markdown": {
"content": payload.get('tpls').get("wecom.tpl", "wecom.tpl not found")
}
}
request = urllib2.Request(url, data=json.dumps(body))
request.add_header("Content-Type",'application/json;charset=utf-8')
request.get_method = lambda: method
try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
@classmethod
def send_dingtalk(cls, payload):
event = payload.get('event')
users = event.get("notify_users_obj")
rule_name = event.get("rule_name")
event_state = "Triggered"
if event.get("is_recovered"):
event_state = "Recovered"
tokens = {}
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
contacts = u.get("contacts")
if contacts.get("dingtalk_robot_token", ""):
tokens[contacts.get("dingtalk_robot_token", "")] = 1
opener = urllib2.build_opener(urllib2.HTTPHandler())
method = "POST"
for t in tokens:
url = "https://oapi.dingtalk.com/robot/send?access_token={}".format(t)
body = {
"msgtype": "markdown",
"markdown": {
"title": "{} - {}".format(event_state, rule_name),
"text": payload.get('tpls').get("dingtalk.tpl", "dingtalk.tpl not found") + ' '.join(["@"+i for i in phones.keys()])
},
"at": {
"atMobiles": phones.keys(),
"isAtAll": False
}
}
request = urllib2.Request(url, data=json.dumps(body))
request.add_header("Content-Type",'application/json;charset=utf-8')
request.get_method = lambda: method
try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
@classmethod
def send_feishu(cls, payload):
users = payload.get('event').get("notify_users_obj")
tokens = {}
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
contacts = u.get("contacts")
if contacts.get("feishu_robot_token", ""):
tokens[contacts.get("feishu_robot_token", "")] = 1
opener = urllib2.build_opener(urllib2.HTTPHandler())
method = "POST"
for t in tokens:
url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t)
body = {
"msg_type": "text",
"content": {
"text": payload.get('tpls').get("feishu.tpl", "feishu.tpl not found")
},
"at": {
"atMobiles": phones.keys(),
"isAtAll": False
}
}
request = urllib2.Request(url, data=json.dumps(body))
request.add_header("Content-Type",'application/json;charset=utf-8')
request.get_method = lambda: method
try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
@classmethod
def send_sms(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_sms not implemented, phones: {}".format(phones.keys()))
@classmethod
def send_voice(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_voice not implemented, phones: {}".format(phones.keys()))
def main():
payload = json.load(sys.stdin)
with open(".payload", 'w') as f:
f.write(json.dumps(payload, indent=4))
for ch in payload.get('event').get('notify_channels'):
send_func_name = "send_{}".format(notify_channel_funcs.get(ch.strip()))
if not hasattr(Sender, send_func_name):
print("function: {} not found", send_func_name)
continue
send_func = getattr(Sender, send_func_name)
send_func(payload)
def hello():
print("hello nightingale")
if __name__ == "__main__":
if len(sys.argv) == 1:
main()
elif sys.argv[1] == "hello":
hello()
else:
print("I am confused")
================================================
FILE: etc/script/notify.py
================================================
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import sys
import json
class Sender(object):
@classmethod
def send_email(cls, payload):
# already done in go code
pass
@classmethod
def send_wecom(cls, payload):
# already done in go code
pass
@classmethod
def send_dingtalk(cls, payload):
# already done in go code
pass
@classmethod
def send_feishu(cls, payload):
# already done in go code
pass
@classmethod
def send_mm(cls, payload):
# already done in go code
pass
@classmethod
def send_sms(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_sms not implemented, phones: {}".format(phones.keys()))
@classmethod
def send_voice(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_voice not implemented, phones: {}".format(phones.keys()))
def main():
payload = json.load(sys.stdin)
with open(".payload", 'w') as f:
f.write(json.dumps(payload, indent=4))
for ch in payload.get('event').get('notify_channels'):
send_func_name = "send_{}".format(ch.strip())
if not hasattr(Sender, send_func_name):
print("function: {} not found", send_func_name)
continue
send_func = getattr(Sender, send_func_name)
send_func(payload)
def hello():
print("hello nightingale")
if __name__ == "__main__":
if len(sys.argv) == 1:
main()
elif sys.argv[1] == "hello":
hello()
else:
print("I am confused")
================================================
FILE: etc/script/notify_feishu.py
================================================
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys
import json
import requests
class Sender(object):
@classmethod
def send_email(cls, payload):
# already done in go code
pass
@classmethod
def send_wecom(cls, payload):
# already done in go code
pass
@classmethod
def send_dingtalk(cls, payload):
# already done in go code
pass
@classmethod
def send_ifeishu(cls, payload):
users = payload.get('event').get("notify_users_obj")
tokens = {}
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
contacts = u.get("contacts")
if contacts.get("feishu_robot_token", ""):
tokens[contacts.get("feishu_robot_token", "")] = 1
headers = {
"Content-Type": "application/json;charset=utf-8",
"Host": "open.feishu.cn"
}
for t in tokens:
url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t)
body = {
"msg_type": "text",
"content": {
"text": payload.get('tpls').get("feishu", "feishu not found")
},
"at": {
"atMobiles": list(phones.keys()),
"isAtAll": False
}
}
response = requests.post(url, headers=headers, data=json.dumps(body))
print(f"notify_ifeishu: token={t} status_code={response.status_code} response_text={response.text}")
@classmethod
def send_mm(cls, payload):
# already done in go code
pass
@classmethod
def send_sms(cls, payload):
pass
@classmethod
def send_voice(cls, payload):
pass
def main():
payload = json.load(sys.stdin)
with open(".payload", 'w') as f:
f.write(json.dumps(payload, indent=4))
for ch in payload.get('event').get('notify_channels'):
send_func_name = "send_{}".format(ch.strip())
if not hasattr(Sender, send_func_name):
print("function: {} not found", send_func_name)
continue
send_func = getattr(Sender, send_func_name)
send_func(payload)
def hello():
print("hello nightingale")
if __name__ == "__main__":
if len(sys.argv) == 1:
main()
elif sys.argv[1] == "hello":
hello()
else:
print("I am confused")
================================================
FILE: etc/script/rule_converter.py
================================================
import json
import yaml
'''
将promtheus/vmalert的rule转换为n9e中的rule
支持k8s的rule configmap
'''
rule_file = 'rules.yaml'
def convert_interval(interval):
if interval.endswith('s') or interval.endswith('S'):
return int(interval[:-1])
if interval.endswith('m') or interval.endswith('M'):
return int(interval[:-1]) * 60
if interval.endswith('h') or interval.endswith('H'):
return int(interval[:-1]) * 60 * 60
if interval.endswith('d') or interval.endswith('D'):
return int(interval[:-1]) * 60 * 60 * 24
return int(interval)
def convert_alert(rule, interval):
name = rule['alert']
prom_ql = rule['expr']
if 'for' in rule:
prom_for_duration = convert_interval(rule['for'])
else:
prom_for_duration = 0
prom_eval_interval = convert_interval(interval)
note = ''
if 'annotations' in rule:
for v in rule['annotations'].values():
note = v
break
annotations = {}
if 'annotations' in rule:
for k, v in rule['annotations'].items():
annotations[k] = v
append_tags = []
severity = 2
if 'labels' in rule:
for k, v in rule['labels'].items():
if k != 'severity':
append_tags.append('{}={}'.format(k, v))
continue
if v == 'critical':
severity = 1
elif v == 'info':
severity = 3
# elif v == 'warning':
# severity = 2
n9e_alert_rule = {
"name": name,
"note": note,
"severity": severity,
"disabled": 0,
"prom_for_duration": prom_for_duration,
"prom_ql": prom_ql,
"prom_eval_interval": prom_eval_interval,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": append_tags,
"annotations":annotations
}
return n9e_alert_rule
def convert_record(rule, interval):
name = rule['record']
prom_ql = rule['expr']
prom_eval_interval = convert_interval(interval)
note = ''
append_tags = []
if 'labels' in rule:
for k, v in rule['labels'].items():
append_tags.append('{}={}'.format(k, v))
n9e_record_rule = {
"name": name,
"note": note,
"disabled": 0,
"prom_ql": prom_ql,
"prom_eval_interval": prom_eval_interval,
"append_tags": append_tags
}
return n9e_record_rule
'''
example of rule group file
---
groups:
- name: example
rules:
- alert: HighRequestLatency
expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5
for: 10m
labels:
severity: page
annotations:
summary: High request latency
'''
def deal_group(group):
"""
parse single prometheus/vmalert rule group
"""
alert_rules = []
record_rules = []
for rule_segment in group['groups']:
if 'interval' in rule_segment:
interval = rule_segment['interval']
else:
interval = '15s'
for rule in rule_segment['rules']:
if 'alert' in rule:
alert_rules.append(convert_alert(rule, interval))
else:
record_rules.append(convert_record(rule, interval))
return alert_rules, record_rules
'''
example of k8s rule configmap
---
apiVersion: v1
kind: ConfigMap
metadata:
name: rulefiles-0
data:
etcdrules.yaml: |
groups:
- name: etcd
rules:
- alert: etcdInsufficientMembers
annotations:
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value}}).'
expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"})
by (job) + 1) / 2)
for: 3m
labels:
severity: critical
'''
def deal_configmap(rule_configmap):
"""
parse rule configmap from k8s
"""
all_record_rules = []
all_alert_rules = []
for _, rule_group_str in rule_configmap['data'].items():
rule_group = yaml.load(rule_group_str, Loader=yaml.FullLoader)
alert_rules, record_rules = deal_group(rule_group)
all_alert_rules.extend(alert_rules)
all_record_rules.extend(record_rules)
return all_alert_rules, all_record_rules
def main():
with open(rule_file, 'r') as f:
rule_config = yaml.load(f, Loader=yaml.FullLoader)
# 如果文件是k8s中的configmap,使用下面的方法
# alert_rules, record_rules = deal_configmap(rule_config)
alert_rules, record_rules = deal_group(rule_config)
with open("alert-rules.json", 'w') as fw:
json.dump(alert_rules, fw, indent=2, ensure_ascii=False)
with open("record-rules.json", 'w') as fw:
json.dump(record_rules, fw, indent=2, ensure_ascii=False)
if __name__ == '__main__':
main()
================================================
FILE: fe.sh
================================================
#!/bin/bash
cp -f ./docker/initsql/a-n9e.sql n9e.sql
if [ ! -d "./pub" ]; then
TAG=$(curl -sX GET https://api.github.com/repos/n9e/fe/releases/latest | awk '/tag_name/{print $4;exit}' FS='[""]')
if ! curl -o n9e-fe-${TAG}.tar.gz -L https://github.com/n9e/fe/releases/download/${TAG}/n9e-fe-${TAG}.tar.gz; then
echo "failed to download n9e-fe-${TAG}.tar.gz!"
exit 1
fi
if ! tar zxf n9e-fe-${TAG}.tar.gz; then
echo "failed to untar n9e-fe-${TAG}.tar.gz!"
exit 2
fi
fi
GOPATH=$(go env GOPATH)
GOPATH=${GOPATH:-/home/runner/go}
# Embed files into a go binary
# go install github.com/rakyll/statik
if ! $GOPATH/bin/statik -src=./pub -dest=./front; then
echo "failed to embed files into a go binary!"
exit 4
fi
================================================
FILE: go.mod
================================================
module github.com/ccfos/nightingale/v6
go 1.24.0
require (
github.com/BurntSushi/toml v1.4.0
github.com/ClickHouse/clickhouse-go/v2 v2.23.2
github.com/IBM/sarama v1.45.0
github.com/VictoriaMetrics/metricsql v0.81.1
github.com/alibabacloud-go/darabonba-openapi/v2 v2.1.13
github.com/alibabacloud-go/dingtalk v1.6.95
github.com/alibabacloud-go/gateway-dingtalk v1.0.2
github.com/alibabacloud-go/openapi-util v0.1.1
github.com/alibabacloud-go/tea v1.3.13
github.com/alibabacloud-go/tea-utils/v2 v2.0.7
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de
github.com/bitly/go-simplejson v0.5.1
github.com/coreos/go-oidc v2.2.1+incompatible
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc
github.com/dgrijalva/jwt-go v3.2.0+incompatible
github.com/expr-lang/expr v1.16.1
github.com/flashcatcloud/ibex v1.3.6
github.com/gin-contrib/pprof v1.4.0
github.com/gin-gonic/gin v1.9.1
github.com/glebarez/sqlite v1.11.0
github.com/go-ldap/ldap/v3 v3.4.4
github.com/gogo/protobuf v1.3.2
github.com/golang-jwt/jwt v3.2.2+incompatible
github.com/golang/protobuf v1.5.4
github.com/golang/snappy v0.0.4
github.com/google/uuid v1.6.0
github.com/hashicorp/go-version v1.6.0
github.com/jinzhu/copier v0.4.0
github.com/json-iterator/go v1.1.12
github.com/koding/multiconfig v0.0.0-20171124222453-69c27309b2d7
github.com/larksuite/oapi-sdk-go/v3 v3.5.1
github.com/lib/pq v1.10.9
github.com/mailru/easyjson v0.7.7
github.com/mattn/go-isatty v0.0.19
github.com/mitchellh/mapstructure v1.5.0
github.com/mojocn/base64Captcha v1.3.6
github.com/olivere/elastic/v7 v7.0.32
github.com/opensearch-project/opensearch-go/v2 v2.3.0
github.com/patrickmn/go-cache v2.1.0+incompatible
github.com/pelletier/go-toml/v2 v2.0.8
github.com/pingcap/tidb/pkg/parser v0.0.0-20260120034856-e15515e804da
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.20.5
github.com/prometheus/common v0.60.1
github.com/prometheus/prometheus v0.47.1
github.com/rakyll/statik v0.1.7
github.com/redis/go-redis/v9 v9.0.2
github.com/spaolacci/murmur3 v1.1.0
github.com/stretchr/testify v1.10.0
github.com/tidwall/gjson v1.14.2
github.com/toolkits/pkg v1.3.8
golang.org/x/exp v0.0.0-20231006140011-7918f672742d
golang.org/x/oauth2 v0.27.0
gopkg.in/gomail.v2 v2.0.0-20160411212932-81ebce5c23df
gopkg.in/yaml.v2 v2.4.0
gorm.io/driver/clickhouse v0.6.1
gorm.io/driver/mysql v1.4.4
gorm.io/driver/postgres v1.5.11
gorm.io/driver/sqlite v1.5.5
gorm.io/gorm v1.25.10
)
require (
github.com/ClickHouse/ch-go v0.61.5 // indirect
github.com/andybalholm/brotli v1.1.0 // indirect
github.com/go-faster/city v1.0.1 // indirect
github.com/go-faster/errors v0.7.1 // indirect
github.com/klauspost/compress v1.17.11 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/paulmach/orb v0.11.1 // indirect
github.com/pierrec/lz4/v4 v4.1.22 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/segmentio/asm v1.2.0 // indirect
github.com/shopspring/decimal v1.4.0 // indirect
go.opentelemetry.io/otel v1.32.0 // indirect
go.opentelemetry.io/otel/trace v1.32.0 // indirect
)
require (
github.com/VictoriaMetrics/metrics v1.34.0 // indirect
github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.5 // indirect
github.com/alibabacloud-go/debug v1.0.1 // indirect
github.com/alicebob/gopher-json v0.0.0-20200520072559-a9ecdc9d1d3a // indirect
github.com/aliyun/credentials-go v1.4.6 // indirect
github.com/clbanning/mxj/v2 v2.7.0 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/eapache/go-resiliency v1.7.0 // indirect
github.com/eapache/go-xerial-snappy v0.0.0-20230731223053-c322873962e3 // indirect
github.com/eapache/queue v1.1.0 // indirect
github.com/glebarez/go-sqlite v1.21.2 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/hashicorp/go-uuid v1.0.3 // indirect
github.com/jackc/pgx/v5 v5.7.1 // indirect
github.com/jackc/puddle/v2 v2.2.2 // indirect
github.com/jcmturner/aescts/v2 v2.0.0 // indirect
github.com/jcmturner/dnsutils/v2 v2.0.0 // indirect
github.com/jcmturner/gofork v1.7.6 // indirect
github.com/jcmturner/gokrb5/v8 v8.4.4 // indirect
github.com/jcmturner/rpc/v2 v2.0.3 // indirect
github.com/pingcap/errors v0.11.5-0.20250523034308-74f78ae071ee // indirect
github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86 // indirect
github.com/pingcap/log v1.1.0 // indirect
github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
github.com/rogpeppe/go-internal v1.13.1 // indirect
github.com/tjfoc/gmsm v1.4.1 // indirect
github.com/valyala/fastrand v1.1.0 // indirect
github.com/valyala/histogram v1.2.0 // indirect
github.com/yuin/gopher-lua v1.1.1 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.0 // indirect
golang.org/x/sync v0.18.0 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect
modernc.org/libc v1.22.5 // indirect
modernc.org/mathutil v1.6.0 // indirect
modernc.org/memory v1.5.0 // indirect
modernc.org/sqlite v1.23.1 // indirect
)
require (
github.com/Azure/go-ntlmssp v0.0.0-20220621081337-cb9428e4ac1e // indirect
github.com/alicebob/miniredis/v2 v2.33.0
github.com/beorn7/perks v1.0.1 // indirect
github.com/bytedance/sonic v1.9.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
github.com/dennwc/varint v1.0.0 // indirect
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
github.com/fatih/camelcase v1.0.0 // indirect
github.com/fatih/structs v1.1.0 // indirect
github.com/gabriel-vasile/mimetype v1.4.2 // indirect
github.com/gin-contrib/sse v0.1.0 // indirect
github.com/go-asn1-ber/asn1-ber v1.5.4 // indirect
github.com/go-kit/log v0.2.1 // indirect
github.com/go-logfmt/logfmt v0.6.0 // indirect
github.com/go-playground/locales v0.14.1 // indirect
github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/go-playground/validator/v10 v10.14.0 // indirect
github.com/go-sql-driver/mysql v1.7.1
github.com/goccy/go-json v0.10.2 // indirect
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
github.com/grafana/regexp v0.0.0-20221122212121-6b5c0a4cb7fd // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jinzhu/inflection v1.0.0 // indirect
github.com/jinzhu/now v1.1.5 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/klauspost/cpuid/v2 v2.2.5 // indirect
github.com/leodido/go-urn v1.2.4 // indirect
github.com/mattn/go-sqlite3 v1.14.17 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/pquerna/cachecontrol v0.1.0 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/robfig/cron/v3 v3.0.1
github.com/tidwall/match v1.1.1
github.com/tidwall/pretty v1.2.0 // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.11 // indirect
go.uber.org/atomic v1.11.0 // indirect
go.uber.org/automaxprocs v1.5.2 // indirect
golang.org/x/arch v0.3.0 // indirect
golang.org/x/crypto v0.45.0 // indirect
golang.org/x/image v0.18.0 // indirect
golang.org/x/net v0.47.0 // indirect
golang.org/x/sys v0.38.0 // indirect
golang.org/x/text v0.31.0 // indirect
google.golang.org/protobuf v1.35.1 // indirect
gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc // indirect
gopkg.in/square/go-jose.v2 v2.6.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
replace golang.org/x/exp v0.0.0-20231006140011-7918f672742d => golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1
replace github.com/olivere/elastic/v7 => github.com/n9e/elastic/v7 v7.0.33-0.20251031061708-f480a2dfcfa7
// replace github.com/flashcatcloud/ibex => ../github.com/flashcatcloud/ibex
================================================
FILE: go.sum
================================================
cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
github.com/Azure/azure-sdk-for-go v65.0.0+incompatible h1:HzKLt3kIwMm4KeJYTdx9EbjRYTySD/t8i1Ee/W5EGXw=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.7.0 h1:8q4SaHjFsClSvuVne0ID/5Ka8u3fcIHyqkLjcFpNRHQ=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.7.0/go.mod h1:bjGvMhVMb+EEm3VRNQawDMUyMMjo+S5ewNjflkep/0Q=
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.3.0 h1:vcYCAze6p19qBW7MhZybIsqD8sMV8js0NyQM8JDnVtg=
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.3.0/go.mod h1:OQeznEEkTZ9OrhHJoDD8ZDq51FHgXjqtP9z6bEwBq9U=
github.com/Azure/azure-sdk-for-go/sdk/internal v1.3.0 h1:sXr+ck84g/ZlZUOZiNELInmMgOsuGwdjjVkEIde0OtY=
github.com/Azure/azure-sdk-for-go/sdk/internal v1.3.0/go.mod h1:okt5dMMTOFjX/aovMlrjvvXoPMBVSPzk9185BT0+eZM=
github.com/Azure/go-ntlmssp v0.0.0-20220621081337-cb9428e4ac1e h1:NeAW1fUYUEWhft7pkxDf6WoUvEZJ/uOKsvtpjLnn8MU=
github.com/Azure/go-ntlmssp v0.0.0-20220621081337-cb9428e4ac1e/go.mod h1:chxPXzSsl7ZWRAuOIE23GDNzjWuZquvFlgA8xmpunjU=
github.com/AzureAD/microsoft-authentication-library-for-go v1.0.0 h1:OBhqkivkhkMqLPymWEppkm7vgPQY2XsHoEkaMQ0AdZY=
github.com/AzureAD/microsoft-authentication-library-for-go v1.0.0/go.mod h1:kgDmCTgBzIEPFElEF+FK0SdjAor06dRq2Go927dnQ6o=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/toml v1.4.0 h1:kuoIxZQy2WRRk1pttg9asf+WVv6tWQuBNVmK8+nqPr0=
github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
github.com/ClickHouse/ch-go v0.61.5 h1:zwR8QbYI0tsMiEcze/uIMK+Tz1D3XZXLdNrlaOpeEI4=
github.com/ClickHouse/ch-go v0.61.5/go.mod h1:s1LJW/F/LcFs5HJnuogFMta50kKDO0lf9zzfrbl0RQg=
github.com/ClickHouse/clickhouse-go/v2 v2.23.2 h1:+DAKPMnxLS7pduQZsrJc8OhdLS2L9MfDEJ2TS+hpYDM=
github.com/ClickHouse/clickhouse-go/v2 v2.23.2/go.mod h1:aNap51J1OM3yxQJRgM+AlP/MPkGBCL8A74uQThoQhR0=
github.com/IBM/sarama v1.45.0 h1:IzeBevTn809IJ/dhNKhP5mpxEXTmELuezO2tgHD9G5E=
github.com/IBM/sarama v1.45.0/go.mod h1:EEay63m8EZkeumco9TDXf2JT3uDnZsZqFgV46n4yZdY=
github.com/VictoriaMetrics/metrics v1.34.0 h1:0i8k/gdOJdSoZB4Z9pikVnVQXfhcIvnG7M7h2WaQW2w=
github.com/VictoriaMetrics/metrics v1.34.0/go.mod h1:r7hveu6xMdUACXvB8TYdAj8WEsKzWB0EkpJN+RDtOf8=
github.com/VictoriaMetrics/metricsql v0.81.1 h1:1gpqI3Mwru1tCM8nZiKxBG0P+DNkjlRwLhRPII3cuho=
github.com/VictoriaMetrics/metricsql v0.81.1/go.mod h1:1g4hdCwlbJZ851PU9VN65xy9Rdlzupo6fx3SNZ8Z64U=
github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 h1:s6gZFSlWYmbqAuRjVTiNNhvNRfY2Wxp9nhfyel4rklc=
github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE=
github.com/alibabacloud-go/alibabacloud-gateway-pop v0.0.6 h1:eIf+iGJxdU4U9ypaUfbtOWCsZSbTb8AUHvyPrxu6mAA=
github.com/alibabacloud-go/alibabacloud-gateway-pop v0.0.6/go.mod h1:4EUIoxs/do24zMOGGqYVWgw0s9NtiylnJglOeEB5UJo=
github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.4/go.mod h1:sCavSAvdzOjul4cEqeVtvlSaSScfNsTQ+46HwlTL1hc=
github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.5 h1:zE8vH9C7JiZLNJJQ5OwjU9mSi4T9ef9u3BURT6LCLC8=
github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.5/go.mod h1:tWnyE9AjF8J8qqLk645oUmVUnFybApTQWklQmi5tY6g=
github.com/alibabacloud-go/darabonba-array v0.1.0 h1:vR8s7b1fWAQIjEjWnuF0JiKsCvclSRTfDzZHTYqfufY=
github.com/alibabacloud-go/darabonba-array v0.1.0/go.mod h1:BLKxr0brnggqOJPqT09DFJ8g3fsDshapUD3C3aOEFaI=
github.com/alibabacloud-go/darabonba-encode-util v0.0.2 h1:1uJGrbsGEVqWcWxrS9MyC2NG0Ax+GpOM5gtupki31XE=
github.com/alibabacloud-go/darabonba-encode-util v0.0.2/go.mod h1:JiW9higWHYXm7F4PKuMgEUETNZasrDM6vqVr/Can7H8=
github.com/alibabacloud-go/darabonba-map v0.0.2 h1:qvPnGB4+dJbJIxOOfawxzF3hzMnIpjmafa0qOTp6udc=
github.com/alibabacloud-go/darabonba-map v0.0.2/go.mod h1:28AJaX8FOE/ym8OUFWga+MtEzBunJwQGceGQlvaPGPc=
github.com/alibabacloud-go/darabonba-openapi/v2 v2.0.12/go.mod h1:cgtLEj8i4ddXMcQgq4PnpVQvlzS+y5B+QtdSfmcLM3A=
github.com/alibabacloud-go/darabonba-openapi/v2 v2.1.13 h1:Q00FU3H94Ts0ZIHDmY+fYGgB7dV9D/YX6FGsgorQPgw=
github.com/alibabacloud-go/darabonba-openapi/v2 v2.1.13/go.mod h1:lxFGfobinVsQ49ntjpgWghXmIF0/Sm4+wvBJ1h5RtaE=
github.com/alibabacloud-go/darabonba-signature-util v0.0.7 h1:UzCnKvsjPFzApvODDNEYqBHMFt1w98wC7FOo0InLyxg=
github.com/alibabacloud-go/darabonba-signature-util v0.0.7/go.mod h1:oUzCYV2fcCH797xKdL6BDH8ADIHlzrtKVjeRtunBNTQ=
github.com/alibabacloud-go/darabonba-string v1.0.2 h1:E714wms5ibdzCqGeYJ9JCFywE5nDyvIXIIQbZVFkkqo=
github.com/alibabacloud-go/darabonba-string v1.0.2/go.mod h1:93cTfV3vuPhhEwGGpKKqhVW4jLe7tDpo3LUM0i0g6mA=
github.com/alibabacloud-go/debug v0.0.0-20190504072949-9472017b5c68/go.mod h1:6pb/Qy8c+lqua8cFpEy7g39NRRqOWc3rOwAy8m5Y2BY=
github.com/alibabacloud-go/debug v1.0.0/go.mod h1:8gfgZCCAC3+SCzjWtY053FrOcd4/qlH6IHTI4QyICOc=
github.com/alibabacloud-go/debug v1.0.1 h1:MsW9SmUtbb1Fnt3ieC6NNZi6aEwrXfDksD4QA6GSbPg=
github.com/alibabacloud-go/debug v1.0.1/go.mod h1:8gfgZCCAC3+SCzjWtY053FrOcd4/qlH6IHTI4QyICOc=
github.com/alibabacloud-go/dingtalk v1.6.95 h1:fUSo0CaMYI8AUYIKZ8+xxyElTqcisqA0EyvFAJ6wpgQ=
github.com/alibabacloud-go/dingtalk v1.6.95/go.mod h1:mUcgNRgMGQzABtiZtTK8a3b6LwQBQ8t9WsDKzklqVpg=
github.com/alibabacloud-go/endpoint-util v1.1.0 h1:r/4D3VSw888XGaeNpP994zDUaxdgTSHBbVfZlzf6b5Q=
github.com/alibabacloud-go/endpoint-util v1.1.0/go.mod h1:O5FuCALmCKs2Ff7JFJMudHs0I5EBgecXXxZRyswlEjE=
github.com/alibabacloud-go/gateway-dingtalk v1.0.2 h1:+etjmc64QTmYvHlc6eFkH9y2DOc3UPcyD2nF3IXsVqw=
github.com/alibabacloud-go/gateway-dingtalk v1.0.2/go.mod h1:JUvHpkJtlPFpgJcfXqc9Y4mk2JnoRn5XpKbRz38jJho=
github.com/alibabacloud-go/openapi-util v0.1.0/go.mod h1:sQuElr4ywwFRlCCberQwKRFhRzIyG4QTP/P4y1CJ6Ws=
github.com/alibabacloud-go/openapi-util v0.1.1 h1:ujGErJjG8ncRW6XtBBMphzHTvCxn4DjrVw4m04HsS28=
github.com/alibabacloud-go/openapi-util v0.1.1/go.mod h1:/UehBSE2cf1gYT43GV4E+RxTdLRzURImCYY0aRmlXpw=
github.com/alibabacloud-go/tea v1.1.0/go.mod h1:IkGyUSX4Ba1V+k4pCtJUc6jDpZLFph9QMy2VUPTwukg=
github.com/alibabacloud-go/tea v1.1.7/go.mod h1:/tmnEaQMyb4Ky1/5D+SE1BAsa5zj/KeGOFfwYm3N/p4=
github.com/alibabacloud-go/tea v1.1.8/go.mod h1:/tmnEaQMyb4Ky1/5D+SE1BAsa5zj/KeGOFfwYm3N/p4=
github.com/alibabacloud-go/tea v1.1.11/go.mod h1:/tmnEaQMyb4Ky1/5D+SE1BAsa5zj/KeGOFfwYm3N/p4=
github.com/alibabacloud-go/tea v1.1.17/go.mod h1:nXxjm6CIFkBhwW4FQkNrolwbfon8Svy6cujmKFUq98A=
github.com/alibabacloud-go/tea v1.1.20/go.mod h1:nXxjm6CIFkBhwW4FQkNrolwbfon8Svy6cujmKFUq98A=
github.com/alibabacloud-go/tea v1.2.2/go.mod h1:CF3vOzEMAG+bR4WOql8gc2G9H3EkH3ZLAQdpmpXMgwk=
github.com/alibabacloud-go/tea v1.3.13 h1:WhGy6LIXaMbBM6VBYcsDCz6K/TPsT1Ri2hPmmZffZ94=
github.com/alibabacloud-go/tea v1.3.13/go.mod h1:A560v/JTQ1n5zklt2BEpurJzZTI8TUT+Psg2drWlxRg=
github.com/alibabacloud-go/tea-utils v1.3.1/go.mod h1:EI/o33aBfj3hETm4RLiAxF/ThQdSngxrpF8rKUDJjPE=
github.com/alibabacloud-go/tea-utils/v2 v2.0.1/go.mod h1:U5MTY10WwlquGPS34DOeomUGBB0gXbLueiq5Trwu0C4=
github.com/alibabacloud-go/tea-utils/v2 v2.0.5/go.mod h1:dL6vbUT35E4F4bFTHL845eUloqaerYBYPsdWR2/jhe4=
github.com/alibabacloud-go/tea-utils/v2 v2.0.6/go.mod h1:qxn986l+q33J5VkialKMqT/TTs3E+U9MJpd001iWQ9I=
github.com/alibabacloud-go/tea-utils/v2 v2.0.7 h1:WDx5qW3Xa5ZgJ1c8NfqJkF6w+AU5wB8835UdhPr6Ax0=
github.com/alibabacloud-go/tea-utils/v2 v2.0.7/go.mod h1:qxn986l+q33J5VkialKMqT/TTs3E+U9MJpd001iWQ9I=
github.com/alibabacloud-go/tea-xml v1.1.3/go.mod h1:Rq08vgCcCAjHyRi/M7xlHKUykZCEtyBy9+DPF6GgEu8=
github.com/alicebob/gopher-json v0.0.0-20200520072559-a9ecdc9d1d3a h1:HbKu58rmZpUGpz5+4FfNmIU+FmZg2P3Xaj2v2bfNWmk=
github.com/alicebob/gopher-json v0.0.0-20200520072559-a9ecdc9d1d3a/go.mod h1:SGnFV6hVsYE877CKEZ6tDNTjaSXYUk6QqoIK6PrAtcc=
github.com/alicebob/miniredis/v2 v2.33.0 h1:uvTF0EDeu9RLnUEG27Db5I68ESoIxTiXbNUiji6lZrA=
github.com/alicebob/miniredis/v2 v2.33.0/go.mod h1:MhP4a3EU7aENRi9aO+tHfTBZicLqQevyi/DJpoj6mi0=
github.com/aliyun/credentials-go v1.1.2/go.mod h1:ozcZaMR5kLM7pwtCMEpVmQ242suV6qTJya2bDq4X1Tw=
github.com/aliyun/credentials-go v1.3.1/go.mod h1:8jKYhQuDawt8x2+fusqa1Y6mPxemTsBEN04dgcAcYz0=
github.com/aliyun/credentials-go v1.3.6/go.mod h1:1LxUuX7L5YrZUWzBrRyk0SwSdH4OmPrib8NVePL3fxM=
github.com/aliyun/credentials-go v1.4.5/go.mod h1:Jm6d+xIgwJVLVWT561vy67ZRP4lPTQxMbEYRuT2Ti1U=
github.com/aliyun/credentials-go v1.4.6 h1:CG8rc/nxCNKfXbZWpWDzI9GjF4Tuu3Es14qT8Y0ClOk=
github.com/aliyun/credentials-go v1.4.6/go.mod h1:Jm6d+xIgwJVLVWT561vy67ZRP4lPTQxMbEYRuT2Ti1U=
github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M=
github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY=
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA=
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de/go.mod h1:DCaWoUhZrYW9p1lxo/cm8EmUOOzAPSEZNGF2DK1dJgw=
github.com/aws/aws-sdk-go v1.44.263/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI=
github.com/aws/aws-sdk-go v1.44.302 h1:ST3ko6GrJKn3Xi+nAvxjG3uk/V1pW8KC52WLeIxqqNk=
github.com/aws/aws-sdk-go v1.44.302/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI=
github.com/aws/aws-sdk-go-v2 v1.18.0/go.mod h1:uzbQtefpm44goOPmdKyAlXSNcwlRgF3ePWVW6EtJvvw=
github.com/aws/aws-sdk-go-v2/config v1.18.25/go.mod h1:dZnYpD5wTW/dQF0rRNLVypB396zWCcPiBIvdvSWHEg4=
github.com/aws/aws-sdk-go-v2/credentials v1.13.24/go.mod h1:jYPYi99wUOPIFi0rhiOvXeSEReVOzBqFNOX5bXYoG2o=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.13.3/go.mod h1:4Q0UFP0YJf0NrsEuEYHpM9fTSEVnD16Z3uyEF7J9JGM=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.33/go.mod h1:7i0PF1ME/2eUPFcjkVIwq+DOygHEoK92t5cDqNgYbIw=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.27/go.mod h1:UrHnn3QV/d0pBZ6QBAEQcqFLf8FAzLmoUfPVIueOvoM=
github.com/aws/aws-sdk-go-v2/internal/ini v1.3.34/go.mod h1:Etz2dj6UHYuw+Xw830KfzCfWGMzqvUTCjUj5b76GVDc=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.27/go.mod h1:EOwBD4J4S5qYszS5/3DpkejfuK+Z5/1uzICfPaZLtqw=
github.com/aws/aws-sdk-go-v2/service/sso v1.12.10/go.mod h1:ouy2P4z6sJN70fR3ka3wD3Ro3KezSxU6eKGQI2+2fjI=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.14.10/go.mod h1:AFvkxc8xfBe8XA+5St5XIHHrQQtkxqrRincx4hmMHOk=
github.com/aws/aws-sdk-go-v2/service/sts v1.19.0/go.mod h1:BgQOMsg8av8jset59jelyPW7NoZcZXLVpDsXunGDrk8=
github.com/aws/smithy-go v1.13.5/go.mod h1:Tg+OJXh4MB2R/uN61Ko2f6hTZwB/ZYGOtib8J3gBHzA=
github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/bitly/go-simplejson v0.5.1 h1:xgwPbetQScXt1gh9BmoJ6j9JMr3TElvuIyjR8pgdoow=
github.com/bitly/go-simplejson v0.5.1/go.mod h1:YOPVLzCfwK14b4Sff3oP1AmGhI9T9Vsg84etUnlyp+Q=
github.com/bsm/ginkgo/v2 v2.5.0 h1:aOAnND1T40wEdAtkGSkvSICWeQ8L3UASX7YVCqQx+eQ=
github.com/bsm/ginkgo/v2 v2.5.0/go.mod h1:AiKlXPm7ItEHNc/2+OkrNG4E0ITzojb9/xWzvQ9XZ9w=
github.com/bsm/gomega v1.20.0 h1:JhAwLmtRzXFTx2AkALSLa8ijZafntmhSoU63Ok18Uq8=
github.com/bsm/gomega v1.20.0/go.mod h1:JifAceMQ4crZIWYUKrlGcmbN3bqHogVTADMD2ATsbwk=
github.com/bytedance/sonic v1.5.0/go.mod h1:ED5hyg4y6t3/9Ku1R6dU/4KyJ48DZ4jPhfY1O2AihPM=
github.com/bytedance/sonic v1.9.1 h1:6iJ6NqdoxCDr6mbY8h18oSO+cShGSMRGCEo7F2h0x8s=
github.com/bytedance/sonic v1.9.1/go.mod h1:i736AoUSYt75HyZLoJW9ERYxcy6eaN6h4BZXU064P/U=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06/go.mod h1:DH46F32mSOjUmXrMHnKwZdA8wcEefY7UVqBKYGjpdQY=
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhDF8vLC+iwCD4WpbV1EBDSzWkJODFLams=
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
github.com/clbanning/mxj/v2 v2.5.5/go.mod h1:hNiWqW14h+kc+MdF9C6/YoRfjEJoR3ou6tn/Qo+ve2s=
github.com/clbanning/mxj/v2 v2.7.0 h1:WA/La7UGCanFe5NpHF0Q3DNtnCsVoxbPKuyBNHWRyME=
github.com/clbanning/mxj/v2 v2.7.0/go.mod h1:hNiWqW14h+kc+MdF9C6/YoRfjEJoR3ou6tn/Qo+ve2s=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
github.com/coreos/go-oidc v2.2.1+incompatible h1:mh48q/BqXqgjVHpy2ZY7WnWAbenxRjsz9N1i1YxjHAk=
github.com/coreos/go-oidc v2.2.1+incompatible/go.mod h1:CgnwVTmzoESiwO9qyAFEMiHoZ1nMCKZlZ9V6mm3/LKc=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dennwc/varint v1.0.0 h1:kGNFFSSw8ToIy3obO/kKr8U9GZYUAxQEVuix4zfDWzE=
github.com/dennwc/varint v1.0.0/go.mod h1:hnItb35rvZvJrbTALZtY/iQfDs48JKRG1RPpgziApxA=
github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumCAMpl/TFQ4/5kLM=
github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/eapache/go-resiliency v1.7.0 h1:n3NRTnBn5N0Cbi/IeOHuQn9s2UwVUH7Ga0ZWcP+9JTA=
github.com/eapache/go-resiliency v1.7.0/go.mod h1:5yPzW0MIvSe0JDsv0v+DvcjEv2FyD6iZYSs1ZI+iQho=
github.com/eapache/go-xerial-snappy v0.0.0-20230731223053-c322873962e3 h1:Oy0F4ALJ04o5Qqpdz8XLIpNA3WM/iSIXqxtqo7UGVws=
github.com/eapache/go-xerial-snappy v0.0.0-20230731223053-c322873962e3/go.mod h1:YvSRo5mw33fLEx1+DlK6L2VV43tJt5Eyel9n9XBcR+0=
github.com/eapache/queue v1.1.0 h1:YOEu7KNc61ntiQlcEeUIoDTJ2o8mQznoNvUhiigpIqc=
github.com/eapache/queue v1.1.0/go.mod h1:6eCeP0CKFpHLu8blIFXhExK/dRa7WDZfr6jVFPTqq+I=
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/expr-lang/expr v1.16.1 h1:Na8CUcMdyGbnNpShY7kzcHCU7WqxuL+hnxgHZ4vaz/A=
github.com/expr-lang/expr v1.16.1/go.mod h1:uCkhfG+x7fcZ5A5sXHKuQ07jGZRl6J0FCAaf2k4PtVQ=
github.com/fatih/camelcase v1.0.0 h1:hxNvNX/xYBp0ovncs8WyWZrOrpBNub/JfaMvbURyft8=
github.com/fatih/camelcase v1.0.0/go.mod h1:yN2Sb0lFhZJUdVvtELVWefmrXpuZESvPmqwoZc+/fpc=
github.com/fatih/structs v1.1.0 h1:Q7juDM0QtcnhCpeyLGQKyg4TOIghuNXrkL32pHAUMxo=
github.com/fatih/structs v1.1.0/go.mod h1:9NiDSp5zOcgEDl+j00MP/WkGVPOlPRLejGD8Ga6PJ7M=
github.com/flashcatcloud/ibex v1.3.6 h1:lJShPFxcZksmkB0w99a3uROGB+Fie1NsqOlkAdar12A=
github.com/flashcatcloud/ibex v1.3.6/go.mod h1:iTU1dKT9TnDNllRPRHUOjXe+HDTQkPH2TeaucHtSuh4=
github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw=
github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
github.com/garyburd/redigo v1.6.2/go.mod h1:NR3MbYisc3/PwhQ00EMzDiPmrwpPxAn5GI05/YaO1SY=
github.com/gin-contrib/pprof v1.4.0 h1:XxiBSf5jWZ5i16lNOPbMTVdgHBdhfGRD5PZ1LWazzvg=
github.com/gin-contrib/pprof v1.4.0/go.mod h1:RrehPJasUVBPK6yTUwOl8/NP6i0vbUgmxtis+Z5KE90=
github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
github.com/gin-gonic/gin v1.8.1/go.mod h1:ji8BvRH1azfM+SYow9zQ6SZMvR8qOMZHmsCuWR9tTTk=
github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=
github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU=
github.com/glebarez/go-sqlite v1.21.2 h1:3a6LFC4sKahUunAmynQKLZceZCOzUthkRkEAl9gAXWo=
github.com/glebarez/go-sqlite v1.21.2/go.mod h1:sfxdZyhQjTM2Wry3gVYWaW072Ri1WMdWJi0k6+3382k=
github.com/glebarez/sqlite v1.11.0 h1:wSG0irqzP6VurnMEpFGer5Li19RpIRi2qvQz++w0GMw=
github.com/glebarez/sqlite v1.11.0/go.mod h1:h8/o8j5wiAsqSPoWELDUdJXhjAhsVliSn7bWZjOhrgQ=
github.com/go-asn1-ber/asn1-ber v1.5.4 h1:vXT6d/FNDiELJnLb6hGNa309LMsrCoYFvpwHDF0+Y1A=
github.com/go-asn1-ber/asn1-ber v1.5.4/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0=
github.com/go-faster/city v1.0.1 h1:4WAxSZ3V2Ws4QRDrscLEDcibJY8uf41H6AhXDrNDcGw=
github.com/go-faster/city v1.0.1/go.mod h1:jKcUJId49qdW3L1qKHH/3wPeUstCVpVSXTM6vO3VcTw=
github.com/go-faster/errors v0.7.1 h1:MkJTnDoEdi9pDabt1dpWf7AA8/BaSYZqibYyhZ20AYg=
github.com/go-faster/errors v0.7.1/go.mod h1:5ySTjWFiphBs07IKuiL69nxdfd5+fzh1u7FPGZP2quo=
github.com/go-kit/log v0.2.1 h1:MRVx0/zhvdseW+Gza6N9rVzU/IVzaeE1SFI4raAhmBU=
github.com/go-kit/log v0.2.1/go.mod h1:NwTd00d/i8cPZ3xOwwiv2PO5MOcx78fFErGNcVmBjv0=
github.com/go-ldap/ldap/v3 v3.4.4 h1:qPjipEpt+qDa6SI/h1fzuGWoRUY+qqQ9sOZq67/PYUs=
github.com/go-ldap/ldap/v3 v3.4.4/go.mod h1:fe1MsuN5eJJ1FeLT/LEBVdWfNWKh459R7aXgXtJC+aI=
github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4=
github.com/go-logfmt/logfmt v0.6.0/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs=
github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
github.com/go-playground/locales v0.14.0/go.mod h1:sawfccIbzZTqEDETgFXqTho0QybSa7l++s0DH+LDiLs=
github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
github.com/go-playground/universal-translator v0.18.0/go.mod h1:UvRDBj+xPUEGrFYl+lu/H90nyDXpg0fqeB/AQUGNTVA=
github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY=
github.com/go-playground/validator/v10 v10.10.0/go.mod h1:74x4gJWsvQexRdW8Pn3dXSGrTK4nAUsbPlLADvpJkos=
github.com/go-playground/validator/v10 v10.14.0 h1:vgvQWe3XCz3gIeFDm/HnTIbj6UGmg/+t63MyGU2n5js=
github.com/go-playground/validator/v10 v10.14.0/go.mod h1:9iXMNT7sEkjXb0I+enO7QXmzG6QCsPWY4zveKFVRSyU=
github.com/go-sql-driver/mysql v1.6.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg=
github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI=
github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI=
github.com/goccy/go-json v0.9.7/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/golang-jwt/jwt v3.2.2+incompatible h1:IfV12K8xAKAnZqdXVzCZ+TOjboZ2keLg81eXfW3O+oY=
github.com/golang-jwt/jwt v3.2.2+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I=
github.com/golang-jwt/jwt/v4 v4.5.0 h1:7cYmW1XlMY7h7ii7UhUyChSgS5wUJEnm9uZVTGqOWzg=
github.com/golang-jwt/jwt/v4 v4.5.0/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0=
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/pprof v0.0.0-20230705174524-200ffdc848b8 h1:n6vlPhxsA+BW/XsS5+uqi7GyzaLa5MH7qlSLBZtRdiA=
github.com/google/pprof v0.0.0-20230705174524-200ffdc848b8/go.mod h1:Jh3hGz2jkYak8qXPD19ryItVnUgpgeqzdkY/D0EaeuA=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
github.com/gopherjs/gopherjs v0.0.0-20200217142428-fce0ec30dd00/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
github.com/gorilla/securecookie v1.1.1/go.mod h1:ra0sb63/xPlUeL+yeDciTfxMRAA+MP+HVt/4epWDjd4=
github.com/gorilla/sessions v1.2.1/go.mod h1:dk2InVEVJ0sfLlnXv9EAgkf6ecYs/i80K/zI+bUmuGM=
github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/grafana/regexp v0.0.0-20221122212121-6b5c0a4cb7fd h1:PpuIBO5P3e9hpqBD0O/HjhShYuM6XE0i/lbE6J94kww=
github.com/grafana/regexp v0.0.0-20221122212121-6b5c0a4cb7fd/go.mod h1:M5qHK+eWfAv8VR/265dIuEpL3fNfeC21tXXp9itM24A=
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I=
github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/hashicorp/go-uuid v1.0.2/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/Co8=
github.com/hashicorp/go-uuid v1.0.3/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
github.com/hashicorp/go-version v1.6.0 h1:feTTfFNnjP967rlCxM/I9g701jU+RN74YKx2mOkIeek=
github.com/hashicorp/go-version v1.6.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA=
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
github.com/jackc/pgx/v5 v5.7.1 h1:x7SYsPBYDkHDksogeSmZZ5xzThcTgRz++I5E+ePFUcs=
github.com/jackc/pgx/v5 v5.7.1/go.mod h1:e7O26IywZZ+naJtWWos6i6fvWK+29etgITqrqHLfoZA=
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
github.com/jcmturner/aescts/v2 v2.0.0 h1:9YKLH6ey7H4eDBXW8khjYslgyqG2xZikXP0EQFKrle8=
github.com/jcmturner/aescts/v2 v2.0.0/go.mod h1:AiaICIRyfYg35RUkr8yESTqvSy7csK90qZ5xfvvsoNs=
github.com/jcmturner/dnsutils/v2 v2.0.0 h1:lltnkeZGL0wILNvrNiVCR6Ro5PGU/SeBvVO/8c/iPbo=
github.com/jcmturner/dnsutils/v2 v2.0.0/go.mod h1:b0TnjGOvI/n42bZa+hmXL+kFJZsFT7G4t3HTlQ184QM=
github.com/jcmturner/gofork v1.7.6 h1:QH0l3hzAU1tfT3rZCnW5zXl+orbkNMMRGJfdJjHVETg=
github.com/jcmturner/gofork v1.7.6/go.mod h1:1622LH6i/EZqLloHfE7IeZ0uEJwMSUyQ/nDd82IeqRo=
github.com/jcmturner/goidentity/v6 v6.0.1 h1:VKnZd2oEIMorCTsFBnJWbExfNN7yZr3EhJAxwOkZg6o=
github.com/jcmturner/goidentity/v6 v6.0.1/go.mod h1:X1YW3bgtvwAXju7V3LCIMpY0Gbxyjn/mY9zx4tFonSg=
github.com/jcmturner/gokrb5/v8 v8.4.4 h1:x1Sv4HaTpepFkXbt2IkL29DXRf8sOfZXo8eRKh687T8=
github.com/jcmturner/gokrb5/v8 v8.4.4/go.mod h1:1btQEpgT6k+unzCwX1KdWMEwPPkkgBtP+F6aCACiMrs=
github.com/jcmturner/rpc/v2 v2.0.3 h1:7FXXj8Ti1IaVFpSAziCZWNzbNuZmnvw/i6CqLNdWfZY=
github.com/jcmturner/rpc/v2 v2.0.3/go.mod h1:VUJYCIDm3PVOEHw8sgt091/20OJjskO/YJki3ELg/Hc=
github.com/jinzhu/copier v0.4.0 h1:w3ciUoD19shMCRargcpm0cm91ytaBhDvuRpz1ODO/U8=
github.com/jinzhu/copier v0.4.0/go.mod h1:DfbEm0FYsaqBcKcFuvmOZb218JkPGtvSHsKg8S8hyyg=
github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E=
github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc=
github.com/jinzhu/now v1.1.4/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8=
github.com/jinzhu/now v1.1.5 h1:/o9tlHleP7gOFmsnYNz3RGnqzefHA47wQpKrrdTIwXQ=
github.com/jinzhu/now v1.1.5/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8=
github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc=
github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0=
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/klauspost/cpuid/v2 v2.2.4/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY=
github.com/klauspost/cpuid/v2 v2.2.5 h1:0E5MSMDEoAulmXNFquVs//DdoomxaoTY1kUhbc/qbZg=
github.com/klauspost/cpuid/v2 v2.2.5/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
github.com/koding/multiconfig v0.0.0-20171124222453-69c27309b2d7 h1:SWlt7BoQNASbhTUD0Oy5yysI2seJ7vWuGUp///OM4TM=
github.com/koding/multiconfig v0.0.0-20171124222453-69c27309b2d7/go.mod h1:Y2SaZf2Rzd0pXkLVhLlCiAXFCLSXAIbTKDivVgff/AM=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/larksuite/oapi-sdk-go/v3 v3.5.1 h1:gX4dz92YU70inuIX+ug+PBe64eHToIN9rHB4Vupv5Eg=
github.com/larksuite/oapi-sdk-go/v3 v3.5.1/go.mod h1:ZEplY+kwuIrj/nqw5uSCINNATcH3KdxSN7y+UxYY5fI=
github.com/leodido/go-urn v1.2.1/go.mod h1:zt4jvISO2HfUBqxjfIshjdMTYS56ZS/qv49ictyFfxY=
github.com/leodido/go-urn v1.2.4 h1:XlAE/cm/ms7TE/VMVoduSpNBoyc2dOxHs5MZSwAN63Q=
github.com/leodido/go-urn v1.2.4/go.mod h1:7ZrI8mTSeBSHl/UaRyKQW1qZeMgak41ANeCNaVckg+4=
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
github.com/mattn/go-sqlite3 v1.14.17 h1:mCRHCLDUBXgpKAqIKsaAaAsrAlbkeomtRFKXh2L6YIM=
github.com/mattn/go-sqlite3 v1.14.17/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg=
github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/mojocn/base64Captcha v1.3.6 h1:gZEKu1nsKpttuIAQgWHO+4Mhhls8cAKyiV2Ew03H+Tw=
github.com/mojocn/base64Captcha v1.3.6/go.mod h1:i5CtHvm+oMbj1UzEPXaA8IH/xHFZ3DGY3Wh3dBpZ28E=
github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/n9e/elastic/v7 v7.0.33-0.20251031061708-f480a2dfcfa7 h1:fPs1GClmnQZ6E/nzrJCieQKJNM46eqMkHaBg3SoHcgY=
github.com/n9e/elastic/v7 v7.0.33-0.20251031061708-f480a2dfcfa7/go.mod h1:/kVskIy0Pd8nAiKtPtcI4XnzOM+pM6MWQ+zP6YqPVFI=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/oklog/ulid v1.3.1 h1:EGfNDEx6MqHz8B3uNV6QAib1UR2Lm97sHi3ocA6ESJ4=
github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
github.com/opensearch-project/opensearch-go/v2 v2.3.0 h1:nQIEMr+A92CkhHrZgUhcfsrZjibvB3APXf2a1VwCmMQ=
github.com/opensearch-project/opensearch-go/v2 v2.3.0/go.mod h1:8LDr9FCgUTVoT+5ESjc2+iaZuldqE+23Iq0r1XeNue8=
github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc=
github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ=
github.com/paulmach/orb v0.11.1 h1:3koVegMC4X/WeiXYz9iswopaTwMem53NzTJuTF20JzU=
github.com/paulmach/orb v0.11.1/go.mod h1:5mULz1xQfs3bmQm63QEJA6lNGujuRafwA5S/EnuLaLU=
github.com/paulmach/protoscan v0.2.1/go.mod h1:SpcSwydNLrxUGSDvXvO0P7g7AuhJ7lcKfDlhJCDw2gY=
github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo=
github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
github.com/pierrec/lz4/v4 v4.1.22 h1:cKFw6uJDK+/gfw5BcDL0JL5aBsAFdsIT18eRtLj7VIU=
github.com/pierrec/lz4/v4 v4.1.22/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pingcap/errors v0.11.0/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8=
github.com/pingcap/errors v0.11.5-0.20250523034308-74f78ae071ee h1:/IDPbpzkzA97t1/Z1+C3KlxbevjMeaI6BQYxvivu4u8=
github.com/pingcap/errors v0.11.5-0.20250523034308-74f78ae071ee/go.mod h1:X2r9ueLEUZgtx2cIogM0v4Zj5uvvzhuuiu7Pn8HzMPg=
github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86 h1:tdMsjOqUR7YXHoBitzdebTvOjs/swniBTOLy5XiMtuE=
github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86/go.mod h1:exzhVYca3WRtd6gclGNErRWb1qEgff3LYta0LvRmON4=
github.com/pingcap/log v1.1.0 h1:ELiPxACz7vdo1qAvvaWJg1NrYFoY6gqAh/+Uo6aXdD8=
github.com/pingcap/log v1.1.0/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4=
github.com/pingcap/tidb/pkg/parser v0.0.0-20260120034856-e15515e804da h1:PhkRZgMWdq9kTsu7vtVbcDs+SBXjHfFj84027WVZCzI=
github.com/pingcap/tidb/pkg/parser v0.0.0-20260120034856-e15515e804da/go.mod h1:oHE+ub2QaDERd+UNHe4z2BhFV2jZrm7VNOe6atR9AF4=
github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8 h1:KoWmjvw+nsYOo29YJK9vDA65RGE3NrOnUtO7a+RF9HU=
github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8/go.mod h1:HKlIX3XHQyzLZPlr7++PzdhaXEj94dEiJgZDTsxEqUI=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pquerna/cachecontrol v0.1.0 h1:yJMy84ti9h/+OEWa752kBTKv4XC30OtVVHYv/8cTqKc=
github.com/pquerna/cachecontrol v0.1.0/go.mod h1:NrUG3Z7Rdu85UNR3vm7SOsl1nFIeSiQnrHV5K9mBcUI=
github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g=
github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U=
github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y=
github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
github.com/prometheus/common v0.60.1 h1:FUas6GcOw66yB/73KC+BOZoFJmbo/1pojoILArPAaSc=
github.com/prometheus/common v0.60.1/go.mod h1:h0LYf1R1deLSKtD4Vdg8gy4RuOvENW2J/h19V5NADQw=
github.com/prometheus/common/sigv4 v0.1.0 h1:qoVebwtwwEhS85Czm2dSROY5fTo2PAPEVdDeppTwGX4=
github.com/prometheus/common/sigv4 v0.1.0/go.mod h1:2Jkxxk9yYvCkE5G1sQT7GuEXm57JrvHu9k5YwTjsNtI=
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/prometheus/prometheus v0.47.1 h1:bd2LiZyxzHn9Oo2Ei4eK2D86vz/L/OiqR1qYo0XmMBo=
github.com/prometheus/prometheus v0.47.1/go.mod h1:J/bmOSjgH7lFxz2gZhrWEZs2i64vMS+HIuZfmYNhJ/M=
github.com/rakyll/statik v0.1.7 h1:OF3QCZUuyPxuGEP7B4ypUa7sB/iHtqOTDYZXGM8KOdQ=
github.com/rakyll/statik v0.1.7/go.mod h1:AlZONWzMtEnMs7W4e/1LURLiI49pIMmp6V9Unghqrcc=
github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:N/ElC8H3+5XpJzTSTfLsJV/mx9Q9g7kxmchpfZyxgzM=
github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4=
github.com/redis/go-redis/v9 v9.0.2 h1:BA426Zqe/7r56kCcvxYLWe1mkaz71LKF77GwgFzSxfE=
github.com/redis/go-redis/v9 v9.0.2/go.mod h1:/xDTe9EF1LM61hek62Poq2nzQSGj0xSrEtEHbBQevps=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
github.com/robfig/go-cache v0.0.0-20130306151617-9fc39e0dbf62/go.mod h1:65XQgovT59RWatovFwnwocoUxiI/eENTnOY5GK3STuY=
github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE=
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
github.com/scylladb/termtables v0.0.0-20191203121021-c4c0b6d42ff4/go.mod h1:C1a7PQSMz9NShzorzCiG2fk9+xuCgLkPeCvMHYR2OWg=
github.com/segmentio/asm v1.2.0 h1:9BQrFxC+YOHJlTlHGkTrFWf59nbL3XnCoFLTwDCI7ys=
github.com/segmentio/asm v1.2.0/go.mod h1:BqMnlJP91P8d+4ibuonYZw9mfnzI9HfxselHZr5aAcs=
github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k=
github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME=
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
github.com/smartystreets/assertions v1.1.0/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo=
github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI=
github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/tidwall/gjson v1.14.2 h1:6BBkirS0rAHjumnjHF6qgy5d2YAJ1TLIaFE2lzfOLqo=
github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs=
github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tjfoc/gmsm v1.3.2/go.mod h1:HaUcFuY0auTiaHB9MHFGCPx5IaLhTUd2atbCFBQXn9w=
github.com/tjfoc/gmsm v1.4.1 h1:aMe1GlZb+0bLjn+cKTPEvvn9oUEBlJitaZiiBwsbgho=
github.com/tjfoc/gmsm v1.4.1/go.mod h1:j4INPkHWMrhJb38G+J6W4Tw0AbuN8Thu3PbdVYhVcTE=
github.com/toolkits/pkg v1.3.8 h1:2yamC20c5mHRtbcGiLY99Lm/2mVitFn6onE8KKvMT1o=
github.com/toolkits/pkg v1.3.8/go.mod h1:M9ecwFGW1vxCTUFM9sr2ZjXSKb04N+1sTQ6SA3RNAIU=
github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6M=
github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95kRgeqEY=
github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU=
github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
github.com/valyala/fastrand v1.1.0 h1:f+5HkLW4rsgzdNoleUOB69hyT9IlD2ZQh9GyDMfb5G8=
github.com/valyala/fastrand v1.1.0/go.mod h1:HWqCzkrkg6QXT8V2EXWvXCoow7vLwOFN002oeRzjapQ=
github.com/valyala/histogram v1.2.0 h1:wyYGAZZt3CpwUiIb9AU/Zbllg1llXyrtApRS815OLoQ=
github.com/valyala/histogram v1.2.0/go.mod h1:Hb4kBwb4UxsaNbbbh+RRz8ZR6pdodR57tzWUS3BUzXY=
github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI=
github.com/xdg-go/scram v1.1.1/go.mod h1:RaEWvsqvNKKvBPvcKeFjrG2cJqOkHTiyTpzz23ni57g=
github.com/xdg-go/stringprep v1.0.3/go.mod h1:W3f5j4i+9rC0kuIEJL0ky1VpHXQU3ocBgklLGvcBnW8=
github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d/go.mod h1:rHwXgn7JulP+udvsHwJoVG1YGAP6VLg4y9I5dyZdqmA=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.1.30/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M=
github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw=
go.mongodb.org/mongo-driver v1.11.4/go.mod h1:PTSz5yu21bkT/wXpkS7WR5f0ddqw5quethTUn9WM+2g=
go.opentelemetry.io/otel v1.32.0 h1:WnBN+Xjcteh0zdk01SVqV55d/m62NJLJdIyb4y/WO5U=
go.opentelemetry.io/otel v1.32.0/go.mod h1:00DCVSB0RQcnzlwyTfqtxSm+DRr9hpYrHjNGiBHVQIg=
go.opentelemetry.io/otel/trace v1.32.0 h1:WIC9mYrXf8TmY/EXuULKc8hR17vE+Hjv2cssQDe03fM=
go.opentelemetry.io/otel/trace v1.32.0/go.mod h1:+i4rkvCraA+tG6AzwloGaCtkx53Fa+L+V8e9a7YvhT8=
go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ=
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
go.uber.org/automaxprocs v1.4.0/go.mod h1:/mTEdr7LvHhs0v7mjdxDreTz1OG5zdZGqgOnhWiR/+Q=
go.uber.org/automaxprocs v1.5.2 h1:2LxUOGiR3O6tw8ui5sZa2LAaHnsviZdVOUZw4fvbnME=
go.uber.org/automaxprocs v1.5.2/go.mod h1:eRbA25aqJrxAbsLO0xy5jVwPt7FQnRgjW+efnwa1WM0=
go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
go.uber.org/multierr v1.7.0/go.mod h1:7EAYxJLBy9rStEaz58O2t4Uvip6FSURkq8/ppBp95ak=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
go.uber.org/zap v1.19.0/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI=
go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8=
go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
golang.org/x/arch v0.3.0 h1:02VY4/ZcO/gBOH6PUaoiptASxtXU10jazRCP865E97k=
golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20191219195013-becbf705a915/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20201012173705-84dcc777aaee/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
golang.org/x/crypto v0.6.0/go.mod h1:OFC/31mSvZgRz0V1QTNCzfAI1aIRzbiufJtkMIlEp58=
golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU=
golang.org/x/crypto v0.9.0/go.mod h1:yrmDGqONDYtNj3tH8X9dzUun2m2lzPa9ngI6/RUPGR0=
golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4=
golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg=
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs=
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM=
golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q=
golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 h1:MGwJjxBy0HJshjDNfLsYO8xppfqWlA5ZT9OhtUUhTNw=
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc=
golang.org/x/image v0.13.0/go.mod h1:6mmbMOeV28HuMTgA6OSRkdXKYw/t5W9Uwn2Yv1r3Yxk=
golang.org/x/image v0.18.0 h1:jGzIakQa/ZXI1I0Fxvaa9W7yP25TqT6cHIHn+6CqvSQ=
golang.org/x/image v0.18.0/go.mod h1:4yyo5vMFQjVjUcVk4jEQcU9MGy/rulF5WvUILseCM2E=
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY=
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg=
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE=
golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M=
golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I=
golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200509044756-6aff5f38e54f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210806184541-e5e7981a1069/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U=
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U=
golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY=
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58=
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI=
golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
golang.org/x/tools v0.0.0-20191029041327-9cc4af7d6b2c/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200509030707-2212a7e161a5/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA=
google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc h1:2gGKlE2+asNV9m7xrywl36YYNnBG5ZQ0r/BOOxqPpmk=
gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc/go.mod h1:m7x9LTH6d71AHyAX77c9yqWCCa3UKHcVEj9y7hAtKDk=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
gopkg.in/gomail.v2 v2.0.0-20160411212932-81ebce5c23df h1:n7WqCuqOuCbNr617RXOY0AWRXxgwEyPp2z+p0+hgMuE=
gopkg.in/gomail.v2 v2.0.0-20160411212932-81ebce5c23df/go.mod h1:LRQQ+SO6ZHR7tOkpBDuZnXENFzX8qRjMDMyPD6BRkCw=
gopkg.in/ini.v1 v1.56.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA=
gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
gopkg.in/natefinch/lumberjack.v2 v2.0.0/go.mod h1:l0ndWWf7gzL7RNwBG7wST/UCcT4T24xpD6X8LsfU/+k=
gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc=
gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc=
gopkg.in/square/go-jose.v2 v2.6.0 h1:NGk74WTnPKBNUhNzQX7PYcTLUjoq7mzKk2OKbvwk2iI=
gopkg.in/square/go-jose.v2 v2.6.0/go.mod h1:M9dMgbHiYLoDGQrXy7OpJDJWiKiU//h+vD76mk0e1AI=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gorm.io/driver/clickhouse v0.6.1 h1:t7JMB6sLBXxN8hEO6RdzCbJCwq/jAEVZdwXlmQs1Sd4=
gorm.io/driver/clickhouse v0.6.1/go.mod h1:riMYpJcGZ3sJ/OAZZ1rEP1j/Y0H6cByOAnwz7fo2AyM=
gorm.io/driver/mysql v1.4.4 h1:MX0K9Qvy0Na4o7qSC/YI7XxqUw5KDw01umqgID+svdQ=
gorm.io/driver/mysql v1.4.4/go.mod h1:BCg8cKI+R0j/rZRQxeKis/forqRwRSYOR8OM3Wo6hOM=
gorm.io/driver/postgres v1.5.11 h1:ubBVAfbKEUld/twyKZ0IYn9rSQh448EdelLYk9Mv314=
gorm.io/driver/postgres v1.5.11/go.mod h1:DX3GReXH+3FPWGrrgffdvCk3DQ1dwDPdmbenSkweRGI=
gorm.io/driver/sqlite v1.5.5 h1:7MDMtUZhV065SilG62E0MquljeArQZNfJnjd9i9gx3E=
gorm.io/driver/sqlite v1.5.5/go.mod h1:6NgQ7sQWAIFsPrJJl1lSNSu2TABh0ZZ/zm5fosATavE=
gorm.io/gorm v1.23.8/go.mod h1:l2lP/RyAtc1ynaTjFksBde/O8v9oOGIApu2/xRitmZk=
gorm.io/gorm v1.25.10 h1:dQpO+33KalOA+aFYGlK+EfxcI5MbO7EP2yYygwh9h+s=
gorm.io/gorm v1.25.10/go.mod h1:hbnx/Oo0ChWMn1BIhpy1oYozzpM15i4YPuHDmfYtwg8=
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
modernc.org/libc v1.22.5 h1:91BNch/e5B0uPbJFgqbxXuOnxBQjlS//icfQEGmvyjE=
modernc.org/libc v1.22.5/go.mod h1:jj+Z7dTNX8fBScMVNRAYZ/jF91K8fdT2hYMThc3YjBY=
modernc.org/mathutil v1.6.0 h1:fRe9+AmYlaej+64JsEEhoWuAYBkOtQiMEU7n/XgfYi4=
modernc.org/mathutil v1.6.0/go.mod h1:Ui5Q9q1TR2gFm0AQRqQUaBWFLAhQpCwNcuhBOSedWPo=
modernc.org/memory v1.5.0 h1:N+/8c5rE6EqugZwHii4IFsaJ7MUhoWX07J5tC/iI5Ds=
modernc.org/memory v1.5.0/go.mod h1:PkUhL0Mugw21sHPeskwZW4D6VscE/GQJOnIpCnW6pSU=
modernc.org/sqlite v1.23.1 h1:nrSBg4aRQQwq59JpvGEQ15tNxoO5pX/kUjcRNwSAGQM=
modernc.org/sqlite v1.23.1/go.mod h1:OrDj17Mggn6MhE+iPbBNf7RGKODDE9NFT0f3EwDzJqk=
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
================================================
FILE: integrations/AMD_ROCm_SMI/collect/amd_rocm_smi/rocm.toml
================================================
# Query statistics from AMD Graphics cards using rocm-smi binary
# bin_path = "/opt/rocm/bin/rocm-smi"
## Optional: timeout for GPU polling
# timeout = "5s"
================================================
FILE: integrations/AMD_ROCm_SMI/markdown/README.md
================================================
# AMD ROCm System Management Interface (SMI) Input Plugin
forked from [telegraf/amd_rocm_smi](https://github.com/influxdata/telegraf/blob/master/plugins/inputs/amd_rocm_smi)
This plugin uses a query on the [`rocm-smi`][1] binary to pull GPU stats
including memory and GPU usage, temperatures and other.
[1]: https://github.com/RadeonOpenCompute/rocm_smi_lib/tree/master/python_smi_tools
## Global configuration options
In addition to the plugin-specific configuration settings, plugins support
additional global and plugin configuration settings. These settings are used to
modify metrics, tags, and field or create aliases and configure ordering, etc.
See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
[CONFIGURATION.md]: https://github.com/influxdata/telegraf/blob/master/docs/CONFIGURATION.md#plugins
## Configuration
```toml
# Query statistics from AMD Graphics cards using rocm-smi binary
# bin_path = "/opt/rocm/bin/rocm-smi"
# bin_path 不设置 则不采集
## Optional: timeout for GPU polling
# timeout = "5s"
```
## Metrics
- measurement: `amd_rocm_smi`
- tags
- `name` (entry name assigned by rocm-smi executable)
- `gpu_id` (id of the GPU according to rocm-smi)
- `gpu_unique_id` (unique id of the GPU)
- fields
- `driver_version` (integer)
- `fan_speed`(integer)
- `memory_total`(integer B)
- `memory_used`(integer B)
- `memory_free`(integer B)
- `temperature_sensor_edge` (float, Celsius)
- `temperature_sensor_junction` (float, Celsius)
- `temperature_sensor_memory` (float, Celsius)
- `utilization_gpu` (integer, percentage)
- `utilization_memory` (integer, percentage)
- `clocks_current_sm` (integer, Mhz)
- `clocks_current_memory` (integer, Mhz)
- `power_draw` (float, Watt)
## Troubleshooting
Check the full output by running `rocm-smi` binary manually.
Linux:
```sh
rocm-smi rocm-smi -o -l -m -M -g -c -t -u -i -f -p -P -s -S -v --showreplaycount --showpids --showdriverversion --showmemvendor --showfwinfo --showproductname --showserial --showuniqueid --showbus --showpendingpages --showpagesinfo --showretiredpages --showunreservablepages --showmemuse --showvoltage --showtopo --showtopoweight --showtopohops --showtopotype --showtoponuma --showmeminfo all --json
```
Please include the output of this command if opening a GitHub issue, together
with ROCm version.
## Example Output
```text
amd_rocm_smi,gpu_id=0x6861,gpu_unique_id=0x2150e7d042a1124,host=ali47xl,name=card0 clocks_current_memory=167i,clocks_current_sm=852i,driver_version=51114i,fan_speed=14i,memory_free=17145282560i,memory_total=17163091968i,memory_used=17809408i,power_draw=7,temperature_sensor_edge=28,temperature_sensor_junction=29,temperature_sensor_memory=92,utilization_gpu=0i 1630572551000000000
amd_rocm_smi,gpu_id=0x6861,gpu_unique_id=0x2150e7d042a1124,host=ali47xl,name=card0 clocks_current_memory=167i,clocks_current_sm=852i,driver_version=51114i,fan_speed=14i,memory_free=17145282560i,memory_total=17163091968i,memory_used=17809408i,power_draw=7,temperature_sensor_edge=29,temperature_sensor_junction=30,temperature_sensor_memory=91,utilization_gpu=0i 1630572701000000000
amd_rocm_smi,gpu_id=0x6861,gpu_unique_id=0x2150e7d042a1124,host=ali47xl,name=card0 clocks_current_memory=167i,clocks_current_sm=852i,driver_version=51114i,fan_speed=14i,memory_free=17145282560i,memory_total=17163091968i,memory_used=17809408i,power_draw=7,temperature_sensor_edge=29,temperature_sensor_junction=29,temperature_sensor_memory=92,utilization_gpu=0i 1630572749000000000
```
## Limitations and notices
Please notice that this plugin has been developed and tested on a limited number
of versions and small set of GPUs. Currently the latest ROCm version tested is
4.3.0. Notice that depending on the device and driver versions the amount of
information provided by `rocm-smi` can vary so that some fields would start/stop
appearing in the metrics upon updates. The `rocm-smi` JSON output is not
perfectly homogeneous and is possibly changing in the future, hence parsing and
unmarshaling can start failing upon updating ROCm.
Inspired by the current state of the art of the `nvidia-smi` plugin.
================================================
FILE: integrations/AliYun/collect/aliyun/cloud.toml
================================================
# # collect interval
# interval = 60
[[instances]]
# # endpoint region 参考 https://help.aliyun.com/document_detail/28616.html#section-72p-xhs-6qt
# region="cn-beijing"
# endpoint="metrics.cn-hangzhou.aliyuncs.com"
# access_key_id="your-access-key-id"
# access_key_secret="your-access-key-secret"
# interval_times=4
# delay="10m"
# period="60s"
# # namespace 参考 https://help.aliyun.com/document_detail/163515.htm?spm=a2c4g.11186623.0.0.44d65c58mhgNw3
# namespaces=["acs_ecs_dashboard"]
# [[instances.metric_filters]]
# # metric name 参考 https://help.aliyun.com/document_detail/163515.htm?spm=a2c4g.11186623.0.0.401d15c73Z0dZh
# # 参考页面中的Metric Id 填入下面的metricName ,页面中包含中文的Metric Name对应接口中的Description
# metric_names=["cpu_cores","vm.TcpCount"]
# namespace=""
# ratelimit=25
# catch_ttl="1h"
# timeout="5s"
================================================
FILE: integrations/AliYun/dashboards/arms-api.json
================================================
{
"id": 0,
"group_id": 0,
"name": "ARMS-API",
"ident": "",
"tags": "ARMS JVM",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"collapsed": true,
"id": "2309f230-83bc-4e48-8422-5d9556154af1",
"layout": {
"h": 1,
"i": "2309f230-83bc-4e48-8422-5d9556154af1",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "指标汇总",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"aggrDimension": [
"host"
],
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "labelValuesToRows",
"showHeader": true,
"sortOrder": "ascend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "展示选定时段内承载了接口R的机器指标",
"id": "76a4644d-7b96-4e4c-8486-7f93b6fda1e6",
"layout": {
"h": 6,
"i": "73025dc0-5732-4753-8eac-c56adf1e42b1",
"isResizable": true,
"w": 24,
"x": 0,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "机器维度-指标列表(选定时段)",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "sum by (host) (sum_over_time(arms_app_requests_count{service=\"$service\",rpc=\"$rpc\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[$__range]))",
"legend": "请求总量",
"refId": "A"
},
{
"expr": "sum by (host) (sum_over_time(arms_app_requests_error_count{service=\"$service\",rpc=\"$rpc\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[$__range]))",
"legend": "错误总数",
"refId": "B"
},
{
"expr": "sum by (host) (sum_over_time(arms_app_requests_seconds{service=\"$service\",rpc=\"$rpc\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[$__range]))/sum by (host) (sum_over_time(arms_app_requests_count{service=\"$service\",rpc=\"$rpc\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[$__range]))",
"legend": "平均耗时",
"refId": "C"
},
{
"expr": "sum by (host) (sum_over_time(arms_exception_requests_count{service=\"$service\",rpc=\"$rpc\"}[$__range])) or on (host) (group by (host) (max_over_time(arms_app_requests_count{service=\"$service\",rpc=\"$rpc\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[$__range]))-1)",
"legend": "异常数",
"refId": "G"
},
{
"expr": "max by (host) (max_over_time(arms_http_requests_latency_seconds{service=\"$service\",rpc=\"$rpc\",quantile=\"0.75\"}[$__range])) or on (host) (group by (host) (max_over_time(arms_app_requests_count{service=\"$service\",rpc=\"$rpc\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[$__range]))-1)",
"legend": "75分位延时",
"refId": "D"
},
{
"expr": "max by (host) (max_over_time(arms_http_requests_latency_seconds{service=\"$service\",rpc=\"$rpc\",quantile=\"0.9\"}[$__range])) or on (host) (group by (host) (max_over_time(arms_app_requests_count{service=\"$service\",rpc=\"$rpc\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[$__range]))-1)",
"legend": "90分位延时",
"refId": "E"
},
{
"expr": "max by (host) (max_over_time(arms_http_requests_latency_seconds{service=\"$service\",rpc=\"$rpc\",quantile=\"0.99\"}[$__range])) or on (host) (group by (host) (max_over_time(arms_app_requests_count{service=\"$service\",rpc=\"$rpc\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[$__range]))-1)",
"legend": "{{host}}:99分位",
"refId": "F"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "66d6b812-fbc3-4a71-a9a8-0d6b72dafe04",
"layout": {
"h": 1,
"i": "66d6b812-fbc3-4a71-a9a8-0d6b72dafe04",
"isResizable": false,
"w": 24,
"x": 0,
"y": 7
},
"name": "概览",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "17655cb8-e070-48cc-ba16-cba9d5c3e984",
"layout": {
"h": 8,
"i": "17655cb8-e070-48cc-ba16-cba9d5c3e984",
"isResizable": true,
"w": 12,
"x": 0,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "接口耗时堆叠图",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum by (callType) (arms_app_requests_seconds{service=\"$service\",rpc=\"$rpc\",callType=~\"^dubbo_client$|^http_client$|^mongodb$|^oracle$|^client$|^redis$|^dmdb$|^thrift_client$|^dsf_client$|^db$|^mq_client$|^grpc_client$|^hsf_client$|^mysql$|^postgresql$|^memcached$|^consumer$\"}) or label_replace(sum(arms_app_requests_seconds{service=\"$service\",rpc=\"$rpc\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\"})- sum(arms_app_requests_seconds{service=\"$service\",rpc=\"$rpc\",callType=~\"^dubbo_client$|^http_client$|^mongodb$|^oracle$|^client$|^redis$|^dmdb$|^thrift_client$|^dsf_client$|^db$|^mq_client$|^grpc_client$|^hsf_client$|^mysql$|^postgresql$|^memcached$|^consumer$\"}),\"callType\",\"other\",\"callType\",\".*\"))/ on () group_left() sum(arms_app_requests_count{service=\"$service\",rpc=\"$rpc\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\"})",
"legend": "{{callType}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e2d00781-a62e-4e7e-8808-05b89c986141",
"layout": {
"h": 8,
"i": "e2d00781-a62e-4e7e-8808-05b89c986141",
"isResizable": true,
"w": 12,
"x": 12,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "响应时间/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (callType) (sum_over_time(arms_app_requests_seconds{service=\"$service\",rpc=\"$rpc\",host=~\"$host\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[1m]))/sum by (callType) (sum_over_time(arms_app_requests_count{service=\"$service\",rpc=\"$rpc\",host=~\"$host\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[1m]))",
"legend": "{{callType}}入口",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5df881f7-c6a0-49ff-8756-ad0fa079e7a4",
"layout": {
"h": 8,
"i": "5df881f7-c6a0-49ff-8756-ad0fa079e7a4",
"isResizable": true,
"w": 12,
"x": 0,
"y": 16
},
"links": [],
"maxPerRow": 4,
"name": "请求数/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (callType) (sum_over_time(arms_app_requests_count{service=\"$service\",rpc=\"$rpc\",host=~\"$host\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[1m]))",
"legend": "{{callType}}入口",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "929ab59e-08fe-4665-beed-8327cc5e4a96",
"layout": {
"h": 8,
"i": "929ab59e-08fe-4665-beed-8327cc5e4a96",
"isResizable": true,
"w": 12,
"x": 12,
"y": 16
},
"links": [],
"maxPerRow": 4,
"name": "HTTP-状态码统计",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (status) (sum_over_time(arms_requests_by_status_count{service=\"$service\",callType=\"http\",rpc=\"$rpc\",host=~\"$host\"}[1m]))",
"legend": "{{status}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0d1843e3-d786-4763-9e2d-4e7b1621a372",
"layout": {
"h": 8,
"i": "0d1843e3-d786-4763-9e2d-4e7b1621a372",
"isResizable": true,
"w": 12,
"x": 0,
"y": 24
},
"links": [],
"maxPerRow": 4,
"name": "响应时间-分数位指标",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(arms_http_requests_latency_seconds{service=\"$service\",rpc=\"$rpc\",host=~\"$host\",quantile=\"0.75\"})",
"legend": "p75",
"refId": "A"
},
{
"expr": "max(arms_http_requests_latency_seconds{service=\"$service\",rpc=\"$rpc\",host=~\"$host\",quantile=\"0.9\"})",
"legend": "p90",
"refId": "B"
},
{
"expr": "max(arms_http_requests_latency_seconds{service=\"$service\",rpc=\"$rpc\",host=~\"$host\",quantile=\"0.99\"})",
"legend": "p99",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "808cfbe5-57dd-44c5-82c7-1877c632519f",
"layout": {
"h": 8,
"i": "808cfbe5-57dd-44c5-82c7-1877c632519f",
"isResizable": true,
"w": 12,
"x": 12,
"y": 29
},
"links": [],
"maxPerRow": 4,
"name": "错误数/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (callType) (sum_over_time(arms_app_requests_error_count{service=\"$service\",rpc=\"$rpc\",host=~\"$host\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[1m]))",
"legend": "{{callType}}入口",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "dfb801f2-06b2-4cdf-824a-c48b6a579085",
"layout": {
"h": 1,
"i": "dfb801f2-06b2-4cdf-824a-c48b6a579085",
"isResizable": false,
"w": 24,
"x": 0,
"y": 33
},
"name": "链路上游",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"aggrDimension": [
"uniq"
],
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "labelValuesToRows",
"showHeader": true,
"sortOrder": "ascend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "展示选定时段内调用了部署在机器N(默认为All,表示所有机器)上的接口R的上游服务的相关指标",
"id": "2386728f-9a8f-406b-98d5-5d5f0eda2c21",
"layout": {
"h": 6,
"i": "2386728f-9a8f-406b-98d5-5d5f0eda2c21",
"isResizable": true,
"w": 24,
"x": 0,
"y": 34
},
"links": [],
"maxPerRow": 4,
"name": "链路上游",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "label_join(sum by (prpc,parent) (sum_over_time(arms_app_requests_count{service=\"$service\",rpc=\"$rpc\",prpc!=\"nil\",prpc!=\"\",host=~\"$host\",parent!=\"\",parent!=\"nil\",rpc!=\"__all__\",endpoint!=\"__all__\",destId!=\"__all__\"}[$__range])),\"uniq\",\"-\",\"parent\",\"prpc\")",
"legend": "",
"refId": "A"
},
{
"expr": "sum by (uniq) (label_join(sum_over_time(arms_app_requests_seconds{service=\"$service\",rpc=\"$rpc\",prpc!=\"nil\",prpc!=\"\",parent!=\"\",parent!=\"nil\",host=~\"$host\",rpc!=\"__all__\",endpoint!=\"__all__\",destId!=\"__all__\"}[$__range]),\"uniq\",\"-\",\"parent\",\"prpc\"))/sum by (uniq) (label_join(sum_over_time(arms_app_requests_count{service=\"$service\",rpc=\"$rpc\",prpc!=\"nil\",prpc!=\"\",parent!=\"\",parent!=\"nil\",host=~\"$host\",rpc!=\"__all__\",endpoint!=\"__all__\",destId!=\"__all__\"}[$__range]),\"uniq\",\"-\",\"parent\",\"prpc\"))",
"legend": "耗时",
"refId": "B"
},
{
"expr": "sum by (uniq) (label_join(sum_over_time(arms_app_requests_error_count{service=\"$service\",rpc=\"$rpc\",prpc!=\"\",prpc!=\"nil\",parent!=\"nil\",parent!=\"\",host=~\"$host\",rpc!=\"__all__\",endpoint!=\"__all__\",destId!=\"__all__\"}[$__range]),\"uniq\",\"-\",\"parent\",\"prpc\"))",
"legend": "错误数",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c7571f5a-9fde-4483-8d53-8841486a5351",
"layout": {
"h": 8,
"i": "c7571f5a-9fde-4483-8d53-8841486a5351",
"isResizable": true,
"w": 24,
"x": 0,
"y": 40
},
"links": [],
"maxPerRow": 4,
"name": "请求数/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (parent,prpc) (sum_over_time(arms_app_requests_count{service=\"$service\",rpc=\"$rpc\",prpc!=\"\",prpc!=\"nil\",parent!=\"\",parent!=\"nil\",host=~\"$host\",rpc!=\"__all__\",endpoint!=\"__all__\",destId!=\"__all__\"}[1m]))",
"legend": "{{parent}}:{{prpc}}",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "853de6b3-5655-4011-bb2d-144b42f0ddc9",
"layout": {
"h": 9,
"i": "853de6b3-5655-4011-bb2d-144b42f0ddc9",
"isResizable": true,
"w": 24,
"x": 0,
"y": 48
},
"links": [],
"maxPerRow": 4,
"name": "错误数/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (parent,prpc) (sum_over_time(arms_app_requests_error_count{service=\"$service\",rpc=\"$rpc\",prpc!=\"\",prpc!=\"nil\",parent!=\"\",parent!=\"nil\",host=~\"$host\",rpc!=\"__all__\",endpoint!=\"__all__\",destId!=\"__all__\"}[1m]))",
"legend": "{{parent}}:{{prpc}}",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5c5c26c1-d291-477f-bc9e-8d12eefc9536",
"layout": {
"h": 8,
"i": "5c5c26c1-d291-477f-bc9e-8d12eefc9536",
"isResizable": true,
"w": 24,
"x": 0,
"y": 57
},
"links": [],
"maxPerRow": 4,
"name": "耗时/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (parent,prpc) (sum_over_time(arms_app_requests_seconds{service=\"$service\",rpc=\"$rpc\",prpc!=\"\",prpc!=\"nil\",parent!=\"\",parent!=\"nil\",host=~\"$host\",rpc!=\"__all__\",endpoint!=\"__all__\",destId!=\"__all__\"}[1m]))/sum by (parent,prpc) (sum_over_time(arms_app_requests_count{service=\"$service\",rpc=\"$rpc\",prpc!=\"\",prpc!=\"nil\",parent!=\"\",parent!=\"nil\",host=~\"$host\",rpc!=\"__all__\",endpoint!=\"__all__\",destId!=\"__all__\"}[1m]))",
"legend": "{{parent}}=\u003e{{prpc}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "077fbbac-ab18-4cd6-9898-6730733ae356",
"layout": {
"h": 1,
"i": "077fbbac-ab18-4cd6-9898-6730733ae356",
"isResizable": false,
"w": 24,
"x": 0,
"y": 65
},
"name": "链路下游",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"aggrDimension": [
"uniq"
],
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "labelValuesToRows",
"showHeader": true,
"sortOrder": "ascend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "展示选定时段内部署在机器N(默认为All,表示所有机器)上的接口R所调用的下游服务的相关指标",
"id": "ab72abb7-a807-4560-891c-449a223ac89d",
"layout": {
"h": 9,
"i": "ab72abb7-a807-4560-891c-449a223ac89d",
"isResizable": true,
"w": 24,
"x": 0,
"y": 66
},
"links": [],
"maxPerRow": 4,
"name": "链路下游",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "label_join(sum by (service,rpc) (sum_over_time(arms_app_requests_count{prpc=\"$rpc\",parent=\"$service\",rpc!=\"__all\",endpoint!=\"__all__\",destId!=\"__all__\"}[$__range])),\"uniq\",\"-\",\"service\",\"rpc\")",
"legend": "",
"refId": "A"
},
{
"expr": "sum by (uniq) (label_join(sum_over_time(arms_app_requests_seconds{parent=\"$service\",prpc=\"$rpc\",rpc!=\"__all\",endpoint!=\"__all__\",destId!=\"__all__\"}[$__range]),\"uniq\",\"-\",\"service\",\"rpc\"))/sum by (uniq) (label_join(sum_over_time(arms_app_requests_count{parent=\"$service\",prpc=\"$rpc\",rpc!=\"__all\",endpoint!=\"__all__\",destId!=\"__all__\"}[$__range]),\"uniq\",\"-\",\"service\",\"rpc\"))",
"legend": "耗时",
"refId": "B"
},
{
"expr": "sum by (uniq) (label_join(sum_over_time(arms_app_requests_error_count{parent=\"$service\",prpc=\"$rpc\",rpc!=\"__all\",endpoint!=\"__all__\",destId!=\"__all__\"}[$__range]),\"uniq\",\"-\",\"service\",\"rpc\"))",
"legend": "错误数",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "edcbe109-e76f-4730-b3d3-64a649f93c3f",
"layout": {
"h": 9,
"i": "edcbe109-e76f-4730-b3d3-64a649f93c3f",
"isResizable": true,
"w": 24,
"x": 0,
"y": 75
},
"links": [],
"maxPerRow": 4,
"name": "请求数/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (service,rpc) (sum_over_time(arms_app_requests_count{prpc=\"$rpc\",parent=\"$service\",rpc!=\"__all\",endpoint!=\"__all__\",destId!=\"__all__\"}[1m]))",
"legend": "{{service}}:{{rpc}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9091d039-6524-4c81-9573-17152a2f7970",
"layout": {
"h": 9,
"i": "9091d039-6524-4c81-9573-17152a2f7970",
"isResizable": true,
"w": 24,
"x": 0,
"y": 84
},
"links": [],
"maxPerRow": 4,
"name": "耗时/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (service,rpc) (sum_over_time(arms_app_requests_seconds{parent=\"$service\",prpc=\"$rpc\",rpc!=\"__all\",endpoint!=\"__all__\",destId!=\"__all__\"}[1m]))/sum by (service,rpc) (sum_over_time(arms_app_requests_count{parent=\"$service\",prpc=\"$rpc\",rpc!=\"__all\",endpoint!=\"__all__\",destId!=\"__all__\"}[1m]))",
"legend": "{{service}}:{{rpc}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e1681553-ba5c-431f-9a46-0ef56a2d62b8",
"layout": {
"h": 9,
"i": "e1681553-ba5c-431f-9a46-0ef56a2d62b8",
"isResizable": true,
"w": 24,
"x": 0,
"y": 93
},
"links": [],
"maxPerRow": 4,
"name": "错误数/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (service,rpc) (sum_over_time(arms_app_requests_error_count{parent=\"$service\",prpc=\"$rpc\",rpc!=\"__all\",endpoint!=\"__all__\",destId!=\"__all__\"}[1m]))",
"legend": "{{service}}:{{rpc}}",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(arms_system_cpu_idle,service)",
"multi": false,
"name": "service",
"reg": "",
"type": "query"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(arms_app_requests_count{service=\"$service\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",rpc!=\"__all__\"},rpc)",
"multi": false,
"name": "rpc",
"reg": "",
"type": "query"
},
{
"allOption": true,
"allValue": ".*",
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(arms_system_cpu_idle{service=\"$service\"},host)",
"multi": false,
"name": "host",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327083057000
}
================================================
FILE: integrations/AliYun/dashboards/arms-application.json
================================================
{
"id": 0,
"group_id": 0,
"name": "ARMS-Application",
"ident": "",
"tags": "JVM ARMS",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"collapsed": true,
"id": "faaf0534-b773-4f49-85bf-fc9ea18b6323",
"layout": {
"h": 1,
"i": "faaf0534-b773-4f49-85bf-fc9ea18b6323",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "总计",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "取最后一个非0点的实例数",
"id": "6dd1e508-b12a-4180-a9e0-616b4cbcaaf8",
"layout": {
"h": 4,
"i": "6dd1e508-b12a-4180-a9e0-616b4cbcaaf8",
"isResizable": true,
"w": 3,
"x": 0,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "实例数(实时)",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "count(count by (host) (arms_system_cpu_idle{service=\"$service\"}))",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "497d0189-6877-4d07-89a7-36b529d16750",
"layout": {
"h": 4,
"i": "497d0189-6877-4d07-89a7-36b529d16750",
"isResizable": true,
"w": 3,
"x": 3,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "累计请求量(选定时段)",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum(sum_over_time(arms_app_requests_count{service=\"$service\",host=~\"$host\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",rpc=\"__all__\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[$__range]))",
"legend": "当前",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "选定时段的总耗时/选定时段的总请求数",
"id": "69247449-841c-4e5e-a7d0-4e68c7bee923",
"layout": {
"h": 4,
"i": "69247449-841c-4e5e-a7d0-4e68c7bee923",
"isResizable": true,
"w": 4,
"x": 6,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "平均耗时(选定时段)",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum(sum_over_time(arms_app_requests_seconds{service=\"$service\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",rpc=\"__all__\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[$__range]))/sum(sum_over_time(arms_app_requests_count{service=\"$service\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",rpc=\"__all__\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[$__range]))",
"legend": "当前",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ae5f63cc-f736-4e3e-8483-2aa49eaddd65",
"layout": {
"h": 4,
"i": "ae5f63cc-f736-4e3e-8483-2aa49eaddd65",
"isResizable": true,
"w": 3,
"x": 10,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "累计错误数(选定时段)",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum(sum_over_time(arms_app_requests_error_count{service=\"$service\",host=~\"$host\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",rpc=\"__all__\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[$__range]))",
"legend": "当前",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6fc2cab9-69f5-43ab-a988-bd559ce91d79",
"layout": {
"h": 4,
"i": "6fc2cab9-69f5-43ab-a988-bd559ce91d79",
"isResizable": true,
"w": 3,
"x": 13,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "累计异常数(选定时段)",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum(sum_over_time(arms_exception_requests_count{service=\"$service\",rpc=\"__all__\",exceptionId!=\"\",exceptionId!=\"nil\"}[$__range]))",
"legend": "当前",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "avg",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "选定时间端内FullGC的次数",
"id": "c76990b4-0756-44ed-808a-62d1bf0e0b44",
"layout": {
"h": 4,
"i": "c76990b4-0756-44ed-808a-62d1bf0e0b44",
"isResizable": true,
"w": 4,
"x": 16,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "累计Full GC次数(选定时段)",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum(sum_over_time(arms_jvm_gc_delta{service=\"$service\",host=~\"$host\",gen=\"old\"}[$__range]))",
"legend": "当前",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "avg",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "耗时大于5ms",
"id": "3f913af5-09ff-41f5-a9b6-3628dc28e951",
"layout": {
"h": 4,
"i": "3f913af5-09ff-41f5-a9b6-3628dc28e951",
"isResizable": true,
"w": 4,
"x": 20,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "累计慢SQL(选定时段)",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum(sum_over_time(arms_db_requests_slow_count{service=\"$service\",host=~\"$host\",rpc=\"__all__\"}[$__range]))",
"legend": "当前",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "4b534f0c-da5f-4e33-add7-7fe9af733656",
"layout": {
"h": 1,
"i": "4b534f0c-da5f-4e33-add7-7fe9af733656",
"isResizable": false,
"w": 24,
"x": 0,
"y": 5
},
"name": "概览",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"aggrDimension": [
"rpc"
],
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "labelValuesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "展示选定时段内服务S的接口指标",
"id": "834ea39f-f630-45d9-a4a0-0e506baee4f2",
"layout": {
"h": 8,
"i": "834ea39f-f630-45d9-a4a0-0e506baee4f2",
"isResizable": true,
"w": 24,
"x": 0,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "接口指标汇总(选定时段)",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "sum by (rpc) (sum_over_time(arms_app_requests_count{service=\"$service\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\",rpc!=\"__all__\"}[$__range]))",
"legend": "请求总量",
"refId": "A"
},
{
"expr": "sum by (rpc) (sum_over_time(arms_app_requests_error_count{service=\"$service\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\",rpc!=\"__all__\"}[$__range]))",
"legend": "错误数",
"refId": "B"
},
{
"expr": "sum by (rpc) (sum_over_time(arms_app_requests_seconds{service=\"$service\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\",rpc!=\"__all__\"}[$__range]))/sum by (rpc) (sum_over_time(arms_app_requests_count{service=\"$service\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\",rpc!=\"__all__\"}[$__range]))",
"legend": "平均请求耗时",
"refId": "C"
},
{
"expr": "sum by (rpc) (sum_over_time(arms_exception_requests_count{service=\"$service\",exceptionId!=\"\",rpc!=\"__all__\"}[$__range])) or on (rpc) group by (rpc) (max_over_time(arms_app_requests_count{service=\"$service\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\",rpc!=\"__all__\"}[$__range]))-1",
"legend": "异常数量",
"refId": "G"
},
{
"expr": "max by (rpc) (max_over_time(arms_http_requests_latency_seconds{service=\"$service\",quantile=\"0.75\"}[$__range])) or on (rpc) group by (rpc) (max_over_time(arms_app_requests_count{service=\"$service\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\",rpc!=\"__all__\"}[$__range]))-1",
"legend": "75分位延时",
"refId": "D"
},
{
"expr": "max by (rpc) (max_over_time(arms_http_requests_latency_seconds{service=\"$service\",quantile=\"0.9\"}[$__range])) or on (rpc) group by (rpc) (max_over_time(arms_app_requests_count{service=\"$service\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\",rpc!=\"__all__\"}[$__range]))-1",
"legend": "90分位延迟",
"refId": "E"
},
{
"expr": "max by (rpc) (max_over_time(arms_http_requests_latency_seconds{service=\"$service\",quantile=\"0.99\"}[$__range])) or on (rpc) group by (rpc) (max_over_time(arms_app_requests_count{service=\"$service\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\",rpc!=\"__all__\"}[$__range]))-1",
"legend": "99分位延迟",
"refId": "F"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "aa244cb8-b9a7-4f90-b383-403a1fa750bb",
"layout": {
"h": 8,
"i": "aa244cb8-b9a7-4f90-b383-403a1fa750bb",
"isResizable": true,
"w": 12,
"x": 0,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "请求数/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (callType) (sum_over_time(arms_app_requests_count{service=\"$service\",host=~\"$host\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",rpc=\"__all__\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[1m]))",
"legend": "{{callType}}入口",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a681c948-c021-4f3f-a7ca-96740dba068d",
"layout": {
"h": 8,
"i": "a681c948-c021-4f3f-a7ca-96740dba068d",
"isResizable": true,
"w": 12,
"x": 12,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "响应时间/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (callType) (arms_app_requests_seconds{service=\"$service\",host=~\"$host\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",rpc=\"__all__\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"})/sum by (callType) (arms_app_requests_count{service=\"$service\",host=~\"$host\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",rpc=\"__all__\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"})",
"legend": "{{callType}}入口",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9e75c715-5012-4698-aeda-f16223fd1214",
"layout": {
"h": 8,
"i": "9e75c715-5012-4698-aeda-f16223fd1214",
"isResizable": true,
"w": 12,
"x": 0,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "错误数/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (callType) (sum_over_time(arms_app_requests_error_count{service=\"$service\",host=~\"$host\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",rpc=\"__all__\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[1m]))",
"legend": "{{callType}}入口",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f1476512-6731-446e-bb8d-c37652d3b045",
"layout": {
"h": 8,
"i": "f1476512-6731-446e-bb8d-c37652d3b045",
"isResizable": true,
"w": 12,
"x": 12,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "HTTP-状态码统计",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (status) (sum_over_time(arms_requests_by_status_count{service=\"$service\",host=~\"$host\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",rpc=\"__all__\"}[1m]))",
"legend": "{{status}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "a57dc9af-ac9b-4d19-983e-e243bf2f4f06",
"layout": {
"h": 1,
"i": "a57dc9af-ac9b-4d19-983e-e243bf2f4f06",
"isResizable": false,
"w": 24,
"x": 0,
"y": 30
},
"name": "应用依赖服务",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "be7ee9e3-717b-4ac6-9c9c-66c7ddd7d135",
"layout": {
"h": 8,
"i": "be7ee9e3-717b-4ac6-9c9c-66c7ddd7d135",
"isResizable": true,
"w": 12,
"x": 0,
"y": 31
},
"links": [],
"maxPerRow": 4,
"name": "应用依赖服务请求量/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (callType) (sum_over_time(arms_app_requests_count{callType=~\"^dubbo_client$|^http_client$|^mongodb$|^oracle$|^client$|^redis$|^dmdb$|^thrift_client$|^dsf_client$|^db$|^mq_client$|^grpc_client$|^hsf_client$|^mysql$|^postgresql$|^memcached$|^consumer$\",service=\"$service\",rpc=\"__all__\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[1m]))",
"legend": "调用{{callType}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0443a553-2398-4865-a121-807e24e412ea",
"layout": {
"h": 8,
"i": "0443a553-2398-4865-a121-807e24e412ea",
"isResizable": true,
"w": 12,
"x": 12,
"y": 31
},
"links": [],
"maxPerRow": 4,
"name": "应用依赖服务平均响应时间/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (callType) (sum_over_time(arms_app_requests_seconds{service=\"$service\",callType=~\"^dubbo_client$|^http_client$|^mongodb$|^oracle$|^client$|^redis$|^dmdb$|^thrift_client$|^dsf_client$|^db$|^mq_client$|^grpc_client$|^hsf_client$|^mysql$|^postgresql$|^memcached$|^consumer$\",rpc=\"__all__\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[1m]))/sum by (callType) (sum_over_time(arms_app_requests_count{service=\"$service\",callType=~\"^dubbo_client$|^http_client$|^mongodb$|^oracle$|^client$|^redis$|^dmdb$|^thrift_client$|^dsf_client$|^db$|^mq_client$|^grpc_client$|^hsf_client$|^mysql$|^postgresql$|^memcached$|^consumer$\",rpc=\"__all__\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[1m]))",
"legend": "调用{{callType}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "3919eb57-3713-4092-93ff-f302a6cfaee9",
"layout": {
"h": 1,
"i": "3919eb57-3713-4092-93ff-f302a6cfaee9",
"isResizable": false,
"w": 24,
"x": 0,
"y": 39
},
"name": "应用提供服务",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e0c59484-6787-486d-aa58-6a13e17dfcc9",
"layout": {
"h": 8,
"i": "e0c59484-6787-486d-aa58-6a13e17dfcc9",
"isResizable": true,
"w": 12,
"x": 0,
"y": 40
},
"links": [],
"maxPerRow": 4,
"name": "应用提供服务请求量 / 每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (callType) (sum_over_time(arms_app_requests_count{service=\"$service\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",rpc=\"__all__\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[1m]))",
"legend": "{{callType}}入口",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "48201d41-5145-412d-abea-7c1679ee444f",
"layout": {
"h": 8,
"i": "48201d41-5145-412d-abea-7c1679ee444f",
"isResizable": true,
"w": 12,
"x": 12,
"y": 40
},
"links": [],
"maxPerRow": 4,
"name": "应用提供服务平均响应时间 / 每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (callType) (sum_over_time(arms_app_requests_seconds{service=\"$service\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",rpc=\"__all__\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[1m]))/sum by (callType) (sum_over_time(arms_app_requests_count{service=\"$service\",callType=~\"^dsf$|^http$|^dubbo$|^mq$|^user_method$|^producer$|^thrift$|^hsf$|^server$|^grpc$\",rpc=\"__all__\",prpc=\"__all__\",ppid=\"__all__\",endpoint=\"__all__\",destId=\"__all__\"}[1m]))",
"legend": "{{callType}}入口",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "10af00a6-3b12-4abe-90b2-9ae04ccd4bff",
"layout": {
"h": 1,
"i": "10af00a6-3b12-4abe-90b2-9ae04ccd4bff",
"isResizable": false,
"w": 24,
"x": 0,
"y": 48
},
"name": "JVM监控",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "bd95a6c2-1387-42b5-967c-82a266bf670e",
"layout": {
"h": 8,
"i": "bd95a6c2-1387-42b5-967c-82a266bf670e",
"isResizable": true,
"w": 12,
"x": 0,
"y": 49
},
"links": [],
"maxPerRow": 4,
"name": "GC次数/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(sum_over_time(arms_jvm_gc_delta{service=\"$service\",host=~\"$host\",gen=\"old\"}[1m]))",
"legend": "FullGC次数-瞬时值",
"refId": "A"
},
{
"expr": "sum(max_over_time(arms_jvm_gc_total{service=\"$service\",host=~\"$host\",gen=\"old\"}[1m]))",
"legend": "FullGC次数-累计值",
"refId": "B"
},
{
"expr": "sum(sum_over_time(arms_jvm_gc_delta{service=\"$service\",host=~\"$host\",gen=\"young\"}[1m]))",
"legend": "YoungGC次数-瞬时值",
"refId": "C"
},
{
"expr": "sum(max_over_time(arms_jvm_gc_total{service=\"$service\",host=~\"$host\",gen=\"young\"}[1m]))",
"legend": "YoungGC次数-累计值",
"refId": "E"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "1cb12b4a-66f2-45d6-bd48-70fc565374b0",
"layout": {
"h": 8,
"i": "1cb12b4a-66f2-45d6-bd48-70fc565374b0",
"isResizable": true,
"w": 12,
"x": 12,
"y": 49
},
"links": [],
"maxPerRow": 4,
"name": "GC耗时/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(sum_over_time(arms_jvm_gc_seconds_delta{service=\"$service\",host=~\"$host\",gen=\"young\"}[1m]))",
"legend": "YoungGC耗时-瞬时值",
"refId": "A"
},
{
"expr": "max(max_over_time(arms_jvm_gc_seconds_total{service=\"$service\",host=~\"$host\",gen=\"young\"}[1m]))",
"legend": "YoungGC耗时-累计值",
"refId": "B"
},
{
"expr": "sum(sum_over_time(arms_jvm_gc_seconds_delta{service=\"$service\",host=~\"$host\",gen=\"old\"}[1m]))",
"legend": "FullGC耗时-瞬时值",
"refId": "C"
},
{
"expr": "max(max_over_time(arms_jvm_gc_seconds_total{service=\"$service\",host=~\"$host\",gen=\"old\"}[1m]))",
"legend": "FullGC耗时-累计值",
"refId": "D"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "44f7673a-a614-425e-82f4-0dd92945040f",
"layout": {
"h": 8,
"i": "44f7673a-a614-425e-82f4-0dd92945040f",
"isResizable": true,
"w": 12,
"x": 0,
"y": 57
},
"links": [],
"maxPerRow": 4,
"name": "堆内存详情/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max by (id) (max_over_time(arms_jvm_mem_used_bytes{service=\"$service\",host=~\"$host\",area=\"heap\"}[1m]))",
"legend": "{{id}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "60223b1e-8429-4c13-8421-131307645a39",
"layout": {
"h": 8,
"i": "60223b1e-8429-4c13-8421-131307645a39",
"isResizable": true,
"w": 12,
"x": 12,
"y": 57
},
"links": [],
"maxPerRow": 4,
"name": "元空间/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(max_over_time(arms_jvm_mem_used_bytes{service=\"$service\",host=~\"$host\",area=\"nonheap\",id=\"metaspace\"}[1m]))",
"legend": "元空间",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8a40080f-738b-46ca-8a42-04be4860b4ee",
"layout": {
"h": 8,
"i": "8a40080f-738b-46ca-8a42-04be4860b4ee",
"isResizable": true,
"w": 12,
"x": 0,
"y": 65
},
"links": [],
"maxPerRow": 4,
"name": "非堆内存/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(max_over_time(arms_jvm_mem_committed_bytes{service=\"$service\",host=~\"$host\",area=\"nonheap\"}[1m]))",
"legend": "提交字节数",
"refId": "A"
},
{
"expr": "max(max_over_time(arms_jvm_mem_init_bytes{service=\"$service\",host=~\"$host\",area=\"nonheap\"}[1m]))",
"legend": "初始字节数",
"refId": "B"
},
{
"expr": "max(max_over_time(arms_jvm_mem_max_bytes{service=\"$service\",host=~\"$host\",area=\"nonheap\"}[1m]))",
"legend": "最大字节数",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4a99dcc2-d230-45bc-b0ad-b718bb21eeb6",
"layout": {
"h": 8,
"i": "4a99dcc2-d230-45bc-b0ad-b718bb21eeb6",
"isResizable": true,
"w": 12,
"x": 12,
"y": 65
},
"links": [],
"maxPerRow": 4,
"name": "直接缓冲区/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(max_over_time(arms_jvm_buffer_pool_total_bytes{service=\"$service\",host=~\"$host\",id=\"direct\"}[1m]))",
"legend": "DirectBuffer总大小",
"refId": "A"
},
{
"expr": "max(max_over_time(arms_jvm_buffer_pool_used_bytes{service=\"$service\",host=~\"$host\",id=\"direct\"}[1m]))",
"legend": "DirectBuffer使用大小",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "34635d94-e8a2-44a8-a246-a984bd415a99",
"layout": {
"h": 8,
"i": "34635d94-e8a2-44a8-a246-a984bd415a99",
"isResizable": true,
"w": 24,
"x": 0,
"y": 73
},
"links": [],
"maxPerRow": 4,
"name": "JVM线程数",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (state) (arms_jvm_threads_count{service=\"$service\",host=~\"$host\"})",
"legend": "{{state}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "bd920da4-1bd1-4c73-81be-ec62a6c814fb",
"layout": {
"h": 1,
"i": "bd920da4-1bd1-4c73-81be-ec62a6c814fb",
"isResizable": false,
"w": 24,
"x": 0,
"y": 81
},
"name": "主机监控",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8dcb1609-c1fc-423f-8b94-ac1e48a14c64",
"layout": {
"h": 9,
"i": "8dcb1609-c1fc-423f-8b94-ac1e48a14c64",
"isResizable": true,
"w": 6,
"x": 0,
"y": 82
},
"links": [],
"maxPerRow": 4,
"name": "CPU/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(max_over_time(arms_system_cpu_system{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "系统CPU使用率",
"refId": "A"
},
{
"expr": "max(max_over_time(arms_system_cpu_user{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "用户CPU使用率",
"refId": "B"
},
{
"expr": "max(max_over_time(arms_system_cpu_io_wait{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "等待IO完成的CPU使用率",
"refId": "C"
},
{
"expr": "max(max_over_time(arms_system_cpu_system{service=\"$service\",host=~\"$host\"}[1m]))+max(max_over_time(arms_system_cpu_io_wait{service=\"$service\",host=~\"$host\"}[1m]))+max(max_over_time(arms_system_cpu_user{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "总和",
"refId": "D"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "037f2e66-80b4-4616-b31b-5a2ae62fd587",
"layout": {
"h": 9,
"i": "037f2e66-80b4-4616-b31b-5a2ae62fd587",
"isResizable": true,
"w": 6,
"x": 6,
"y": 82
},
"links": [],
"maxPerRow": 4,
"name": "物理内存/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(max_over_time(arms_system_mem_free_bytes{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "系统的空闲内存",
"refId": "A"
},
{
"expr": "max(max_over_time(arms_system_mem_buffers_bytes{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "系统的BufferCache的内存数",
"refId": "E"
},
{
"expr": "max(max_over_time(arms_system_mem_cached_bytes{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "系统的PageCache里的内存数",
"refId": "B"
},
{
"expr": "max(max_over_time(arms_system_mem_total_bytes{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "总和",
"refId": "F"
},
{
"expr": "max(max_over_time(arms_system_mem_used_bytes{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "系统的已经使用的内存",
"refId": "G"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "409c5294-7b00-408f-8156-124504283162",
"layout": {
"h": 9,
"i": "409c5294-7b00-408f-8156-124504283162",
"isResizable": true,
"w": 6,
"x": 12,
"y": 82
},
"links": [],
"maxPerRow": 4,
"name": "负载/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(max_over_time(arms_system_load{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "load",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "182bc839-efd6-4c9c-b7ac-017fc2aa28c8",
"layout": {
"h": 9,
"i": "182bc839-efd6-4c9c-b7ac-017fc2aa28c8",
"isResizable": true,
"w": 6,
"x": 18,
"y": 82
},
"links": [],
"maxPerRow": 4,
"name": "磁盘可用容量/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(max_over_time(arms_system_disk_free_bytes{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "磁盘空闲数",
"refId": "A"
},
{
"expr": "max(max_over_time(arms_system_disk_total_bytes{service=\"$service\",host=~\"$host\"}[1m]))-max(max_over_time(arms_system_disk_free_bytes{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "磁盘使用数",
"refId": "B"
},
{
"expr": "max(max_over_time(arms_system_disk_total_bytes{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "总和",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e36b674a-e3b9-4687-9a75-6e07c03815a8",
"layout": {
"h": 9,
"i": "e36b674a-e3b9-4687-9a75-6e07c03815a8",
"isResizable": true,
"w": 12,
"x": 0,
"y": 91
},
"links": [],
"maxPerRow": 4,
"name": "网络流量(Byte)/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(max_over_time(arms_system_net_in_bytes{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "网络接收的字节数",
"refId": "A"
},
{
"expr": "max(max_over_time(arms_system_net_out_bytes{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "网络发送的字节数",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "95dad73c-8716-481e-a49c-72a7b3c5fa91",
"layout": {
"h": 9,
"i": "95dad73c-8716-481e-a49c-72a7b3c5fa91",
"isResizable": true,
"w": 12,
"x": 12,
"y": 91
},
"links": [],
"maxPerRow": 4,
"name": "网络数据包(个)/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(max_over_time(arms_system_net_in_packets{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "网络接收的报文数",
"refId": "A"
},
{
"expr": "max(max_over_time(arms_system_net_out_packets{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "网络发送的报文数",
"refId": "B"
},
{
"expr": "max(max_over_time(arms_system_net_in_errs{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "网络接收的错误数",
"refId": "C"
},
{
"expr": "max(max_over_time(arms_system_net_out_errs{service=\"$service\",host=~\"$host\"}[1m]))",
"legend": "网络丢弃报文数",
"refId": "D"
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(arms_system_cpu_idle,service)",
"multi": false,
"name": "service",
"reg": "",
"type": "query"
},
{
"allOption": true,
"allValue": ".*",
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(arms_system_cpu_idle{service=\"$service\"},host)",
"multi": false,
"name": "host",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327089363000
}
================================================
FILE: integrations/AliYun/dashboards/arms-db.json
================================================
{
"id": 0,
"group_id": 0,
"name": "ARMS-DB",
"ident": "",
"tags": "ARMS",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"collapsed": true,
"id": "bd8c0aac-06df-4b2d-9456-cad8e7389499",
"layout": {
"h": 1,
"i": "bd8c0aac-06df-4b2d-9456-cad8e7389499",
"w": 24,
"x": 0,
"y": 0
},
"name": "概览(DB级别)",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8d2da301-e5e8-4b2f-9b31-59aa0835c312",
"layout": {
"h": 8,
"i": "8d2da301-e5e8-4b2f-9b31-59aa0835c312",
"w": 12,
"x": 0,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "请求数/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (callType) (sum_over_time(arms_db_requests_count{endpoint=\"${instance}\",destId=~\"${db}\"}[1m]))",
"legend": "{{callType}}入口",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ab2b0969-50e7-4e4b-962a-58be133e6aef",
"layout": {
"h": 8,
"i": "ab2b0969-50e7-4e4b-962a-58be133e6aef",
"w": 12,
"x": 12,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "响应时间/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (callType) (sum_over_time(arms_db_requests_seconds{endpoint=\"$instance\",destId=~\"^$db$\"}[1m]))/sum by (callType) (sum_over_time(arms_db_requests_count{endpoint=\"$instance\",destId=~\"^$db$\"}[1m]))",
"legend": "{{callType}}入口",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6bd5d219-0a94-4f90-b2e0-93ed3eeca9f0",
"layout": {
"h": 8,
"i": "6bd5d219-0a94-4f90-b2e0-93ed3eeca9f0",
"w": 12,
"x": 0,
"y": 9
},
"links": [],
"maxPerRow": 4,
"name": "错误数/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (callType) (sum_over_time(arms_db_requests_error_count{endpoint=\"$instance\",destId=~\"$db\"}[1m]))",
"legend": "{{callType}}入口",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "针对所有SQL的聚和指标",
"id": "d9093b86-5796-471a-a28c-fe1d8daf1721",
"layout": {
"h": 8,
"i": "d9093b86-5796-471a-a28c-fe1d8daf1721",
"w": 12,
"x": 12,
"y": 9
},
"links": [],
"maxPerRow": 4,
"name": "性能一览/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(sum_over_time(arms_db_requests_count{endpoint=\"$instance\",destId=~\"$db\"}[1m]))",
"legend": "请求次数",
"refId": "A"
},
{
"expr": "sum(sum_over_time(arms_db_requests_seconds{endpoint=\"$instance\",destId=~\"$db\"}[1m]))/sum(sum_over_time(arms_db_requests_count{endpoint=\"$instance\",destId=~\"$db\"}[1m]))",
"legend": "平均耗时",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(arms_system_cpu_idle,service)",
"multi": false,
"name": "service",
"reg": "",
"type": "query"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(arms_db_requests_count{service=\"$service\"},endpoint)",
"multi": false,
"name": "instance",
"reg": "",
"type": "query"
},
{
"allOption": true,
"allValue": ".*",
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(arms_db_requests_count{endpoint=\"${instance}\"},destId)",
"multi": false,
"name": "db",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327092680000
}
================================================
FILE: integrations/AliYun/dashboards/arms-jvm-service.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云 ARMS-JVM-SERVICE",
"ident": "",
"tags": "JVM ARMS",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "a26c5c3d-7b60-4746-bd1f-ca95581cf2fd",
"layout": {
"h": 1,
"i": "a26c5c3d-7b60-4746-bd1f-ca95581cf2fd",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Basic Info",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "2e921e92-069e-46fe-a0ef-d2f37dc22575",
"layout": {
"h": 6,
"i": "96c82fd7-ec94-473e-b2e0-ead52ab390fc",
"isResizable": true,
"w": 8,
"x": 0,
"y": 1
},
"maxPerRow": 4,
"name": "Threads Count",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_threads_count{service=\"$service\",host=\"$host\"}",
"legend": "State {{state}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "9d8055bd-4c57-4eaa-afc6-b0a727a238f6",
"layout": {
"h": 6,
"i": "afbd0af8-6c44-4c15-bd7f-c047ad41d0d7",
"isResizable": true,
"w": 8,
"x": 8,
"y": 1
},
"maxPerRow": 4,
"name": "Mem Committed Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_mem_committed_bytes{service=\"$service\",host=\"$host\"}",
"legend": "area {{area}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "8092fb90-1b91-4e57-82bf-e170bba9099f",
"layout": {
"h": 6,
"i": "84b76aba-c00a-49de-b9f4-1613d2d497af",
"isResizable": true,
"w": 8,
"x": 16,
"y": 1
},
"maxPerRow": 4,
"name": "Mem Init Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_mem_init_bytes{service=\"$service\",host=\"$host\"}",
"legend": "area: {{area}} id:{{id}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "705c90e0-e8b6-4f1c-b35c-c8a785009a20",
"layout": {
"h": 1,
"i": "705c90e0-e8b6-4f1c-b35c-c8a785009a20",
"isResizable": false,
"w": 24,
"x": 0,
"y": 7
},
"name": "JVM Memory",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "c7741620-977b-417d-8db0-1c9f7f98d8f8",
"layout": {
"h": 7,
"i": "81090d87-c72c-4243-b016-000ccdd8a9e7",
"isResizable": true,
"w": 12,
"x": 0,
"y": 8
},
"maxPerRow": 4,
"name": "JVM Memory(total)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_mem_used_bytes{service=\"$service\",host=\"$host\", area=\"total\"}",
"legend": "Used {{id}}",
"refId": "A"
},
{
"expr": "arms_jvm_mem_max_bytes{service=\"$service\",host=\"$host\", area=\"total\"}",
"legend": "Max {{id}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "5455e2f2-f6bb-4888-9d88-240d7e12cce2",
"layout": {
"h": 7,
"i": "5455e2f2-f6bb-4888-9d88-240d7e12cce2",
"isResizable": true,
"w": 12,
"x": 12,
"y": 8
},
"maxPerRow": 4,
"name": "JVM Memory(heap)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_mem_used_bytes{service=\"$service\",host=\"$host\", area=\"heap\"}",
"legend": "Used {{id}}",
"refId": "A"
},
{
"expr": "arms_jvm_mem_max_bytes{service=\"$service\",host=\"$host\", area=\"heap\"}",
"legend": "Max {{id}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "c268f732-f9c8-4a36-977b-7554f63d84db",
"layout": {
"h": 7,
"i": "7826a7a7-d2bf-474e-aec0-359470732007",
"isResizable": true,
"w": 12,
"x": 0,
"y": 15
},
"maxPerRow": 4,
"name": "JVM Memory(noheap)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_mem_used_bytes{service=\"$service\",host=\"$host\", area=\"nonheap\"}",
"legend": "Used {{id}}",
"refId": "A"
},
{
"expr": "arms_jvm_mem_max_bytes{service=\"$service\",host=\"$host\", area=\"nonheap\"}",
"legend": "Max {{id}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "66fc2b8f-9d66-4421-9acb-fe8af891ffe1",
"layout": {
"h": 7,
"i": "c01f392d-f225-4615-b49a-eee689295c53",
"isResizable": true,
"w": 12,
"x": 12,
"y": 15
},
"maxPerRow": 4,
"name": "JVM Buffer Pool",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_buffer_pool_count{service=\"$service\",host=\"$host\"}",
"legend": "id {{id}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "ee48523f-948d-4828-a606-309ce683e694",
"layout": {
"h": 7,
"i": "ca327ef3-1b8a-4b9a-9b42-62fd41aefb31",
"isResizable": true,
"w": 12,
"x": 0,
"y": 22
},
"maxPerRow": 4,
"name": "JVM Buffer Pool Total Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_buffer_pool_total_bytes{service=\"$service\",host=\"$host\"}",
"legend": "id {{id}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "22b24a2d-9976-47d9-ad4d-bef92880a53e",
"layout": {
"h": 7,
"i": "db4b7e4d-74b2-44fc-aca8-c0cb2635daad",
"isResizable": true,
"w": 12,
"x": 12,
"y": 22
},
"maxPerRow": 4,
"name": "JVM Buffer Pool Used Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_buffer_pool_used_bytes{service=\"$service\",host=\"$host\"}",
"legend": "id {{id}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "0aaf3516-4938-41e3-b7cb-323de6de75d9",
"layout": {
"h": 1,
"i": "0aaf3516-4938-41e3-b7cb-323de6de75d9",
"isResizable": false,
"w": 24,
"x": 0,
"y": 29
},
"name": "GC",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "5303bda0-47c2-4aca-bb12-1da512500f4a",
"layout": {
"h": 6,
"i": "5303bda0-47c2-4aca-bb12-1da512500f4a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 30
},
"maxPerRow": 4,
"name": "过去一分钟GC耗时(秒)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "increase(arms_jvm_gc_seconds_total{service=\"$service\",host=\"$host\"}[1m])",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "cf410459-b5df-4aca-a410-ecda091d6097",
"layout": {
"h": 6,
"i": "cf410459-b5df-4aca-a410-ecda091d6097",
"isResizable": true,
"w": 12,
"x": 12,
"y": 30
},
"maxPerRow": 4,
"name": "过去一分钟GC次数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "increase(arms_jvm_gc_total{service=\"$service\",host=\"$host\"}[1m])",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(arms_jvm_buffer_pool_count, service)",
"label": "service",
"name": "service",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(arms_jvm_buffer_pool_count{service=\"$service\"}, host)",
"label": "host",
"name": "host",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327094704000
}
================================================
FILE: integrations/AliYun/dashboards/arms-machine.json
================================================
{
"id": 0,
"group_id": 0,
"name": "ARMS-Machine",
"ident": "",
"tags": "ARMS",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"collapsed": true,
"id": "8865eacb-f0f6-45fa-912a-8494907c48d6",
"layout": {
"h": 1,
"i": "8865eacb-f0f6-45fa-912a-8494907c48d6",
"w": 24,
"x": 0,
"y": 0
},
"name": "系统信息",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7fd3186b-6190-44c7-ad05-1c81993f27c9",
"layout": {
"h": 9,
"i": "7fd3186b-6190-44c7-ad05-1c81993f27c9",
"w": 24,
"x": 0,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "CPU",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(arms_system_cpu_system{host=~\"$host\"})",
"legend": "系统CPU使用率",
"refId": "A"
},
{
"expr": "max(arms_system_cpu_io_wait{host=~\"$host\"})",
"legend": "等待IO完成的CPU使用率",
"refId": "B"
},
{
"expr": "max(arms_system_cpu_user{host=~\"$host\"})",
"legend": "用户CPU使用率",
"refId": "C"
},
{
"expr": "max(arms_system_cpu_system{host=\"$host\"})+max(arms_system_cpu_io_wait{host=~\"$host\"})+max(arms_system_cpu_user{host=\"$host\"})",
"legend": "总和",
"refId": "D"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "60fc127b-b565-40de-9346-860062d5ea58",
"layout": {
"h": 9,
"i": "60fc127b-b565-40de-9346-860062d5ea58",
"w": 24,
"x": 0,
"y": 10
},
"links": [],
"maxPerRow": 4,
"name": "内存",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(arms_system_mem_used_bytes{host=\"$host\"})",
"legend": "系统的已经使用的内存",
"refId": "A"
},
{
"expr": "max(arms_system_mem_total_bytes{host=\"$host\"})",
"legend": "总和",
"refId": "C"
},
{
"expr": "max(arms_system_mem_buffers_bytes{host=\"$host\"})",
"legend": "系统的BufferCache的内存数",
"refId": "D"
},
{
"expr": "max(arms_system_mem_cached_bytes{host=\"$host\"})",
"legend": "系统的PageCache里的内存数",
"refId": "E"
},
{
"expr": "max(arms_system_mem_free_bytes{host=\"$host\"})",
"legend": "系统的空闲内存",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5229fd8c-3e26-44e6-a091-145c3caef46f",
"layout": {
"h": 9,
"i": "5229fd8c-3e26-44e6-a091-145c3caef46f",
"w": 24,
"x": 0,
"y": 19
},
"links": [],
"maxPerRow": 4,
"name": "负载",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(arms_system_load{host=\"$host\"})",
"legend": "负载",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "60872e48-5445-4ee1-b0a2-19be72b6f737",
"layout": {
"h": 9,
"i": "60872e48-5445-4ee1-b0a2-19be72b6f737",
"w": 24,
"x": 0,
"y": 28
},
"links": [],
"maxPerRow": 4,
"name": "磁盘",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(arms_system_disk_free_bytes{host=\"$host\"})",
"legend": "可用磁盘容量",
"refId": "A"
},
{
"expr": "max(arms_system_disk_total_bytes{host=\"$host\"})",
"legend": "总磁盘容量",
"refId": "B"
},
{
"expr": "max(arms_system_disk_total_bytes{host=~\"$host\"})-max(arms_system_disk_free_bytes{host=~\"$host\"})",
"legend": "已使用磁盘容量",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "517cc410-c4a0-4923-a902-3c102f06cd0c",
"layout": {
"h": 9,
"i": "517cc410-c4a0-4923-a902-3c102f06cd0c",
"w": 24,
"x": 0,
"y": 37
},
"links": [],
"maxPerRow": 4,
"name": "网络流量(Byte)/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(max_over_time(arms_system_net_in_bytes{host=~\"$host\"}[1m]))",
"legend": "网络接收的字节数",
"refId": "A"
},
{
"expr": "max(max_over_time(arms_system_net_out_bytes{host=~\"$host\"}[1m]))",
"legend": "网络发送的字节数",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "752d89ce-1136-4ddf-b4b9-1a232a8840db",
"layout": {
"h": 9,
"i": "752d89ce-1136-4ddf-b4b9-1a232a8840db",
"w": 24,
"x": 0,
"y": 46
},
"links": [],
"maxPerRow": 4,
"name": "网络数据包(个)/每分钟",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(max_over_time(arms_system_net_in_packets{host=~\"$host\"}[1m]))",
"legend": "网络接收的报文数",
"refId": "A"
},
{
"expr": "max(max_over_time(arms_system_net_out_packets{host=~\"$host\"}[1m]))",
"legend": "网络发送的报文数",
"refId": "C"
},
{
"expr": "max(max_over_time(arms_system_net_in_errs{host=~\"$host\"}[1m]))",
"legend": "网络接收的错误数",
"refId": "D"
},
{
"expr": "max(max_over_time(arms_system_net_out_errs{host=~\"$host\"}[1m]))",
"legend": "网络丢弃报文数",
"refId": "E"
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(arms_system_cpu_idle,service)",
"multi": false,
"name": "service",
"reg": "",
"type": "query"
},
{
"allOption": false,
"allValue": "*",
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(arms_system_cpu_idle{service=\"$service\"},host)",
"multi": false,
"name": "host",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327098444000
}
================================================
FILE: integrations/AliYun/dashboards/arms_jvm.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云 ARMS-JVM",
"ident": "",
"tags": "JVM ARMS",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "a26c5c3d-7b60-4746-bd1f-ca95581cf2fd",
"layout": {
"h": 1,
"i": "a26c5c3d-7b60-4746-bd1f-ca95581cf2fd",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Basic Info",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "2e921e92-069e-46fe-a0ef-d2f37dc22575",
"layout": {
"h": 6,
"i": "96c82fd7-ec94-473e-b2e0-ead52ab390fc",
"isResizable": true,
"w": 8,
"x": 0,
"y": 1
},
"maxPerRow": 4,
"name": "Threads Count",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_threads_count{app=\"$app\",host=\"$host\"}",
"legend": "State {{state}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "9d8055bd-4c57-4eaa-afc6-b0a727a238f6",
"layout": {
"h": 6,
"i": "afbd0af8-6c44-4c15-bd7f-c047ad41d0d7",
"isResizable": true,
"w": 8,
"x": 8,
"y": 1
},
"maxPerRow": 4,
"name": "Mem Committed Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_mem_committed_bytes{app=\"$app\",host=\"$host\"}",
"legend": "area {{area}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "8092fb90-1b91-4e57-82bf-e170bba9099f",
"layout": {
"h": 6,
"i": "84b76aba-c00a-49de-b9f4-1613d2d497af",
"isResizable": true,
"w": 8,
"x": 16,
"y": 1
},
"maxPerRow": 4,
"name": "Mem Init Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_mem_init_bytes{app=\"$app\",host=\"$host\"}",
"legend": "area: {{area}} id:{{id}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "705c90e0-e8b6-4f1c-b35c-c8a785009a20",
"layout": {
"h": 1,
"i": "705c90e0-e8b6-4f1c-b35c-c8a785009a20",
"isResizable": false,
"w": 24,
"x": 0,
"y": 7
},
"name": "JVM Memory",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "c7741620-977b-417d-8db0-1c9f7f98d8f8",
"layout": {
"h": 7,
"i": "81090d87-c72c-4243-b016-000ccdd8a9e7",
"isResizable": true,
"w": 12,
"x": 0,
"y": 8
},
"maxPerRow": 4,
"name": "JVM Memory(total)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_mem_used_bytes{app=\"$app\",host=\"$host\", area=\"total\"}",
"legend": "Used {{id}}",
"refId": "A"
},
{
"expr": "arms_jvm_mem_max_bytes{app=\"$app\",host=\"$host\", area=\"total\"}",
"legend": "Max {{id}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "5455e2f2-f6bb-4888-9d88-240d7e12cce2",
"layout": {
"h": 7,
"i": "5455e2f2-f6bb-4888-9d88-240d7e12cce2",
"isResizable": true,
"w": 12,
"x": 12,
"y": 8
},
"maxPerRow": 4,
"name": "JVM Memory(heap)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_mem_used_bytes{app=\"$app\",host=\"$host\", area=\"heap\"}",
"legend": "Used {{id}}",
"refId": "A"
},
{
"expr": "arms_jvm_mem_max_bytes{app=\"$app\",host=\"$host\", area=\"heap\"}",
"legend": "Max {{id}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "c268f732-f9c8-4a36-977b-7554f63d84db",
"layout": {
"h": 7,
"i": "7826a7a7-d2bf-474e-aec0-359470732007",
"isResizable": true,
"w": 12,
"x": 0,
"y": 15
},
"maxPerRow": 4,
"name": "JVM Memory(noheap)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_mem_used_bytes{app=\"$app\",host=\"$host\", area=\"nonheap\"}",
"legend": "Used {{id}}",
"refId": "A"
},
{
"expr": "arms_jvm_mem_max_bytes{app=\"$app\",host=\"$host\", area=\"nonheap\"}",
"legend": "Max {{id}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "66fc2b8f-9d66-4421-9acb-fe8af891ffe1",
"layout": {
"h": 7,
"i": "c01f392d-f225-4615-b49a-eee689295c53",
"isResizable": true,
"w": 12,
"x": 12,
"y": 15
},
"maxPerRow": 4,
"name": "JVM Buffer Pool",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_buffer_pool_count{app=\"$app\",host=\"$host\"}",
"legend": "id {{id}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "ee48523f-948d-4828-a606-309ce683e694",
"layout": {
"h": 7,
"i": "ca327ef3-1b8a-4b9a-9b42-62fd41aefb31",
"isResizable": true,
"w": 12,
"x": 0,
"y": 22
},
"maxPerRow": 4,
"name": "JVM Buffer Pool Total Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_buffer_pool_total_bytes{app=\"$app\",host=\"$host\"}",
"legend": "id {{id}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "22b24a2d-9976-47d9-ad4d-bef92880a53e",
"layout": {
"h": 7,
"i": "db4b7e4d-74b2-44fc-aca8-c0cb2635daad",
"isResizable": true,
"w": 12,
"x": 12,
"y": 22
},
"maxPerRow": 4,
"name": "JVM Buffer Pool Used Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "arms_jvm_buffer_pool_used_bytes{app=\"$app\",host=\"$host\"}",
"legend": "id {{id}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "0aaf3516-4938-41e3-b7cb-323de6de75d9",
"layout": {
"h": 1,
"i": "0aaf3516-4938-41e3-b7cb-323de6de75d9",
"isResizable": false,
"w": 24,
"x": 0,
"y": 29
},
"name": "GC",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "5303bda0-47c2-4aca-bb12-1da512500f4a",
"layout": {
"h": 6,
"i": "5303bda0-47c2-4aca-bb12-1da512500f4a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 30
},
"maxPerRow": 4,
"name": "过去一分钟GC耗时(秒)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "increase(arms_jvm_gc_seconds_total{app=\"$app\",host=\"$host\"}[1m])",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "cf410459-b5df-4aca-a410-ecda091d6097",
"layout": {
"h": 6,
"i": "cf410459-b5df-4aca-a410-ecda091d6097",
"isResizable": true,
"w": 12,
"x": 12,
"y": 30
},
"maxPerRow": 4,
"name": "过去一分钟GC次数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "increase(arms_jvm_gc_total{app=\"$app\",host=\"$host\"}[1m])",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(arms_jvm_buffer_pool_count, app)",
"label": "app",
"name": "app",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(arms_jvm_buffer_pool_count{app=\"$app\"}, host)",
"label": "host",
"name": "host",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327100994000
}
================================================
FILE: integrations/AliYun/dashboards/cdn.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云CDN",
"ident": "",
"tags": "CDN",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": true,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0430c7e9-7372-45e3-9bb2-c5939baf6bfa",
"layout": {
"h": 4,
"i": "0430c7e9-7372-45e3-9bb2-c5939baf6bfa",
"isResizable": true,
"w": 8,
"x": 0,
"y": 0
},
"maxPerRow": 4,
"name": "网络带宽(bits/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_cdn_bps_isp_value{instance_id=\"$instance_id\"}",
"legend": "峰值 {{instance_id}",
"refId": "A"
},
{
"expr": "aliyun_acs_cdn_internet_out_average{instance_id=\"$instance_id\"}",
"legend": "均值 {{instance_id}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b438ae81-3dfc-4ed8-b66f-262a4b507e4b",
"layout": {
"h": 4,
"i": "b438ae81-3dfc-4ed8-b66f-262a4b507e4b",
"isResizable": true,
"w": 8,
"x": 8,
"y": 0
},
"maxPerRow": 4,
"name": "下行流量(bytes)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_cdn_internet_out_isp_value{instance_id=\"$instance_id\"}",
"legend": "{{instance_id}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "af0874c7-3123-437a-93bc-448f6de8b43b",
"layout": {
"h": 4,
"i": "c6e41c04-d591-4117-bdf1-5dc6e1f4c084",
"isResizable": true,
"w": 8,
"x": 16,
"y": 0
},
"maxPerRow": 4,
"name": "每秒访问次数(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_cdn_qps_isp_value{instance_id=\"$instance_id\"}",
"legend": "{{instance_id}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": true,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ec8fcf96-1691-4e45-9a5f-2f183021b434",
"layout": {
"h": 4,
"i": "ec8fcf96-1691-4e45-9a5f-2f183021b434",
"isResizable": true,
"w": 8,
"x": 0,
"y": 4
},
"maxPerRow": 4,
"name": "边缘状态码4XX占比(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_cdn_code4xx_isp_value{instance_id=\"$instance_id\"}",
"legend": "峰值{{instance_id}}",
"refId": "A"
},
{
"expr": "aliyun_acs_cdn_bps_average{instance_id=\"$instance_id\"}",
"legend": "均值 {{instance_id}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": true,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "25b6e3fa-f6dd-4452-8025-3c7d9a9a592c",
"layout": {
"h": 4,
"i": "e884b781-1bd4-476c-a807-a68a6417764e",
"isResizable": true,
"w": 8,
"x": 8,
"y": 4
},
"maxPerRow": 4,
"name": "边缘状态码5XX占比(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_cdn_code5xx_isp_value{instance_id=\"$instance_id\"}",
"legend": "峰值{{instance_id}}",
"refId": "A"
},
{
"expr": "aliyun_acs_cdn_bps_average{instance_id=\"$instance_id\"}",
"legend": "均值 {{instance_id}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"label": "datasource",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_cdn_qps_isp_value, instance_id)",
"label": "instance_id",
"name": "instance_id",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327104499000
}
================================================
FILE: integrations/AliYun/dashboards/ecs.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云ECS",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceName": "Default",
"datasourceValue": "${datasource}",
"id": "8606d5ad-c3c7-4b1d-86bf-474d3302ee17",
"layout": {
"h": 4,
"i": "8606d5ad-c3c7-4b1d-86bf-474d3302ee17",
"isResizable": true,
"w": 12,
"x": 0,
"y": 0
},
"maxPerRow": 4,
"name": "CPU平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_ecs_dashboard_cpu_utilization_average{ident=~\"$ident\"}) by (ident,instance_id)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceName": "Default",
"datasourceValue": "${datasource}",
"id": "c7034fe3-5521-4867-a8bd-429767cc03a2",
"layout": {
"h": 4,
"i": "55404296-0bd9-409d-aeaf-e9c7cceea0dd",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"maxPerRow": 4,
"name": "内存平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_ecs_dashboard_memory_usedutilization_average{ident=~\"$ident\"}) by (ident,instance_id)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceName": "Default",
"datasourceValue": "${datasource}",
"id": "e4c11925-b359-4edb-9269-4bdd4d230224",
"layout": {
"h": 4,
"i": "0c7b3a5a-ef12-4349-be9b-7a245bf01418",
"isResizable": true,
"w": 12,
"x": 0,
"y": 4
},
"maxPerRow": 4,
"name": "系统负载[5m]",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_ecs_dashboard_load_5m_average{ident=~\"$ident\"}) by (ident,instance_id)",
"legend": "{{ident}} {{instance_id}} 5分钟负载",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceName": "Default",
"datasourceValue": "${datasource}",
"id": "388d4da6-eb1f-48f1-955d-37579809dfec",
"layout": {
"h": 4,
"i": "5abea3d2-ea82-4bdb-a4f0-4dd1316c0377",
"isResizable": true,
"w": 12,
"x": 12,
"y": 5
},
"maxPerRow": 4,
"name": "磁盘平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_ecs_dashboard_diskusage_utilization_average{ident=~\"$ident\"}) by (ident,instance_id)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f8d19cc9-0168-4c13-b9a9-c7980eced974",
"layout": {
"h": 4,
"i": "f8d19cc9-0168-4c13-b9a9-c7980eced974",
"w": 12,
"x": 0,
"y": 9
},
"maxPerRow": 4,
"name": "网络流量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bitsSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_ecs_dashboard_intranet_in_average{ident=~\"$ident\"}) by (ident,instance_id)",
"legend": "主机:{{ident}} 实例ID: {{instance_id}} 入流量",
"refId": "A",
"step": 300,
"time": {
"end": "now",
"start": "now-5m"
}
},
{
"expr": "sum(aliyun_acs_ecs_dashboard_intranet_out_average{ident=~\"$ident\"}) by (ident,instance_id)",
"legend": "主机:{{ident}} 实例ID: {{instance_id}} 出流量",
"refId": "B",
"step": 300,
"time": {
"end": "now",
"start": "now-5m"
}
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_ecs_dashboard_cpu_utilization_average,ident)",
"multi": true,
"name": "ident",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327106006000
}
================================================
FILE: integrations/AliYun/dashboards/mongodb.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云MongoDB",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": false,
"id": "971ae452-1cf4-4137-b6e3-1fffd1cf1036",
"layout": {
"h": 1,
"i": "971ae452-1cf4-4137-b6e3-1fffd1cf1036",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "分组",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c43eb882-915f-4c38-a0b5-8f33c21ab44a",
"layout": {
"h": 4,
"i": "09903231-6557-42be-9cf3-2873878e9bf2",
"isResizable": true,
"w": 6,
"x": 0,
"y": 1
},
"maxPerRow": 4,
"name": "CPU平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_cpu_utilization_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": true,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8ab40b92-d31d-419d-b353-1b08ce5ddc25",
"layout": {
"h": 4,
"i": "0a7aeee9-2a64-4484-a66e-0a084a7d507f",
"isResizable": true,
"w": 6,
"x": 6,
"y": 1
},
"maxPerRow": 4,
"name": "内存平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_memory_utilization_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5ce5974c-a74a-4e14-8acd-bace8a24efcc",
"layout": {
"h": 4,
"i": "ca23d79c-ae7a-462b-8ada-22a01437e4b1",
"isResizable": true,
"w": 6,
"x": 12,
"y": 1
},
"maxPerRow": 4,
"name": "磁盘平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_disk_utilization_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "657686c3-0529-4ebe-995a-edef9e3f6ee6",
"layout": {
"h": 4,
"i": "3775178d-206e-43a6-9d4e-8cbf4ce56790",
"isResizable": true,
"w": 6,
"x": 18,
"y": 1
},
"maxPerRow": 4,
"name": "IOPS平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_iops_utilization_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "752381f7-6b74-4593-b174-aabee757c011",
"layout": {
"h": 4,
"i": "acccdbbc-22cb-46e9-9a77-81c6c368cfe9",
"isResizable": true,
"w": 6,
"x": 0,
"y": 5
},
"maxPerRow": 4,
"name": "连接数平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_connection_utilization_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1e75956d-2cf6-4fed-9700-b279ff229d10",
"layout": {
"h": 4,
"i": "5d10e936-d4d9-48ac-93f4-a16b2c1eae81",
"isResizable": true,
"w": 6,
"x": 6,
"y": 5
},
"maxPerRow": 4,
"name": "每秒访问次数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_qps_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b255683e-e324-49e4-b538-0c25411f27e5",
"layout": {
"h": 4,
"i": "a1b04dc5-bce7-4c1b-b9d3-51b209afc714",
"isResizable": true,
"w": 6,
"x": 12,
"y": 5
},
"maxPerRow": 4,
"name": "连接数使用量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_connection_amount_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "4aa42dc8-8c4b-42d1-939e-ed733cbcb126",
"layout": {
"h": 1,
"i": "4aa42dc8-8c4b-42d1-939e-ed733cbcb126",
"isResizable": false,
"w": 24,
"x": 0,
"y": 9
},
"name": "磁盘容量",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0b6f8633-3b4a-47bf-8972-39d128d686eb",
"layout": {
"h": 4,
"i": "263f341f-79ea-4d40-ad35-88eb2dfa6286",
"isResizable": true,
"w": 6,
"x": 0,
"y": 10
},
"maxPerRow": 4,
"name": "实例占用磁盘空间量(bytes)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_instance_disk_amount_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "388cf815-fdd7-4d94-abe2-6866680a2d5d",
"layout": {
"h": 4,
"i": "6f7771c0-0692-4be1-9953-7803825ce57a",
"isResizable": true,
"w": 6,
"x": 6,
"y": 10
},
"maxPerRow": 4,
"name": "数据占用磁盘空间量(bytes)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_data_disk_amount_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3d80f914-2b42-479c-9ecc-544490b92d49",
"layout": {
"h": 4,
"i": "2928fb03-7062-469d-ba31-a06a0cbc2d55",
"isResizable": true,
"w": 6,
"x": 12,
"y": 10
},
"maxPerRow": 4,
"name": "日志占用磁盘空间量(bytes)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_log_disk_amount_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "c15834bc-c823-40d7-8c36-1a4931401a5d",
"layout": {
"h": 1,
"i": "c15834bc-c823-40d7-8c36-1a4931401a5d",
"isResizable": false,
"w": 24,
"x": 0,
"y": 14
},
"name": "网络请求",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3f30d533-bbcd-4264-861d-aa29a9d9c47b",
"layout": {
"h": 4,
"i": "63b08762-b208-473d-8645-0bdd087631a8",
"isResizable": true,
"w": 6,
"x": 0,
"y": 15
},
"maxPerRow": 4,
"name": "内网入流量(bytes)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_intranet_in_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "57726e1a-7288-4d77-8871-92894dd71014",
"layout": {
"h": 4,
"i": "081050c1-6535-4f3a-8a4b-6c87bc9e0d23",
"isResizable": true,
"w": 6,
"x": 6,
"y": 15
},
"maxPerRow": 4,
"name": "内网出流量(bytes)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_intranet_out_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "047e624a-9c12-4d83-9445-747f07ed766a",
"layout": {
"h": 4,
"i": "2b8e3e40-241f-41eb-adaa-7e2c08061f00",
"isResizable": true,
"w": 6,
"x": 12,
"y": 15
},
"maxPerRow": 4,
"name": "请求数(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_number_requests_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "aef60aa0-4896-4da1-8517-b9bfd4a386f1",
"layout": {
"h": 1,
"i": "aef60aa0-4896-4da1-8517-b9bfd4a386f1",
"isResizable": false,
"w": 24,
"x": 0,
"y": 19
},
"name": "操作次数",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b4be1ae2-bd11-4b1b-a01a-41e1ca62e791",
"layout": {
"h": 4,
"i": "c823f96f-d671-4173-9cf0-c2b420271c49",
"isResizable": true,
"w": 6,
"x": 0,
"y": 20
},
"maxPerRow": 4,
"name": "Insert操作次数(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_op_insert_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0a02e31c-5ebc-438e-8012-69f4ef3cc3dd",
"layout": {
"h": 4,
"i": "13a9905c-9eae-4189-bc96-95332d46c8b9",
"isResizable": true,
"w": 6,
"x": 6,
"y": 20
},
"maxPerRow": 4,
"name": "Query操作次数(Frequency)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_op_query_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "35ae128a-e23c-4a0c-b5c1-62289148452f",
"layout": {
"h": 4,
"i": "6cc5c897-2ddb-4790-bfda-0938f8b7cf3d",
"isResizable": true,
"w": 6,
"x": 12,
"y": 20
},
"maxPerRow": 4,
"name": "Update操作次数(Frequency)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_op_update_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1b21c3a5-246b-4473-884b-439af496644e",
"layout": {
"h": 4,
"i": "0aca0c41-4c99-47a2-9298-00d1fe73914f",
"isResizable": true,
"w": 6,
"x": 18,
"y": 20
},
"maxPerRow": 4,
"name": "Delete操作次数(Frequency)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_op_delete_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "42e9cbb6-2219-4ff1-ac71-209f1c8cd120",
"layout": {
"h": 4,
"i": "b3cdc370-248f-4dbe-9d47-7e00df2888c2",
"isResizable": true,
"w": 6,
"x": 0,
"y": 24
},
"maxPerRow": 4,
"name": "Getmore操作次数(Frequency)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_op_getmore_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "877b8ba6-0f7d-437d-b29a-08015c02b1ce",
"layout": {
"h": 4,
"i": "6aa9e8ca-ea2a-4cb6-be1b-59c165aed5d5",
"isResizable": true,
"w": 6,
"x": 6,
"y": 24
},
"maxPerRow": 4,
"name": "Command操作次数(Frequency)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_mongodb_op_command_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_mongodb_qps_average,instance_id)",
"multi": false,
"name": "instance_id",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327107424000
}
================================================
FILE: integrations/AliYun/dashboards/mse.json
================================================
{
"id": 0,
"group_id": 0,
"name": "MSE监控大盘",
"ident": "MSE-Monitor",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": 7,
"id": "aba69dc0-5a11-4bcd-add9-335b5a677bee",
"layout": {
"h": 5,
"i": "aba69dc0-5a11-4bcd-add9-335b5a677bee",
"isResizable": true,
"w": 6,
"x": 0,
"y": 0
},
"maxPerRow": 4,
"name": "PV(一分钟)",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(delta(envoy_http_rq_total{envoy_clusterid=\"$envoy_clusterid\"}[1m]))",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"detailName": "详情",
"legengPosition": "right"
},
"datasourceCate": "prometheus",
"datasourceValue": 7,
"id": "e34a272e-6125-4afa-a2c1-80d7d9078673",
"layout": {
"h": 5,
"i": "116a5607-5860-426e-a560-d3241da88b57",
"isResizable": true,
"w": 9,
"x": 6,
"y": 0
},
"maxPerRow": 4,
"name": "请求成功率",
"options": {
"standardOptions": {
"decimals": 0,
"util": "percentUnit"
}
},
"targets": [
{
"expr": "sum(delta(envoy_http_downstream_rq{envoy_clusterid=\"$envoy_clusterid\"}[3m])) by (response_code_class)",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "pie",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 7,
"id": "a8917108-58a6-479a-8ec4-571f1b5a79c2",
"layout": {
"h": 5,
"i": "9be66a1f-c0bb-47dc-a3c0-ad43b588789b",
"isResizable": true,
"w": 9,
"x": 15,
"y": 0
},
"maxPerRow": 4,
"name": "请求量(一分钟)",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(delta(envoy_http_downstream_cx_rx_bytes_total{envoy_clusterid=\"$envoy_clusterid\"}[1m]))",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 7,
"id": "1b102bee-ccc9-49a0-a1d1-cc097bb6a987",
"layout": {
"h": 6,
"i": "1b102bee-ccc9-49a0-a1d1-cc097bb6a987",
"isResizable": true,
"w": 8,
"x": 0,
"y": 5
},
"maxPerRow": 4,
"name": "平均延迟",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(envoy_http_downstream_rq_time_sum{envoy_clusterid=\"$envoy_clusterid\"}[10m])) / sum(rate(envoy_http_downstream_rq_time_count{envoy_clusterid=\"$envoy_clusterid\"}[10m]))",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 7,
"id": "b432fc11-2f9d-4b72-826b-6ca787401859",
"layout": {
"h": 6,
"i": "ea4c1073-07d3-4adc-a4d3-4812cc55ad7c",
"isResizable": true,
"w": 8,
"x": 8,
"y": 5
},
"maxPerRow": 4,
"name": "P95",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(envoy_http_downstream_rq_time_bucket{envoy_clusterid=\"$envoy_clusterid\"}[10m])) by (le, service))",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 7,
"id": "9062d707-d8a7-4a93-82e5-46f6059e8d70",
"layout": {
"h": 6,
"i": "d36246b9-4a9c-4ab0-9171-c5ac330be0ca",
"isResizable": true,
"w": 8,
"x": 16,
"y": 5
},
"maxPerRow": 4,
"name": "QPS",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(irate(envoy_http_downstream_rq{envoy_clusterid=\"$envoy_clusterid\"}[2m]))",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"baseColor": "#9470FF",
"calc": "lastNotNull",
"serieWidth": 40,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": 7,
"id": "c3f64cfd-adb2-4316-bb84-55f88ed513a3",
"layout": {
"h": 6,
"i": "807c34f9-bd61-4da3-ad88-41bb3e045605",
"isResizable": true,
"w": 24,
"x": 0,
"y": 11
},
"maxPerRow": 4,
"name": "Top Service Request",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "label_replace(label_replace(topk(10, sum(delta(envoy_cluster_upstream_rq_total{envoy_clusterid=\"$envoy_clusterid\", cluster_name=~\"outbound_([0-9]+)_(.*)_(.*).svc.cluster.local$\", cluster_name!~\".*waf-proxy.static\", cluster_name!~\"outbound_([0-9]+)_(.*)_kubernetes.default.svc.cluster.local\", cluster_name!~\"outbound_([0-9]+)_(.*)_(.*).kube-system.svc.cluster.local\", cluster_name!~\"outbound_([0-9]+)_(.*)_(.*).arms-prom.svc.cluster.local\"}[1m])) by (cluster_name)), \"service_name\", \"$3\", \"cluster_name\", \"outbound_([0-9]+)_(.*)_(.*).svc.cluster.local$\"), \"port\", \"$1\", \"cluster_name\", \"outbound_([0-9]+)_(.*)_(.*).svc.cluster.local$\")",
"legend": "{{service_name}}:{{port}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 7,
"id": "8df57678-ff19-4b63-b768-4dad3f12222b",
"layout": {
"h": 5,
"i": "44f413ba-3262-4ccf-a4b1-c1165bafaaff",
"isResizable": true,
"w": 24,
"x": 0,
"y": 17
},
"maxPerRow": 4,
"name": "Top Service RT",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "label_replace(label_replace(avg(delta(envoy_cluster_upstream_rq_time_sum{envoy_clusterid=\"$envoy_clusterid\", cluster_name=~\"outbound_([0-9]+)_(.*)_(.*)$\"}[3m])) by (cluster_name) / avg(delta(envoy_cluster_upstream_rq_time_count{envoy_clusterid=\"$envoy_clusterid\", cluster_name=~\"outbound_([0-9]+)_(.*)_(.*)$\"}[1m])) by (cluster_name), \"service_name\", \"$3\", \"cluster_name\", \"outbound_([0-9]+)_(.*)_(.*)$\"), \"port\", \"$1\", \"cluster_name\", \"outbound_([0-9]+)_(.*)_(.*)$\")",
"legend": "{{service_name}}:{{port}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(envoy_cluster_bind_errors, envoy_clusterid)",
"hide": false,
"label": "envoy_clusterid",
"multi": false,
"name": "envoy_clusterid",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327111860000
}
================================================
FILE: integrations/AliYun/dashboards/mysql.json
================================================
{
"name": "阿里云MySQL",
"tags": "阿里云 mysql",
"ident": "",
"configs": {
"panels": [
{
"type": "row",
"id": "1cb8caf3-ef35-4572-9ecc-71b9f063a685",
"name": "关键指标",
"collapsed": true,
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 0,
"i": "1cb8caf3-ef35-4572-9ecc-71b9f063a685",
"isResizable": false
},
"panels": []
},
{
"type": "timeseries",
"id": "5aad17df-354e-40de-a643-61da6668939b",
"layout": {
"h": 5,
"w": 24,
"x": 0,
"y": 1,
"i": "fcf9515d-3a56-4596-8b3a-d7d8631aa218",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MySQL_SlowQueries{instanceName=\"$instance\"}",
"legend": "{{instanceName}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "每秒慢查询数量(countS)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "row",
"id": "2b3a816e-94e2-4c9d-9bb8-770c458033db",
"name": "基础指标",
"collapsed": true,
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 6,
"i": "2b3a816e-94e2-4c9d-9bb8-770c458033db",
"isResizable": false
},
"panels": []
},
{
"type": "timeseries",
"id": "12d4a674-6d09-4b02-aa4f-d767531bd368",
"layout": {
"h": 4,
"w": 8,
"x": 0,
"y": 7,
"i": "baba4778-b950-4224-9dac-9ecda041f93b",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_CpuUsage{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "CPU使用率",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "55b17951-a4ae-46a7-a2d7-57db1414f6ff",
"layout": {
"h": 4,
"w": 8,
"x": 8,
"y": 7,
"i": "c4c248bd-21fb-4485-8235-f50640116e65",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MemoryUsage{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "内存使用率",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "02c6af68-0e59-4f62-b0e8-80a9a9d0df82",
"layout": {
"h": 4,
"w": 8,
"x": 16,
"y": 7,
"i": "51cf9211-5e76-4176-b1ec-42929ccc6803",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_DiskUsage{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "磁盘使用率",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "b72c5032-1ea0-4c87-9cfd-d21b374680f1",
"layout": {
"h": 4,
"w": 8,
"x": 0,
"y": 11,
"i": "b72c5032-1ea0-4c87-9cfd-d21b374680f1",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MySQL_ActiveSessions{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "活跃连接数",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "b518c9c4-f0e8-4712-ab67-be4521eeff0c",
"layout": {
"h": 4,
"w": 8,
"x": 8,
"y": 11,
"i": "ff589719-6072-488d-819d-6e080a6f3c60",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_ConnectionUsage{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "连接数使用率",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "86c1f728-ac1e-402b-bea6-2e3979f472c3",
"layout": {
"h": 4,
"w": 8,
"x": 16,
"y": 11,
"i": "5d673c5d-1fbb-4df4-9ece-c991d053ca34",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_IOPSUsage{instanceName=\"$instance\"} ",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "IOPS使用率",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off",
"standardOptions": {
"util": "percent"
}
}
}
]
},
{
"type": "timeseries",
"id": "dc874418-8d11-409c-96e8-e48fac2f6e20",
"layout": {
"h": 4,
"w": 8,
"x": 0,
"y": 15,
"i": "86915dd4-990c-41ba-b048-3da301d97327",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MySQL_NetworkInNew{instanceName=\"$instance\"}/ 8",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "网络流入带宽",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "bytesSecIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "b979878a-81a6-4c0d-960d-22a736d00655",
"layout": {
"h": 4,
"w": 8,
"x": 8,
"y": 15,
"i": "86f9e07f-85dc-44e0-8245-ca0a9b0dfa81",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MySQL_NetworkOutNew{instanceName=\"$instance\"}/ 8",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "网络流出带宽",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "bytesSecIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "row",
"id": "6d896a20-bf04-4dc7-94da-1394ef109848",
"name": "性能指标",
"collapsed": true,
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 19,
"i": "6d896a20-bf04-4dc7-94da-1394ef109848",
"isResizable": false
},
"panels": []
},
{
"type": "timeseries",
"id": "2e545b2b-130b-4829-a2d2-ee5305c302aa",
"layout": {
"h": 4,
"w": 8,
"x": 0,
"y": 20,
"i": "13dceb72-9e9d-483d-86d2-b192debdcece",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MySQL_QPS{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "QPS",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "reqps"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "0299da4b-d779-4ed7-9cd5-096f43181b2e",
"layout": {
"h": 4,
"w": 8,
"x": 8,
"y": 20,
"i": "2b23c24e-b6f9-44f5-8151-2d5a7585c31a",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MySQL_TPS{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "TPS",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "reqps"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "56a0e345-1d4d-4051-a3cf-738bea220f96",
"layout": {
"h": 4,
"w": 8,
"x": 16,
"y": 20,
"i": "d1752ed4-f4a1-4c4b-854f-1c2ef01b34a4",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "AliyunRds_MySQL_IbufUseRatio{instanceName=\"$instance\"}",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "BP利用率",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
}
],
"var": [
{
"name": "datasource",
"label": "datasource",
"type": "datasource",
"hide": false,
"definition": "prometheus"
},
{
"name": "instance",
"label": "",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(AliyunRds_MySQL_SlowQueries, instanceName)"
}
],
"version": "3.0.0"
},
"uuid": 1717556327098444000
}
================================================
FILE: integrations/AliYun/dashboards/nat.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云NAT",
"ident": "",
"tags": "NAT",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": false,
"id": "446c0b21-14f5-4347-a2b6-41a7ffb48e0a",
"layout": {
"h": 1,
"i": "446c0b21-14f5-4347-a2b6-41a7ffb48e0a",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Snat Session统计",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ec8fcf96-1691-4e45-9a5f-2f183021b434",
"layout": {
"h": 4,
"i": "ec8fcf96-1691-4e45-9a5f-2f183021b434",
"isResizable": true,
"w": 6,
"x": 0,
"y": 1
},
"maxPerRow": 4,
"name": "并发连接数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_session_active_connection_value{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "be560c38-4046-4f89-af05-bbef24cf21aa",
"layout": {
"h": 4,
"i": "48f9d2e1-600c-401a-b351-8d6ee9e33600",
"isResizable": true,
"w": 6,
"x": 6,
"y": 1
},
"maxPerRow": 4,
"name": "并发丢弃连接速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_session_limit_drop_connection_value{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "12563bff-298b-4b44-bd21-0c47df8f9636",
"layout": {
"h": 4,
"i": "cad14ba1-6830-4873-9c5e-28f2d1c30f55",
"isResizable": true,
"w": 6,
"x": 12,
"y": 1
},
"maxPerRow": 4,
"name": "新建连接速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_session_new_connection_value{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0ba23c30-d378-4916-a833-fe46cdbc1a3f",
"layout": {
"h": 4,
"i": "35188f14-d7ef-46bd-85cb-a510334ce3c1",
"isResizable": true,
"w": 6,
"x": 18,
"y": 1
},
"maxPerRow": 4,
"name": "新建丢弃连接速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_session_new_limit_drop_connection_value{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "13931491-40cd-4a6a-b888-b81e3800a705",
"layout": {
"h": 4,
"i": "345a4573-35a2-4654-9d4d-cdda3134bbac",
"isResizable": true,
"w": 6,
"x": 0,
"y": 5
},
"maxPerRow": 4,
"name": "新建连接水位",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_session_new_connection_water_lever_value{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9ff84fb6-81bf-43d8-9abf-4b2f95ef1071",
"layout": {
"h": 4,
"i": "5b5e4fc7-ed87-473a-9918-47e3707a639c",
"isResizable": true,
"w": 6,
"x": 6,
"y": 5
},
"maxPerRow": 4,
"name": "并发连接水位",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_session_active_connection_water_lever_value{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "be39a88e-8433-4cd1-ba3c-68a5bea890bc",
"layout": {
"h": 4,
"i": "cbb88b7e-6c95-41f4-a5ea-e607bcf0a30e",
"isResizable": true,
"w": 6,
"x": 12,
"y": 5
},
"maxPerRow": 4,
"name": "区间内port分配失败的个数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_error_port_allocation_count_value{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ed75b227-f4bd-4a72-8269-0d7a1b2dd2a7",
"layout": {
"h": 4,
"i": "a25ee068-126f-4d7c-acde-574ecd1534ef",
"isResizable": true,
"w": 6,
"x": 18,
"y": 5
},
"maxPerRow": 4,
"name": "区间内port分配失败的速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_error_port_allocation_rate_value{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "a638124d-151f-4ad8-994d-a29385747ac8",
"layout": {
"h": 1,
"i": "a638124d-151f-4ad8-994d-a29385747ac8",
"isResizable": false,
"w": 24,
"x": 0,
"y": 9
},
"name": "入方向统计",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "fc7cda13-3ad0-4e05-8861-38989debd507",
"layout": {
"h": 4,
"i": "bf83d311-1cdc-4013-92de-c412a900d2a8",
"isResizable": true,
"w": 6,
"x": 0,
"y": 10
},
"maxPerRow": 4,
"name": "入方向流量速率(bps)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bitsIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_bw_rate_in_from_inside_value{instance_id=\"$instance_id\"}",
"legend": "from_inside",
"refId": "A"
},
{
"expr": "aliyun_acs_nat_gateway_bw_rate_in_from_outside_value{instance_id=\"$instance_id\"}",
"legend": "from_outside",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3b07e195-c481-463f-8de9-c5c7f6539175",
"layout": {
"h": 4,
"i": "29363208-08f6-4bfd-a9b0-836062e2b88f",
"isResizable": true,
"w": 6,
"x": 6,
"y": 10
},
"maxPerRow": 4,
"name": "入方向流量(bytes)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_bytes_in_from_inside_value{instance_id=\"$instance_id\"}",
"legend": "from_inside",
"refId": "A"
},
{
"expr": "aliyun_acs_nat_gateway_bytes_in_from_outside_value{instance_id=\"$instance_id\"}",
"legend": "from_outside",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "be3f5f15-d399-4fd9-84d5-d000d0da86e6",
"layout": {
"h": 4,
"i": "0cbe7dd4-eba9-47e6-8c86-e11359a97bc2",
"isResizable": true,
"w": 6,
"x": 12,
"y": 10
},
"maxPerRow": 4,
"name": "入方向包速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_packets_in_from_inside_value{instance_id=\"$instance_id\"}",
"legend": "from_inside",
"refId": "A"
},
{
"expr": "aliyun_acs_nat_gateway_packets_in_from_outside_value{instance_id=\"$instance_id\"}",
"legend": "from_outside",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ffa8eb49-3236-43ec-9fe6-00a2a398d64a",
"layout": {
"h": 4,
"i": "339eb9d0-1613-4117-b59c-278b3727ba0a",
"isResizable": true,
"w": 6,
"x": 18,
"y": 10
},
"maxPerRow": 4,
"name": "入方向包量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_pps_rate_in_from_inside_value{instance_id=\"$instance_id\"}",
"legend": "from_inside",
"refId": "A"
},
{
"expr": "aliyun_acs_nat_gateway_pps_rate_in_from_outside_value{instance_id=\"$instance_id\"}",
"legend": "from_outside",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "b8e7b1b9-4e8d-4a26-b524-13a1cc141b6e",
"layout": {
"h": 1,
"i": "b8e7b1b9-4e8d-4a26-b524-13a1cc141b6e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 14
},
"name": "出方向统计",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e1d94e76-9e7f-4a7e-b207-e4bc63382e06",
"layout": {
"h": 4,
"i": "8c17c2d6-aba3-4b40-8a85-71e57f53d71b",
"isResizable": true,
"w": 6,
"x": 0,
"y": 15
},
"maxPerRow": 4,
"name": "出方向流量速率(bps)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bitsIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_bw_rate_out_to_inside_value{instance_id=\"$instance_id\"}",
"legend": "out_to_inside",
"refId": "A"
},
{
"expr": "aliyun_acs_nat_gateway_bw_rate_out_to_outside_value{instance_id=\"$instance_id\"}",
"legend": "out_to_outside",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d314b395-16d6-4fe9-a020-84ca656f0117",
"layout": {
"h": 4,
"i": "65c4598a-1a95-4d55-b0ac-ae2446c7c98a",
"isResizable": true,
"w": 6,
"x": 6,
"y": 15
},
"maxPerRow": 4,
"name": "出方向流量(bytes)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_bytes_out_to_inside_value{instance_id=\"$instance_id\"}",
"legend": "out_to_inside",
"refId": "A"
},
{
"expr": "aliyun_acs_nat_gateway_bytes_out_to_outside_value{instance_id=\"$instance_id\"}",
"legend": "out_to_outside",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "efd5b118-f530-4750-8ac8-c9e8ae4da600",
"layout": {
"h": 4,
"i": "9c22e1d7-8d2c-47c9-83ce-37e9eccbc8a4",
"isResizable": true,
"w": 6,
"x": 12,
"y": 15
},
"maxPerRow": 4,
"name": "出方向包速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_packets_out_to_inside_value{instance_id=\"$instance_id\"}",
"legend": "out_to_inside",
"refId": "A"
},
{
"expr": "aliyun_acs_nat_gateway_packets_out_to_outside_value{instance_id=\"$instance_id\"}",
"legend": "out_to_outside",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "50d416eb-3518-4f37-a554-17bb9af79dd8",
"layout": {
"h": 4,
"i": "3bd0c456-6d70-4479-b5aa-ed68705fda7e",
"isResizable": true,
"w": 6,
"x": 18,
"y": 15
},
"maxPerRow": 4,
"name": "出方向包量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_nat_gateway_pps_rate_out_to_inside_value{instance_id=\"$instance_id\"}",
"legend": "out_to_inside",
"refId": "A"
},
{
"expr": "aliyun_acs_nat_gateway_pps_rate_out_to_outside_value{instance_id=\"$instance_id\"}",
"legend": "out_to_outside",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"label": "datasource",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_nat_gateway_bw_rate_in_from_inside_value, instance_id)",
"label": "instance_id",
"name": "instance_id",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327113592000
}
================================================
FILE: integrations/AliYun/dashboards/oss.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云OSS",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "ae6ab52c-c38f-4697-aa93-1a4a7beed598",
"layout": {
"h": 1,
"i": "ae6ab52c-c38f-4697-aa93-1a4a7beed598",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "整体情况",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "aff2b3e8-e536-491e-98e4-e15639922ec5",
"layout": {
"h": 4,
"i": "e440abe0-5b4d-4877-ac1e-1d5b37c3db6c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 1
},
"maxPerRow": 4,
"name": "可用性/有效请求率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_availability_value{user_id=\"$user_id\"}",
"legend": "可用性",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_request_valid_rate_value{user_id=\"$user_id\"}",
"legend": "有效请求率",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "db8f77c9-fd97-4f58-af82-b51992bd5495",
"layout": {
"h": 4,
"i": "db8f77c9-fd97-4f58-af82-b51992bd5495",
"isResizable": true,
"w": 12,
"x": 12,
"y": 1
},
"maxPerRow": 4,
"name": "总请求数/有效请求数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_total_request_count_value{user_id=\"$user_id\"}",
"legend": "总请求数",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_valid_request_count_value{user_id=\"$user_id\"}",
"legend": "效请求数",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e28752b4-5f16-4358-a363-e76ddddb9c24",
"layout": {
"h": 4,
"i": "facf2d97-6eba-4559-80c8-73c0164fa29c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 5
},
"maxPerRow": 4,
"name": "公网出流量/公网入流量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_internet_send_value{user_id=\"$user_id\"}",
"legend": "公网出流量",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_internet_recv_value{user_id=\"$user_id\"}",
"legend": "公网入流量",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "98d4c729-06c9-4e67-aa57-7864f2e46d98",
"layout": {
"h": 1,
"i": "98d4c729-06c9-4e67-aa57-7864f2e46d98",
"isResizable": false,
"w": 24,
"x": 0,
"y": 9
},
"name": "请求状态详情",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b8217fcb-278e-4dfe-8449-56bc9ebcb1c3",
"layout": {
"h": 4,
"i": "61ae3ba8-3a75-4297-b53f-f9de5c8476da",
"isResizable": true,
"w": 6,
"x": 0,
"y": 10
},
"maxPerRow": 4,
"name": "服务端错误请求数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_server_error_count_value{user_id=\"$user_id\"}",
"legend": "服务端错误请求数",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1cb9ec59-73dc-4559-b7f0-ca90679bda79",
"layout": {
"h": 4,
"i": "e6566470-8b07-4061-bdff-1cfb38daa963",
"isResizable": true,
"w": 6,
"x": 6,
"y": 10
},
"maxPerRow": 4,
"name": "服务端错误请求占比",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_server_error_rate_value{user_id=\"$user_id\"}",
"legend": "服务端错误请求占比",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5f48c124-d36b-423d-bc11-fa0b9fdca3f2",
"layout": {
"h": 4,
"i": "104c6702-9114-421a-ad5e-7ad4b72d0c02",
"isResizable": true,
"w": 6,
"x": 12,
"y": 10
},
"maxPerRow": 4,
"name": "网络错误请求数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_network_error_count_value{user_id=\"$user_id\"}",
"legend": "网络错误请求数",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "755a1c23-48c5-4491-b009-5137882723b0",
"layout": {
"h": 4,
"i": "082830cf-310a-4108-98e2-0c60d5c5bc35",
"isResizable": true,
"w": 6,
"x": 18,
"y": 10
},
"maxPerRow": 4,
"name": "网络错误请求占比",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_network_error_rate_value{user_id=\"$user_id\"}",
"legend": "网络错误请求占比",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f486359b-2733-4d92-adb9-1348f9565d65",
"layout": {
"h": 4,
"i": "1cb945cc-4b3c-4f2c-ace4-8037ad0e8816",
"isResizable": true,
"w": 6,
"x": 0,
"y": 14
},
"maxPerRow": 4,
"name": "客户端错误请求数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_client_other_error_count_value{user_id=\"$user_id\"}",
"legend": "客户端其他错误请求数",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_client_timeout_error_count_value{user_id=\"$user_id\"}",
"legend": "客户端超时错误请求数",
"refId": "B"
},
{
"expr": "aliyun_acs_oss_dashboard_resource_not_found_error_count_value{user_id=\"$user_id\"}",
"legend": "客户端资源不存在错误请求数",
"refId": "C"
},
{
"expr": "aliyun_acs_oss_dashboard_authorization_error_count_value{user_id=\"$user_id\"}",
"legend": "客户端授权错误请求数",
"refId": "D"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d525f038-812e-4024-9a8c-d4a9cbcf9d87",
"layout": {
"h": 4,
"i": "a237200b-1f05-4093-9aa6-d1e27770c38f",
"isResizable": true,
"w": 6,
"x": 6,
"y": 14
},
"maxPerRow": 4,
"name": "客户端错误请求占比",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_client_other_error_rate_value{user_id=\"$user_id\"}",
"legend": "客户端其他错误请求占比",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_client_timeout_error_rate_value{user_id=\"$user_id\"}",
"legend": "客户端超时错误请求占比",
"refId": "B"
},
{
"expr": "aliyun_acs_oss_dashboard_resource_not_found_error_rate_value{user_id=\"$user_id\"}",
"legend": "客户端资源不存在错误请求占比",
"refId": "C"
},
{
"expr": "aliyun_acs_oss_dashboard_authorization_error_rate_value{user_id=\"$user_id\"}",
"legend": "客户端授权错误请求占比",
"refId": "D"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "39c5dc8b-5e87-47c8-89ba-73a9985483e2",
"layout": {
"h": 4,
"i": "74a86f62-668c-426c-b69d-4f9e5fd6fb2b",
"isResizable": true,
"w": 6,
"x": 12,
"y": 14
},
"maxPerRow": 4,
"name": "有效请求数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_success_count_value{user_id=\"$user_id\"}",
"legend": "成功请求数",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_redirect_count_value{user_id=\"$user_id\"}",
"legend": "重定向请求数",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5f10a135-c092-4905-9986-c51d28cb532f",
"layout": {
"h": 4,
"i": "9d9a4a35-a3bf-41c0-a019-8d70d1a38eab",
"isResizable": true,
"w": 6,
"x": 18,
"y": 14
},
"maxPerRow": 4,
"name": "有效请求占比",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_success_rate_value{user_id=\"$user_id\"}",
"legend": "成功请求占比",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_redirect_rate_value{user_id=\"$user_id\"}",
"legend": "重定向请求占比",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "eae6d0ff-c5a3-4d6e-8fd6-53c765562234",
"layout": {
"h": 1,
"i": "eae6d0ff-c5a3-4d6e-8fd6-53c765562234",
"isResizable": false,
"w": 24,
"x": 0,
"y": 18
},
"name": "计量参考",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2722be43-b2aa-44d2-9066-8a2818aadfa2",
"layout": {
"h": 4,
"i": "99bc9b5d-ac08-473a-8566-466777eb62e8",
"isResizable": true,
"w": 6,
"x": 0,
"y": 19
},
"maxPerRow": 4,
"name": "存储大小",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_metering_storage_utilization_value{user_id=\"$user_id\"}",
"legend": "存储大小",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "af133902-12f9-4746-b028-e021391e4a67",
"layout": {
"h": 4,
"i": "ecabf939-7f31-4fd5-a825-7d8ffd4033d7",
"isResizable": true,
"w": 6,
"x": 6,
"y": 19
},
"maxPerRow": 4,
"name": "计费请求数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_metering_get_request_value{user_id=\"$user_id\"}",
"legend": "客户端其他错误请求数",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6bb61b7c-4b49-4d9b-a707-34913be40e6e",
"layout": {
"h": 4,
"i": "616e7a61-6bc1-4499-8065-b944789953e0",
"isResizable": true,
"w": 6,
"x": 12,
"y": 19
},
"maxPerRow": 4,
"name": "计量流量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_metering_internet_rx_value{user_id=\"$user_id\"}",
"legend": "internet_rx",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_metering_internet_tx_value{user_id=\"$user_id\"}",
"legend": "internet_tx",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "d27c946f-33e2-4d35-ad84-87b68a0076f1",
"layout": {
"h": 1,
"i": "d27c946f-33e2-4d35-ad84-87b68a0076f1",
"isResizable": false,
"w": 24,
"x": 0,
"y": 23
},
"name": "平均延时",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9c635833-ebb1-441a-b5ce-81a5bfabaa71",
"layout": {
"h": 4,
"i": "38632126-ab8b-4c9c-b15f-6dbcdc0e8ced",
"isResizable": true,
"w": 6,
"x": 0,
"y": 24
},
"maxPerRow": 4,
"name": "GetObject请求平均延时(Milliseconds)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_get_object_e2e_latency_value{user_id=\"$user_id\"}",
"legend": "GetObject请求E2E平均延时",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_get_object_server_latency_value{user_id=\"$user_id\"}",
"legend": "GetObject请求平均服务器延时",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f1f08b1f-fda7-47a6-803d-fc68599cbbde",
"layout": {
"h": 4,
"i": "a61ee3b0-4097-4ee7-9567-16fcd5cc50a0",
"isResizable": true,
"w": 6,
"x": 6,
"y": 24
},
"maxPerRow": 4,
"name": "HeadObject请求平均延时(Milliseconds)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_head_object_e2e_latency_value{user_id=\"$user_id\"}",
"legend": "HeadObject请求E2E平均延时",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_head_object_server_latency_value{user_id=\"$user_id\"}",
"legend": "HeadObject请求平均服务器延时",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7282be15-52e2-40a2-951f-6140daa7d595",
"layout": {
"h": 4,
"i": "0d424100-822c-4c5f-a02d-883fe3642fab",
"isResizable": true,
"w": 6,
"x": 12,
"y": 24
},
"maxPerRow": 4,
"name": "PutObject请求平均延时(Milliseconds)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_put_object_e2e_latency_value{user_id=\"$user_id\"}",
"legend": "PutObject请求E2E平均延时",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_put_object_server_latency_value{user_id=\"$user_id\"}",
"legend": "PutObject请求平均服务器延时",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6f211349-a330-437a-bfab-29404739098d",
"layout": {
"h": 4,
"i": "58a5a502-0d4d-4b73-a9dd-8b930fc11052",
"isResizable": true,
"w": 6,
"x": 18,
"y": 24
},
"maxPerRow": 4,
"name": "PostObject请求平均延时(Milliseconds)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_post_object_e2e_latency_value{user_id=\"$user_id\"}",
"legend": "PostObject请求E2E平均延时",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_post_object_server_latency_value{user_id=\"$user_id\"}",
"legend": "PostObject请求平均服务器延时",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6b392bdb-9677-42d4-b2fd-de53601fc858",
"layout": {
"h": 4,
"i": "2205cc4b-dcd2-444d-8b37-0694deb9308d",
"isResizable": true,
"w": 6,
"x": 0,
"y": 28
},
"maxPerRow": 4,
"name": "AppendObject请求平均延时(Milliseconds)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_append_object_e2e_latency_value{user_id=\"$user_id\"}",
"legend": "AppendObject请求E2E平均延时",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_append_object_server_latency_value{user_id=\"$user_id\"}",
"legend": "AppendObject请求平均服务器延时",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1400ae72-eae2-4ffe-bcf5-2ebd8194d470",
"layout": {
"h": 4,
"i": "f2015851-061f-4345-8b20-989103549937",
"isResizable": true,
"w": 6,
"x": 6,
"y": 28
},
"maxPerRow": 4,
"name": "UploadPart请求平均延时(Milliseconds)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_upload_part_e2e_latency_value{user_id=\"$user_id\"}",
"legend": "UploadPart请求E2E平均延时",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_upload_part_server_latency_value{user_id=\"$user_id\"}",
"legend": "UploadPart请求平均服务器延时",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "060e67f9-4db5-4a5f-a24c-3c54efcbf297",
"layout": {
"h": 4,
"i": "8bd12d33-05d8-484a-82b4-d161bf1681d0",
"isResizable": true,
"w": 6,
"x": 12,
"y": 28
},
"maxPerRow": 4,
"name": "UploadPartCopy请求平均延时(Milliseconds)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_upload_part_copy_e2e_latency_value{user_id=\"$user_id\"}",
"legend": "UploadPartCopy请求E2E平均延时",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_upload_part_copy_server_latency_value{user_id=\"$user_id\"}",
"legend": "UploadPartCopy请求平均服务器延时",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "40fb9cf4-c166-4ab8-b0b5-b510a63821a3",
"layout": {
"h": 1,
"i": "40fb9cf4-c166-4ab8-b0b5-b510a63821a3",
"isResizable": false,
"w": 24,
"x": 0,
"y": 32
},
"name": "最大延时",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0997c38b-4776-476b-a859-109a6a5a3f74",
"layout": {
"h": 4,
"i": "f72517cd-65b3-40d8-8886-ae0870f4c4ec",
"isResizable": true,
"w": 6,
"x": 0,
"y": 33
},
"maxPerRow": 4,
"name": "GetObject请求最大延时(Milliseconds)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_max_get_object_e2e_latency_value{user_id=\"$user_id\"}",
"legend": "GetObject请求E2E最大延时",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_max_get_object_server_latency_value{user_id=\"$user_id\"}",
"legend": "GetObject请求最大服务器延时",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "51bd67db-3c51-43bf-a4ad-fdb9688968c3",
"layout": {
"h": 4,
"i": "911f4590-e7f9-470a-a940-0fd78380200b",
"isResizable": true,
"w": 6,
"x": 6,
"y": 33
},
"maxPerRow": 4,
"name": "HeadObject请求最大延时(Milliseconds)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_max_head_object_e2e_latency_value{user_id=\"$user_id\"}",
"legend": "HeadObject请求E2E最大延时",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_max_head_object_server_latency_value{user_id=\"$user_id\"}",
"legend": "HeadObject请求最大服务器延时",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "37e46a51-94cd-4bf5-ac8b-c2034d28f7b1",
"layout": {
"h": 4,
"i": "c2081d5d-f5c7-4989-9613-68e1b4090cd4",
"isResizable": true,
"w": 6,
"x": 12,
"y": 33
},
"maxPerRow": 4,
"name": "PutObject请求最大延时(Milliseconds)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_max_put_object_e2e_latency_value{user_id=\"$user_id\"}",
"legend": "PutObject请求E2E最大延时",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_max_put_object_server_latency_value{user_id=\"$user_id\"}",
"legend": "PutObject请求最大服务器延时",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "30782208-bd2e-4e13-b9ad-09b3aab748dc",
"layout": {
"h": 4,
"i": "5fb8bde2-d319-411d-bfac-24a70b80fbc3",
"isResizable": true,
"w": 6,
"x": 18,
"y": 33
},
"maxPerRow": 4,
"name": "PostObject请求最大延时(Milliseconds)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_max_post_object_e2e_latency_value{user_id=\"$user_id\"}",
"legend": "PostObject请求E2E最大延时",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_max_post_object_server_latency_value{user_id=\"$user_id\"}",
"legend": "PostObject请求最大服务器延时",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "613657f6-bea5-4efb-a124-c35f0391b719",
"layout": {
"h": 4,
"i": "3cc0b72f-7691-4860-9076-816b4294b8a1",
"isResizable": true,
"w": 6,
"x": 0,
"y": 37
},
"maxPerRow": 4,
"name": "AppendObject请求最大延时(Milliseconds)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_max_append_object_e2e_latency_value{user_id=\"$user_id\"}",
"legend": "AppendObject请求E2E最大延时",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_max_append_object_server_latency_value{user_id=\"$user_id\"}",
"legend": "AppendObject请求最大服务器延时",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7217044b-7b5a-4484-9eb5-0ecbb28cd8f6",
"layout": {
"h": 4,
"i": "e933e521-d5ba-4f72-81c8-553d4ff025a7",
"isResizable": true,
"w": 6,
"x": 6,
"y": 37
},
"maxPerRow": 4,
"name": "UploadPart请求最大延时(Milliseconds)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_max_upload_part_e2e_latency_value{user_id=\"$user_id\"}",
"legend": "UploadPart请求E2E最大延时",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_max_upload_part_server_latency_value{user_id=\"$user_id\"}",
"legend": "UploadPart请求最大服务器延时",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c1b2077d-024c-4ed1-955e-9f556346e28b",
"layout": {
"h": 4,
"i": "30d3a018-ac07-4dda-a28c-63e34a724f98",
"isResizable": true,
"w": 6,
"x": 12,
"y": 37
},
"maxPerRow": 4,
"name": "UploadPartCopy请求最大延时(Milliseconds)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_max_upload_part_copy_e2e_latency_value{user_id=\"$user_id\"}",
"legend": "UploadPartCopy请求E2E最大延时",
"refId": "A"
},
{
"expr": "aliyun_acs_oss_dashboard_max_upload_part_copy_server_latency_value{user_id=\"$user_id\"}",
"legend": "UploadPartCopy请求最大服务器延时",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "aee6d2ba-1489-4a09-a9b5-97e22a9c25de",
"layout": {
"h": 1,
"i": "aee6d2ba-1489-4a09-a9b5-97e22a9c25de",
"isResizable": false,
"w": 24,
"x": 0,
"y": 41
},
"name": "请求成功操作分类",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9776ff4c-ee48-4c34-b15f-f59d6957eff6",
"layout": {
"h": 4,
"i": "5f3b1b12-57c7-4ed7-9d36-e54b7693c5ce",
"isResizable": true,
"w": 6,
"x": 0,
"y": 42
},
"maxPerRow": 4,
"name": "GetObject成功请求",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_get_object_count_value",
"legend": "GetObject成功请求数",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4d06cc49-1729-4239-bfe9-3d74c41dbaca",
"layout": {
"h": 4,
"i": "d7b32954-4413-4ba2-a854-b5cc5df93e74",
"isResizable": true,
"w": 6,
"x": 6,
"y": 42
},
"maxPerRow": 4,
"name": "HeadObject成功请求",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_head_object_count_value",
"legend": "HeadObject成功请求数",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "93f35c48-0d5a-4f0a-a12a-da7a9f259b6e",
"layout": {
"h": 4,
"i": "f2fb1ebc-e309-465a-a13b-a2777639adc2",
"isResizable": true,
"w": 6,
"x": 12,
"y": 42
},
"maxPerRow": 4,
"name": "PutObject成功请求",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_put_object_count_value",
"legend": "PutObject成功请求数",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "02427f33-29fe-4c9e-b636-4902409920ae",
"layout": {
"h": 4,
"i": "8e5f7ed6-5721-4323-ac67-b4f1360bf9f5",
"isResizable": true,
"w": 6,
"x": 18,
"y": 42
},
"maxPerRow": 4,
"name": "PostObject成功请求",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_post_object_count_value",
"legend": "PostObject成功请求数",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5088ba65-9da3-4d6a-95bb-a64a75b7c687",
"layout": {
"h": 4,
"i": "fba364b1-2ba3-48dc-8107-09a7594fb012",
"isResizable": true,
"w": 6,
"x": 0,
"y": 46
},
"maxPerRow": 4,
"name": "AppendObject成功请求",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_append_object_count_value",
"legend": "AppendObject成功请求数",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1c679bf2-3487-46fc-84bc-cd87a141cb56",
"layout": {
"h": 4,
"i": "ac88ce75-7f09-45e8-8088-d74a75e417c5",
"isResizable": true,
"w": 6,
"x": 6,
"y": 46
},
"maxPerRow": 4,
"name": "UploadPart成功请求",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_upload_part_count_value",
"legend": "UploadPart成功请求数",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9fb546a9-f159-4187-b054-b6a63d193743",
"layout": {
"h": 4,
"i": "f67a210d-6f21-4a7e-9987-6056d9e2b237",
"isResizable": true,
"w": 6,
"x": 12,
"y": 46
},
"maxPerRow": 4,
"name": "UploadPartCopy成功请求",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_upload_part_copy_count_value",
"legend": "UploadPartCopy成功请求数",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "51834321-efc6-4fe8-af4d-de0aa0a8ac32",
"layout": {
"h": 4,
"i": "601ac8db-e263-4465-9dea-136b0480e437",
"isResizable": true,
"w": 6,
"x": 18,
"y": 46
},
"maxPerRow": 4,
"name": "DeleteObject成功请求",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_delete_object_count_value",
"legend": "DeleteObject成功请求数",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c455124e-a519-43fb-b6eb-b617e80dcfcc",
"layout": {
"h": 4,
"i": "ad8b49a7-f63e-4952-88b8-afcf44fb674c",
"isResizable": true,
"w": 6,
"x": 0,
"y": 50
},
"maxPerRow": 4,
"name": "DeleteObjects成功请求",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_oss_dashboard_delete_objects_count_value",
"legend": "DeleteObjects成功请求数",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_oss_dashboard_get_object_count_value, user_id)",
"multi": true,
"name": "user_id",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327116137000
}
================================================
FILE: integrations/AliYun/dashboards/polardb_mysql.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云POLARDB-MySQL",
"ident": "",
"tags": "polardb 阿里云",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b72c5032-1ea0-4c87-9cfd-d21b374680f1",
"layout": {
"h": 4,
"i": "b72c5032-1ea0-4c87-9cfd-d21b374680f1",
"isResizable": true,
"w": 8,
"x": 0,
"y": 0
},
"maxPerRow": 4,
"name": "活跃连接数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_polardb_cluster_active_sessions_average{node_id=\"$node_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b518c9c4-f0e8-4712-ab67-be4521eeff0c",
"layout": {
"h": 4,
"i": "ff589719-6072-488d-819d-6e080a6f3c60",
"isResizable": true,
"w": 8,
"x": 8,
"y": 0
},
"maxPerRow": 4,
"name": "连接数使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_polardb_cluster_connection_utilization_average{node_id=\"$node_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "12d4a674-6d09-4b02-aa4f-d767531bd368",
"layout": {
"h": 4,
"i": "baba4778-b950-4224-9dac-9ecda041f93b",
"isResizable": true,
"w": 8,
"x": 16,
"y": 0
},
"maxPerRow": 4,
"name": "CPU使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_polardb_cluster_cpu_utilization_average{node_id=\"$node_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "86c1f728-ac1e-402b-bea6-2e3979f472c3",
"layout": {
"h": 4,
"i": "5d673c5d-1fbb-4df4-9ece-c991d053ca34",
"isResizable": true,
"w": 8,
"x": 0,
"y": 4
},
"maxPerRow": 4,
"name": "每秒存储引擎IO吞吐量(KB)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_polardb_cluster_data_io_average{node_id=\"$node_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e5a04daf-a6a1-4248-93ac-72dbd3d04f0b",
"layout": {
"h": 4,
"i": "e4c12b45-5748-4568-a108-c7f3b640a24c",
"isResizable": true,
"w": 8,
"x": 8,
"y": 4
},
"maxPerRow": 4,
"name": "每秒存储引擎IO次数(countSecond)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_polardb_cluster_data_iops_average{node_id=\"$node_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "56a0e345-1d4d-4051-a3cf-738bea220f96",
"layout": {
"h": 4,
"i": "d1752ed4-f4a1-4c4b-854f-1c2ef01b34a4",
"isResizable": true,
"w": 8,
"x": 16,
"y": 4
},
"maxPerRow": 4,
"name": "每秒IO次数(countSecond)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_polardb_cluster_iops_usage_average{node_id=\"$node_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "58799c95-0429-43eb-ba8e-33611d0795ab",
"layout": {
"h": 4,
"i": "fe529101-422b-407c-ad77-b71b58b8848a",
"isResizable": true,
"w": 8,
"x": 0,
"y": 8
},
"maxPerRow": 4,
"name": "内存命中率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_polardb_cluster_mem_hit_ratio_average{node_id=\"$node_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "55b17951-a4ae-46a7-a2d7-57db1414f6ff",
"layout": {
"h": 4,
"i": "c4c248bd-21fb-4485-8235-f50640116e65",
"isResizable": true,
"w": 8,
"x": 8,
"y": 8
},
"maxPerRow": 4,
"name": "内存使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_polardb_cluster_memory_utilization_average{node_id=\"$node_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2e545b2b-130b-4829-a2d2-ee5305c302aa",
"layout": {
"h": 4,
"i": "13dceb72-9e9d-483d-86d2-b192debdcece",
"isResizable": true,
"w": 8,
"x": 16,
"y": 8
},
"maxPerRow": 4,
"name": "每秒查询数量(count)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_polardb_cluster_qps_average{node_id=\"$node_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "70d33549-dd0d-4b68-90e7-f2759fa6bcd5",
"layout": {
"h": 4,
"i": "496c32c6-49f3-4506-a491-772ac5159f89",
"isResizable": true,
"w": 8,
"x": 0,
"y": 12
},
"maxPerRow": 4,
"name": "每秒慢查询数量(countS)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_polardb_cluster_slow_queries_ps_average{node_id=\"$node_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "61443377-131b-4cf9-8b3b-8f1cf4794086",
"layout": {
"h": 4,
"i": "dfcbc2d9-2714-4e8f-b5b6-290a7e9e35ce",
"isResizable": true,
"w": 8,
"x": 8,
"y": 12
},
"maxPerRow": 4,
"name": "每秒事务数(countS)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_polardb_cluster_tps_average{node_id=\"$node_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"label": "datasource",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_polardb_cluster_mps_average, name)",
"label": "",
"name": "name",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_polardb_cluster_mps_average{name=\"$name\"}, cluster_id)",
"label": "cluster_id",
"name": "cluster_id",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_polardb_cluster_active_sessions_average{cluster_id=\"$cluster_id\"}, node_id)",
"label": "node_id",
"name": "node_id",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327122536000
}
================================================
FILE: integrations/AliYun/dashboards/rds.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云RDS",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "01f4d444-aa2d-466d-9615-c76baf60a40c",
"layout": {
"h": 4,
"i": "01f4d444-aa2d-466d-9615-c76baf60a40c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 0
},
"maxPerRow": 4,
"name": "CPU平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_rds_dashboard_cpu_usage_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c43eb882-915f-4c38-a0b5-8f33c21ab44a",
"layout": {
"h": 4,
"i": "09903231-6557-42be-9cf3-2873878e9bf2",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"maxPerRow": 4,
"name": "内存平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_rds_dashboard_memory_usage_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "37447883-ad79-46bc-888a-1be2835c1c64",
"layout": {
"h": 4,
"i": "378a5a26-c28e-4612-af09-f82ec2e11d80",
"isResizable": true,
"w": 12,
"x": 0,
"y": 4
},
"maxPerRow": 4,
"name": "磁盘平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_rds_dashboard_disk_usage_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c9735607-3f24-44a7-bbf1-3ad39441c5c9",
"layout": {
"h": 4,
"i": "93a4c8a6-ac23-4e26-8a38-781ec1668820",
"isResizable": true,
"w": 12,
"x": 12,
"y": 4
},
"maxPerRow": 4,
"name": "IOPS平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_rds_dashboard_iops_usage_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b516e7dc-8022-409d-b907-18c4143df891",
"layout": {
"h": 5,
"i": "b516e7dc-8022-409d-b907-18c4143df891",
"isResizable": true,
"w": 24,
"x": 0,
"y": 8
},
"maxPerRow": 4,
"name": "SQL执行量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_rds_dashboard_my_sql_com_delete_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "delete",
"refId": "A"
},
{
"expr": "sum(aliyun_acs_rds_dashboard_my_sql_com_insert_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "insert",
"refId": "B"
},
{
"expr": "sum(aliyun_acs_rds_dashboard_my_sql_com_insert_select_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "insert_select",
"refId": "C"
},
{
"expr": "sum(aliyun_acs_rds_dashboard_my_sql_com_update_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "update",
"refId": "D"
},
{
"expr": "sum(aliyun_acs_rds_dashboard_my_sql_com_select_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "select",
"refId": "E"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_rds_dashboard_cpu_usage_average,name)",
"multi": false,
"name": "name",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_rds_dashboard_cpu_usage_average{name=\"$name\"},instance_id)",
"multi": false,
"name": "instance_id",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327125143000
}
================================================
FILE: integrations/AliYun/dashboards/rds_new.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云RDS_N",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "37447883-ad79-46bc-888a-1be2835c1c64",
"layout": {
"h": 4,
"i": "378a5a26-c28e-4612-af09-f82ec2e11d80",
"isResizable": true,
"w": 8,
"x": 0,
"y": 0
},
"maxPerRow": 4,
"name": "硬盘使用(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_rds_dashboard_disk_usage_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c9735607-3f24-44a7-bbf1-3ad39441c5c9",
"layout": {
"h": 4,
"i": "93a4c8a6-ac23-4e26-8a38-781ec1668820",
"isResizable": true,
"w": 8,
"x": 8,
"y": 0
},
"maxPerRow": 4,
"name": "IOPS使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_rds_dashboard_iops_usage_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9480ea17-27e7-49fa-ab87-3b10f2d4b6ed",
"layout": {
"h": 4,
"i": "d428b946-aa8b-449e-a4b7-303e9423b787",
"isResizable": true,
"w": 8,
"x": 16,
"y": 0
},
"maxPerRow": 4,
"name": "连接数使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_rds_dashboard_connection_usage_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "01f4d444-aa2d-466d-9615-c76baf60a40c",
"layout": {
"h": 4,
"i": "01f4d444-aa2d-466d-9615-c76baf60a40c",
"isResizable": true,
"w": 8,
"x": 0,
"y": 4
},
"maxPerRow": 4,
"name": "CPU使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_rds_dashboard_cpu_usage_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c43eb882-915f-4c38-a0b5-8f33c21ab44a",
"layout": {
"h": 4,
"i": "09903231-6557-42be-9cf3-2873878e9bf2",
"isResizable": true,
"w": 8,
"x": 8,
"y": 4
},
"maxPerRow": 4,
"name": "内存使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_rds_dashboard_memory_usage_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6a03d4e9-f1f8-4989-bbe0-0ef428365d6e",
"layout": {
"h": 4,
"i": "d42cd666-4060-43fc-b0bc-e9698f816c97",
"isResizable": true,
"w": 8,
"x": 16,
"y": 4
},
"maxPerRow": 4,
"name": "MySQL网络入流量(bits/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bitsSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_rds_dashboard_my_sql_network_in_new_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b516e7dc-8022-409d-b907-18c4143df891",
"layout": {
"h": 5,
"i": "b516e7dc-8022-409d-b907-18c4143df891",
"isResizable": true,
"w": 24,
"x": 0,
"y": 8
},
"maxPerRow": 4,
"name": "SQL执行量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_rds_dashboard_my_sql_com_delete_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "delete",
"refId": "A"
},
{
"expr": "sum(aliyun_acs_rds_dashboard_my_sql_com_insert_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "insert",
"refId": "B"
},
{
"expr": "sum(aliyun_acs_rds_dashboard_my_sql_com_insert_select_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "insert_select",
"refId": "C"
},
{
"expr": "sum(aliyun_acs_rds_dashboard_my_sql_com_update_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "update",
"refId": "D"
},
{
"expr": "sum(aliyun_acs_rds_dashboard_my_sql_com_select_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "select",
"refId": "E"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7a3fbc77-1d6c-40cf-bec9-38b3b0f45dd9",
"layout": {
"h": 5,
"i": "96d3ff16-9947-4ec6-95a4-bec2bdfd24d0",
"isResizable": true,
"w": 24,
"x": 0,
"y": 13
},
"maxPerRow": 4,
"name": "SQL慢查询",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_rds_dashboard_my_sql_slow_queries_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_rds_dashboard_cpu_usage_average,name)",
"multi": false,
"name": "name",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_rds_dashboard_cpu_usage_average{name=\"$name\"},instance_id)",
"multi": false,
"name": "instance_id",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327127106000
}
================================================
FILE: integrations/AliYun/dashboards/redis.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云REDIS",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "01f4d444-aa2d-466d-9615-c76baf60a40c",
"layout": {
"h": 4,
"i": "01f4d444-aa2d-466d-9615-c76baf60a40c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 0
},
"maxPerRow": 4,
"name": "CPU平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_kvstore_cpu_usage_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c43eb882-915f-4c38-a0b5-8f33c21ab44a",
"layout": {
"h": 4,
"i": "09903231-6557-42be-9cf3-2873878e9bf2",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"maxPerRow": 4,
"name": "内存平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_kvstore_memory_usage_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "37447883-ad79-46bc-888a-1be2835c1c64",
"layout": {
"h": 4,
"i": "378a5a26-c28e-4612-af09-f82ec2e11d80",
"isResizable": true,
"w": 12,
"x": 0,
"y": 4
},
"maxPerRow": 4,
"name": "连接数平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_kvstore_connection_usage_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c9735607-3f24-44a7-bbf1-3ad39441c5c9",
"layout": {
"h": 4,
"i": "93a4c8a6-ac23-4e26-8a38-781ec1668820",
"isResizable": true,
"w": 12,
"x": 12,
"y": 4
},
"maxPerRow": 4,
"name": "失败统计平均使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_kvstore_failed_count_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b516e7dc-8022-409d-b907-18c4143df891",
"layout": {
"h": 5,
"i": "b516e7dc-8022-409d-b907-18c4143df891",
"isResizable": true,
"w": 24,
"x": 0,
"y": 8
},
"maxPerRow": 4,
"name": "网络流量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bitsSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_kvstore_intranet_in_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "in",
"refId": "A"
},
{
"expr": "sum(aliyun_acs_kvstore_intranet_out_average{instance_id=\"$instance_id\"}) by (instance_id)",
"legend": "out",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_kvstore_cpu_usage_average,name)",
"multi": false,
"name": "name",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_kvstore_cpu_usage_average{name=\"$name\"},instance_id)",
"multi": false,
"name": "instance_id",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327128561000
}
================================================
FILE: integrations/AliYun/dashboards/redis_cluster.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云REDIS 集群版",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": false,
"id": "2c38fd30-4c4c-40a9-ad4a-5c945db32947",
"layout": {
"h": 1,
"i": "2c38fd30-4c4c-40a9-ad4a-5c945db32947",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "默认分组",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "01f4d444-aa2d-466d-9615-c76baf60a40c",
"layout": {
"h": 4,
"i": "01f4d444-aa2d-466d-9615-c76baf60a40c",
"isResizable": true,
"w": 8,
"x": 0,
"y": 1
},
"maxPerRow": 4,
"name": "CPU 使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_cpu_usage_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "82fed14f-911c-4f2f-8b39-8630b5cc43dc",
"layout": {
"h": 4,
"i": "def6d721-2816-40ef-b4fe-a111a8c035fe",
"isResizable": true,
"w": 8,
"x": 8,
"y": 1
},
"maxPerRow": 4,
"name": "内存使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_memory_usage_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "200741b5-1d0c-4bb8-a6a6-d26783e13b31",
"layout": {
"h": 4,
"i": "bef9407d-be17-457b-a824-3a1904c5a20c",
"isResizable": true,
"w": 8,
"x": 16,
"y": 1
},
"maxPerRow": 4,
"name": "连接数使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_connection_usage_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ef1e2ca6-268f-47bf-afd3-c9422443ac98",
"layout": {
"h": 4,
"i": "47adc622-6018-4e72-8c01-d7d6c88677a4",
"isResizable": true,
"w": 8,
"x": 0,
"y": 5
},
"maxPerRow": 4,
"name": "流入带宽使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_intranet_in_ratio_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e33e2568-eea7-405e-9989-7b1bdf816f65",
"layout": {
"h": 4,
"i": "a229c543-1139-43bf-be02-bbea39e4e6a8",
"isResizable": true,
"w": 8,
"x": 8,
"y": 5
},
"maxPerRow": 4,
"name": "流出带宽使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_intranet_out_ratio_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "e469ebd7-9249-41ea-b547-10d697738854",
"layout": {
"h": 1,
"i": "e469ebd7-9249-41ea-b547-10d697738854",
"isResizable": false,
"w": 24,
"x": 0,
"y": 9
},
"name": "基础监控项",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1a088a52-fca7-4b55-a2ff-3893419c4bd4",
"layout": {
"h": 4,
"i": "4f26e8c4-c8c7-48db-bfe0-ad1a069b9bd2",
"isResizable": true,
"w": 8,
"x": 0,
"y": 10
},
"maxPerRow": 4,
"name": "DB连接数使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_connection_usage_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "be416c06-1b9e-4861-8c4f-3bfd8737d836",
"layout": {
"h": 4,
"i": "66af6522-4be6-4ff4-87f9-ec56d8f181e7",
"isResizable": true,
"w": 9,
"x": 8,
"y": 10
},
"maxPerRow": 4,
"name": "数据节点CPU 使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_cpu_usage_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e6a480bb-7f56-4a8d-a3af-30c52e423852",
"layout": {
"h": 4,
"i": "857bb7f2-38e4-4416-8d69-1082450b9fd0",
"isResizable": true,
"w": 8,
"x": 0,
"y": 14
},
"maxPerRow": 4,
"name": "数据节点命中率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_hit_rate_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "aba1d918-5600-452a-b5a5-6f3795d0a220",
"layout": {
"h": 4,
"i": "d0beaee6-2eee-4a66-82ba-30e441aed819",
"isResizable": true,
"w": 8,
"x": 8,
"y": 14
},
"maxPerRow": 4,
"name": "数据节点流入带宽(KBytes/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_intranet_in_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5af6d738-b37f-4b40-9b9f-2c774a02bb26",
"layout": {
"h": 4,
"i": "8d4609c8-ce71-4e85-8a41-bc4328b817e5",
"isResizable": true,
"w": 8,
"x": 16,
"y": 14
},
"maxPerRow": 4,
"name": "数据节点流入带宽使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_intranet_in_ratio_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "223b0eb7-b617-4897-8d4e-cc13db7c6846",
"layout": {
"h": 4,
"i": "f7f3d139-28e0-4b70-9529-dc914e77df7b",
"isResizable": true,
"w": 8,
"x": 0,
"y": 18
},
"maxPerRow": 4,
"name": "数据节点流出带宽(KBytes/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_intranet_out_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8e5b9c6b-90ec-4efe-85f4-1e386e28495d",
"layout": {
"h": 4,
"i": "22b8d485-016f-4320-93dd-ee7b41c3ef83",
"isResizable": true,
"w": 8,
"x": 8,
"y": 18
},
"maxPerRow": 4,
"name": "数据节点流出带宽使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_intranet_out_ratio_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "51ac86b0-e108-4bf0-af19-80b0588e883b",
"layout": {
"h": 4,
"i": "82a19a60-ada6-4639-8590-b363c92cd30d",
"isResizable": true,
"w": 8,
"x": 16,
"y": 18
},
"maxPerRow": 4,
"name": "Key总数(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_keys_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "54fc0148-930b-47a5-bdf4-06e44c8e1d6a",
"layout": {
"h": 4,
"i": "3d6e3c07-97b6-4930-ada9-21d1ffdfc9c6",
"isResizable": true,
"w": 8,
"x": 0,
"y": 22
},
"maxPerRow": 4,
"name": "数据节点内存使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_memory_usage_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8b7d5240-0c30-43c0-86d0-6620e5e122e0",
"layout": {
"h": 4,
"i": "91975194-a995-4fe9-8788-7503abfe414a",
"isResizable": true,
"w": 8,
"x": 8,
"y": 22
},
"maxPerRow": 4,
"name": "DB已用连接数(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_used_connection_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3e4719d7-db36-47d7-b4ee-fa6bfd691817",
"layout": {
"h": 4,
"i": "b70f9c9c-643c-44fa-89d2-2e99098ab508",
"isResizable": true,
"w": 8,
"x": 16,
"y": 22
},
"maxPerRow": 4,
"name": "阻塞客户端连接数(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_blocked_clients_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "98ce0f92-5c53-44a6-bfcb-14de68a56a36",
"layout": {
"h": 4,
"i": "9dd21107-4432-4648-a156-68a6f1c41684",
"isResizable": true,
"w": 8,
"x": 0,
"y": 26
},
"maxPerRow": 4,
"name": "历史累计逐出Key总数(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_evicted_keys_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4b9a4e17-35ad-4968-91b9-4d2eca5e2b06",
"layout": {
"h": 4,
"i": "ec228b90-d782-4e0c-8301-6ef7a0a41556",
"isResizable": true,
"w": 8,
"x": 8,
"y": 26
},
"maxPerRow": 4,
"name": "每秒逐出Key总数(Count/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_evicted_keys_per_second_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "cea77b74-9cfe-4fb6-a9cb-db9ae7f32958",
"layout": {
"h": 4,
"i": "bd004cc1-4d0d-49fb-98fe-b3c12b51399d",
"isResizable": true,
"w": 8,
"x": 16,
"y": 26
},
"maxPerRow": 4,
"name": "历史累计淘汰Key总数(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_expired_keys_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "24c3c854-6552-4107-86fc-52420ea03692",
"layout": {
"h": 4,
"i": "3793c7dc-0da9-434f-a330-42ffba31b5b0",
"isResizable": true,
"w": 8,
"x": 0,
"y": 30
},
"maxPerRow": 4,
"name": "每秒淘汰Key总数(Count/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_expired_keys_per_second_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5af71ff4-8b40-40cd-b672-29b89921ace3",
"layout": {
"h": 4,
"i": "1293634c-92dc-4137-91e8-903c8d08be8c",
"isResizable": true,
"w": 8,
"x": 8,
"y": 30
},
"maxPerRow": 4,
"name": "已设置过期时间Key总数(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_expires_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "490e1a0e-ea61-445a-bc25-38d3abd265c9",
"layout": {
"h": 4,
"i": "576e80a4-9fea-461f-b9c7-70a23da86b92",
"isResizable": true,
"w": 8,
"x": 16,
"y": 30
},
"maxPerRow": 4,
"name": "读QPS(Count/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_get_qps_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6457aad2-7774-4264-ba43-d784ab4fd936",
"layout": {
"h": 4,
"i": "0a2ab22e-af09-46d0-bb98-b148a9760ee9",
"isResizable": true,
"w": 8,
"x": 0,
"y": 34
},
"maxPerRow": 4,
"name": "每秒命中Key数量(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_hits_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "76dfed26-8798-4510-a420-0f8ad7ccb1e5",
"layout": {
"h": 4,
"i": "b62577c7-cea6-4bcd-8807-c4eed591545a",
"isResizable": true,
"w": 8,
"x": 8,
"y": 34
},
"maxPerRow": 4,
"name": "每秒未命中Key数量(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_misses_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d7a87151-35e3-41b0-8fbc-577af8df2d95",
"layout": {
"h": 4,
"i": "190bc267-e449-4d73-a734-50c4193b8f68",
"isResizable": true,
"w": 8,
"x": 16,
"y": 34
},
"maxPerRow": 4,
"name": "其他QPS(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_other_ops_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7a9282ea-fd7b-41d0-ac7a-44ce0612cfa0",
"layout": {
"h": 4,
"i": "e1ef102d-981d-4c62-beae-5af73effe7bd",
"isResizable": true,
"w": 8,
"x": 0,
"y": 38
},
"maxPerRow": 4,
"name": "写QPS(Count/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_put_qps_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f8f71f38-82cd-413c-abed-5e03e2214adf",
"layout": {
"h": 4,
"i": "d5b997c9-5bb3-441b-91c8-92b53ef73355",
"isResizable": true,
"w": 8,
"x": 8,
"y": 38
},
"maxPerRow": 4,
"name": "Lua脚本使用内存量(Byte)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_used_memory_lua_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "723f3cf7-5d92-46b0-a307-ef99f6f62cb2",
"layout": {
"h": 1,
"i": "723f3cf7-5d92-46b0-a307-ef99f6f62cb2",
"isResizable": false,
"w": 24,
"x": 0,
"y": 42
},
"name": "延迟监控项",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "92ff998a-db7d-41d9-b4d4-fa9d04252a4c",
"layout": {
"h": 4,
"i": "26a238e5-ffd0-48b4-b7bd-1fdc9ba20776",
"isResizable": true,
"w": 8,
"x": 0,
"y": 43
},
"maxPerRow": 4,
"name": "数据节点平均响应时间(us)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_sharding_avg_rt_average{instance_id=\"$instance_id\", node_id=\"$node_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "ce1da00c-8e8a-498a-a6da-3e164128cb21",
"layout": {
"h": 1,
"i": "ce1da00c-8e8a-498a-a6da-3e164128cb21",
"isResizable": false,
"w": 24,
"x": 0,
"y": 47
},
"name": "Proxy监控项",
"type": "row"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_kvstore_sharding_cpu_usage_average,name)",
"multi": false,
"name": "name",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_kvstore_sharding_cpu_usage_average{name=\"$name\"},instance_id)",
"multi": false,
"name": "instance_id",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_kvstore_sharding_cpu_usage_average{instance_id=\"$instance_id\"}, node_id)",
"multi": false,
"name": "node_id",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327130386000
}
================================================
FILE: integrations/AliYun/dashboards/redis_new.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云REDIS_N",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": false,
"id": "2c38fd30-4c4c-40a9-ad4a-5c945db32947",
"layout": {
"h": 1,
"i": "2c38fd30-4c4c-40a9-ad4a-5c945db32947",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "默认分组",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "01f4d444-aa2d-466d-9615-c76baf60a40c",
"layout": {
"h": 4,
"i": "01f4d444-aa2d-466d-9615-c76baf60a40c",
"isResizable": true,
"w": 8,
"x": 0,
"y": 1
},
"maxPerRow": 4,
"name": "CPU 使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_cpu_usage_average{instance_id=\"$instance_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c43eb882-915f-4c38-a0b5-8f33c21ab44a",
"layout": {
"h": 4,
"i": "09903231-6557-42be-9cf3-2873878e9bf2",
"isResizable": true,
"w": 8,
"x": 8,
"y": 1
},
"maxPerRow": 4,
"name": "内存使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_memory_usage_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "37447883-ad79-46bc-888a-1be2835c1c64",
"layout": {
"h": 4,
"i": "378a5a26-c28e-4612-af09-f82ec2e11d80",
"isResizable": true,
"w": 8,
"x": 16,
"y": 1
},
"maxPerRow": 4,
"name": "连接数使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_connection_usage_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2d1c8cdf-538e-48b5-8563-358f242825e5",
"layout": {
"h": 4,
"i": "764e188d-d728-44a0-a79d-133d957df9a9",
"isResizable": true,
"w": 8,
"x": 0,
"y": 5
},
"maxPerRow": 4,
"name": "流入带宽使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_intranet_in_ratio_average{instance_id=\"$instance_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0577dc0f-887a-4a54-9100-ef5e5e7443a0",
"layout": {
"h": 4,
"i": "93955b6b-620c-4407-908c-01ba4f544fef",
"isResizable": true,
"w": 8,
"x": 8,
"y": 5
},
"maxPerRow": 4,
"name": "流出带宽使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_intranet_out_ratio_average{instance_id=\"$instance_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a83bd550-3866-4225-9c09-08dd77e1b281",
"layout": {
"h": 4,
"i": "7b4d509b-1a8d-4f6a-9df4-f14116c4b9eb",
"isResizable": true,
"w": 8,
"x": 16,
"y": 5
},
"maxPerRow": 4,
"name": "连接数使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_connection_usage_average{instance_id=\"$instance_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_kvstore_cpu_usage_average,name)",
"multi": false,
"name": "name",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_kvstore_cpu_usage_average,instance_id)",
"multi": false,
"name": "instance_id",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327133348000
}
================================================
FILE: integrations/AliYun/dashboards/redis_standard.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云REDIS 标准版",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "2c38fd30-4c4c-40a9-ad4a-5c945db32947",
"layout": {
"h": 1,
"i": "2c38fd30-4c4c-40a9-ad4a-5c945db32947",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "默认分组",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "01f4d444-aa2d-466d-9615-c76baf60a40c",
"layout": {
"h": 4,
"i": "01f4d444-aa2d-466d-9615-c76baf60a40c",
"isResizable": true,
"w": 8,
"x": 0,
"y": 1
},
"maxPerRow": 4,
"name": "CPU 使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_cpu_usage_average{instance_id=\"$instance_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "82fed14f-911c-4f2f-8b39-8630b5cc43dc",
"layout": {
"h": 4,
"i": "def6d721-2816-40ef-b4fe-a111a8c035fe",
"isResizable": true,
"w": 8,
"x": 8,
"y": 1
},
"maxPerRow": 4,
"name": "内存使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_memory_usage_average{instance_id=\"$instance_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "200741b5-1d0c-4bb8-a6a6-d26783e13b31",
"layout": {
"h": 4,
"i": "bef9407d-be17-457b-a824-3a1904c5a20c",
"isResizable": true,
"w": 8,
"x": 16,
"y": 1
},
"maxPerRow": 4,
"name": "连接数使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_connection_usage_average{instance_id=\"$instance_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ef1e2ca6-268f-47bf-afd3-c9422443ac98",
"layout": {
"h": 4,
"i": "47adc622-6018-4e72-8c01-d7d6c88677a4",
"isResizable": true,
"w": 8,
"x": 0,
"y": 5
},
"maxPerRow": 4,
"name": "流入带宽使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_intranet_in_ratio_average{instance_id=\"$instance_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e33e2568-eea7-405e-9989-7b1bdf816f65",
"layout": {
"h": 4,
"i": "a229c543-1139-43bf-be02-bbea39e4e6a8",
"isResizable": true,
"w": 8,
"x": 8,
"y": 5
},
"maxPerRow": 4,
"name": "流出带宽使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_intranet_out_ratio_average{instance_id=\"$instance_id\"}",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "e469ebd7-9249-41ea-b547-10d697738854",
"layout": {
"h": 1,
"i": "e469ebd7-9249-41ea-b547-10d697738854",
"isResizable": false,
"w": 24,
"x": 0,
"y": 9
},
"name": "基础监控项",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1a088a52-fca7-4b55-a2ff-3893419c4bd4",
"layout": {
"h": 4,
"i": "4f26e8c4-c8c7-48db-bfe0-ad1a069b9bd2",
"isResizable": true,
"w": 8,
"x": 0,
"y": 10
},
"maxPerRow": 4,
"name": "DB连接数使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_connection_usage_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "be416c06-1b9e-4861-8c4f-3bfd8737d836",
"layout": {
"h": 4,
"i": "66af6522-4be6-4ff4-87f9-ec56d8f181e7",
"isResizable": true,
"w": 9,
"x": 8,
"y": 10
},
"maxPerRow": 4,
"name": "数据节点CPU 使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_cpu_usage_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e6a480bb-7f56-4a8d-a3af-30c52e423852",
"layout": {
"h": 4,
"i": "857bb7f2-38e4-4416-8d69-1082450b9fd0",
"isResizable": true,
"w": 8,
"x": 0,
"y": 14
},
"maxPerRow": 4,
"name": "数据节点命中率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_hit_rate_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "aba1d918-5600-452a-b5a5-6f3795d0a220",
"layout": {
"h": 4,
"i": "d0beaee6-2eee-4a66-82ba-30e441aed819",
"isResizable": true,
"w": 8,
"x": 8,
"y": 14
},
"maxPerRow": 4,
"name": "数据节点流入带宽(KBytes/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_intranet_in_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5af6d738-b37f-4b40-9b9f-2c774a02bb26",
"layout": {
"h": 4,
"i": "8d4609c8-ce71-4e85-8a41-bc4328b817e5",
"isResizable": true,
"w": 8,
"x": 16,
"y": 14
},
"maxPerRow": 4,
"name": "数据节点流入带宽使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_intranet_in_ratio_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "223b0eb7-b617-4897-8d4e-cc13db7c6846",
"layout": {
"h": 4,
"i": "f7f3d139-28e0-4b70-9529-dc914e77df7b",
"isResizable": true,
"w": 8,
"x": 0,
"y": 18
},
"maxPerRow": 4,
"name": "数据节点流出带宽(KBytes/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_intranet_out_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8e5b9c6b-90ec-4efe-85f4-1e386e28495d",
"layout": {
"h": 4,
"i": "22b8d485-016f-4320-93dd-ee7b41c3ef83",
"isResizable": true,
"w": 8,
"x": 8,
"y": 18
},
"maxPerRow": 4,
"name": "数据节点流出带宽使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_intranet_out_ratio_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "51ac86b0-e108-4bf0-af19-80b0588e883b",
"layout": {
"h": 4,
"i": "82a19a60-ada6-4639-8590-b363c92cd30d",
"isResizable": true,
"w": 8,
"x": 16,
"y": 18
},
"maxPerRow": 4,
"name": "Key总数(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_keys_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "54fc0148-930b-47a5-bdf4-06e44c8e1d6a",
"layout": {
"h": 4,
"i": "3d6e3c07-97b6-4930-ada9-21d1ffdfc9c6",
"isResizable": true,
"w": 8,
"x": 0,
"y": 22
},
"maxPerRow": 4,
"name": "数据节点内存使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_memory_usage_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8b7d5240-0c30-43c0-86d0-6620e5e122e0",
"layout": {
"h": 4,
"i": "91975194-a995-4fe9-8788-7503abfe414a",
"isResizable": true,
"w": 8,
"x": 8,
"y": 22
},
"maxPerRow": 4,
"name": "DB已用连接数(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_used_connection_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3e4719d7-db36-47d7-b4ee-fa6bfd691817",
"layout": {
"h": 4,
"i": "b70f9c9c-643c-44fa-89d2-2e99098ab508",
"isResizable": true,
"w": 8,
"x": 16,
"y": 22
},
"maxPerRow": 4,
"name": "阻塞客户端连接数(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_blocked_clients_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "98ce0f92-5c53-44a6-bfcb-14de68a56a36",
"layout": {
"h": 4,
"i": "9dd21107-4432-4648-a156-68a6f1c41684",
"isResizable": true,
"w": 8,
"x": 0,
"y": 26
},
"maxPerRow": 4,
"name": "历史累计逐出Key总数(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_evicted_keys_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4b9a4e17-35ad-4968-91b9-4d2eca5e2b06",
"layout": {
"h": 4,
"i": "ec228b90-d782-4e0c-8301-6ef7a0a41556",
"isResizable": true,
"w": 8,
"x": 8,
"y": 26
},
"maxPerRow": 4,
"name": "每秒逐出Key总数(Count/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_evicted_keys_per_second_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "cea77b74-9cfe-4fb6-a9cb-db9ae7f32958",
"layout": {
"h": 4,
"i": "bd004cc1-4d0d-49fb-98fe-b3c12b51399d",
"isResizable": true,
"w": 8,
"x": 16,
"y": 26
},
"maxPerRow": 4,
"name": "历史累计淘汰Key总数(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_expired_keys_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "24c3c854-6552-4107-86fc-52420ea03692",
"layout": {
"h": 4,
"i": "3793c7dc-0da9-434f-a330-42ffba31b5b0",
"isResizable": true,
"w": 8,
"x": 0,
"y": 30
},
"maxPerRow": 4,
"name": "每秒淘汰Key总数(Count/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_expired_keys_per_second_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5af71ff4-8b40-40cd-b672-29b89921ace3",
"layout": {
"h": 4,
"i": "1293634c-92dc-4137-91e8-903c8d08be8c",
"isResizable": true,
"w": 8,
"x": 8,
"y": 30
},
"maxPerRow": 4,
"name": "已设置过期时间Key总数(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_expires_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "490e1a0e-ea61-445a-bc25-38d3abd265c9",
"layout": {
"h": 4,
"i": "576e80a4-9fea-461f-b9c7-70a23da86b92",
"isResizable": true,
"w": 8,
"x": 16,
"y": 30
},
"maxPerRow": 4,
"name": "读QPS(Count/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_get_qps_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6457aad2-7774-4264-ba43-d784ab4fd936",
"layout": {
"h": 4,
"i": "0a2ab22e-af09-46d0-bb98-b148a9760ee9",
"isResizable": true,
"w": 8,
"x": 0,
"y": 34
},
"maxPerRow": 4,
"name": "每秒命中Key数量(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_hits_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "76dfed26-8798-4510-a420-0f8ad7ccb1e5",
"layout": {
"h": 4,
"i": "b62577c7-cea6-4bcd-8807-c4eed591545a",
"isResizable": true,
"w": 8,
"x": 8,
"y": 34
},
"maxPerRow": 4,
"name": "每秒未命中Key数量(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_misses_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d7a87151-35e3-41b0-8fbc-577af8df2d95",
"layout": {
"h": 4,
"i": "190bc267-e449-4d73-a734-50c4193b8f68",
"isResizable": true,
"w": 8,
"x": 16,
"y": 34
},
"maxPerRow": 4,
"name": "其他QPS(个)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_other_ops_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7a9282ea-fd7b-41d0-ac7a-44ce0612cfa0",
"layout": {
"h": 4,
"i": "e1ef102d-981d-4c62-beae-5af73effe7bd",
"isResizable": true,
"w": 8,
"x": 0,
"y": 38
},
"maxPerRow": 4,
"name": "写QPS(Count/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_put_qps_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f8f71f38-82cd-413c-abed-5e03e2214adf",
"layout": {
"h": 4,
"i": "d5b997c9-5bb3-441b-91c8-92b53ef73355",
"isResizable": true,
"w": 8,
"x": 8,
"y": 38
},
"maxPerRow": 4,
"name": "Lua脚本使用内存量(Byte)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_used_memory_lua_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "723f3cf7-5d92-46b0-a307-ef99f6f62cb2",
"layout": {
"h": 1,
"i": "723f3cf7-5d92-46b0-a307-ef99f6f62cb2",
"isResizable": false,
"w": 24,
"x": 0,
"y": 42
},
"name": "延迟监控项",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "92ff998a-db7d-41d9-b4d4-fa9d04252a4c",
"layout": {
"h": 4,
"i": "26a238e5-ffd0-48b4-b7bd-1fdc9ba20776",
"isResizable": true,
"w": 8,
"x": 0,
"y": 43
},
"maxPerRow": 4,
"name": "数据节点平均响应时间(us)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_kvstore_standard_avg_rt_average{instance_id=\"$instance_id\" }",
"instant": false,
"legend": "",
"refId": "A",
"step": 120
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "ce1da00c-8e8a-498a-a6da-3e164128cb21",
"layout": {
"h": 1,
"i": "ce1da00c-8e8a-498a-a6da-3e164128cb21",
"isResizable": false,
"w": 24,
"x": 0,
"y": 47
},
"name": "Proxy监控项",
"type": "row"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_kvstore_standard_keys_average,name)",
"multi": false,
"name": "name",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_kvstore_standard_keys_average{name=\"$name\"},instance_id)",
"multi": false,
"name": "instance_id",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327135170000
}
================================================
FILE: integrations/AliYun/dashboards/slb.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云SLB",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceName": "Default",
"datasourceValue": "${datasource}",
"id": "aa8b2623-1e14-43cd-a3c4-33944a61fcc5",
"layout": {
"h": 4,
"i": "aa8b2623-1e14-43cd-a3c4-33944a61fcc5",
"isResizable": true,
"w": 12,
"x": 0,
"y": 0
},
"maxPerRow": 4,
"name": "七层实例QPS使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_slb_dashboard_instance_qps_utilization_average{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "{{instance_id}} QPS使用率 ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b2002c63-8f0b-436c-b765-5bb65191f3c2",
"layout": {
"h": 4,
"i": "b2002c63-8f0b-436c-b765-5bb65191f3c2",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"maxPerRow": 4,
"name": "7层协议实例Upstream状态码分布",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_slb_dashboard_upstream_code4xx_average{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "{{instance_id}} 状态码 4xx ",
"refId": "A"
},
{
"expr": "sum(aliyun_acs_slb_dashboard_upstream_code5xx_average{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "{{instance_id}} 状态码 5xx ",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "71028d82-4804-468f-92f4-3444953b22cc",
"layout": {
"h": 4,
"i": "71028d82-4804-468f-92f4-3444953b22cc",
"isResizable": true,
"w": 12,
"x": 0,
"y": 4
},
"maxPerRow": 4,
"name": "新建连接数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_slb_dashboard_instance_new_connection_average{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "{{instance_id}} 新建连接数",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "25f90635-ff68-4dc2-bfb0-c6634f0e6867",
"layout": {
"h": 4,
"i": "25f90635-ff68-4dc2-bfb0-c6634f0e6867",
"isResizable": true,
"w": 12,
"x": 12,
"y": 4
},
"maxPerRow": 4,
"name": "并发连接数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_slb_dashboard_active_connection_average{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "{{instance_id}} 活跃连接数",
"refId": "A"
},
{
"expr": "sum(aliyun_acs_slb_dashboard_inactive_connection_average{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "{{instance_id}} 非活跃连接数",
"refId": "B"
},
{
"expr": "sum(aliyun_acs_slb_dashboard_max_connection_average{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "{{instance_id}} 最大活跃连接数",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "fde27e57-bdd6-4fd6-b3c0-75222f736d3b",
"layout": {
"h": 4,
"i": "fde27e57-bdd6-4fd6-b3c0-75222f736d3b",
"isResizable": true,
"w": 12,
"x": 0,
"y": 8
},
"maxPerRow": 4,
"name": "数据包数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_slb_dashboard_packet_rx_average{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "{{instance_id}} 接受数据包数",
"refId": "A"
},
{
"expr": "sum(aliyun_acs_slb_dashboard_packet_tx_average{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "{{instance_id}} 发送数据包数",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a0fd47db-0b49-4b71-ae16-b4108324e35a",
"layout": {
"h": 4,
"i": "a0fd47db-0b49-4b71-ae16-b4108324e35a",
"isResizable": true,
"w": 12,
"x": 12,
"y": 8
},
"maxPerRow": 4,
"name": "流量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bitsSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_acs_slb_dashboard_instance_traffic_rx_average{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "{{instance_id}} 出流量",
"refId": "A"
},
{
"expr": "sum(aliyun_acs_slb_dashboard_instance_traffic_tx_average{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "{{instance_id}} 入流量",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_slb_dashboard_active_connection_average ,instance_id)",
"multi": true,
"name": "instance_id",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327138375000
}
================================================
FILE: integrations/AliYun/dashboards/slb_new.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云-负载均衡",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "ca8f4cf3-c45c-44ca-8685-b7563e6fba66",
"layout": {
"h": 1,
"i": "ca8f4cf3-c45c-44ca-8685-b7563e6fba66",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "实例",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "670fc7e8-0b40-4bc7-845b-6bbe8037b8f7",
"layout": {
"h": 4,
"i": "670fc7e8-0b40-4bc7-845b-6bbe8037b8f7",
"isResizable": true,
"w": 8,
"x": 0,
"y": 1
},
"maxPerRow": 4,
"name": "流量(bits/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bitsSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_slb_dashboard_instance_traffic_rx_average{instance_id=\"$instance_id\"}",
"legend": "rx",
"refId": "A"
},
{
"expr": "aliyun_acs_slb_dashboard_instance_traffic_tx_average{instance_id=\"$instance_id\"}",
"legend": "tx",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "874e39c7-45e6-4146-8198-edd28949168a",
"layout": {
"h": 4,
"i": "38c17f21-f807-4740-b851-a42f19bb962d",
"isResizable": true,
"w": 8,
"x": 8,
"y": 1
},
"maxPerRow": 4,
"name": "数据包数(Count/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_slb_dashboard_instance_packet_rx_average{instance_id=\"$instance_id\"}",
"legend": "rx",
"refId": "A"
},
{
"expr": "aliyun_acs_slb_dashboard_instance_packet_tx_average{instance_id=\"$instance_id\"}",
"legend": "tx",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "92d4bf33-a96d-4e69-8c84-d02ad5ce3b5d",
"layout": {
"h": 4,
"i": "11934f91-3570-4e8a-973b-2575abb8d877",
"isResizable": true,
"w": 8,
"x": 16,
"y": 1
},
"maxPerRow": 4,
"name": "并发连接数(Count/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_slb_dashboard_instance_max_connection_average{instance_id=\"$instance_id\"}",
"legend": "rx",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9ff35053-b00e-421c-81e3-4d8b67a37248",
"layout": {
"h": 4,
"i": "0b92d498-85ad-4329-b844-13f4ee56915f",
"isResizable": true,
"w": 8,
"x": 0,
"y": 5
},
"maxPerRow": 4,
"name": "新建连接数(Count/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_slb_dashboard_instance_new_connection_average{instance_id=\"$instance_id\"}",
"legend": "rx",
"refId": "A"
},
{
"expr": "aliyun_acs_slb_dashboard_instance_traffic_tx_average{instance_id=\"$instance_id\"}",
"legend": "tx",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6f583875-375e-4885-a471-592f8c01b889",
"layout": {
"h": 4,
"i": "cbd6ed4c-65e1-4fe1-8653-e7c037cb5a4a",
"isResizable": true,
"w": 8,
"x": 8,
"y": 5
},
"maxPerRow": 4,
"name": "丢弃流量(bits/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bitsSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_slb_dashboard_instance_drop_traffic_rx_average{instance_id=\"$instance_id\"}",
"legend": "rx",
"refId": "A"
},
{
"expr": "aliyun_acs_slb_dashboard_instance_drop_traffic_tx_average{instance_id=\"$instance_id\"}",
"legend": "tx",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "16f91e54-cc25-4027-ad7f-f9a6bf2390e1",
"layout": {
"h": 4,
"i": "e0183572-a2da-433a-bf1e-7ec5988528f7",
"isResizable": true,
"w": 8,
"x": 16,
"y": 5
},
"maxPerRow": 4,
"name": "丢弃数据包数(Count/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_slb_dashboard_instance_drop_packet_rx_average{instance_id=\"$instance_id\"}",
"legend": "rx",
"refId": "A"
},
{
"expr": "aliyun_acs_slb_dashboard_instance_drop_packet_tx_average{instance_id=\"$instance_id\"}",
"legend": "tx",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9692546e-31f0-4446-8b4c-6e82fa552aaa",
"layout": {
"h": 4,
"i": "b680e559-3d8f-49bd-b3ea-bd39a9df9da3",
"isResizable": true,
"w": 8,
"x": 0,
"y": 9
},
"maxPerRow": 4,
"name": "丢弃连接数(Count/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_slb_dashboard_instance_drop_connection_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "875a2ea6-a132-4238-9cbc-2d65dd1a4d21",
"layout": {
"h": 4,
"i": "96bba171-a4ee-4433-8063-d22c19be9a27",
"isResizable": true,
"w": 8,
"x": 8,
"y": 9
},
"maxPerRow": 4,
"name": "七层实例QPS(Count/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_slb_dashboard_instance_qps_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d15301ff-7240-41ec-a98f-4d0c5b7d6489",
"layout": {
"h": 4,
"i": "c38364ae-b8d6-4d9c-96a0-25be0df01c65",
"isResizable": true,
"w": 8,
"x": 16,
"y": 9
},
"maxPerRow": 4,
"name": "七层实例QPS使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_slb_dashboard_instance_qps_utilization_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "323c5357-3ef6-49a3-8999-5f32138f58d6",
"layout": {
"h": 4,
"i": "b624a147-a21e-4d43-b587-a0719d756be6",
"isResizable": true,
"w": 8,
"x": 0,
"y": 13
},
"maxPerRow": 4,
"name": "实例新建连接数使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_slb_dashboard_instance_new_connection_utilization_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "cf66ca56-593c-49e5-b882-7cfc764a8bdd",
"layout": {
"h": 4,
"i": "ea262a5c-cacf-46cf-8eb3-38c8dfe0c394",
"isResizable": true,
"w": 8,
"x": 8,
"y": 13
},
"maxPerRow": 4,
"name": "实例最大连接数使用率(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_slb_dashboard_instance_max_connection_utilization_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "50655f70-86f6-43bd-8482-27106c4ef241",
"layout": {
"h": 4,
"i": "4ba3ee7b-f640-4390-a2fc-38acb8a1ccdd",
"isResizable": true,
"w": 8,
"x": 16,
"y": 13
},
"maxPerRow": 4,
"name": "七层实例UpstreamRt(ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_slb_dashboard_instance_upstream_rt_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "adf69431-4342-41e2-8a41-ff45f32361a4",
"layout": {
"h": 4,
"i": "316b1b16-217d-4c7b-bd6c-37efba237391",
"isResizable": true,
"w": 8,
"x": 0,
"y": 17
},
"maxPerRow": 4,
"name": "7层协议实例Upstream状态码分布(Count/Second)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_slb_dashboard_instance_upstream_code4xx_average{instance_id=\"$instance_id\"}",
"legend": "4xx",
"refId": "A"
},
{
"expr": "aliyun_acs_slb_dashboard_instance_upstream_code5xx_average{instance_id=\"$instance_id\"}",
"legend": "5xx",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "46b8aab8-73ec-4010-a31b-69d1b03ccc96",
"layout": {
"h": 4,
"i": "8b6e7b2c-f4d8-42e4-91cf-e5a830582bf6",
"isResizable": true,
"w": 8,
"x": 8,
"y": 17
},
"maxPerRow": 4,
"name": "7层协议实例状态码分布(Count/Second)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_slb_dashboard_instance_status_code2xx_average{instance_id=\"$instance_id\"}",
"legend": "2xx",
"refId": "A"
},
{
"expr": "aliyun_acs_slb_dashboard_instance_status_code3xx_average{instance_id=\"$instance_id\"}",
"legend": "3xx",
"refId": "B"
},
{
"expr": "aliyun_acs_slb_dashboard_instance_status_code4xx_average{instance_id=\"$instance_id\"}",
"legend": "4xx",
"refId": "C"
},
{
"expr": "aliyun_acs_slb_dashboard_instance_status_code5xx_average{instance_id=\"$instance_id\"}",
"legend": "5xx",
"refId": "D"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5345bb38-3349-4d28-90de-b86c12465ecd",
"layout": {
"h": 4,
"i": "9fa233c3-d25f-4faf-b95a-e94f4b382dbc",
"isResizable": true,
"w": 8,
"x": 16,
"y": 17
},
"maxPerRow": 4,
"name": "七层实例RT(ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "aliyun_acs_slb_dashboard_instance_rt_average{instance_id=\"$instance_id\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": false,
"id": "e13cd648-4025-419e-a9cf-f5b714b4055e",
"layout": {
"h": 1,
"i": "e13cd648-4025-419e-a9cf-f5b714b4055e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 21
},
"name": "端口",
"type": "row"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_acs_slb_dashboard_rt_average, instance_id)",
"name": "instance_id",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327140096000
}
================================================
FILE: integrations/AliYun/dashboards/waf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "阿里云WAF",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ec46b990-faf5-4ed7-a791-bbac5df91636",
"layout": {
"h": 4,
"i": "ec46b990-faf5-4ed7-a791-bbac5df91636",
"isResizable": true,
"w": 12,
"x": 0,
"y": 0
},
"maxPerRow": 4,
"name": "4xx 环比率V3",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_waf_4xx_ratio_wafv3_maximum{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "4xx 环比率V3",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "efc75e62-5e75-470d-b12b-a98ca44b268a",
"layout": {
"h": 4,
"i": "584b5a3c-2b7a-4e11-bee5-c2ed8661933e",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"maxPerRow": 4,
"name": "5xx 环比率V3",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_waf_5xx_ratio_wafv3_maximum{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "5xx 环比率V3",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "79aefa1b-5e50-4c0c-980d-e5523b859509",
"layout": {
"h": 4,
"i": "fc875397-c1a4-4713-b564-09abf852bcf3",
"isResizable": true,
"w": 12,
"x": 0,
"y": 4
},
"maxPerRow": 4,
"name": "4xx 环比率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_waf_4xx_ratio_maximum{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "4xx 环比率",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "60c211d4-d51a-4681-b23b-ec8cc5dce7fe",
"layout": {
"h": 4,
"i": "946be0db-32a3-48ea-9473-88fdfa77201d",
"isResizable": true,
"w": 12,
"x": 12,
"y": 4
},
"maxPerRow": 4,
"name": "5xx 环比率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_waf_5xx_ratio_maximum{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "5xx 环比率",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0de9271c-7b19-4003-ae56-2e273b4b99c4",
"layout": {
"h": 4,
"i": "0de9271c-7b19-4003-ae56-2e273b4b99c4",
"isResizable": true,
"w": 12,
"x": 0,
"y": 8
},
"maxPerRow": 4,
"name": "QPS环比增长率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_waf_qps_ratio_maximum{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "{{instance_id}}QPS 环比增长率",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ccf2ffc1-6f22-4a13-b795-68072c077e1f",
"layout": {
"h": 4,
"i": "d0320716-f704-4b6e-8671-b58fb77a5d7c",
"isResizable": true,
"w": 12,
"x": 12,
"y": 8
},
"maxPerRow": 4,
"name": "QPS环比下降率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(aliyun_waf_qps_ratio_down_maximum{instance_id=~\"$instance_id\"}) by (instance_id)",
"legend": "{{instance_id}}QPS 环比下降率",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(aliyun_waf_qps_ratio_maximum,instance_id)",
"multi": true,
"name": "instance_id",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327142143000
}
================================================
FILE: integrations/AliYun/markdown/README.md
================================================
# aliyun plugin
## 简介
使用[categraf](https://github.com/flashcatcloud/categraf)中[aliyun](https://github.com/flashcatcloud/categraf/tree/main/inputs/aliyun)插件拉取阿里云云监控的数据(通过 OpenAPI)。
## 授权
获取凭证 [https://usercenter.console.aliyun.com/#/manage/ak](https://usercenter.console.aliyun.com/#/manage/ak)
RAM 用户授权。RAM 用户调用云监控 API 前,需要所属的阿里云账号将权限策略授予对应的 RAM 用户,参见 [RAM 用户权限](https://help.aliyun.com/document_detail/43170.html?spm=a2c4g.11186623.0.0.30c841feqsoAAn)。
可以在 [授权页面](https://ram.console.aliyun.com/permissions) 新增授权,选择对应的用户,授予云监控只读权限 `AliyunCloudMonitorReadOnlyAccess`, 并为授予权限的用户创建accessKey 即可。
## Categraf中conf/input.aliyun/cloud.toml配置文件:
```toml
# # categraf采集周期,阿里云指标的粒度一般是60秒,建议设置不要少于60秒
interval = 120
[[instances]]
## 阿里云资源所处的region
## endpoint region 参考 https://help.aliyun.com/document_detail/28616.html#section-72p-xhs-6qt
region="cn-beijing"
endpoint="metrics.cn-hangzhou.aliyuncs.com"
## 填入你的access_key_id
access_key_id=""
## 填入你的access_key_secret
access_key_secret=""
## 可能无法获取当前最新指标,这个指标是指监控指标的截止时间距离现在多久
delay="50m"
## 阿里云指标的最小粒度,60s 是推荐值,再小了部分指标不支持
period="60s"
## 指标所属的namespace ,为空,则表示所有空间指标都要采集
## namespace 参考 https://help.aliyun.com/document_detail/163515.htm?spm=a2c4g.11186623.0.0.44d65c58mhgNw3
namespaces=["acs_ecs_dashboard"]
## 过滤某个namespace下的一个或多个指标
## metric name 参考 https://help.aliyun.com/document_detail/163515.htm?spm=a2c4g.11186623.0.0.401d15c73Z0dZh
## 参考页面中的Metric Id 填入下面的metricName ,页面中包含中文的Metric Name对应接口中的Description
[[instances.metric_filters]]
namespace=""
metric_names=["cpu_cores","vm.TcpCount", "cpu_idle"]
# 阿里云查询指标接口的QPS是50, 这里默认设置为一半
ratelimit=25
# 查询指定namesapce指标后, namespace/metric_name等meta信息会缓存起来,catch_ttl 是指标的缓存时间
catch_ttl="1h"
# 每次请求阿里云endpoint的超时时间
timeout="5s"
```
## 效果图
### aliyun ecs

### aliyun rds

### aliyun redis

### aliyun slb

### aliyun waf

================================================
FILE: integrations/AppDynamics/collect/appdynamics/app.toml
================================================
#interval=15s
[[instances]]
#url_base = "http://{{.ip}}:{{.port}}/a.json?metric-path={{.metric_path}}&time-range-type=BETWEEN_TIMES&start-time={{.start_time}}&end-time={{.end_time}}&output=JSON"
#url_vars = [
# { ip="127.0.0.1", port="8090", application="cms", metric_path="Application Infrastructure Performance|AdminServer|Individual Nodes|xxxxx|Agent|App|Availability", start_time="$START_TIME", end_time="$END_TIME"},
#]
# # 指定url_vars中哪些key 作为最终的label附加
# url_var_label_keys= []
# #从url中提取变量
# url_label_key="instance"
# url_label_value="{{.Host}}"
# #自定义 http header
#headers = { Authorization="", X-Forwarded-For="", Host=""}
# #每次请求的超时时间
#timeout="5s"
# # precision of start-time and end-time
#precision="ms"
## basic auth
#username=""
#password=""
# # endtime = now - delay
#delay = "1m"
# # starttime = now - delay - period = endtime - period
#period = "1m"
# # 想要添加的额外label
#labels = {application="cms"}
# # 从返回中过滤哪些指标
filters = ["current", "max", "min", "value","sum", "count"]
# # 限制并发请求量, 最多同时有多少个请求
# # 默认范围(0,100)
#request_inflight= 10
## 强制开启100以上的并发请求 (不推荐)
# force_request_inflight = 1000
# # 是否开启 tls
# use_tls = true
# # tls 最小版本
## tls_min_version = "1.2"
# # tls ca证书路径
## tls_ca = "/etc/categraf/ca.pem"
# # tls cert 路径
## tls_cert = "/etc/categraf/cert.pem"
# # tls key 路径
## tls_key = "/etc/categraf/key.pem"
# # 是否跳过证书验证
## insecure_skip_verify = true
================================================
FILE: integrations/AppDynamics/markdown/README.md
================================================
## AppDynamics
AppDynamics 采集插件, 采集 AppDynamics 数据
## Configuration
```toml
#interval=15s
[[instances]]
#url_base = "http://{{.ip}}:{{.port}}/a.json?metric-path={{.metric_path}}&time-range-type=BETWEEN_TIMES&start-time={{.start_time}}&end-time={{.end_time}}&output=JSON"
#url_vars = [
# { ip="127.0.0.1", port="8090", application="cms", metric_path="Application Infrastructure Performance|AdminServer|Individual Nodes|xxxxx|Agent|App|Availability", start_time="$START_TIME", end_time="$END_TIME"},
#]
# # 指定url_vars中哪些key 作为最终的label附加
# url_var_label_keys= []
# #从url中提取变量
# url_label_key="instance"
# url_label_value="{{.Host}}"
# #自定义 http header
#headers = { Authorization="", X-Forwarded-For="", Host=""}
# #每次请求的超时时间
#timeout="5s"
# # precision of start-time and end-time
#precision="ms"
## basic auth
#username=""
#password=""
# # endtime = now - delay
#delay = "1m"
# # starttime = now - delay - period = endtime - period
#period = "1m"
# # 想要添加的额外label
#labels = {application="cms"}
# # 从返回中过滤哪些指标
filters = ["current", "max", "min", "value","sum", "count"]
# # 限制并发请求量, 最多同时有多少个请求
# # 默认范围(0,100)
#request_inflight= 10
## 强制开启100以上的并发请求 (不推荐)
# force_request_inflight = 1000
# # 是否开启 tls
# use_tls = true
# # tls 最小版本
## tls_min_version = "1.2"
# # tls ca证书路径
## tls_ca = "/etc/categraf/ca.pem"
# # tls cert 路径
## tls_cert = "/etc/categraf/cert.pem"
# # tls key 路径
## tls_key = "/etc/categraf/key.pem"
# # 是否跳过证书验证
## insecure_skip_verify = true
```
================================================
FILE: integrations/AutoMQ/alerts/常用告警规则.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Kafka active controller 数目异常",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "sum(kafka_controller_active_count) by (job) != 1",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327150310000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Kafka Broker 连接数过多",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "sum(kafka_server_connection_count) by (job, instance) \u003e 1000",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327151102000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Kafka Fetch 请求高延迟",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "kafka_request_time_99p_milliseconds{type=\"Fetch\"} \u003e 1000",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327151678000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Kafka Partition 数量过多",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "kafka_partition_count \u003e 5000",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327152312000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Kafka Produce 请求高延迟",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "kafka_request_time_99p_milliseconds{type=\"Produce\"} \u003e 100",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327152981000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Kafka 每秒 Fetch 请求超过1000",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "sum(rate(kafka_request_count_total{type=\"Fetch\"}[1m])) by (job, instance) \u003e 1000",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327153589000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Kafka 每秒 Produce 请求超过1000",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "sum(rate(kafka_request_count_total{type=\"Produce\"}[1m])) by (job, instance) \u003e 1000",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327154308000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Kafka 消费延迟较高",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "sum(max(kafka_log_end_offset) by (job, topic, partition)) by (job, topic)\n- on (topic) group_left (consumer_group)\nsum(max(kafka_group_commit_offset) by (job, consumer_group, topic, partition)) by (job, consumer_group, topic) \u003e 10000",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327154879000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Kafka 请求 S3 失败率较高",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "sum(rate(kafka_stream_operation_latency_count{operation_type=\"S3Request\", status=\"failed\"}[1m])) by (job, operation_name) \u003e 0.1",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327155698000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Kafka 请求错误率过高",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "sum(rate(kafka_request_error_count_total{error!=\"NONE\"}[1m])) by (job, instance, error) \u003e 0.1",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327156240000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Kafka 集群入向网络流量较高",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2,
3
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "sum(rate(kafka_broker_network_io_bytes_total{direction=\"in\"}[1m])) by (job) \u003e 50 * 1024 * 1024",
"severity": 2
},
{
"prom_ql": "sum(rate(kafka_broker_network_io_bytes_total{direction=\"in\"}[1m])) by (job) \u003e 20 * 1024 * 1024",
"severity": 3
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327156954000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Kafka 集群出向网络流量较高",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2,
3
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "sum(rate(kafka_broker_network_io_bytes_total{direction=\"out\"}[1m])) by (job) \u003e 50 * 1024 * 1024",
"severity": 2
},
{
"prom_ql": "sum(rate(kafka_broker_network_io_bytes_total{direction=\"out\"}[1m])) by (job) \u003e 20 * 1024 * 1024",
"severity": 3
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327157473000
}
]
================================================
FILE: integrations/AutoMQ/collect/prometheus/采集OTEL-COLLECTOR的样例.toml
================================================
interval = 15
[[instances]]
urls = [
"http://:/metrics"
]
url_label_key = "otel_collector"
url_label_value = "{{.Host}}"
================================================
FILE: integrations/AutoMQ/dashboards/broker_metrics.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Broker Metrics",
"ident": "automq-broker-metrics",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "0915f8ee-4c36-487d-b7dc-26566a5297f0",
"layout": {
"h": 4,
"i": "0915f8ee-4c36-487d-b7dc-26566a5297f0",
"isResizable": true,
"w": 8,
"x": 0,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Error Rate(req/s)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(instance, error) (rate(kafka_request_error_count_total{job=\"$cluster_id\", instance=~\"$node_id\", error!=\"NONE\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{error}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "e8f9e4d5-de65-4b2b-974e-a2b77eb84cad",
"layout": {
"h": 4,
"i": "e8f9e4d5-de65-4b2b-974e-a2b77eb84cad",
"isResizable": true,
"w": 8,
"x": 8,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Connection Count",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(instance, listener) (kafka_server_connection_count{job=\"$cluster_id\", instance=~\"$node_id\"})",
"legend": "Node-{{instance}}#{{listener}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "ba9d84f6-3490-4d01-9733-f8c40e6a3694",
"layout": {
"h": 4,
"i": "ba9d84f6-3490-4d01-9733-f8c40e6a3694",
"isResizable": true,
"w": 8,
"x": 16,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Partition Count",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "kafka_partition_count{job=\"$cluster_id\", instance=~\"$node_id\"}",
"legend": "Node-{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "2a5b54cf-1ce3-4a0c-ba17-f4ce13fd9d35",
"layout": {
"h": 5,
"i": "2a5b54cf-1ce3-4a0c-ba17-f4ce13fd9d35",
"isResizable": true,
"w": 8,
"x": 0,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Messages In",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(instance) (rate(kafka_message_count_total{job=\"$cluster_id\", instance=~\"$node_id\"}[$__rate_interval]))",
"legend": "Node-{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.15,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "ab1ba353-92cb-4dbf-a288-d9c1a1442622",
"layout": {
"h": 5,
"i": "ab1ba353-92cb-4dbf-a288-d9c1a1442622",
"isResizable": true,
"w": 8,
"x": 8,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Bytes In",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesSecIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(instance) (rate(kafka_network_io_bytes_total{job=\"$cluster_id\", direction=\"in\", instance=~\"$node_id\"}[$__rate_interval]))",
"legend": "Node-{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.15,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "d4802fc0-962c-44a0-b480-f4c12e9a8bbe",
"layout": {
"h": 5,
"i": "d4802fc0-962c-44a0-b480-f4c12e9a8bbe",
"isResizable": true,
"w": 8,
"x": 16,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Bytes Out",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesSecIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(instance) (rate(kafka_network_io_bytes_total{job=\"$cluster_id\", direction=\"out\", instance=~\"$node_id\"}[$__rate_interval]))",
"legend": "Node-{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "fa0fa9d3-9715-439a-8a39-76fbeec93a71",
"layout": {
"h": 5,
"i": "fa0fa9d3-9715-439a-8a39-76fbeec93a71",
"isResizable": true,
"w": 8,
"x": 0,
"y": 9
},
"links": [],
"maxPerRow": 4,
"name": "Messages In Per Topic",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(topic, instance) (rate(kafka_message_count_total{job=\"$cluster_id\", instance=~\"$node_id\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{topic}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.15,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "3de2881d-d2db-46f2-9d83-f365371fd78b",
"layout": {
"h": 5,
"i": "3de2881d-d2db-46f2-9d83-f365371fd78b",
"isResizable": true,
"w": 8,
"x": 8,
"y": 9
},
"links": [],
"maxPerRow": 4,
"name": "Bytes In Per Topic",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesSecIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(topic, instance) (rate(kafka_network_io_bytes_total{job=\"$cluster_id\", direction=\"in\", instance=~\"$node_id\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{topic}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.15,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "3a26bc9b-886f-4ad7-a47f-8c29d7bf3b15",
"layout": {
"h": 5,
"i": "3a26bc9b-886f-4ad7-a47f-8c29d7bf3b15",
"isResizable": true,
"w": 8,
"x": 16,
"y": 9
},
"links": [],
"maxPerRow": 4,
"name": "Bytes Out Per Topic",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesSecIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(topic, instance) (rate(kafka_network_io_bytes_total{job=\"$cluster_id\", direction=\"out\", instance=~\"$node_id\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{topic}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.15,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "892bb035-44ad-4e93-bc64-8b70da74079a",
"layout": {
"h": 5,
"i": "892bb035-44ad-4e93-bc64-8b70da74079a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "Produce Throughput",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(instance, topic) (rate(kafka_request_count_total{job=\"$cluster_id\", instance=~\"$node_id\", type=\"Produce\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{topic}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.15,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "db955862-42c0-484d-b972-0a99cd92190e",
"layout": {
"h": 5,
"i": "db955862-42c0-484d-b972-0a99cd92190e",
"isResizable": true,
"w": 12,
"x": 12,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "Fetch Throughput",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(instance, topic) (rate(kafka_request_count_total{job=\"$cluster_id\", instance=~\"$node_id\", type=\"Fetch\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{topic}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "35895b86-045b-474d-8248-455cfef787ba",
"layout": {
"h": 5,
"i": "35895b86-045b-474d-8248-455cfef787ba",
"isResizable": true,
"w": 12,
"x": 0,
"y": 19
},
"links": [],
"maxPerRow": 4,
"name": "Produce Latency",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(instance) (kafka_request_time_99p_milliseconds{job=\"$cluster_id\", instance=~\"$node_id\", type=\"Produce\"})",
"legend": "Node-{{instance}}#P99",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "sum by(instance) (kafka_request_time_mean_milliseconds{job=\"$cluster_id\", instance=~\"$node_id\", type=\"Produce\"})",
"legend": "Node-{{instance}}#Avg",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "sum by(instance) (kafka_request_time_50p_milliseconds{job=\"$cluster_id\", instance=~\"$node_id\", type=\"Produce\"})",
"legend": "Node-{{instance}}#P50",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "c323e0d1-9325-48e5-b446-723beb6944cc",
"layout": {
"h": 5,
"i": "c323e0d1-9325-48e5-b446-723beb6944cc",
"isResizable": true,
"w": 12,
"x": 12,
"y": 19
},
"links": [],
"maxPerRow": 4,
"name": "Fetch Latency",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(instance) (kafka_request_time_99p_milliseconds{job=\"$cluster_id\", instance=~\"$node_id\", type=\"Fetch\"})",
"legend": "Node-{{instance}}#P99",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "sum by(instance) (kafka_request_time_mean_milliseconds{job=\"$cluster_id\", instance=~\"$node_id\", type=\"Fetch\"})",
"legend": "Node-{{instance}}#Avg",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "sum by(instance) (kafka_request_time_50p_milliseconds{job=\"$cluster_id\", instance=~\"$node_id\", type=\"Fetch\"})",
"legend": "Node-{{instance}}#P50",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "DS_PROMETHEUS",
"type": "datasource"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(process_runtime_jvm_cpu_utilization_ratio,job)",
"hide": false,
"multi": false,
"name": "cluster_id",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(process_runtime_jvm_cpu_utilization_ratio,instance)",
"hide": false,
"multi": true,
"name": "node_id",
"reg": "/(^[0-9]*$)/",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327159415000
}
================================================
FILE: integrations/AutoMQ/dashboards/cluster_overview.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Cluster Overview",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "970f949f-cdc8-4ec0-b2c8-c49d3bd14d8d",
"layout": {
"h": 4,
"i": "970f949f-cdc8-4ec0-b2c8-c49d3bd14d8d",
"isResizable": true,
"w": 3,
"x": 0,
"y": 0
},
"links": [],
"maxPerRow": 4,
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#F2495C",
"type": "base",
"value": null
},
{
"color": "#73BF69",
"value": 0
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "sum by(job) (kafka_controller_active_count{job=\"$cluster_id\"})",
"legend": "Active Controller",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "bcf3c5d7-9dd1-4cb6-81a1-3198f4b049e2",
"layout": {
"h": 4,
"i": "bcf3c5d7-9dd1-4cb6-81a1-3198f4b049e2",
"isResizable": true,
"w": 3,
"x": 3,
"y": 0
},
"links": [],
"maxPerRow": 4,
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 1
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "kafka_broker_fenced_count{job=\"$cluster_id\", instance=\"$active_controller\"}",
"legend": "Fenced Broker",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "333648e6-4bef-4e2d-bb84-f1d720d383cc",
"layout": {
"h": 4,
"i": "333648e6-4bef-4e2d-bb84-f1d720d383cc",
"isResizable": true,
"w": 4,
"x": 6,
"y": 0
},
"links": [],
"maxPerRow": 4,
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#3274D9",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "kafka_topic_count{job=\"$cluster_id\", instance=\"$active_controller\"}",
"legend": "Topics",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "659a8d96-5276-49fb-a011-22d85dace435",
"layout": {
"h": 4,
"i": "659a8d96-5276-49fb-a011-22d85dace435",
"isResizable": true,
"w": 2,
"x": 10,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Error Rate",
"options": {
"standardOptions": {
"decimals": 1,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 1
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(rate(kafka_request_error_count_total{job=\"$cluster_id\", error!=\"NONE\"}[$__rate_interval]))",
"legend": "__auto",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "0650f241-385f-4386-8b76-7f23d53ef8fb",
"layout": {
"h": 8,
"i": "0650f241-385f-4386-8b76-7f23d53ef8fb",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Bytes In (+) / Out (-)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"decimals": 1,
"util": "bytesSecIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum(rate(kafka_broker_network_io_bytes_total{job=\"$cluster_id\", direction=\"in\"}[$__rate_interval]))",
"legend": "In",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "sum(rate(kafka_broker_network_io_bytes_total{job=\"$cluster_id\", direction=\"out\"}[$__rate_interval])) * -1",
"legend": "Out",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "aac842e5-e997-4c0f-84e4-0b104dd303e3",
"layout": {
"h": 4,
"i": "aac842e5-e997-4c0f-84e4-0b104dd303e3",
"isResizable": true,
"w": 6,
"x": 0,
"y": 4
},
"links": [],
"maxPerRow": 4,
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#F2495C",
"type": "base",
"value": null
},
{
"color": "#73BF69",
"value": 0
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "kafka_broker_active_count{job=\"$cluster_id\", instance=\"$active_controller\"}",
"legend": "Active Broker",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "fb4e88e4-9e6e-44d0-8bb8-1fe717c0b824",
"layout": {
"h": 4,
"i": "fb4e88e4-9e6e-44d0-8bb8-1fe717c0b824",
"isResizable": true,
"w": 4,
"x": 6,
"y": 4
},
"links": [],
"maxPerRow": 4,
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#3274D9",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "kafka_partition_total_count{job=\"$cluster_id\", instance=\"$active_controller\"}",
"legend": "Partitions",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "b752e09e-4bc4-4e12-935e-d24f1d422a9b",
"layout": {
"h": 4,
"i": "b752e09e-4bc4-4e12-935e-d24f1d422a9b",
"isResizable": true,
"w": 2,
"x": 10,
"y": 4
},
"links": [],
"maxPerRow": 4,
"options": {
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#56A64B",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(kafka_log_size{job=\"$cluster_id\"})",
"legend": "Size",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"baseColor": "#9470FF",
"calc": "lastNotNull",
"serieWidth": 70,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "7b8a904f-179f-4140-9ed4-d109271ec7af",
"layout": {
"h": 5,
"i": "7b8a904f-179f-4140-9ed4-d109271ec7af",
"isResizable": true,
"w": 5,
"x": 0,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Group Count",
"options": {
"standardOptions": {
"util": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(kafka_group_count{job=\"$cluster_id\"})",
"legend": "total",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "sum(kafka_group_stable_count{job=\"$cluster_id\"})",
"legend": "stable",
"maxDataPoints": 240,
"refId": "D"
},
{
"expr": "sum(kafka_group_dead_count{job=\"$cluster_id\"})",
"legend": "dead",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "sum(kafka_group_empty_count{job=\"$cluster_id\"})",
"legend": "empty",
"maxDataPoints": 240,
"refId": "C"
},
{
"expr": "sum(kafka_group_preparing_rebalance_count{job=\"$cluster_id\"})",
"legend": "prepare_rebalance",
"maxDataPoints": 240,
"refId": "E"
},
{
"expr": "sum(kafka_group_completing_rebalance_count{job=\"$cluster_id\"})",
"legend": "completing_rebalance",
"maxDataPoints": 240,
"refId": "F"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"columns": [],
"displayMode": "labelsOfSeriesToRows",
"linkMode": "appendLinkColumn",
"links": [
{
"title": "下钻",
"url": "/built-in-components/dashboard/detail?__uuid__=1717556327172992000&TSDB=${DS_PROMETHEUS}\u0026cluster_id=${cluster_id}\u0026group_id=${__field.labels.consumer_group}\u0026partition=all\u0026topic=${__field.labels.topic}"
}
],
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "b7cb494b-413f-4b12-ae1e-7da55b65d5ee",
"layout": {
"h": 5,
"i": "b7cb494b-413f-4b12-ae1e-7da55b65d5ee",
"isResizable": true,
"w": 19,
"x": 5,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Consumer Lag",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "rgba(44, 157, 61, 1)"
},
"type": "special"
},
{
"match": {
"from": 0
},
"result": {
"color": "rgba(228, 22, 22, 1)"
},
"type": "range"
}
]
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"valueMappings": []
}
}
],
"targets": [
{
"expr": "sum by(topic) (max by(topic, partition) (kafka_log_end_offset{job=\"$cluster_id\"}))\n- on(topic) group_left(consumer_group)\nsum by(consumer_group, topic) (max by(consumer_group, topic, partition) (kafka_group_commit_offset{job=\"$cluster_id\"}))",
"legend": "",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"renameByName": {
"value": "lag"
}
}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"aggrDimension": [
"topic"
],
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "labelValuesToRows",
"linkMode": "appendLinkColumn",
"links": [
{
"title": "下钻",
"url": "/built-in-components/dashboard/detail?__uuid__=1717556327174664000&TSDB=${DS_PROMETHEUS}\u0026cluster_id=${cluster_id}\u0026topic=${__field.labels.topic}"
}
],
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "90716481-5dd6-4c4e-8219-cbcb724a22d9",
"layout": {
"h": 4,
"i": "90716481-5dd6-4c4e-8219-cbcb724a22d9",
"isResizable": true,
"w": 24,
"x": 0,
"y": 13
},
"links": [],
"maxPerRow": 4,
"name": "Topic Statistics",
"options": {
"standardOptions": {
"decimals": 2
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID",
"value": "D"
},
"properties": {
"standardOptions": {
"decimals": 2,
"util": "bytesIEC"
},
"valueMappings": []
}
},
{
"matcher": {
"id": "byFrameRefID",
"value": "A"
},
"properties": {
"standardOptions": {
"decimals": 1,
"util": "bytesSecIEC"
}
},
"type": "special"
},
{
"matcher": {
"id": "byFrameRefID",
"value": "B"
},
"properties": {
"standardOptions": {
"decimals": 1,
"util": "bytesSecIEC"
}
},
"type": "special"
}
],
"targets": [
{
"expr": "sum by(topic) (rate(kafka_network_io_bytes_total{job=\"$cluster_id\", direction=\"in\"}[$__rate_interval]))",
"legend": "Bytes in",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "sum by(topic) (rate(kafka_network_io_bytes_total{job=\"$cluster_id\", direction=\"out\"}[$__rate_interval]))",
"legend": "Bytes out",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "sum by(topic) (rate(kafka_message_count_total{job=\"$cluster_id\", direction=\"in\"}[$__rate_interval]))",
"legend": "Msgs in",
"maxDataPoints": 240,
"refId": "C"
},
{
"expr": "sum by(topic) (kafka_log_size{job=\"$cluster_id\"})",
"legend": "Size",
"maxDataPoints": 240,
"refId": "D"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"aggrDimension": [
"instance"
],
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "labelValuesToRows",
"linkMode": "appendLinkColumn",
"links": [
{
"title": "下钻",
"url": "/built-in-components/dashboard/detail?__uuid__=1717556327159415000&DS_PROMETHEUS=${DS_PROMETHEUS}\u0026cluster_id=${cluster_id}\u0026node_id=${__field.labels.instance}"
}
],
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "a427c6d3-0fab-45c2-92e5-585e5435fcfe",
"layout": {
"h": 6,
"i": "a427c6d3-0fab-45c2-92e5-585e5435fcfe",
"isResizable": true,
"w": 24,
"x": 0,
"y": 17
},
"links": [],
"maxPerRow": 4,
"name": "Broker Statistics",
"options": {
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID",
"value": "C"
},
"properties": {
"standardOptions": {
"util": "bytesSecIEC"
}
}
},
{
"matcher": {
"id": "byFrameRefID",
"value": "D"
},
"properties": {
"standardOptions": {
"util": "bytesSecIEC"
}
},
"type": "special"
},
{
"matcher": {
"id": "byFrameRefID",
"value": "H"
},
"properties": {
"standardOptions": {
"util": "milliseconds"
}
},
"type": "special"
},
{
"matcher": {
"id": "byFrameRefID",
"value": "I"
},
"properties": {
"standardOptions": {
"util": "milliseconds"
}
},
"type": "special"
}
],
"targets": [
{
"expr": "sum by(instance) (kafka_partition_count{job=\"$cluster_id\"})",
"legend": "Partitions",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "sum by(instance) (kafka_server_connection_count{job=\"$cluster_id\"})",
"legend": "Connections",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "sum by(instance) (rate(kafka_network_io_bytes_total{job=\"$cluster_id\", direction=\"in\"}[$__rate_interval]))",
"legend": "Bytes In",
"maxDataPoints": 240,
"refId": "C"
},
{
"expr": "sum by(instance) (rate(kafka_network_io_bytes_total{job=\"$cluster_id\", direction=\"out\"}[$__rate_interval]))",
"legend": "Bytes Out",
"maxDataPoints": 240,
"refId": "D"
},
{
"expr": "sum by(instance) (rate(kafka_message_count_total{job=\"$cluster_id\", direction=\"in\"}[$__rate_interval]))",
"legend": "Msgs In",
"maxDataPoints": 240,
"refId": "E"
},
{
"expr": "sum by(instance) (rate(kafka_request_count_total{job=\"$cluster_id\", type=\"Produce\"}[$__rate_interval]))",
"legend": "Produce",
"maxDataPoints": 240,
"refId": "F"
},
{
"expr": "sum by(instance) (rate(kafka_request_count_total{job=\"$cluster_id\", type=\"Fetch\"}[$__rate_interval]))",
"legend": "Fetch",
"maxDataPoints": 240,
"refId": "G"
},
{
"expr": "sum by(instance) (kafka_request_time_99p_milliseconds{job=\"$cluster_id\", type=\"Produce\"})",
"legend": "Produce P99",
"maxDataPoints": 240,
"refId": "H"
},
{
"expr": "sum by(instance) (kafka_request_time_99p_milliseconds{job=\"$cluster_id\", type=\"Fetch\"})",
"legend": "Fetch P99",
"maxDataPoints": 240,
"refId": "I"
},
{
"expr": "sum by(instance) (rate(kafka_topic_request_failed_total{job=\"$cluster_id\", type=\"produce\"}[$__rate_interval]))",
"legend": "Failed Produce",
"maxDataPoints": 240,
"refId": "J"
},
{
"expr": "sum by(instance) (rate(kafka_topic_request_failed_total{job=\"$cluster_id\", type=\"fetch\"}[$__rate_interval]))",
"legend": "Failed Fetch",
"maxDataPoints": 240,
"refId": "K"
},
{
"expr": "sum by(instance) (rate(kafka_request_error_count_total{job=\"$cluster_id\", error!=\"NONE\"}[$__rate_interval]))",
"legend": "Error Request",
"maxDataPoints": 240,
"refId": "L"
}
],
"transformations": [
{
"id": "organize",
"options": {
"renameByName": {
"instance": "Node ID"
}
}
}
],
"type": "table",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "DS_PROMETHEUS",
"type": "datasource"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(process_runtime_jvm_cpu_utilization_ratio,job)",
"hide": false,
"multi": false,
"name": "cluster_id",
"reg": "",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(kafka_controller_active_count{job=\"$cluster_id\"} \u003e 0, instance)",
"hide": false,
"multi": false,
"name": "active_controller",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327162929000
}
================================================
FILE: integrations/AutoMQ/dashboards/detailed_metrics.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Detailed Metrics",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"collapsed": true,
"id": "d88272f7-f9f7-4a94-84f7-00a1b3674de9",
"layout": {
"h": 1,
"i": "d88272f7-f9f7-4a94-84f7-00a1b3674de9",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "System Overview",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "299f918c-b627-44ab-bf3a-6291389acb72",
"layout": {
"h": 5,
"i": "299f918c-b627-44ab-bf3a-6291389acb72",
"isResizable": true,
"w": 12,
"x": 0,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "JVM CPU Utilization",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "process_runtime_jvm_cpu_utilization_ratio{job=\"$cluster_id\", instance=~\"$node_id\"}",
"legend": "Node-{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "c3f483df-3c9a-47d1-a02e-459bd61f832c",
"layout": {
"h": 5,
"i": "bff120e4-a3d7-406e-acf8-0ccfbf5c9e0d",
"isResizable": true,
"w": 12,
"x": 12,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "JVM Heap Memory Utilization",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(instance, type) (process_runtime_jvm_memory_usage_bytes{job=\"$cluster_id\", instance=~\"$node_id\", type=\"heap\"})\n/\nsum by(type, instance) (process_runtime_jvm_memory_limit_bytes{job=\"$cluster_id\", instance=~\"$node_id\", type=\"heap\"})",
"legend": "Node-{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "7fd86b25-a721-4e30-9495-93349d3dcc42",
"layout": {
"h": 5,
"i": "4800354c-1360-49f1-859d-3fe54ecab27e",
"isResizable": true,
"w": 8,
"x": 0,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Memory Allocation",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "1 - sum by(job, instance) (kafka_stream_buffer_allocated_memory_size_bytes{job=\"$cluster_id\", instance=~\"$node_id\"}) / sum by(job, instance) (kafka_stream_buffer_used_memory_size_bytes{job=\"$cluster_id\", instance=~\"$node_id\"})",
"legend": "",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "aa626e9e-1a08-4d34-a5a9-2d6f6dc498ea",
"layout": {
"h": 5,
"i": "aa626e9e-1a08-4d34-a5a9-2d6f6dc498ea",
"isResizable": true,
"w": 8,
"x": 8,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Memory Allocation Detailed",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(job, instance, type) (kafka_stream_buffer_allocated_memory_size_bytes{job=\"$cluster_id\", instance=~\"$node_id\"})",
"legend": "Node-{{instance}}#{{type}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "77674110-158a-43cf-96e8-7b5566473c49",
"layout": {
"h": 5,
"i": "77674110-158a-43cf-96e8-7b5566473c49",
"isResizable": true,
"w": 8,
"x": 16,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Cache Size",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(job, instance) (kafka_stream_delta_wal_cache_size_bytes{job=\"$cluster_id\", instance=~\"$node_id\"})",
"legend": "Node-{{instance}}#Delta_WAL_Cache",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "sum by(job, instance) (kafka_stream_block_cache_size_bytes{job=\"$cluster_id\", instance=~\"$node_id\"})",
"legend": "Node-{{instance}}#Block_Cache",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "11cbfa04-73ff-40e1-b577-ca4348c74993",
"layout": {
"h": 1,
"i": "11cbfa04-73ff-40e1-b577-ca4348c74993",
"isResizable": false,
"w": 24,
"x": 0,
"y": 11
},
"name": "Kafka Requests",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "02ac7786-566c-4e24-bc84-213635a8b05f",
"layout": {
"h": 7,
"i": "02ac7786-566c-4e24-bc84-213635a8b05f",
"isResizable": true,
"w": 8,
"x": 0,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "Request Throughput",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(instance, job, type) (rate(kafka_request_count_total{job=\"$cluster_id\", instance=~\"$node_id\", type=~\"$request_type\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{type}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "2e16284f-d7ae-4acc-98ee-05ee32c80d72",
"layout": {
"h": 7,
"i": "2e16284f-d7ae-4acc-98ee-05ee32c80d72",
"isResizable": true,
"w": 8,
"x": 8,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "Request Latency (Avg)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "kafka_request_time_mean_milliseconds{job=\"$cluster_id\", instance=~\"$node_id\", type=~\"$request_type\"}",
"legend": "Node-{{instance}}#{{type}}",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "c1dfc44e-1ac9-4f28-b10e-c40077dd708a",
"layout": {
"h": 7,
"i": "c1dfc44e-1ac9-4f28-b10e-c40077dd708a",
"isResizable": true,
"w": 8,
"x": 16,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "Request Latency (P99)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "kafka_request_time_99p_milliseconds{job=\"$cluster_id\", instance=~\"$node_id\", type=~\"$request_type\"}",
"legend": "Node-{{instance}}#{{type}}",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "e437d394-379d-41e7-9df3-9486b0380dde",
"layout": {
"h": 6,
"i": "e437d394-379d-41e7-9df3-9486b0380dde",
"isResizable": true,
"w": 12,
"x": 0,
"y": 19
},
"links": [],
"maxPerRow": 4,
"name": "IO Thread Idle Ratio",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "kafka_io_threads_idle_rate_1m{job=\"$cluster_id\", instance=~\"$node_id\"}",
"legend": "Node-{{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "1bb285b7-7d02-430e-9b66-7d985c87c32a",
"layout": {
"h": 6,
"i": "1bb285b7-7d02-430e-9b66-7d985c87c32a",
"isResizable": true,
"w": 12,
"x": 12,
"y": 19
},
"links": [],
"maxPerRow": 4,
"name": "Network Thread Idle Ratio",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "kafka_network_threads_idle_rate{job=\"$cluster_id\", instance=~\"$node_id\"}",
"legend": "Node-{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "55b0d7fc-9ae3-4ae2-971a-593d49bb9650",
"layout": {
"h": 1,
"i": "55b0d7fc-9ae3-4ae2-971a-593d49bb9650",
"isResizable": false,
"w": 24,
"x": 0,
"y": 25
},
"name": "S3Stream Request",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "fb96dfec-bbb8-4372-bc81-a57794a45e39",
"layout": {
"h": 6,
"i": "fb96dfec-bbb8-4372-bc81-a57794a45e39",
"isResizable": true,
"w": 8,
"x": 0,
"y": 26
},
"links": [],
"maxPerRow": 4,
"name": "Request Throughput",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(job, instance, operation_name, operation_type) (rate(kafka_stream_operation_latency_count{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"$s3stream_request\", operation_type=\"S3Stream\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{operation_name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "700efda2-c816-4014-b7fd-e02262e23218",
"layout": {
"h": 6,
"i": "700efda2-c816-4014-b7fd-e02262e23218",
"isResizable": true,
"w": 8,
"x": 8,
"y": 26
},
"links": [],
"maxPerRow": 4,
"name": "Request Latency (Avg)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "kafka_stream_operation_latency_mean_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"$s3stream_request\", operation_type=\"S3Stream\"}",
"legend": "Node-{{instance}}#{{operation_name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "74596486-1c73-46ad-be71-4c84f8782a8f",
"layout": {
"h": 6,
"i": "74596486-1c73-46ad-be71-4c84f8782a8f",
"isResizable": true,
"w": 8,
"x": 16,
"y": 26
},
"links": [],
"maxPerRow": 4,
"name": "Request Latency (P99)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "kafka_stream_operation_latency_99p_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"$s3stream_request\", operation_type=\"S3Stream\"}",
"legend": "Node-{{instance}}#{{operation_name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "5cfbeac7-6507-4d22-8ad3-5ceb37b70c55",
"layout": {
"h": 1,
"i": "5cfbeac7-6507-4d22-8ad3-5ceb37b70c55",
"isResizable": false,
"w": 24,
"x": 0,
"y": 32
},
"name": "S3 Request",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "4f64e96e-bccf-43e0-b15b-4a3cf85c2555",
"layout": {
"h": 6,
"i": "4f64e96e-bccf-43e0-b15b-4a3cf85c2555",
"isResizable": true,
"w": 12,
"x": 0,
"y": 33
},
"links": [],
"maxPerRow": 4,
"name": "Error Rate",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 0.1
}
],
"style": "off"
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum by(operation_name, instance) (rate(kafka_stream_operation_latency_count{job=\"$cluster_id\", instance=~\"$node_id\", operation_type=\"S3Request\", status=\"failed\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{operation_name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "ad31f321-fc64-4a40-8b63-cba3f82ee493",
"layout": {
"h": 6,
"i": "ad31f321-fc64-4a40-8b63-cba3f82ee493",
"isResizable": true,
"w": 12,
"x": 12,
"y": 33
},
"links": [],
"maxPerRow": 4,
"name": "Request Throughput",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(job, instance, operation_name) (rate(kafka_stream_operation_latency_count{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"$s3request\", operation_type=\"S3Request\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{operation_name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "91eb39ef-6c0c-4f97-a395-f2a282cf0f02",
"layout": {
"h": 5,
"i": "91eb39ef-6c0c-4f97-a395-f2a282cf0f02",
"isResizable": true,
"w": 12,
"x": 0,
"y": 39
},
"links": [],
"maxPerRow": 4,
"name": "Request Latency (Avg)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"targets": [
{
"expr": "avg by(instance, operation_name) (kafka_stream_operation_latency_mean_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"$s3request\", operation_type=\"S3Request\"})",
"legend": "Node-{{instance}}#{{operation_name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "768b847a-214d-4fa0-bb25-465f12d53ef3",
"layout": {
"h": 5,
"i": "768b847a-214d-4fa0-bb25-465f12d53ef3",
"isResizable": true,
"w": 12,
"x": 12,
"y": 39
},
"links": [],
"maxPerRow": 4,
"name": "Request Latency (P99)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "avg by(instance, operation_name) (kafka_stream_operation_latency_99p_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"$s3request\", operation_type=\"S3Request\"})",
"legend": "Node-{{instance}}#{{operation_name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "0fc296de-7e62-4422-a2bf-193b11c3f66f",
"layout": {
"h": 1,
"i": "0fc296de-7e62-4422-a2bf-193b11c3f66f",
"isResizable": false,
"w": 24,
"x": 0,
"y": 44
},
"name": "Delta WAL",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "f52eb9f4-198b-422b-b669-58c3684177a3",
"layout": {
"h": 5,
"i": "f52eb9f4-198b-422b-b669-58c3684177a3",
"isResizable": true,
"w": 12,
"x": 0,
"y": 45
},
"links": [],
"maxPerRow": 4,
"name": "Request Throughput",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(instance, operation_name) (rate(kafka_stream_operation_latency_count{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"append_wal|append_wal_full\", operation_type=\"S3Storage\", stage=\"complete\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{operation_name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "bd378630-0756-4734-95ae-49b61d7ad45c",
"layout": {
"h": 5,
"i": "bd378630-0756-4734-95ae-49b61d7ad45c",
"isResizable": true,
"w": 12,
"x": 12,
"y": 45
},
"links": [],
"maxPerRow": 4,
"name": "Request Latency",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "avg by(instance, operation_name, stage) (kafka_stream_operation_latency_mean_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"append_wal\", operation_type=\"S3Storage\"})",
"legend": "Node-{{instance}}#{{operation_name}}-{{stage}}_Avg",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "avg by(instance, operation_name, stage) (kafka_stream_operation_latency_99p_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"append_wal\", operation_type=\"S3Storage\"})",
"legend": "Node-{{instance}}#{{operation_name}}-{{stage}}_P99",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "de311c0f-406e-45fb-b489-14fcb535332e",
"layout": {
"h": 1,
"i": "de311c0f-406e-45fb-b489-14fcb535332e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 50
},
"name": "Delta WAL Cache",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "94970a87-f173-4828-89ba-421c31b62252",
"layout": {
"h": 5,
"i": "94970a87-f173-4828-89ba-421c31b62252",
"isResizable": true,
"w": 8,
"x": 0,
"y": 51
},
"links": [],
"maxPerRow": 4,
"name": "Request Throughput",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(job, instance, operation_name) (rate(kafka_stream_operation_latency_count{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"append_log_cache|append_log_cache_full\", operation_type=\"S3Storage\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{operation_name}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "sum by(job, instance, operation_name, status) (rate(kafka_stream_operation_latency_count{job=\"$cluster_id\", instance=~\"$node_id\", operation_type=\"S3Storage\", operation_name=\"read_log_cache\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{operation_name}}-{{status}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "b3714360-1340-4088-80cd-587fdd847888",
"layout": {
"h": 5,
"i": "b3714360-1340-4088-80cd-587fdd847888",
"isResizable": true,
"w": 8,
"x": 8,
"y": 51
},
"links": [],
"maxPerRow": 4,
"name": "Request Latency (Avg)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "avg by(instance, operation_name) (kafka_stream_operation_latency_mean_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"append_log_cache\", operation_type=\"S3Storage\"})",
"legend": "Node-{{instance}}#{{operation_name}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "avg by(instance, operation_name, status) (kafka_stream_operation_latency_mean_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=\"read_log_cache\", operation_type=\"S3Storage\"})",
"legend": "Node-{{instance}}#{{operation_name}}-{{status}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "778750a6-c587-42d7-b7a2-9ab123ad2be8",
"layout": {
"h": 5,
"i": "778750a6-c587-42d7-b7a2-9ab123ad2be8",
"isResizable": true,
"w": 8,
"x": 16,
"y": 51
},
"links": [],
"maxPerRow": 4,
"name": "Request Latency (P99)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "avg by(instance, operation_name) (kafka_stream_operation_latency_99p_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"append_log_cache\", operation_type=\"S3Storage\"})",
"legend": "Node-{{instance}}#{{operation_name}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "avg by(instance, operation_name, status) (kafka_stream_operation_latency_99p_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=\"read_log_cache\", operation_type=\"S3Storage\"})",
"legend": "Node-{{instance}}#{{operation_name}}-{{status}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "7d25b5df-0130-4464-87eb-91939d141487",
"layout": {
"h": 1,
"i": "7d25b5df-0130-4464-87eb-91939d141487",
"isResizable": false,
"w": 24,
"x": 0,
"y": 56
},
"name": "Upload WAL",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "c3cb53a0-80cf-43fa-b758-e41c5a354b33",
"layout": {
"h": 6,
"i": "c3cb53a0-80cf-43fa-b758-e41c5a354b33",
"isResizable": true,
"w": 12,
"x": 0,
"y": 57
},
"links": [],
"maxPerRow": 4,
"name": "Upload WAL Throughput",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(job, instance, operation_name) (rate(kafka_stream_operation_latency_count{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"upload_wal|force_upload_wal\", operation_type=\"S3Storage\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{operation_name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "99715d27-cc5a-4625-aa37-f05594e7af58",
"layout": {
"h": 6,
"i": "99715d27-cc5a-4625-aa37-f05594e7af58",
"isResizable": true,
"w": 12,
"x": 12,
"y": 57
},
"links": [],
"maxPerRow": 4,
"name": "Request Latency",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "avg by(instance, operation_name, stage) (kafka_stream_operation_latency_mean_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"upload_wal|force_upload_wal\", operation_type=\"S3Storage\"})",
"legend": "Node-{{instance}}#{{operation_name}}-{{stage}}_Avg",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "avg by(instance, operation_name, stage) (kafka_stream_operation_latency_99p_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"upload_wal|force_upload_wal\", operation_type=\"S3Storage\"})",
"legend": "Node-{{instance}}#{{operation_name}}-{{stage}}_P99",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "debcdc1d-d36b-4992-a8d4-f9b02bf3299c",
"layout": {
"h": 1,
"i": "debcdc1d-d36b-4992-a8d4-f9b02bf3299c",
"isResizable": false,
"w": 24,
"x": 0,
"y": 63
},
"name": "Block Cache",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "3afefad7-cd2d-4cb8-9071-6bebd067870d",
"layout": {
"h": 5,
"i": "3afefad7-cd2d-4cb8-9071-6bebd067870d",
"isResizable": true,
"w": 12,
"x": 0,
"y": 64
},
"links": [],
"maxPerRow": 4,
"name": "Request Throughput",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(job, instance, operation_name, status) (rate(kafka_stream_operation_latency_count{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=~\"read_block_cache\", operation_type=\"S3Storage\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{operation_name}}-{{status}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "2aa45efb-e3d4-4a8c-996c-a23ecbedbfe7",
"layout": {
"h": 5,
"i": "2aa45efb-e3d4-4a8c-996c-a23ecbedbfe7",
"isResizable": true,
"w": 12,
"x": 12,
"y": 64
},
"links": [],
"maxPerRow": 4,
"name": "Request Latency",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "avg by(instance, operation_name, status) (kafka_stream_operation_latency_mean_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=\"read_block_cache\", operation_type=\"S3Storage\"})",
"legend": "Node-{{instance}}#{{operation_name}}-{{status}}_Avg",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "avg by(instance, operation_name, status) (kafka_stream_operation_latency_99p_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=\"read_block_cache\", operation_type=\"S3Storage\"})",
"legend": "Node-{{instance}}#{{operation_name}}-{{status}}_P99",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "46168b03-fae4-415c-b17e-28b40a436a1e",
"layout": {
"h": 6,
"i": "46168b03-fae4-415c-b17e-28b40a436a1e",
"isResizable": true,
"w": 8,
"x": 0,
"y": 69
},
"links": [],
"maxPerRow": 4,
"name": "Read Ahead Request Throughput",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(job, instance, operation_name, status) (rate(kafka_stream_operation_latency_count{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=\"read_ahead\", operation_type=\"S3Storage\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{operation_name}}-{{status}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "1b7708a0-3e19-421a-8a22-155c78846797",
"layout": {
"h": 6,
"i": "1b7708a0-3e19-421a-8a22-155c78846797",
"isResizable": true,
"w": 8,
"x": 8,
"y": 69
},
"links": [],
"maxPerRow": 4,
"name": "Read Ahead Throughput",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(job, instance) (rate(kafka_stream_read_ahead_size_sum_bytes{job=\"$cluster_id\", instance=~\"$node_id\"}[$__rate_interval]))",
"legend": "Node-{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "957402a2-77c2-46a7-8258-461b3c463345",
"layout": {
"h": 6,
"i": "957402a2-77c2-46a7-8258-461b3c463345",
"isResizable": true,
"w": 8,
"x": 16,
"y": 69
},
"links": [],
"maxPerRow": 4,
"name": "Read Ahead Latency",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "avg by(instance, operation_name, status) (kafka_stream_operation_latency_mean_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=\"read_ahead\", operation_type=\"S3Storage\"})",
"legend": "Node-{{instance}}#{{operation_name}}-{{status}}_Avg",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "avg by(instance, operation_name, status) (kafka_stream_operation_latency_99p_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_name=\"read_ahead\", operation_type=\"S3Storage\"})",
"legend": "Node-{{instance}}#{{operation_name}}-{{status}}_P99",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "2f2fd063-d7d5-4bb9-9c8a-b0d5d41920a1",
"layout": {
"h": 5,
"i": "2f2fd063-d7d5-4bb9-9c8a-b0d5d41920a1",
"isResizable": true,
"w": 8,
"x": 0,
"y": 75
},
"links": [],
"maxPerRow": 4,
"name": "Read Ahead Size Avg",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "avg by(instance) (kafka_stream_read_ahead_size_mean_bytes{job=\"$cluster_id\", instance=~\"$node_id\"})",
"legend": "Node-{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "c2fa4768-45d7-4653-97ef-0c26d78ffe15",
"layout": {
"h": 5,
"i": "c2fa4768-45d7-4653-97ef-0c26d78ffe15",
"isResizable": true,
"w": 8,
"x": 8,
"y": 75
},
"links": [],
"maxPerRow": 4,
"name": "Available Inflight Read Ahead Size",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(job, instance) (kafka_stream_available_inflight_read_ahead_size_bytes{job=\"$cluster_id\", instance=~\"$node_id\"})",
"legend": "Node-{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "3ecc36e8-5ad0-404b-a70e-b3a036ec3325",
"layout": {
"h": 5,
"i": "3ecc36e8-5ad0-404b-a70e-b3a036ec3325",
"isResizable": true,
"w": 8,
"x": 16,
"y": 75
},
"links": [],
"maxPerRow": 4,
"name": "Read Ahead Throttle Time",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "avg by(instance) (kafka_stream_read_ahead_limiter_queue_time_mean_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\"})",
"legend": "Node-{{instance}}_Avg",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "avg by(instance) (kafka_stream_read_ahead_limiter_queue_time_99p_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\"})",
"legend": "Node-{{instance}}_P99",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "8c0dcf14-6d8b-4525-a23e-07c7d4bcb116",
"layout": {
"h": 1,
"i": "8c0dcf14-6d8b-4525-a23e-07c7d4bcb116",
"isResizable": false,
"w": 24,
"x": 0,
"y": 80
},
"name": "Networks",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "3c3878b8-665b-43ac-b59b-09457ea69114",
"layout": {
"h": 6,
"i": "3c3878b8-665b-43ac-b59b-09457ea69114",
"isResizable": true,
"w": 12,
"x": 0,
"y": 81
},
"links": [],
"maxPerRow": 4,
"name": "Total Bytes In / Out",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(instance, job) (rate(kafka_stream_network_inbound_usage_bytes_total{job=\"$cluster_id\", instance=~\"$node_id\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#In",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "sum by(instance, job) (rate(kafka_stream_network_outbound_usage_bytes_total{job=\"$cluster_id\", instance=~\"$node_id\"}[$__rate_interval])) * -1",
"legend": "Node-{{instance}}#Out",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "a0f1b7de-5c7d-488e-8699-5839453ba1c1",
"layout": {
"h": 6,
"i": "a0f1b7de-5c7d-488e-8699-5839453ba1c1",
"isResizable": true,
"w": 12,
"x": 12,
"y": 81
},
"links": [],
"maxPerRow": 4,
"name": "Bytes In / Out",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(instance, job) (rate(kafka_stream_download_size_bytes_total{job=\"$cluster_id\", instance=~\"$node_id\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#In_S3",
"maxDataPoints": 240,
"refId": "D"
},
{
"expr": "sum by(instance, job) (rate(kafka_stream_upload_size_bytes_total{job=\"$cluster_id\", instance=~\"$node_id\"}[$__rate_interval])) * -1",
"legend": "Node-{{instance}}#Out_S3",
"maxDataPoints": 240,
"refId": "C"
},
{
"expr": "sum by(instance, job, direction) (rate(kafka_network_io_bytes_total{job=\"$cluster_id\", instance=~\"$node_id\", direction=\"in\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#In_Msg",
"maxDataPoints": 240,
"refId": "E"
},
{
"expr": "sum by(instance, job, direction) (rate(kafka_network_io_bytes_total{job=\"$cluster_id\", instance=~\"$node_id\", direction=\"out\"}[$__rate_interval])) * -1",
"legend": "Node-{{instance}}#Out_Msg",
"maxDataPoints": 240,
"refId": "F"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "40f8fcd4-8d2c-44b7-8697-abc455f3dbaf",
"layout": {
"h": 6,
"i": "40f8fcd4-8d2c-44b7-8697-abc455f3dbaf",
"isResizable": true,
"w": 12,
"x": 0,
"y": 87
},
"links": [],
"maxPerRow": 4,
"name": "Available Bandwidth",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(instance, job) (kafka_stream_network_inbound_available_bandwidth_bytes{job=\"$cluster_id\", instance=~\"$node_id\"})",
"legend": "Node-{{instance}}#In",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "sum by(instance, job) (kafka_stream_network_outbound_available_bandwidth_bytes{job=\"$cluster_id\", instance=~\"$node_id\"})",
"legend": "Node-{{instance}}#Out",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "2e1cc35b-aed8-498e-9ab2-1a06732921f8",
"layout": {
"h": 6,
"i": "2e1cc35b-aed8-498e-9ab2-1a06732921f8",
"isResizable": true,
"w": 12,
"x": 12,
"y": 87
},
"links": [],
"maxPerRow": 4,
"name": "Network Limiter Queue Time",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "avg by(instance) (kafka_stream_network_inbound_limiter_queue_time_mean_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\"})",
"legend": "Node-{{instance}}#In_Avg",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "avg by(instance) (kafka_stream_network_outbound_limiter_queue_time_mean_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\"})",
"legend": "Node-{{instance}}#Out_Avg",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "avg by(instance) (kafka_stream_network_inbound_limiter_queue_time_99p_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\"})",
"legend": "Node-{{instance}}#In_P99",
"maxDataPoints": 240,
"refId": "C"
},
{
"expr": "avg by(instance) (kafka_stream_network_outbound_limiter_queue_time_99p_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\"})",
"legend": "Node-{{instance}}#Out_P99",
"maxDataPoints": 240,
"refId": "D"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "3a5d1265-435f-4b55-a0ff-8d756abf2482",
"layout": {
"h": 1,
"i": "3a5d1265-435f-4b55-a0ff-8d756abf2482",
"isResizable": false,
"w": 24,
"x": 0,
"y": 93
},
"name": "S3 Object",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "7890d4fb-abd0-4909-bf18-40aad3396865",
"layout": {
"h": 5,
"i": "7890d4fb-abd0-4909-bf18-40aad3396865",
"isResizable": true,
"w": 12,
"x": 0,
"y": 94
},
"links": [],
"maxPerRow": 4,
"name": "S3 Object Count Total",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(state) (kafka_stream_s3_object_count{job=\"$cluster_id\"})",
"legend": "__auto",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "3cbffbbb-a527-4c70-88c3-a40e6da04a70",
"layout": {
"h": 5,
"i": "3cbffbbb-a527-4c70-88c3-a40e6da04a70",
"isResizable": true,
"w": 12,
"x": 12,
"y": 94
},
"links": [],
"maxPerRow": 4,
"name": "S3 Object Size Total",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "kafka_stream_s3_object_size_bytes{job=\"$cluster_id\"}",
"legend": "Size",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "86821c1e-4735-4a2a-8493-8e83bef54acb",
"layout": {
"h": 5,
"i": "86821c1e-4735-4a2a-8493-8e83bef54acb",
"isResizable": true,
"w": 12,
"x": 0,
"y": 99
},
"links": [],
"maxPerRow": 4,
"name": "StreamSetObject Num",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(node_id) (kafka_stream_stream_set_object_num{job=\"$cluster_id\", node_id=~\"$node_id\"})",
"legend": "Node-{{node_id}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "d9608713-38c0-417a-a315-39e66177db50",
"layout": {
"h": 5,
"i": "d9608713-38c0-417a-a315-39e66177db50",
"isResizable": true,
"w": 12,
"x": 12,
"y": 99
},
"links": [],
"maxPerRow": 4,
"name": "StreamObject Num",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "kafka_stream_stream_object_num{job=\"$cluster_id\"}",
"legend": "Count",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "f087fb68-3ef2-4dc0-a406-4ac554b630be",
"layout": {
"h": 6,
"i": "f087fb68-3ef2-4dc0-a406-4ac554b630be",
"isResizable": true,
"w": 8,
"x": 0,
"y": 104
},
"links": [],
"maxPerRow": 4,
"name": "Get Object Throughput By Size",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(size, instance) (rate(kafka_stream_operation_latency_count{operation_type=\"S3Request\", operation_name=\"get_object\", job=\"$cluster_id\", instance=~\"$node_id\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{size}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "434a445d-419a-4b59-a284-aee5faf7fe62",
"layout": {
"h": 6,
"i": "434a445d-419a-4b59-a284-aee5faf7fe62",
"isResizable": true,
"w": 8,
"x": 8,
"y": 104
},
"links": [],
"maxPerRow": 4,
"name": "Get Object Request Latency Avg",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "avg by(instance, operation_name, size) (kafka_stream_operation_latency_mean_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_type=\"S3Request\", operation_name=\"get_object\"})",
"legend": "Node-{{instance}}#{{operation_name}}-{{size}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "90282958-1378-45e8-a535-6ac1d92a568d",
"layout": {
"h": 6,
"i": "90282958-1378-45e8-a535-6ac1d92a568d",
"isResizable": true,
"w": 8,
"x": 16,
"y": 104
},
"links": [],
"maxPerRow": 4,
"name": "Get Object Request Latency P99",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "avg by(instance, operation_name, size) (kafka_stream_operation_latency_99p_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_type=\"S3Request\", operation_name=\"get_object\"})",
"legend": "Node-{{instance}}#{{operation_name}}-{{size}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "abff8180-66cf-48a0-a29a-ee17f8f01390",
"layout": {
"h": 7,
"i": "abff8180-66cf-48a0-a29a-ee17f8f01390",
"isResizable": true,
"w": 8,
"x": 0,
"y": 110
},
"links": [],
"maxPerRow": 4,
"name": "Upload Object Throughput By Size",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(size, instance) (rate(kafka_stream_operation_latency_count{operation_type=\"S3Request\", operation_name=~\"put_object|upload_part\", job=\"$cluster_id\", instance=~\"$node_id\"}[$__rate_interval]))",
"legend": "Node-{{instance}}#{{size}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "4aff03b3-88f3-4ef0-99c9-9c97813ad474",
"layout": {
"h": 7,
"i": "4aff03b3-88f3-4ef0-99c9-9c97813ad474",
"isResizable": true,
"w": 8,
"x": 8,
"y": 110
},
"links": [],
"maxPerRow": 4,
"name": "Put Object Request Latency Avg",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "avg by(instance, size) (kafka_stream_operation_latency_mean_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_type=\"S3Request\", operation_name=~\"put_object|upload_part\"})",
"legend": "Node-{{instance}}#{{size}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "bda7d930-ce56-4201-996c-a36ac54aaa8d",
"layout": {
"h": 7,
"i": "bda7d930-ce56-4201-996c-a36ac54aaa8d",
"isResizable": true,
"w": 8,
"x": 16,
"y": 110
},
"links": [],
"maxPerRow": 4,
"name": "Put Object Request Latency P99",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "avg by(instance, size) (kafka_stream_operation_latency_99p_nanoseconds{job=\"$cluster_id\", instance=~\"$node_id\", operation_type=\"S3Request\", operation_name=~\"put_object|upload_part\"})",
"legend": "Node-{{instance}}#{{size}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "2bc4051b-eba3-4917-a30b-893d004069cf",
"layout": {
"h": 1,
"i": "2bc4051b-eba3-4917-a30b-893d004069cf",
"isResizable": false,
"w": 24,
"x": 0,
"y": 117
},
"name": "Compaction",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.15,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "7c60cf60-593c-46f5-9e26-81851b3df023",
"layout": {
"h": 6,
"i": "7c60cf60-593c-46f5-9e26-81851b3df023",
"isResizable": true,
"w": 12,
"x": 0,
"y": 118
},
"links": [],
"maxPerRow": 4,
"name": "Compaction Throughput",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(kafka_stream_compaction_read_size_bytes_total{job=\"$cluster_id\", instance=~\"$node_id\"}[$__rate_interval])",
"legend": "Node-{{instance}}#In",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "rate(kafka_stream_compaction_write_size_bytes_total{job=\"$cluster_id\", instance=~\"$node_id\"}[$__rate_interval]) * -1",
"legend": "Node-{{instance}}#Out",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "93d6e740-d0b6-42ac-8a4a-a7c513214916",
"layout": {
"h": 1,
"i": "93d6e740-d0b6-42ac-8a4a-a7c513214916",
"isResizable": false,
"w": 24,
"x": 0,
"y": 124
},
"name": "Auto-Balancer",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "4f70ad3c-ca70-4dc4-bfde-ab09ddd4fcc4",
"layout": {
"h": 6,
"i": "4f70ad3c-ca70-4dc4-bfde-ab09ddd4fcc4",
"isResizable": true,
"w": 12,
"x": 0,
"y": 125
},
"links": [],
"maxPerRow": 4,
"name": "Metrics Delay",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "kafka_stream_auto_balancer_metrics_time_delay_milliseconds{job=\"$cluster_id\", node_id=~\"$node_id\"}",
"legend": "Node-{{node_id}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "DS_PROMETHEUS",
"type": "datasource"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(process_runtime_jvm_cpu_utilization_ratio,job)",
"hide": false,
"multi": false,
"name": "cluster_id",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(process_runtime_jvm_cpu_utilization_ratio,instance)",
"hide": false,
"multi": true,
"name": "node_id",
"reg": "/(^[0-9]*$)/",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(kafka_request_count_total,type)",
"hide": false,
"multi": true,
"name": "request_type",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(kafka_stream_operation_latency_count{operation_type=\"S3Stream\"},operation_name)",
"hide": false,
"multi": true,
"name": "s3stream_request",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(kafka_stream_operation_latency_count{operation_type=\"S3Request\"},operation_name)",
"hide": false,
"multi": true,
"name": "s3request",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327167247000
}
================================================
FILE: integrations/AutoMQ/dashboards/group_metrics.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Group Metrics",
"ident": "automq-group-metrics",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${TSDB}",
"id": "cac8a249-bb61-4c2d-bc90-91a7dac58f3b",
"layout": {
"h": 5,
"i": "cac8a249-bb61-4c2d-bc90-91a7dac58f3b",
"isResizable": true,
"w": 12,
"x": 0,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Consumer Throughput",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(consumer_group, topic, partition) (rate(kafka_group_commit_offset{job=\"$cluster_id\", consumer_group=~\"$group_id\", topic=~\"$topic\", partition=~\"$partition\"}[$__rate_interval]))",
"legend": "{{consumer_group}}#{{topic}}-{{partition}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${TSDB}",
"id": "fc0afd7c-0161-4ee4-88de-81c74f432769",
"layout": {
"h": 5,
"i": "12e7bb88-2851-44ea-a311-44ebcdb0e7b7",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Consumer Lag",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(topic) (max by(topic, partition) (kafka_log_end_offset{job=\"$cluster_id\", topic=~\"$topic\", partition=~\"$partition\"}))\n- on(topic) group_left(consumer_group)\nsum by(consumer_group, topic) (max by(consumer_group, topic, partition) (kafka_group_commit_offset{job=\"$cluster_id\", consumer_group=~\"$group_id\", topic=~\"$topic\", partition=~\"$partition\"}))",
"legend": "{{consumer_group}}#{{topic}}-{{partition}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"hide": false,
"name": "TSDB",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${TSDB}"
},
"definition": "label_values(process_runtime_jvm_cpu_utilization_ratio,job)",
"hide": false,
"multi": false,
"name": "cluster_id",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${TSDB}"
},
"definition": "label_values(kafka_group_commit_offset,consumer_group)",
"hide": false,
"multi": true,
"name": "group_id",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${TSDB}"
},
"definition": "label_values(kafka_group_commit_offset,topic)",
"hide": false,
"multi": true,
"name": "topic",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${TSDB}"
},
"definition": "label_values(kafka_group_commit_offset,partition)",
"hide": false,
"multi": true,
"name": "partition",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327172992000
}
================================================
FILE: integrations/AutoMQ/dashboards/topic_metrics.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Topic Metrics",
"ident": "automq-topic-metrics",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${TSDB}",
"id": "d4df5a68-ae8b-40e9-8f88-cd2e3f88e783",
"layout": {
"h": 3,
"i": "d4df5a68-ae8b-40e9-8f88-cd2e3f88e783",
"isResizable": true,
"w": 5,
"x": 0,
"y": 0
},
"links": [],
"maxPerRow": 4,
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "rgba(44, 157, 61, 1)",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "sum by(job) (rate(kafka_network_io_bytes_total{job=\"$cluster_id\", topic=~\"$topic\", partition=~\"$partition\", direction=\"in\"}[$__rate_interval]))",
"legend": "Bytes In Total",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${TSDB}",
"id": "52e25ff6-5ef8-4237-a49d-85473f9f90af",
"layout": {
"h": 3,
"i": "52e25ff6-5ef8-4237-a49d-85473f9f90af",
"isResizable": true,
"w": 3,
"x": 5,
"y": 0
},
"links": [],
"maxPerRow": 4,
"options": {
"standardOptions": {
"decimals": 2,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "rgba(148, 112, 255, 1)",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "sum by(topic) (kafka_log_size{job=\"$cluster_id\", topic=~\"$topic\"})",
"legend": "Size",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.15,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${TSDB}",
"id": "75a6d933-f8d8-46f8-8cf0-9ef9dcc4e86b",
"layout": {
"h": 6,
"i": "75a6d933-f8d8-46f8-8cf0-9ef9dcc4e86b",
"isResizable": true,
"w": 8,
"x": 8,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Messages In",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "single"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(topic) (rate(kafka_message_count_total{job=\"$cluster_id\", topic=~\"$topic\", direction=\"in\"}[$__rate_interval]))",
"legend": "{{topic}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.15,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${TSDB}",
"id": "f1bbfda3-0fd8-4b85-b725-222f992fbbc9",
"layout": {
"h": 6,
"i": "f1bbfda3-0fd8-4b85-b725-222f992fbbc9",
"isResizable": true,
"w": 8,
"x": 16,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Request Throughput",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "single"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(topic, type) (rate(kafka_topic_request_count_total{job=\"$cluster_id\", topic=~\"$topic\"}[$__rate_interval]))",
"legend": "{{type}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${TSDB}",
"id": "56a85016-ca99-4495-be6a-53de1366a396",
"layout": {
"h": 3,
"i": "56a85016-ca99-4495-be6a-53de1366a396",
"isResizable": true,
"w": 5,
"x": 0,
"y": 3
},
"links": [],
"maxPerRow": 4,
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "rgba(44, 157, 61, 1)",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "sum by(job) (rate(kafka_network_io_bytes_total{job=\"$cluster_id\", topic=~\"$topic\", partition=~\"$partition\", direction=\"out\"}[$__rate_interval]))",
"legend": "Bytes Out Total",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${TSDB}",
"id": "6efe4439-00ab-41d8-a4f0-b2117ad13648",
"layout": {
"h": 3,
"i": "6efe4439-00ab-41d8-a4f0-b2117ad13648",
"isResizable": true,
"w": 3,
"x": 5,
"y": 3
},
"links": [],
"maxPerRow": 4,
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "rgba(148, 112, 255, 1)",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "count by(topic) (kafka_log_size{job=\"$cluster_id\", topic=~\"$topic\"})",
"legend": "Partition Count",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${TSDB}",
"id": "f3da44a7-bcb6-4b92-a5f9-b7e6856ced1a",
"layout": {
"h": 6,
"i": "f3da44a7-bcb6-4b92-a5f9-b7e6856ced1a",
"isResizable": true,
"w": 8,
"x": 0,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Size",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "single"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(topic) (kafka_log_size{job=\"$cluster_id\", topic=~\"$topic\"})",
"legend": "{{topic}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.15,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${TSDB}",
"id": "1354e591-5030-42f0-a3c3-eeb1e7c0a537",
"layout": {
"h": 6,
"i": "1354e591-5030-42f0-a3c3-eeb1e7c0a537",
"isResizable": true,
"w": 8,
"x": 8,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Bytes In",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 1,
"util": "bytesSecIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(topic, partition) (rate(kafka_network_io_bytes_total{job=\"$cluster_id\", topic=~\"$topic\", direction=\"in\", partition=~\"$partition\"}[$__rate_interval]))",
"legend": "{{topic}}-{{partition}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.15,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${TSDB}",
"id": "a91a656b-e5b1-4265-ba88-f1881b324137",
"layout": {
"h": 6,
"i": "a91a656b-e5b1-4265-ba88-f1881b324137",
"isResizable": true,
"w": 8,
"x": 16,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Bytes Out",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 1,
"util": "bytesSecIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum by(topic, partition) (rate(kafka_network_io_bytes_total{job=\"$cluster_id\", topic=~\"$topic\", direction=\"out\", partition=~\"$partition\"}[$__rate_interval]))",
"legend": "{{topic}}-{{partition}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"columns": [],
"displayMode": "labelsOfSeriesToRows",
"linkMode": "appendLinkColumn",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${TSDB}",
"id": "5c3e188e-3d7c-4bc9-b8c7-c19c39a8d396",
"layout": {
"h": 4,
"i": "5c3e188e-3d7c-4bc9-b8c7-c19c39a8d396",
"isResizable": true,
"w": 24,
"x": 0,
"y": 12
},
"maxPerRow": 4,
"name": "Consume group lag",
"options": {
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
}
}
],
"targets": [
{
"expr": "sum by(topic) (max by(topic, partition) (kafka_log_end_offset{job=\"$cluster_id\", topic=~\"$topic\", partition=~\"$partition\"}))\n- on(topic) group_left(consumer_group)\nsum by(consumer_group, topic) (max by(consumer_group, topic, partition) (kafka_group_commit_offset{job=\"$cluster_id\", topic=~\"$topic\", partition=~\"$partition\"}))",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"hide": false,
"name": "TSDB",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${TSDB}"
},
"definition": "label_values(process_runtime_jvm_cpu_utilization_ratio,job)",
"hide": false,
"multi": false,
"name": "cluster_id",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${TSDB}"
},
"definition": "label_values(kafka_log_size,topic)",
"hide": false,
"multi": true,
"name": "topic",
"reg": "",
"type": "query"
},
{
"allOption": true,
"allValue": ".*",
"datasource": {
"cate": "prometheus",
"value": "${TSDB}"
},
"definition": "label_values(kafka_log_size{topic=\"$topic\"},partition)",
"hide": false,
"multi": true,
"name": "partition",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327174664000
}
================================================
FILE: integrations/AutoMQ/markdown/overview.md
================================================
## 前言
AuthMQ 官方文档提供了指标吐出方式以及和监控系统的整合方式,具体可以参考[AutoMQ](https://docs.automq.com/zh/docs/automq-opensource/LkwkwdQlwizjqckhj0dcc2IdnDh)。
## 推荐方式
建议采用 AutoMQ 文档中的方案二:使用 Prometheus OTLP Receiver 的方式,把所有的指标都收集到 OTel Collector 中,然后使用 Prometheus 或者 Categraf 直接去拉取数据即可。假如使用 Categraf,就是使用 prometheus 插件去拉取数据,比如我们为 prometheus 插件提供一个单独的 automq.toml 的配置文件:`conf/input.prometheus/automq.toml` ,内容如下:
```toml
[[instances]]
urls = [
"http://:/metrics"
]
url_label_key = "otel_collector"
url_label_value = "{{.Host}}"
```
注意,url_label_key 一般都是指定为 instance,但是这里故意指定为其他字符串,是因为 AutoMQ 原始的指标中包含了 instance 标签,为了避免冲突,所以指定为其他字符串。
================================================
FILE: integrations/AutoMQ/metrics/exporter.json
================================================
[
{
"id": 0,
"uuid": 1717556327176351000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Broker节点入站网络IO速率",
"unit": "bytesSecSI",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, instance) (rate(kafka_network_io_bytes_total{direction=\"in\"}[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Broker节点入站网络IO速率",
"note": ""
},
{
"lang": "en_US",
"name": "Broker node inbound network IO rate",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327179998000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Broker节点出站网络IO速率",
"unit": "bytesSecSI",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, instance) (rate(kafka_network_io_bytes_total{direction=\"out\"}[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Broker节点出站网络IO速率",
"note": ""
},
{
"lang": "en_US",
"name": "Broker node outbound network IO rate",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327182502000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Broker节点当前分配的分区数量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "kafka_partition_count",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Broker节点当前分配的分区数量",
"note": ""
},
{
"lang": "en_US",
"name": "The number of partitions currently allocated to the Broker node",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327184905000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Broker节点每秒消息吞吐量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, instance) (rate(kafka_message_count_total[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Broker节点每秒消息吞吐量",
"note": ""
},
{
"lang": "en_US",
"name": "Broker node message throughput per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327187098000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Consumer Group提交偏移量总和",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, consumer_group, topic) (max by(job, consumer_group, topic, partition) (kafka_group_commit_offset))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Consumer Group提交偏移量总和",
"note": ""
},
{
"lang": "en_US",
"name": "Consumer Group Commit Offset Sum",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327189255000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Consumer Group提交偏移量速率(每个实例)",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, consumer_group, topic, partition) (rate(kafka_group_commit_offset[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Consumer Group提交偏移量速率(每个实例)",
"note": ""
},
{
"lang": "en_US",
"name": "Consumer Group commit offset rate (per instance)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327191388000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Consumer Group数量总和",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "sum by (job) (kafka_group_count)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Consumer Group数量总和",
"note": ""
},
{
"lang": "en_US",
"name": "Consumer Group Quantity Sum",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327193321000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Consumer Group最大提交偏移量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "max by(job, consumer_group, topic, partition) (kafka_group_commit_offset)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Consumer Group最大提交偏移量",
"note": ""
},
{
"lang": "en_US",
"name": "Consumer Group Maximum commit offset",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327195219000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Fetch请求处理时间的50分位数",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "kafka_request_time_50p_milliseconds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Fetch请求处理时间的50分位数",
"note": ""
},
{
"lang": "en_US",
"name": "50th quantile of Fetch request processing time",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327197092000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Fetch请求处理时间的99分位数",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "kafka_request_time_99p_milliseconds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Fetch请求处理时间的99分位数",
"note": ""
},
{
"lang": "en_US",
"name": "99th quantile of Fetch request processing time",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327198909000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Fetch请求处理时间的99分位数(每个实例)",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "avg by(job, instance) (kafka_request_time_99p_milliseconds)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Fetch请求处理时间的99分位数(每个实例)",
"note": ""
},
{
"lang": "en_US",
"name": "99th quantile of Fetch request processing time (per instance)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327201071000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Fetch请求处理时间的平均值",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "kafka_request_time_mean_milliseconds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Fetch请求处理时间的平均值",
"note": ""
},
{
"lang": "en_US",
"name": "Average of Fetch request processing time",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327202824000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Fetch请求处理时间的最大值",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "kafka_request_time_max_milliseconds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Fetch请求处理时间的最大值",
"note": ""
},
{
"lang": "en_US",
"name": "Maximum Fetch request processing time",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327204459000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Fetch请求失败速率(每个实例)",
"unit": "reqps",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, instance) (rate(kafka_topic_request_failed_total[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Fetch请求失败速率(每个实例)",
"note": ""
},
{
"lang": "en_US",
"name": "Fetch request failure rate (per instance)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327206530000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Kafka Controller活跃状态计数",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job) (kafka_controller_active_count)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Kafka Controller活跃状态计数",
"note": ""
},
{
"lang": "en_US",
"name": "Kafka Controller active status count",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327208370000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Produce请求处理时间的50分位数",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "kafka_request_time_50p_milliseconds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Produce请求处理时间的50分位数",
"note": ""
},
{
"lang": "en_US",
"name": "50th Quantile of Processing Time for Produce requests",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327210667000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Produce请求处理时间的99分位数",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "kafka_request_time_99p_milliseconds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Produce请求处理时间的99分位数",
"note": ""
},
{
"lang": "en_US",
"name": "99th quantile of processing time for Produce requests",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327212669000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Produce请求处理时间的99分位数(每个实例)",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "avg by(job, instance) (kafka_request_time_99p_milliseconds)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Produce请求处理时间的99分位数(每个实例)",
"note": ""
},
{
"lang": "en_US",
"name": "99th quantile of Produce request processing time (per instance)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327214580000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Produce请求处理时间的平均值",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "kafka_request_time_mean_milliseconds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Produce请求处理时间的平均值",
"note": ""
},
{
"lang": "en_US",
"name": "Average Produce request processing time",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327217055000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Produce请求处理时间的最大值",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "kafka_request_time_max_milliseconds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Produce请求处理时间的最大值",
"note": ""
},
{
"lang": "en_US",
"name": "The maximum processing time of the Produce request",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327219060000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "Produce请求失败速率(每个实例)",
"unit": "reqps",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, instance) (rate(kafka_topic_request_failed_total[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Produce请求失败速率(每个实例)",
"note": ""
},
{
"lang": "en_US",
"name": "Produce request failure rate (per instance)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327221295000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "各个Consumer Group和Topic的最大提交偏移量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "max by(job, consumer_group, topic, partition) (kafka_group_commit_offset)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "各个Consumer Group和Topic的最大提交偏移量",
"note": ""
},
{
"lang": "en_US",
"name": "Maximum commit offsets for each Consumer Group and Topic",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327223767000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "各个Topic分区的最大日志结束偏移量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "max by(job, topic, partition) (kafka_log_end_offset)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "各个Topic分区的最大日志结束偏移量",
"note": ""
},
{
"lang": "en_US",
"name": "Maximum end of log offset for each Topic partition",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327225577000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "实例的Fetch请求速率",
"unit": "reqps",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, instance, topic) (rate(kafka_request_count_total[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "实例的Fetch请求速率",
"note": ""
},
{
"lang": "en_US",
"name": "Fetch request rate of an instance",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327227527000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "实例的Produce请求速率",
"unit": "reqps",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, instance, topic) (rate(kafka_request_count_total[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "实例的Produce请求速率",
"note": ""
},
{
"lang": "en_US",
"name": "Instance's Produce request rate",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327229750000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "已死亡的Consumer Group数量总和",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "sum by (job) (kafka_group_dead_count)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "已死亡的Consumer Group数量总和",
"note": ""
},
{
"lang": "en_US",
"name": "Total number of dead Consumer Groups",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327231514000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "日志大小总和",
"unit": "bytesSI",
"note": "",
"lang": "zh_CN",
"expression": "sum by (job) (max by(job, topic, partition) (kafka_log_size))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "日志大小总和",
"note": ""
},
{
"lang": "en_US",
"name": "Sum of log sizes",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327233243000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "日志消费延迟 lag",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, topic) (max by(job, topic, partition) (kafka_log_end_offset{}))\n- on(job, topic) group_left(consumer_group)\nsum by(job, consumer_group, topic) (max by(job, consumer_group, topic, partition) (kafka_group_commit_offset{}))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "日志消费延迟 lag",
"note": ""
},
{
"lang": "en_US",
"name": "Log consumption delay lag",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327236120000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "日志结束偏移量总和",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, topic) (max by(job, topic, partition) (kafka_log_end_offset))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "日志结束偏移量总和",
"note": ""
},
{
"lang": "en_US",
"name": "End of log offset sum",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327237878000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "日志结束最大偏移量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "max by(job, topic, partition) (kafka_log_end_offset)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "日志结束最大偏移量",
"note": ""
},
{
"lang": "en_US",
"name": "End of Log Max Offset",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327240138000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "正在准备进行rebalance的Consumer Group数量总和",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "sum by (job) (kafka_group_preparing_rebalance_count)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "正在准备进行rebalance的Consumer Group数量总和",
"note": ""
},
{
"lang": "en_US",
"name": "Sum number of Consumer Groups in preparation for rebalance",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327241777000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "正在等待Leader进行状态分配的Consumer Group数量总和",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "sum by (job) (kafka_group_completing_rebalance_count)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "正在等待Leader进行状态分配的Consumer Group数量总和",
"note": ""
},
{
"lang": "en_US",
"name": "Sum number of Consumer Groups waiting for status allocation by Leader",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327243616000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个Topic的Broker节点入站网络IO速率sum by(topic)",
"unit": "bytesSecSI",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, topic) (rate(kafka_network_io_bytes_total{direction=\"in\"}[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个Topic的Broker节点入站网络IO速率sum by(topic)",
"note": ""
},
{
"lang": "en_US",
"name": "Broker node inbound network IO rate per topic sum by (topic)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327245855000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个Topic的Broker节点入站网络IO速率sum by(topic, instance)",
"unit": "bytesSecSI",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, topic, instance) (rate(kafka_network_io_bytes_total{direction=\"in\"}[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个Topic的Broker节点入站网络IO速率sum by(topic, instance)",
"note": ""
},
{
"lang": "en_US",
"name": "Broker node inbound network IO rate per topic sum by (topic, instance)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327247589000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个Topic的Broker节点出站网络IO速率sum by(topic)",
"unit": "bytesSecSI",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, topic) (rate(kafka_network_io_bytes_total{direction=\"out\"}[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个Topic的Broker节点出站网络IO速率sum by(topic)",
"note": ""
},
{
"lang": "en_US",
"name": "Broker node outbound network IO rate per topic sum by (topic)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327249421000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个Topic的Broker节点出站网络IO速率sum by(topic, instance)",
"unit": "bytesSecSI",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, topic, instance) (rate(kafka_network_io_bytes_total{direction=\"out\"}[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个Topic的Broker节点出站网络IO速率sum by(topic, instance)",
"note": ""
},
{
"lang": "en_US",
"name": "Broker node outbound network IO rate per topic sum by (topic, instance)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327251637000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个Topic的Broker节点出站网络IO速率sum by(topic, partition)",
"unit": "bytesSecSI",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, topic, partition) (rate(kafka_network_io_bytes_total{direction=\"out\"}[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个Topic的Broker节点出站网络IO速率sum by(topic, partition)",
"note": ""
},
{
"lang": "en_US",
"name": "Broker node outbound network IO rate per topic sum by (topic, partition)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327253444000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个Topic的Broker节点消息吞吐量(入站)sum by(topic)",
"unit": "mps",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, topic) (rate(kafka_message_count_total{direction=\"in\"}[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个Topic的Broker节点消息吞吐量(入站)sum by(topic)",
"note": ""
},
{
"lang": "en_US",
"name": "Broker node message throughput per topic (inbound) sum by (topic)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327255550000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个Topic的Broker节点消息吞吐量(入站)sum by(topic, instance)",
"unit": "mps",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, topic, instance) (rate(kafka_message_count_total{direction=\"in\"}[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个Topic的Broker节点消息吞吐量(入站)sum by(topic, instance)",
"note": ""
},
{
"lang": "en_US",
"name": "Broker node message throughput per topic (inbound) sum by (topic, instance)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327257265000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个Topic的Broker节点网络IO速率(集群级别)",
"unit": "bytesSecSI",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job) (rate(kafka_network_io_bytes_total[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个Topic的Broker节点网络IO速率(集群级别)",
"note": ""
},
{
"lang": "en_US",
"name": "Broker node network IO rate per Topic (cluster level)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327259010000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个Topic的日志大小",
"unit": "bytesSI",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, topic) (kafka_log_size)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个Topic的日志大小",
"note": ""
},
{
"lang": "en_US",
"name": "Log size per Topic",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327260869000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个Topic的消息吞吐量sum by(topic, type)",
"unit": "mps",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, topic, type) (rate(kafka_message_count_total[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个Topic的消息吞吐量sum by(topic, type)",
"note": ""
},
{
"lang": "en_US",
"name": "Message throughput per topic sum by (topic, type)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327262606000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个Topic的请求计数sum by(topic, type)",
"unit": "reqps",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, topic, type) (rate(kafka_topic_request_count_total[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个Topic的请求计数sum by(topic, type)",
"note": ""
},
{
"lang": "en_US",
"name": "Request count per topic sum by (topic, type)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327264897000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个实例的Broker节点出站网络IO速率(实例级别)",
"unit": "bytesSecSI",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, instance) (rate(kafka_network_io_bytes_total{direction=\"out\"}[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个实例的Broker节点出站网络IO速率(实例级别)",
"note": ""
},
{
"lang": "en_US",
"name": "Broker node outbound network IO rate per instance (instance level)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327267156000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个实例的Broker节点分区数量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, instance) (kafka_partition_count)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个实例的Broker节点分区数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of Broker node partitions per instance",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327269529000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个实例的Broker节点消息吞吐量(入站)",
"unit": "mps",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, instance) (rate(kafka_message_count_total{direction=\"in\"}[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个实例的Broker节点消息吞吐量(入站)",
"note": ""
},
{
"lang": "en_US",
"name": "Broker node message throughput per instance (inbound)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327271703000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个实例的Fetch请求速率",
"unit": "reqps",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, instance) (rate(kafka_request_count_total{type=\"Fetch\"}[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个实例的Fetch请求速率",
"note": ""
},
{
"lang": "en_US",
"name": "Fetch request rate per instance",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327273861000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个实例的Produce请求速率",
"unit": "reqps",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, instance) (rate(kafka_request_count_total{type=\"Produce\"}[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个实例的Produce请求速率",
"note": ""
},
{
"lang": "en_US",
"name": "Produce request rate per instance",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327276362000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "每个实例的节点当前建立的连接数",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, instance) (kafka_server_connection_count)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每个实例的节点当前建立的连接数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of connections currently established by nodes per instance",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327278608000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "活跃Broker数量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "kafka_broker_active_count",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "活跃Broker数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of active brokers",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327280516000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "稳定状态的Consumer Group数量总和",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "sum by (job) (kafka_group_stable_count)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "稳定状态的Consumer Group数量总和",
"note": ""
},
{
"lang": "en_US",
"name": "Sum of the number of steady-state Consumer Groups",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327282783000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "空的Consumer Group数量总和",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "sum by (job) (kafka_group_empty_count)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "空的Consumer Group数量总和",
"note": ""
},
{
"lang": "en_US",
"name": "Sum number of empty Consumer Groups",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327284934000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "节点当前建立的连接数",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "kafka_server_connection_count",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "节点当前建立的连接数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of connections currently established by the node",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327287401000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "被fence的Broker数量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "kafka_broker_fenced_count",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "被fence的Broker数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of Brokers fenced",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327291657000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "请求错误计数总和",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "sum by (job) (rate(kafka_request_error_count_total[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "请求错误计数总和",
"note": ""
},
{
"lang": "en_US",
"name": "Request error count sum",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327295172000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "请求错误计数率",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "rate(kafka_request_error_count_total[1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "请求错误计数率",
"note": ""
},
{
"lang": "en_US",
"name": "Request error count rate",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327297787000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "请求错误计数率(每个实例)",
"unit": "reqps",
"note": "",
"lang": "zh_CN",
"expression": "sum by(job, instance) (rate(kafka_request_error_count_total[1m]))",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "请求错误计数率(每个实例)",
"note": ""
},
{
"lang": "en_US",
"name": "Request error count rate (per instance)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327300342000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "集群Topic总数",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "kafka_topic_count",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "集群Topic总数",
"note": ""
},
{
"lang": "en_US",
"name": "Total number of cluster topics",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327302857000,
"collector": "Exporter",
"typ": "AutoMQ",
"name": "集群分区总数",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "kafka_partition_total_count",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "集群分区总数",
"note": ""
},
{
"lang": "en_US",
"name": "Total number of cluster partitions",
"note": ""
}
]
}
]
================================================
FILE: integrations/Bind/collect/bind/bind.toml
================================================
[[instances]]
urls = [
# "http://localhost:8053/xml/v3",
]
gather_memory_contexts = true
gather_views = true
timeout = "5s"
# labels={app="bind"}
================================================
FILE: integrations/Bind/markdown/README.md
================================================
forked from [telegraf/bind](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/bind)
配置示例
```
[[instances]]
urls = [
#"http://localhost:8053/xml/v3",
]
timeout = "5s"
gather_memory_contexts = true
gather_views = true
```
================================================
FILE: integrations/Canal/dashboards/canal_by_categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Canal instances",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"collapsed": true,
"id": "c876d624-3357-4511-bed9-fd30fd82e2e8",
"layout": {
"h": 1,
"i": "c876d624-3357-4511-bed9-fd30fd82e2e8",
"w": 24,
"x": 0,
"y": 0
},
"name": "Instance status",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Canal instance 基本信息。",
"id": "235ad194-ad14-4431-9633-543bd679e9dc",
"layout": {
"h": 5,
"i": "235ad194-ad14-4431-9633-543bd679e9dc",
"w": 6,
"x": 0,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Basic",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"repeat": null,
"targets": [
{
"expr": "canal_instance{destination=~\"$destination\"}",
"legend": "Destination: {{destination}}",
"refId": "A"
},
{
"expr": "canal_instance_parser_mode{destination=~\"$destination\"}",
"legend": "Parallel parser: {{parallel}}",
"refId": "B"
},
{
"expr": "canal_instance_store{destination=~\"$destination\"}",
"legend": "Batch mode: {{batchMode}}",
"refId": "C"
},
{
"expr": "canal_instance_store{destination=~\"$destination\"}",
"legend": "Buffer size: {{size}}",
"refId": "D"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Canal instance 网络带宽占用。\ninbound: 读取MySQL binlog.\noutbound: 对Client端传输格式化binlog.",
"id": "13080d70-8fe9-4388-9d72-f587b3b480b9",
"layout": {
"h": 5,
"i": "13080d70-8fe9-4388-9d72-f587b3b480b9",
"w": 6,
"x": 6,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Network bandwidth",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(canal_instance_received_binlog_bytes{destination=~\"$destination\", parser=\"0\"}[2m]) / 1024",
"legend": "inbound",
"refId": "A"
},
{
"expr": "rate(canal_instance_client_bytes{destination=~\"$destination\"}[2m]) / 1024",
"legend": "outbound",
"refId": "B"
},
{
"expr": "rate(canal_instance_received_binlog_bytes{destination=~\"$destination\", parser=\"1\"}[2m]) / 1024",
"legend": "inbound-1",
"refId": "C"
},
{
"expr": "rate(canal_instance_received_binlog_bytes{destination=~\"$destination\", parser=\"2\"}[2m]) / 1024",
"legend": "inbound-2",
"refId": "D"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "master: Canal server相对于MySQL master的延时。通过master heartbeat机制能刷新idle状态下的延时。\nput: store put操作的时间点为基准。\nget: client get操作的时间点为基准。\nack: client ack操作的时间点为基准。",
"id": "8f46e835-1c9a-4ec1-94d4-2416646061d6",
"layout": {
"h": 5,
"i": "8f46e835-1c9a-4ec1-94d4-2416646061d6",
"w": 6,
"x": 12,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Delay",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "canal_instance_traffic_delay{destination=~\"$destination\"} / 1000",
"legend": "master",
"refId": "D"
},
{
"expr": "canal_instance_put_delay{destination=~\"$destination\"} / 1000",
"legend": "put",
"refId": "A"
},
{
"expr": "canal_instance_get_delay{destination=~\"$destination\"} / 1000",
"legend": "get",
"refId": "B"
},
{
"expr": "canal_instance_ack_delay{destination=~\"$destination\"} / 1000",
"legend": "ack",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "sink线程blocking占比;dump线程blocking占比(仅parallel mode)。",
"id": "986e87d8-37d3-4995-8f8d-7141d4b792af",
"layout": {
"h": 5,
"i": "986e87d8-37d3-4995-8f8d-7141d4b792af",
"w": 6,
"x": 18,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Blocking",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "clamp_max(rate(canal_instance_publish_blocking_time{destination=~\"$destination\", parser=\"0\"}[2m]), 1000) / 10",
"legend": "dump",
"refId": "B"
},
{
"expr": "clamp_max(rate(canal_instance_sink_blocking_time{destination=~\"$destination\"}[2m]), 1000) / 10",
"legend": "sink",
"refId": "A"
},
{
"expr": "clamp_max(rate(canal_instance_publish_blocking_time{destination=~\"$destination\", parser=\"1\"}[2m]), 1000) / 10",
"legend": "dump-1",
"refId": "C"
},
{
"expr": "clamp_max(rate(canal_instance_publish_blocking_time{destination=~\"$destination\", parser=\"2\"}[2m]), 1000) / 10",
"legend": "dump-2",
"refId": "D"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "d51067ea-6e23-41c1-91d9-dd264c041add",
"layout": {
"h": 1,
"i": "d51067ea-6e23-41c1-91d9-dd264c041add",
"w": 24,
"x": 0,
"y": 6
},
"name": "Throughput",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Instance处理binlog的TPS(以master变更行数table rows为基准计算)。\nput: put操作TPS。\nget: get操作TPS。\nack: ack操作TPS。",
"id": "66a0392c-ffbe-42a5-b423-7e83007318b2",
"layout": {
"h": 5,
"i": "66a0392c-ffbe-42a5-b423-7e83007318b2",
"w": 6,
"x": 0,
"y": 7
},
"links": [],
"maxPerRow": 4,
"name": "TPS(table rows)",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(canal_instance_put_rows{destination=~\"$destination\"}[2m])",
"legend": "put",
"refId": "A"
},
{
"expr": "rate(canal_instance_get_rows{destination=~\"$destination\"}[2m])",
"legend": "get",
"refId": "B"
},
{
"expr": "rate(canal_instance_ack_rows{destination=~\"$destination\"}[2m])",
"legend": "ack",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Canal instance 处理binlog的TPS,以MySQL transaction为单位计算。",
"id": "daaa8cfd-3238-4ee2-ae30-4d5bd0c899d7",
"layout": {
"h": 5,
"i": "daaa8cfd-3238-4ee2-ae30-4d5bd0c899d7",
"w": 6,
"x": 6,
"y": 7
},
"links": [],
"maxPerRow": 4,
"name": "TPS(MySQL transaction)",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(canal_instance_transactions{destination=~\"$destination\"}[2m])",
"legend": "transactions",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "72043279-0d2a-47d9-97f9-80cb4c595bc4",
"layout": {
"h": 1,
"i": "72043279-0d2a-47d9-97f9-80cb4c595bc4",
"w": 24,
"x": 0,
"y": 12
},
"name": "Client",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Canal instance接收到的请求统计,结果按packet type分类。",
"id": "bae11c21-18ad-4964-90a5-09ba9b083fb9",
"layout": {
"h": 5,
"i": "bae11c21-18ad-4964-90a5-09ba9b083fb9",
"w": 6,
"x": 0,
"y": 13
},
"links": [],
"maxPerRow": 4,
"name": "Client requests",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "canal_instance_client_packets{destination=~\"$destination\"}",
"legend": "{{packetType}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "client 请求的GET与ACK包的QPS。",
"id": "9e94fdcd-fc2a-4dc1-8f64-9242409a7345",
"layout": {
"h": 5,
"i": "9e94fdcd-fc2a-4dc1-8f64-9242409a7345",
"w": 6,
"x": 6,
"y": 13
},
"links": [],
"maxPerRow": 4,
"name": "Client QPS",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(canal_instance_client_packets{destination=~\"$destination\",packetType=\"GET\"}[2m])",
"legend": "GET",
"refId": "A"
},
{
"expr": "rate(canal_instance_client_packets{destination=~\"$destination\",packetType=\"CLIENTACK\"}[2m])",
"legend": "ACK",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "server响应GET请求,但返回空包的占比。",
"id": "7fc92a43-77c1-469f-b1b4-dc2b5c45a81d",
"layout": {
"h": 5,
"i": "7fc92a43-77c1-469f-b1b4-dc2b5c45a81d",
"w": 6,
"x": 12,
"y": 13
},
"links": [],
"maxPerRow": 4,
"name": "Empty packets",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(canal_instance_client_empty_batches{destination=~\"$destination\"}[2m])",
"legend": "empty",
"refId": "A"
},
{
"expr": "rate(canal_instance_client_packets{destination=~\"$destination\", packetType=\"GET\"}[2m])",
"legend": "nonempty",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Canal client 请求响应时间的概况。",
"id": "99623fe5-652c-4091-81d9-942e94b4ac4c",
"layout": {
"h": 5,
"i": "99623fe5-652c-4091-81d9-942e94b4ac4c",
"w": 6,
"x": 18,
"y": 13
},
"links": [],
"maxPerRow": 4,
"name": "Response time",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(canal_instance_client_request_latency_bucket{destination=~\"$destination\"}[2m])",
"legend": "{{le}}ms",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "24ce3843-8303-4a17-ac74-36f3ddc50af5",
"layout": {
"h": 1,
"i": "24ce3843-8303-4a17-ac74-36f3ddc50af5",
"w": 24,
"x": 0,
"y": 18
},
"name": "Store",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Canal instance ringbuffer内未释放的events数量。",
"id": "326dcd0e-dc9c-42b9-83a7-ea59648092cc",
"layout": {
"h": 5,
"i": "326dcd0e-dc9c-42b9-83a7-ea59648092cc",
"w": 6,
"x": 0,
"y": 19
},
"links": [],
"maxPerRow": 4,
"name": "Store remain events",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "canal_instance_store_produce_seq{destination=~\"$destination\"} - canal_instance_store_consume_seq{destination=~\"$destination\"}",
"legend": "events",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Canal instance ringbuffer 内未释放events占用内存。",
"id": "c328535d-9864-4681-865c-8551115dac07",
"layout": {
"h": 5,
"i": "c328535d-9864-4681-865c-8551115dac07",
"w": 6,
"x": 6,
"y": 19
},
"links": [],
"maxPerRow": 4,
"name": "Store remain mem",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(canal_instance_store_produce_mem{destination=~\"$destination\"} - canal_instance_store_consume_mem{destination=~\"$destination\"}) / 1024",
"legend": "memsize",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"hide": false,
"name": "datasource",
"type": "datasource"
},
{
"allOption": false,
"allValue": null,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(canal_instance, destination)",
"hide": false,
"multi": false,
"name": "destination",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327308578000
}
================================================
FILE: integrations/Canal/markdown/README.md
================================================
## canal
canal 默认提供了 prometheus 格式指标的接口 [Prometheus-QuickStart](https://github.com/alibaba/canal/wiki/Prometheus-QuickStart) ,所以可以直接通过[ prometheus 插件](https://flashcat.cloud/docs/content/flashcat-monitor/categraf/plugin/prometheus)采集。
================================================
FILE: integrations/Ceph/alerts/ceph_by_categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CephErrorState",
"note": "Ceph is in Error state longer than 5m, please check status of pools and OSDs",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "ceph_health_status \u003e 1",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ceph_health_status \u003e 1",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327313004000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CephOsdReweighted",
"note": "OSD {{ $labels.ceph_daemon}} on cluster {{ $labels.cluster}} was reweighted for too long. Please either create silent or fix that issue",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 3600,
"prom_ql": "ceph_osd_weight \u003c 1",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ceph_osd_weight \u003c 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327313504000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CephOSDUtilizatoin",
"note": "Osd free space for {{ $labels.osd }} is higher tan 90%. Please validate why its so big, reweight or add storage",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "ceph_osd_utilization \u003e 90",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ceph_osd_utilization \u003e 90",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327314160000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CephPgActivating",
"note": "Some groups are activating for too long on {{ $labels.cluster }}. Those PGs are unavailable for too long!",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "ceph_pg_activating \u003e 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ceph_pg_activating \u003e 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327314779000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CephPgBackfillTooFull",
"note": "Some groups are located on full OSD on cluster {{ $labels.cluster }}. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "ceph_pg_backfill_toofull \u003e 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ceph_pg_backfill_toofull \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327315319000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CephPgDown",
"note": "Some groups are down (unavailable) for too long on {{ $labels.cluster }}. Please ensure that all the data are available",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 180,
"prom_ql": "ceph_pg_down \u003e 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ceph_pg_down \u003e 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327315892000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CephPgIncomplete",
"note": "Some groups are incomplete (unavailable) for too long on {{ $labels.cluster }}. Please ensure that all the data are available",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "ceph_pg_incomplete \u003e 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ceph_pg_incomplete \u003e 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327316390000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CephPgInconsistent",
"note": "Some groups are inconsistent for too long on {{ $labels.cluster }}. Data is available but inconsistent across nodes",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "ceph_pg_inconsistent \u003e 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ceph_pg_inconsistent \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327316911000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CephPgUnavailable",
"note": "Some groups are unavailable on {{ $labels.cluster }}. Please check their detailed status and current configuration.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "ceph_pg_total - ceph_pg_active \u003e 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ceph_pg_total - ceph_pg_active \u003e 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327317557000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CephTargetDown",
"note": "CEPH target down for more than 2m, please check - it could be a either exporter crash or a whole cluster crash",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "up{job=\"ceph\"} == 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "up{job=\"ceph\"} == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327318071000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CephWarnState",
"note": "Ceph is in Warn state longer than 30m, please check status of pools and OSDs",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 1800,
"prom_ql": "ceph_health_status == 1",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ceph_health_status == 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327318544000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "MonitorAvailableStorage",
"note": "Monitor storage for {{ $labels.monitor }} less than 30% - please check why its too high",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "ceph_monitor_avail_percent \u003c 30",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ceph_monitor_avail_percent \u003c 30",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327319044000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "MonitorClockSkewTooHigh",
"note": "Monitor clock skew detected on {{ $labels.monitor }} - please check ntp and hardware clock settings",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "abs(ceph_monitor_clock_skew_seconds) \u003e 0.1",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "abs(ceph_monitor_clock_skew_seconds) \u003e 0.1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327319491000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "OsdApplyLatencyTooHigh",
"note": "OSD latency for {{ $labels.osd }} is too high. Please check if it doesn't stuck in weird state",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 90,
"prom_ql": "ceph_osd_perf_apply_latency_seconds \u003e 10",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ceph_osd_perf_apply_latency_seconds \u003e 10",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327320012000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "OsdDown",
"note": "OSD is down longer than 30 min, please check what's the status",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 1800,
"prom_ql": "ceph_osd_up == 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ceph_osd_up == 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327320476000
}
]
================================================
FILE: integrations/Ceph/dashboards/ceph_by_categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Ceph - Cluster By Categraf",
"ident": "",
"tags": "Categraf",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"collapsed": true,
"id": "40d15965-877f-4cb9-8e8f-b6d43eaf477b",
"layout": {
"h": 1,
"i": "40d15965-877f-4cb9-8e8f-b6d43eaf477b",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "CLUSTER STATE",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "d30b94a5-5875-4c55-85d3-352d6bc4d419",
"layout": {
"h": 3,
"i": "d30b94a5-5875-4c55-85d3-352d6bc4d419",
"isResizable": true,
"w": 3,
"x": 0,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "ceph_health_status{cluster=\"$cluster\"}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "f22cc692-4e4e-44e6-ae1e-f4750e6ced36",
"layout": {
"h": 3,
"i": "f22cc692-4e4e-44e6-ae1e-f4750e6ced36",
"isResizable": true,
"w": 3,
"x": 3,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Write Throughput",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(irate(ceph_osd_op_w_in_bytes{cluster=\"$cluster\"}[5m]))",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "aaf30364-54b0-47b1-9e99-351f4f65a780",
"layout": {
"h": 3,
"i": "aaf30364-54b0-47b1-9e99-351f4f65a780",
"isResizable": true,
"w": 3,
"x": 6,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Read Throughput",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(irate(ceph_osd_op_r_out_bytes{cluster=\"$cluster\"}[5m]))",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "ce61e127-e379-4fd9-bcd4-37a3bbca1ace",
"layout": {
"h": 3,
"i": "ce61e127-e379-4fd9-bcd4-37a3bbca1ace",
"isResizable": true,
"w": 3,
"x": 9,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Cluster Capacity",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "ceph_cluster_total_bytes{cluster=\"$cluster\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "ddf70b0b-f207-489d-95b0-fc3a457993a6",
"layout": {
"h": 6,
"i": "ddf70b0b-f207-489d-95b0-fc3a457993a6",
"isResizable": true,
"w": 3,
"x": 12,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Available Capacity",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "(ceph_cluster_total_bytes{cluster=\"$cluster\"}-ceph_cluster_total_used_bytes{cluster=\"$cluster\"})/ceph_cluster_total_bytes{cluster=\"$cluster\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "6114e5c7-73a8-4c35-8b49-85ed0fe5ca75",
"layout": {
"h": 3,
"i": "6114e5c7-73a8-4c35-8b49-85ed0fe5ca75",
"isResizable": true,
"w": 3,
"x": 15,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Number of Objects",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "ceph_cluster_total_objects{cluster=\"$cluster\"}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "45150843-0f61-4e6c-9dc2-484c693afd4e",
"layout": {
"h": 3,
"i": "45150843-0f61-4e6c-9dc2-484c693afd4e",
"isResizable": true,
"w": 3,
"x": 18,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Bytes Written",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(ceph_osd_op_w_in_bytes{cluster=\"$cluster\"})",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "9eb35dad-3e2f-4020-97b7-2a4567bc4e9c",
"layout": {
"h": 3,
"i": "9eb35dad-3e2f-4020-97b7-2a4567bc4e9c",
"isResizable": true,
"w": 3,
"x": 21,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Bytes Read",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(ceph_osd_op_r_out_bytes{cluster=\"$cluster\"})",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "c08d5553-c189-424d-be0c-15fdced7a402",
"layout": {
"h": 3,
"i": "c08d5553-c189-424d-be0c-15fdced7a402",
"isResizable": true,
"w": 3,
"x": 0,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Alerts",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "count(ALERTS{cluster='$cluster', alertstate='firing'}) OR vector(0)",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "92dc4651-a023-4c7a-b998-88a6ba844d7f",
"layout": {
"h": 3,
"i": "92dc4651-a023-4c7a-b998-88a6ba844d7f",
"isResizable": true,
"w": 3,
"x": 3,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Write IOPS",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(irate(ceph_osd_op_w{cluster=\"$cluster\"}[5m]))",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "72056479-6031-4014-8d72-5cf06e50099c",
"layout": {
"h": 3,
"i": "72056479-6031-4014-8d72-5cf06e50099c",
"isResizable": true,
"w": 3,
"x": 6,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Read IOPS",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(irate(ceph_osd_op_r{cluster=\"$cluster\"}[5m]))",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "c750f49f-05e1-4a22-8efe-eb0e42b87950",
"layout": {
"h": 3,
"i": "c750f49f-05e1-4a22-8efe-eb0e42b87950",
"isResizable": true,
"w": 3,
"x": 9,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Used Capacity",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "ceph_cluster_total_used_bytes{cluster=\"$cluster\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "d20d66ff-ce39-47d6-800c-8ab99941bf6a",
"layout": {
"h": 3,
"i": "d20d66ff-ce39-47d6-800c-8ab99941bf6a",
"isResizable": true,
"w": 3,
"x": 15,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Difference",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "ceph_cluster_total_objects{cluster=\"$cluster\"}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "a36e7fb7-c450-4dcf-9cac-3dd20092cdef",
"layout": {
"h": 3,
"i": "a36e7fb7-c450-4dcf-9cac-3dd20092cdef",
"isResizable": true,
"w": 3,
"x": 18,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Mon Session Num",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(ceph_mon_num_sessions{cluster='$cluster'})",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "065cb6bd-fca7-46b0-a9d0-7f4866a583fb",
"layout": {
"h": 3,
"i": "065cb6bd-fca7-46b0-a9d0-7f4866a583fb",
"isResizable": true,
"w": 3,
"x": 21,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Monitors In Quorum",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "count(ceph_mon_quorum_status{cluster=\"$cluster\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "4ea9d78b-193e-41a9-8ed5-b52487f6b460",
"layout": {
"h": 1,
"i": "4ea9d78b-193e-41a9-8ed5-b52487f6b460",
"isResizable": false,
"w": 24,
"x": 0,
"y": 7
},
"name": "OSD STATE",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "2e9efb16-1239-4f9a-b105-1cbcdc2e9a25",
"layout": {
"h": 3,
"i": "2e9efb16-1239-4f9a-b105-1cbcdc2e9a25",
"isResizable": true,
"w": 2,
"x": 0,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "OSDs OUT",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "count(ceph_osd_up{cluster=\"$cluster\"}) - count(ceph_osd_in{cluster=\"$cluster\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "323101fb-f562-4c6d-89d7-39899549ba5b",
"layout": {
"h": 3,
"i": "323101fb-f562-4c6d-89d7-39899549ba5b",
"isResizable": true,
"w": 2,
"x": 2,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "OSDs DOWN",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "count(ceph_osd_up{cluster=\"$cluster\"} == 0.0) OR vector(0)",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "5add038b-eb04-40dd-9bcc-8e620e72549b",
"layout": {
"h": 3,
"i": "5add038b-eb04-40dd-9bcc-8e620e72549b",
"isResizable": true,
"w": 2,
"x": 4,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "OSDs UP",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(ceph_osd_up{cluster=\"$cluster\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "5295e4fa-4027-408d-acef-ee1e6b662c3b",
"layout": {
"h": 3,
"i": "5295e4fa-4027-408d-acef-ee1e6b662c3b",
"isResizable": true,
"w": 2,
"x": 6,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "OSDs IN",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(ceph_osd_in{cluster=\"$cluster\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "6ef3f924-1ada-4d7e-9a34-2b00bf607e0e",
"layout": {
"h": 3,
"i": "6ef3f924-1ada-4d7e-9a34-2b00bf607e0e",
"isResizable": true,
"w": 2,
"x": 8,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Avg PGs",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "avg(ceph_osd_numpg{cluster=\"$cluster\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "b6109c27-3022-45bc-8d69-46db095335b4",
"layout": {
"h": 3,
"i": "b6109c27-3022-45bc-8d69-46db095335b4",
"isResizable": true,
"w": 3,
"x": 10,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Avg Apply Latency",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "avg(ceph_osd_apply_latency_ms{cluster=\"$cluster\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "ba070ba7-8fb3-44af-9bcf-5f34eeb96919",
"layout": {
"h": 3,
"i": "ba070ba7-8fb3-44af-9bcf-5f34eeb96919",
"isResizable": true,
"w": 3,
"x": 13,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Avg Commit Latency",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "avg(ceph_osd_commit_latency_ms{cluster=\"$cluster\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "fe6884f3-8cb6-4688-a189-3d733090f041",
"layout": {
"h": 3,
"i": "fe6884f3-8cb6-4688-a189-3d733090f041",
"isResizable": true,
"w": 4,
"x": 16,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Avg Op Write Latency",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "avg(rate(ceph_osd_op_w_latency_sum{cluster=\"$cluster\"}[5m]) / rate(ceph_osd_op_w_latency_count{cluster=\"$cluster\"}[5m]) \u003e= 0)",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "7d2d0613-206b-487b-8c58-56f36ba4f145",
"layout": {
"h": 3,
"i": "7d2d0613-206b-487b-8c58-56f36ba4f145",
"isResizable": true,
"w": 4,
"x": 20,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Avg Op Read Latency",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "avg(rate(ceph_osd_op_r_latency_sum{cluster=\"$cluster\"}[5m])/rate(ceph_osd_op_r_latency_count{cluster=\"$cluster\"}[5m]) \u003e= 0)",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "8b5713db-43e7-470e-8758-108e51543bcc",
"layout": {
"h": 1,
"i": "8b5713db-43e7-470e-8758-108e51543bcc",
"isResizable": false,
"w": 24,
"x": 0,
"y": 11
},
"name": "CLUSTER",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "d756bbce-10d0-4301-994d-4e381be883ee",
"layout": {
"h": 8,
"i": "d756bbce-10d0-4301-994d-4e381be883ee",
"isResizable": true,
"w": 8,
"x": 0,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "Capacity",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "ceph_cluster_total_bytes{cluster=\"$cluster\"}",
"legend": "Total Capacity",
"refId": "C"
},
{
"expr": "ceph_cluster_total_bytes{cluster=\"$cluster\"}-ceph_cluster_total_used_bytes{cluster=\"$cluster\"}",
"legend": "Available",
"refId": "A"
},
{
"expr": "ceph_cluster_total_used_bytes{cluster=\"$cluster\"}",
"legend": "Used",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "496e43b1-cb84-42cf-b0db-d4c5fb651d45",
"layout": {
"h": 8,
"i": "496e43b1-cb84-42cf-b0db-d4c5fb651d45",
"isResizable": true,
"w": 8,
"x": 8,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "IOPS",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(irate(ceph_osd_op_w{cluster=\"$cluster\"}[5m]))",
"legend": "Write",
"refId": "A"
},
{
"expr": "sum(irate(ceph_osd_op_r{cluster=\"$cluster\"}[5m]))",
"legend": "Read",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "22fc4713-e9f9-4b47-aa65-d7c2067452c1",
"layout": {
"h": 8,
"i": "22fc4713-e9f9-4b47-aa65-d7c2067452c1",
"isResizable": true,
"w": 8,
"x": 16,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "Throughput",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(irate(ceph_osd_op_w_in_bytes{cluster=\"$cluster\"}[5m]))",
"legend": "Write",
"refId": "A"
},
{
"expr": "sum(irate(ceph_osd_op_r_out_bytes{cluster=\"$cluster\"}[5m]))",
"legend": "Read",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "a5c43a7c-b6cf-4bc5-acef-c7ef845c4620",
"layout": {
"h": 8,
"i": "a5c43a7c-b6cf-4bc5-acef-c7ef845c4620",
"isResizable": true,
"w": 8,
"x": 0,
"y": 20
},
"links": [],
"maxPerRow": 4,
"name": "Pool Used Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(ceph_pool_bytes_used{cluster='$cluster'}) *on (pool_id) group_left(name)(ceph_pool_metadata{cluster='$cluster'})",
"legend": "{{name}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"id": "729139a3-0097-45a5-86ee-63608734baef",
"layout": {
"h": 8,
"i": "729139a3-0097-45a5-86ee-63608734baef",
"isResizable": true,
"w": 8,
"x": 8,
"y": 20
},
"links": [],
"maxPerRow": 4,
"name": "Pool RAW Used Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(ceph_pool_raw_bytes_used{cluster='$cluster'}) *on (pool_id) group_left(name)(ceph_pool_metadata{cluster='$cluster'})",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "c97c4581-6163-4b3c-bd18-b710244f607b",
"layout": {
"h": 8,
"i": "c97c4581-6163-4b3c-bd18-b710244f607b",
"isResizable": true,
"w": 8,
"x": 16,
"y": 20
},
"links": [],
"maxPerRow": 4,
"name": "Objects Per Pool",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(ceph_pool_objects{cluster='$cluster'}) *on (pool_id) group_left(name)(ceph_pool_metadata{cluster='$cluster'})",
"legend": "{{name}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"id": "e04d410c-280a-4f4a-90c0-f19779d30753",
"layout": {
"h": 7,
"i": "e04d410c-280a-4f4a-90c0-f19779d30753",
"isResizable": true,
"w": 8,
"x": 0,
"y": 28
},
"links": [],
"maxPerRow": 4,
"name": "Pool Quota Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(ceph_pool_quota_bytes{cluster='$cluster'}) *on (pool_id) group_left(name)(ceph_pool_metadata{cluster='$cluster'})",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"id": "94f01b89-eeef-4b73-922f-e24539683f85",
"layout": {
"h": 7,
"i": "94f01b89-eeef-4b73-922f-e24539683f85",
"isResizable": true,
"w": 8,
"x": 8,
"y": 28
},
"links": [],
"maxPerRow": 4,
"name": "Pool Objects Quota",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(ceph_pool_quota_objects{cluster='$cluster'}) *on (pool_id) group_left(name)(ceph_pool_metadata{cluster='$cluster'})",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"id": "6fce7e94-658a-43f0-b71d-6d4884b0729f",
"layout": {
"h": 7,
"i": "6fce7e94-658a-43f0-b71d-6d4884b0729f",
"isResizable": true,
"w": 8,
"x": 16,
"y": 28
},
"links": [],
"maxPerRow": 4,
"name": "OSD Type Count",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "count(ceph_bluestore_commit_lat_count{cluster='$cluster'})",
"legend": "BlueStore",
"refId": "A"
},
{
"expr": "count(ceph_filestore_journal_latency_count{cluster='$cluster'})",
"legend": "FileStore",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "8d15424c-09e2-4277-b03c-9f387794c246",
"layout": {
"h": 1,
"i": "8d15424c-09e2-4277-b03c-9f387794c246",
"isResizable": false,
"w": 24,
"x": 0,
"y": 35
},
"name": "Alerts",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "39fe8468-9d6a-4ea0-86cb-82629123b5b2",
"layout": {
"h": 6,
"i": "39fe8468-9d6a-4ea0-86cb-82629123b5b2",
"isResizable": true,
"w": 8,
"x": 0,
"y": 36
},
"links": [],
"maxPerRow": 4,
"name": "Alerts from CephThanos",
"targets": [
{
"expr": "ALERTS{cluster='$cluster', alertstate='firing'}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "unknown",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "00669ac0-4fac-4771-ba40-390bc2bd3131",
"layout": {
"h": 6,
"i": "00669ac0-4fac-4771-ba40-390bc2bd3131",
"isResizable": true,
"w": 8,
"x": 8,
"y": 36
},
"links": [],
"maxPerRow": 4,
"name": "Top Sluggish OSD's",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "topk(5,sort_desc(ceph_osd_apply_latency_ms{cluster='$cluster'} + ceph_osd_commit_latency_ms{cluster='$cluster'}))",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "021f7151-84f5-465c-85be-2f4d80d3087a",
"layout": {
"h": 6,
"i": "021f7151-84f5-465c-85be-2f4d80d3087a",
"isResizable": true,
"w": 8,
"x": 16,
"y": 36
},
"links": [],
"maxPerRow": 4,
"name": "Down OSD's",
"targets": [
{
"expr": "ceph_osd_up{cluster=\"$cluster\"} == 0",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "unknown",
"version": "3.0.0"
},
{
"collapsed": false,
"id": "4276718f-312d-469a-a079-36696fef7926",
"layout": {
"h": 1,
"i": "4276718f-312d-469a-a079-36696fef7926",
"isResizable": false,
"w": 24,
"x": 0,
"y": 42
},
"name": "Ceph Versions",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "b5196610-3a49-4c6a-99fb-49173346db75",
"layout": {
"h": 9,
"i": "b5196610-3a49-4c6a-99fb-49173346db75",
"isResizable": true,
"w": 6,
"x": 0,
"y": 43
},
"links": [],
"maxPerRow": 4,
"name": "Ceph OSD Versions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "count by (ceph_version)(ceph_osd_metadata{cluster='$cluster'})",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "3ceb91fe-65a0-408b-96ca-1bdd3b003890",
"layout": {
"h": 9,
"i": "3ceb91fe-65a0-408b-96ca-1bdd3b003890",
"isResizable": true,
"w": 6,
"x": 6,
"y": 43
},
"links": [],
"maxPerRow": 4,
"name": "Ceph Mon Versions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "count by (ceph_version)(ceph_mon_metadata{cluster='$cluster'})",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "72476720-2416-49d3-af67-8ed7da951d4f",
"layout": {
"h": 9,
"i": "72476720-2416-49d3-af67-8ed7da951d4f",
"isResizable": true,
"w": 6,
"x": 12,
"y": 43
},
"links": [],
"maxPerRow": 4,
"name": "Ceph MDS Versions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "count by (ceph_version)(ceph_mds_metadata{cluster='$cluster'})",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "9fe26b14-d9ce-41b6-af0b-f0e353789959",
"layout": {
"h": 9,
"i": "9fe26b14-d9ce-41b6-af0b-f0e353789959",
"isResizable": true,
"w": 6,
"x": 18,
"y": 43
},
"links": [],
"maxPerRow": 4,
"name": "Ceph RGW Versions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "count by (ceph_version)(ceph_rgw_metadata{cluster='$cluster'})",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "55fde6b5-f588-4a82-b6df-7faeb819a6ed",
"layout": {
"h": 1,
"i": "55fde6b5-f588-4a82-b6df-7faeb819a6ed",
"isResizable": false,
"w": 24,
"x": 0,
"y": 52
},
"name": "OBJECTS",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"id": "cfd5482c-be7c-4072-8404-3f1c34095825",
"layout": {
"h": 12,
"i": "cfd5482c-be7c-4072-8404-3f1c34095825",
"isResizable": true,
"w": 6,
"x": 0,
"y": 53
},
"links": [],
"maxPerRow": 4,
"name": "Objects in the Cluster",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "ceph_cluster_total_objects{cluster=\"$cluster\"}",
"legend": "Total",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "7b173926-a111-48c6-b31c-e153ce874b92",
"layout": {
"h": 12,
"i": "7b173926-a111-48c6-b31c-e153ce874b92",
"isResizable": true,
"w": 8,
"x": 6,
"y": 53
},
"links": [],
"maxPerRow": 4,
"name": "PGs State",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "ceph_pg_active{cluster=\"$cluster\"}",
"legend": "Active",
"refId": "M"
},
{
"expr": "ceph_pg_clean{cluster=\"$cluster\"}",
"legend": "Clean",
"refId": "U"
},
{
"expr": "ceph_pg_peering{cluster=\"$cluster\"}",
"legend": "Peering",
"refId": "I"
},
{
"expr": "ceph_pg_degraded{cluster=\"$cluster\"}",
"legend": "Degraded",
"refId": "B"
},
{
"expr": "ceph_pg_stale{cluster=\"$cluster\"}",
"legend": "Stale",
"refId": "C"
},
{
"expr": "ceph_unclean_pgs{cluster=\"$cluster\"}",
"legend": "Unclean",
"refId": "D"
},
{
"expr": "ceph_pg_undersized{cluster=\"$cluster\"}",
"legend": "Undersized",
"refId": "E"
},
{
"expr": "ceph_pg_incomplete{cluster=\"$cluster\"}",
"legend": "Incomplete",
"refId": "G"
},
{
"expr": "ceph_pg_forced_backfill{cluster=\"$cluster\"}",
"legend": "Forced Backfill",
"refId": "H"
},
{
"expr": "ceph_pg_inconsistent{cluster=\"$cluster\"}",
"legend": "Inconsistent",
"refId": "F"
},
{
"expr": "ceph_pg_forced_recovery{cluster=\"$cluster\"}",
"legend": "Forced Recovery",
"refId": "J"
},
{
"expr": "ceph_pg_creating{cluster=\"$cluster\"}",
"legend": "Creating",
"refId": "K"
},
{
"expr": "ceph_pg_wait_backfill{cluster=\"$cluster\"}",
"legend": "Wait Backfill",
"refId": "L"
},
{
"expr": "ceph_pg_deep{cluster=\"$cluster\"}",
"legend": "Deep",
"refId": "N"
},
{
"expr": "ceph_pg_scrubbing{cluster=\"$cluster\"}",
"legend": "Scrubbing",
"refId": "O"
},
{
"expr": "ceph_pg_recovering{cluster=\"$cluster\"}",
"legend": "Recovering",
"refId": "P"
},
{
"expr": "ceph_pg_repair{cluster=\"$cluster\"}",
"legend": "Repair",
"refId": "Q"
},
{
"expr": "ceph_pg_down{cluster=\"$cluster\"}",
"legend": "Down",
"refId": "R"
},
{
"expr": "ceph_pg_peered{cluster=\"$cluster\"}",
"legend": "Peered",
"refId": "S"
},
{
"expr": "ceph_pg_backfill{cluster=\"$cluster\"}",
"legend": "Backfill",
"refId": "T"
},
{
"expr": "ceph_pg_remapped{cluster=\"$cluster\"}",
"legend": "Remapped",
"refId": "V"
},
{
"expr": "ceph_pg_backfill_toofull{cluster=\"$cluster\"}",
"legend": "Backfill Toofull",
"refId": "W"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "3fb23975-8958-45ec-a963-610fd376ec7a",
"layout": {
"h": 6,
"i": "3fb23975-8958-45ec-a963-610fd376ec7a",
"isResizable": true,
"w": 10,
"x": 14,
"y": 53
},
"links": [],
"maxPerRow": 4,
"name": "Stuck PGs",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "ceph_pg_degraded{cluster=\"$cluster\"}",
"legend": "Degraded",
"refId": "F"
},
{
"expr": "ceph_pg_stale{cluster=\"$cluster\"}",
"legend": "Stale",
"refId": "A"
},
{
"expr": "ceph_pg_undersized{cluster=\"$cluster\"}",
"legend": "Undersized",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "2fb7f0a1-4f10-4435-bff7-a5cdbb68ad79",
"layout": {
"h": 6,
"i": "2fb7f0a1-4f10-4435-bff7-a5cdbb68ad79",
"isResizable": true,
"w": 10,
"x": 14,
"y": 59
},
"links": [],
"maxPerRow": 4,
"name": "Recovery Operations",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(irate(ceph_osd_recovery_ops{cluster=\"$cluster\"}[$interval]))",
"legend": "OPS",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "4ae16220-033e-4b18-ab00-190388f95783",
"layout": {
"h": 1,
"i": "4ae16220-033e-4b18-ab00-190388f95783",
"isResizable": false,
"w": 24,
"x": 0,
"y": 65
},
"name": "LATENCY",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "de0f7837-69c6-4a10-a362-814893d10a3f",
"layout": {
"h": 8,
"i": "de0f7837-69c6-4a10-a362-814893d10a3f",
"isResizable": true,
"w": 12,
"x": 0,
"y": 66
},
"links": [],
"maxPerRow": 4,
"name": "OSD Apply Latency Distribution",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "ceph_osd_apply_latency_ms{cluster='$cluster'}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "43fb926a-064f-4107-be33-b0b16af9924f",
"layout": {
"h": 8,
"i": "43fb926a-064f-4107-be33-b0b16af9924f",
"isResizable": true,
"w": 12,
"x": 12,
"y": 66
},
"links": [],
"maxPerRow": 4,
"name": "OSD Commit Latency Distribution",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "ceph_osd_commit_latency_ms{cluster='$cluster'}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "b65f0eef-cd72-4a22-9143-3937626a3762",
"layout": {
"h": 8,
"i": "b65f0eef-cd72-4a22-9143-3937626a3762",
"isResizable": true,
"w": 12,
"x": 0,
"y": 74
},
"links": [],
"maxPerRow": 4,
"name": "OSD Read Op Latency Distribution",
"targets": [
{
"expr": "rate(ceph_osd_op_r_latency_sum{cluster=\"$cluster\"}[5m]) / rate(ceph_osd_op_r_latency_count{cluster=\"$cluster\"}[5m]) \u003e= 0",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "unknown",
"version": "3.0.0"
},
{
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "7ebe8e1e-7b43-4371-b6d2-b934d8f189b6",
"layout": {
"h": 8,
"i": "7ebe8e1e-7b43-4371-b6d2-b934d8f189b6",
"isResizable": true,
"w": 12,
"x": 12,
"y": 74
},
"links": [],
"maxPerRow": 4,
"name": "OSD Write Op Latency Distribution",
"targets": [
{
"expr": "rate(ceph_osd_op_w_latency_sum{cluster=\"$cluster\"}[5m]) / rate(ceph_osd_op_w_latency_count{cluster=\"$cluster\"}[5m]) \u003e= 0",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "unknown",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "83ce1df6-4421-4692-be4c-85cf3b24bfa9",
"layout": {
"h": 7,
"i": "83ce1df6-4421-4692-be4c-85cf3b24bfa9",
"isResizable": true,
"w": 12,
"x": 0,
"y": 82
},
"links": [],
"maxPerRow": 4,
"name": "Avg OSD Op Latency",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "avg(rate(ceph_osd_op_r_latency_sum{cluster=\"$cluster\"}[5m]) / rate(ceph_osd_op_r_latency_count{cluster=\"$cluster\"}[5m]) \u003e= 0)",
"legend": "read",
"refId": "A"
},
{
"expr": "avg(rate(ceph_osd_op_w_latency_sum{cluster=\"$cluster\"}[5m]) / rate(ceph_osd_op_w_latency_count{cluster=\"$cluster\"}[5m]) \u003e= 0)",
"legend": "write",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DATASOURCE}",
"id": "4e31065f-fc5e-4e3c-ac34-a7ec9141ece8",
"layout": {
"h": 7,
"i": "4e31065f-fc5e-4e3c-ac34-a7ec9141ece8",
"isResizable": true,
"w": 12,
"x": 12,
"y": 82
},
"links": [],
"maxPerRow": 4,
"name": "AVG OSD Apply + Commit Latency",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "avg(ceph_osd_apply_latency_ms{cluster=\"$cluster\"})",
"legend": "apply",
"refId": "A"
},
{
"expr": "avg(ceph_osd_commit_latency_ms{cluster=\"$cluster\"})",
"legend": "commit",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "DATASOURCE",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${DATASOURCE}"
},
"definition": "label_values(ceph_health_status,cluster)",
"multi": false,
"name": "cluster",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327323021000
}
================================================
FILE: integrations/Ceph/markdown/README.md
================================================
# ceph plugin
开启 ceph prometheus 支持
```bash
ceph mgr module enable prometheus
```
## 采集配置
既然 ceph 可以暴露 prometheus 协议的 metrics 数据,则直接使用 prometheus 插件抓取即可。
categraf 配置文件:`conf/input.prometheus/prometheus.toml`
```yaml
[[instances]]
urls = [
"http://192.168.11.181:9283/metrics"
]
labels = {service="ceph",cluster="ceph-cluster-001"}
```
## 仪表盘效果
夜莺内置仪表盘中已经内置了 ceph 的仪表盘,导入即可使用。

## 告警规则
夜莺内置告警规则中已经内置了 ceph 的告警规则,导入即可使用。

================================================
FILE: integrations/ClickHouse/alerts/clickhouse_by_categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Categraf ZooKeeper故障",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "avg(clickhouse_metrics_zoo_keeper_session ) != 1",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153856411000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Categraf 内存使用",
"note": "内存使用报警",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1,
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "clickhouse_metrics_memory_tracking / clickhouse_asynchronous_metrics_os_memory_total * 100 \u003e 90",
"severity": 1
},
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "clickhouse_metrics_memory_tracking/ clickhouse_asynchronous_metrics_os_memory_total * 100 \u003e 80",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153858877000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Categraf 磁盘使用",
"note": "磁盘使用报警",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1,
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "clickhouse_asynchronous_metrics_disk_available_default / (clickhouse_asynchronous_metrics_disk_available_default + clickhouse_asynchronous_metrics_disk_used_default) * 100 \u003c 10",
"severity": 1
},
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "clickhouse_asynchronous_metrics_disk_available_default / (clickhouse_asynchronous_metrics_disk_available_default + clickhouse_asynchronous_metrics_disk_used_default) * 100 \u003c 20",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153860224000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Categraf 网络故障",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3,
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "clickhouse_metrics_network_send \u003e 250 or clickhouse_metrics_network_receive \u003e 250",
"severity": 2
},
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "clickhouse_metrics_network_send \u003e 250 or clickhouse_metrics_network_receive \u003e 250",
"severity": 3
},
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "increase(clickhouse_metrics_interserver_connection[5m]) \u003e 0",
"severity": 3
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153861525000
}
]
================================================
FILE: integrations/ClickHouse/alerts/clickhouse_by_exporter.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Exporter 认证错误",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2,
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) \u003e 0",
"severity": 2
},
{
"prom_ql": "increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) \u003e 0",
"severity": 3
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153863782000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Exporter ZooKeeper故障",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "avg(ClickHouseMetrics_ZooKeeperSession) != 1",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153865298000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Exporter 内存使用",
"note": "内存使用报警",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1,
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "ClickHouseMetrics_MemoryTracking / ClickHouseAsyncMetrics_OSMemoryTotal * 100 \u003e 90",
"severity": 1
},
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "ClickHouseMetrics_MemoryTracking / ClickHouseAsyncMetrics_OSMemoryTotal * 100 \u003e 80",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153866296000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Exporter 副本错误",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1,
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1",
"severity": 1
},
{
"prom_ql": " ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1",
"severity": 1
},
{
"prom_ql": " ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1",
"severity": 3
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153867268000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Exporter 磁盘使用",
"note": "磁盘使用报警",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1,
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 \u003c 10",
"severity": 1
},
{
"keys": {
"labelKey": "",
"valueKey": ""
},
"prom_ql": "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 \u003c 20",
"severity": 2
},
{
"prom_ql": "ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 \u003c 20",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153868363000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ClickHouse Exporter 网络故障",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2,
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "ClickHouseMetrics_NetworkSend \u003e 250 or ClickHouseMetrics_NetworkReceive \u003e 250",
"severity": 2
},
{
"prom_ql": "ClickHouseMetrics_NetworkSend \u003e 250 or ClickHouseMetrics_NetworkReceive \u003e 250",
"severity": 3
},
{
"prom_ql": "increase(ClickHouseMetrics_InterserverConnection[5m]) \u003e 0",
"severity": 3
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1719305153869486000
}
]
================================================
FILE: integrations/ClickHouse/collect/clickhouse/clickhouse.toml
================================================
# # collect interval
# interval = 15
# Read metrics from one or many ClickHouse servers
[[instances]]
## Username for authorization on ClickHouse server
username = "default"
## Password for authorization on ClickHouse server
# password = ""
## HTTP(s) timeout while getting metrics values
## The timeout includes connection time, any redirects, and reading the
## response body.
# timeout = 5
## List of servers for metrics scraping
## metrics scrape via HTTP(s) clickhouse interface
## https://clickhouse.tech/docs/en/interfaces/http/
# servers = ["http://127.0.0.1:8123"]
## If "auto_discovery"" is "true" plugin tries to connect to all servers
## available in the cluster with using same "user:password" described in
## "user" and "password" parameters and get this server hostname list from
## "system.clusters" table. See
## - https://clickhouse.tech/docs/en/operations/system_tables/#system-clusters
## - https://clickhouse.tech/docs/en/operations/server_settings/settings/#server_settings_remote_servers
## - https://clickhouse.tech/docs/en/operations/table_engines/distributed/
## - https://clickhouse.tech/docs/en/operations/table_engines/replication/#creating-replicated-tables
# auto_discovery = true
## Filter cluster names in "system.clusters" when "auto_discovery" is "true"
## when this filter present then "WHERE cluster IN (...)" filter will apply
## please use only full cluster names here, regexp and glob filters is not
## allowed for "/etc/clickhouse-server/config.d/remote.xml"
##
##
##
##
## clickhouse-ru-1.local 9000
## clickhouse-ru-2.local 9000
##
##
## clickhouse-eu-1.local 9000
## clickhouse-eu-2.local 9000
##
##
##
##
##
##
## example: cluster_include = ["my-own-cluster"]
# cluster_include = []
## Filter cluster names in "system.clusters" when "auto_discovery" is
## "true" when this filter present then "WHERE cluster NOT IN (...)"
## filter will apply
## example: cluster_exclude = ["my-internal-not-discovered-cluster"]
# cluster_exclude = []
## Optional TLS Config
# tls_ca = "/etc/telegraf/ca.pem"
# tls_cert = "/etc/telegraf/cert.pem"
# tls_key = "/etc/telegraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
# [[instances.metrics]]
# measurement = "sessions"
# label_fields = [ "status", "type" ]
# metric_fields = [ "value" ]
# timeout = "3s"
# request = '''
# SELECT status, type, COUNT(*) as value FROM v$session GROUP BY status, type
# '''
================================================
FILE: integrations/ClickHouse/dashboards/clickhouse_by_categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "ClickHouse_Categraf",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"graphTooltip": "default",
"graphZoom": "default",
"panels": [
{
"collapsed": false,
"id": "be21493f-8eb4-4947-9e34-0109fcda9a51",
"layout": {
"h": 1,
"i": "be21493f-8eb4-4947-9e34-0109fcda9a51",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "General",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "内存占用",
"id": "8687b3a6-c7f2-4af6-8509-29c99969a55d",
"layout": {
"h": 7,
"i": "8687b3a6-c7f2-4af6-8509-29c99969a55d",
"isResizable": true,
"w": 12,
"x": 0,
"y": 1
},
"maxPerRow": 4,
"name": "Memory",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_memory_tracking",
"legend": "分配的内存总量",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "tcp连接数",
"id": "af8f29d3-8992-450e-8caa-749518669da1",
"layout": {
"h": 7,
"i": "af8f29d3-8992-450e-8caa-749518669da1",
"isResizable": true,
"w": 12,
"x": 12,
"y": 1
},
"maxPerRow": 4,
"name": "Connections",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_tcp_connection",
"legend": "与 TCP 服务器(带本地接口的客户端)的连接数,也包括服务器-服务器连接",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "cd6bd256-20d0-4892-9c99-7b9976fd9013",
"layout": {
"h": 1,
"i": "cd6bd256-20d0-4892-9c99-7b9976fd9013",
"isResizable": false,
"w": 24,
"x": 0,
"y": 15
},
"name": "Queries",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "需要解释和可能执行的查询数量,不包括失败的查询",
"id": "0524900d-99fd-42b8-a647-d4da462fc309",
"layout": {
"h": 8,
"i": "0524900d-99fd-42b8-a647-d4da462fc309",
"isResizable": true,
"w": 11,
"x": 0,
"y": 16
},
"maxPerRow": 4,
"name": "查询总数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_query[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_query[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "SELECT查询的数量",
"id": "9e288fcd-4513-46bb-bcb5-b2f4e0b4edde",
"layout": {
"h": 8,
"i": "d5701b8e-6926-4349-934d-0999de4666a5",
"isResizable": true,
"w": 12,
"x": 11,
"y": 16
},
"maxPerRow": 4,
"name": "SELECT 查询数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_select_query[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_select_query[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "6a95c841-5ac7-4cd9-9597-21b0f41441e4",
"layout": {
"h": 9,
"i": "c14878d8-c197-45ab-802e-f328b84deeff",
"isResizable": true,
"w": 11,
"x": 0,
"y": 24
},
"maxPerRow": 4,
"name": "查询平均延迟",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "increase(clickhouse_events_query_time_microseconds[1m]) / (increase(clickhouse_events_query_time_microseconds[1m]) + 0.001)",
"legend": "{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "4584d584-054b-4d42-b2f9-215a71fca118",
"layout": {
"h": 9,
"i": "050414c6-8e31-49d1-8ddd-1916bf2101f5",
"isResizable": true,
"w": 12,
"x": 11,
"y": 24
},
"maxPerRow": 4,
"name": "SELECT查询平均延迟",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "increase(clickhouse_events_select_query_time_microseconds[1m] )/ (increase(clickhouse_events_select_query_time_microseconds[1m]) + 0.001)",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "9da4b0b9-6843-4225-b3ba-dec31b2ca7c0",
"layout": {
"h": 1,
"i": "9da4b0b9-6843-4225-b3ba-dec31b2ca7c0",
"isResizable": false,
"w": 24,
"x": 0,
"y": 50
},
"name": "Insert",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "所有表INSERT的行数",
"id": "6ea9312d-2334-4912-864c-4cea25bf65f9",
"layout": {
"h": 8,
"i": "6ea9312d-2334-4912-864c-4cea25bf65f9",
"isResizable": true,
"w": 12,
"x": 0,
"y": 51
},
"maxPerRow": 4,
"name": "Inserted 插入行数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_inserted_rows[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_inserted_rows[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "所有表INSERT的字节数(未压缩列以它们在内存中存储的形式)",
"id": "8a4987ad-c10f-419c-b2e9-107502e4bc20",
"layout": {
"h": 8,
"i": "993c68cb-fabe-4742-b1a3-b9bd317d9725",
"isResizable": true,
"w": 11,
"x": 12,
"y": 51
},
"maxPerRow": 4,
"name": "Inserted 插入字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_inserted_bytes[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_inserted_bytes[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "由于分区的活动数据部分数量过多,INSERT到MergeTree表的块被限制的次数",
"id": "dd6d8098-1cab-483d-9462-3436678cbbb8",
"layout": {
"h": 8,
"i": "ecfdbaee-44bb-47ad-a871-e1448a7eac25",
"isResizable": true,
"w": 12,
"x": 0,
"y": 59
},
"maxPerRow": 4,
"name": "延迟插入次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_metrics_delayed_inserts[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_metrics_delayed_inserts[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "681645ab-ef95-4875-9f97-2bf3d0981bf9",
"layout": {
"h": 1,
"i": "681645ab-ef95-4875-9f97-2bf3d0981bf9",
"isResizable": false,
"w": 24,
"x": 0,
"y": 83
},
"name": "Select",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从所有表SELECT的字节数(未压缩列以它们在内存中存储的形式)",
"id": "244865f5-c78a-4b21-8068-462612b8730c",
"layout": {
"h": 10,
"i": "628a3845-fc7d-400c-b16c-6181faddd7b9",
"isResizable": true,
"w": 11,
"x": 0,
"y": 84
},
"maxPerRow": 4,
"name": "SELECT查询的字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_selected_bytes[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_selected_bytes[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从所有表SELECT的行数",
"id": "a7e4857b-cf88-4ed7-8cf6-adadeb521e78",
"layout": {
"h": 10,
"i": "2b63803d-ef02-49bc-aba9-41ac7e85bb9d",
"isResizable": true,
"w": 11,
"x": 11,
"y": 84
},
"maxPerRow": 4,
"name": "SELECT查询的行数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_selected_rows[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_selected_rows[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "1d95c7da-85e1-4a6b-a6a9-68eb19ca821e",
"layout": {
"h": 1,
"i": "1d95c7da-85e1-4a6b-a6a9-68eb19ca821e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 104
},
"name": "IO",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "6be5bfc5-8d91-414f-9781-e4b390307961",
"layout": {
"h": 8,
"i": "6be5bfc5-8d91-414f-9781-e4b390307961",
"isResizable": true,
"w": 11,
"x": 0,
"y": 5
},
"maxPerRow": 4,
"name": "iseek函数调用次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_seek[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_seek[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "df47ce32-e3b6-4d5b-8bc9-4a640ec9c75c",
"layout": {
"h": 8,
"i": "4b6a7bf4-e91a-41fb-b2a1-61d79805f2c5",
"isResizable": true,
"w": 11,
"x": 11,
"y": 5
},
"maxPerRow": 4,
"name": "打开的文件数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_file_open[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_file_open[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从文件描述符进行读取(read/pread)的次数,不包括套接字",
"id": "26318204-5e2c-43dd-93b1-2d0ff34c3c9b",
"layout": {
"h": 8,
"i": "e4f66145-d9c6-48df-a9f5-a98ec26cf1e2",
"isResizable": true,
"w": 11,
"x": 0,
"y": 13
},
"maxPerRow": 4,
"name": "从FD文件描述符读取次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_read_buffer_from_file_descriptor_read[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_read_buffer_from_file_descriptor_read[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "向FD写入的次数,不包括套接字",
"id": "222a5968-672b-4f4f-ba1a-c823c4be1116",
"layout": {
"h": 8,
"i": "3a378a00-6b9f-4098-8a2d-ed32628cc182",
"isResizable": true,
"w": 11,
"x": 11,
"y": 13
},
"maxPerRow": 4,
"name": "向FD写入的次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_write_buffer_from_file_descriptor_write[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_write_buffer_from_file_descriptor_write[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "fc2a63e9-aec2-4755-b4a2-8b0f54868bc6",
"layout": {
"h": 8,
"i": "7b9965c5-b341-4713-88f1-e5a48183ef94",
"isResizable": true,
"w": 11,
"x": 0,
"y": 21
},
"maxPerRow": 4,
"name": "向FD写入的字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_write_buffer_from_file_descriptor_write_bytes[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_write_buffer_from_file_descriptor_write_bytes[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从文件描述符读取的字节数。如果文件是压缩的,这将显示压缩后的数据大小",
"id": "b5546ead-ef3d-406c-acfc-33759f9d5ad9",
"layout": {
"h": 8,
"i": "7e16e52b-7c39-409b-b2f0-1e9f1feaa451",
"isResizable": true,
"w": 11,
"x": 11,
"y": 21
},
"maxPerRow": 4,
"name": "从FD读取的字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "irate(clickhouse_events_read_buffer_from_file_descriptor_read_bytes[2m]) * $trends",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_read_buffer_from_file_descriptor_read_bytes[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从压缩源(文件,网络)读取的压缩块数(独立压缩的数据块)",
"id": "dc75e7de-7728-4f12-91fc-789cdc2610dd",
"layout": {
"h": 9,
"i": "ccfe64c6-a0b2-4179-abce-49a772d43a94",
"isResizable": true,
"w": 11,
"x": 0,
"y": 29
},
"maxPerRow": 4,
"name": "读取的压缩块数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_compressed_read_buffer_blocks[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_compressed_read_buffer_blocks[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
},
{
"__mode__": "__query__",
"expr": "clickhouse_events_compressed_read_buffer_blocks",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从压缩源(文件,网络)读取的未压缩字节数(解压后的字节数)",
"id": "44c6e097-4114-4ef5-89d3-96da2c11ccf3",
"layout": {
"h": 9,
"i": "f3dfd2fc-a656-4ef0-a326-1977908ffd90",
"isResizable": true,
"w": 11,
"x": 11,
"y": 29
},
"maxPerRow": 4,
"name": "读取的未压缩字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_compressed_read_buffer_bytes[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_compressed_read_buffer_bytes[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "f358d0ce-ab09-4f5d-8477-c6c47341f185",
"layout": {
"h": 1,
"i": "f358d0ce-ab09-4f5d-8477-c6c47341f185",
"isResizable": false,
"w": 24,
"x": 0,
"y": 138
},
"name": "Replicas\n",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "在ClickHouse集群中,由于特定情况而暂时处于只读状态的复制表数量。这种情况通常发生在以下两种情形下:\n\nZooKeeper会话丢失后重新初始化:ClickHouse使用ZooKeeper来维护集群间的协调和一致性。如果与ZooKeeper的会话丢失,为了防止数据不一致,相关的复制表可能会被自动设置为只读模式,直到与ZooKeeper的连接恢复并且数据同步完成为止。\n\n未配置ZooKeeper启动:如果ClickHouse实例在没有正确配置ZooKeeper的情况下启动,它可能无法确定其在复制集群中的准确状态,从而为了安全起见,将涉及复制的表设置为只读,以避免潜在的数据冲突或不一致性问题。",
"id": "24f224bb-3f8c-4dae-9485-d2015e238ba4",
"layout": {
"h": 10,
"i": "24f224bb-3f8c-4dae-9485-d2015e238ba4",
"isResizable": true,
"w": 15,
"x": 0,
"y": 6
},
"maxPerRow": 4,
"name": "只读状态的复制表数量",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "clickhouse_metrics_readonly_replica",
"legend": "{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "9bdb151e-adfb-4985-ad38-50f83a4b13e1",
"layout": {
"h": 1,
"i": "9bdb151e-adfb-4985-ad38-50f83a4b13e1",
"isResizable": false,
"w": 24,
"x": 0,
"y": 149
},
"name": "Merge",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "启动的后台合并次数",
"id": "505ab54d-1897-45d6-a087-db2c2570f8f7",
"layout": {
"h": 8,
"i": "505ab54d-1897-45d6-a087-db2c2570f8f7",
"isResizable": true,
"w": 12,
"x": 0,
"y": 150
},
"maxPerRow": 4,
"name": "后台合并次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_merge[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_merge[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "后台合并读取的行数。这是合并前的行数",
"id": "cf1cbd45-4762-4b57-b674-91520ee63ff4",
"layout": {
"h": 8,
"i": "019fba59-0df6-45d8-a7d0-3adbf32b44f5",
"isResizable": true,
"w": 11,
"x": 12,
"y": 150
},
"maxPerRow": 4,
"name": "合并读取的行数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_merged_rows[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_merged_rows[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "后台合并读取的未压缩字节数(列以它们在内存中存储的形式)。这是合并前的字节数",
"id": "bee87dc6-ace1-47c2-8676-3a7a4c33d2bd",
"layout": {
"h": 9,
"i": "2a43db17-8490-4f9c-a165-a547b112c31b",
"isResizable": true,
"w": 12,
"x": 0,
"y": 158
},
"maxPerRow": 4,
"name": "合并读取的未压缩字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_merged_uncompressed_bytes[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_merged_uncompressed_bytes[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "63c16da6-f138-4f8f-9a36-a0f71454ba5d",
"layout": {
"h": 9,
"i": "972b412f-af69-4709-858d-cdae2a1ab35f",
"isResizable": true,
"w": 11,
"x": 12,
"y": 158
},
"maxPerRow": 4,
"name": "合并平均持续时间",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "increase(clickhouse_events_merges_time_milliseconds[2m]) / (increase(clickhouse_events_merges_time_milliseconds[2m]) + 0.001)",
"legend": "{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "a01d3de2-a79d-49d9-b238-e5db403d8259",
"layout": {
"h": 8,
"i": "8b2138af-31bb-4c88-b983-08eead1da3ba",
"isResizable": true,
"w": 12,
"x": 0,
"y": 167
},
"maxPerRow": 4,
"name": "MergeTree表插入的行数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_merge_tree_data_writer_rows[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_merge_tree_data_writer_rows[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "向MergeTree系列表(包括ReplicatedMergeTree等变种)中插入数据时所用的数据块数量",
"id": "87a118d2-305d-4eb9-921a-39bbf11b56b5",
"layout": {
"h": 8,
"i": "ea833f12-d5fa-419f-9825-5fd105a709df",
"isResizable": true,
"w": 11,
"x": 12,
"y": 167
},
"maxPerRow": 4,
"name": "MergeTree插入块数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_merge_tree_data_writer_blocks[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_merge_tree_data_writer_blocks[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "插入到MergeTree表的未压缩字节数(列以它们在内存中存储的形式)",
"id": "3dca4a05-b0cc-4e63-ae32-ee5261b6b7b3",
"layout": {
"h": 8,
"i": "84893385-fa2f-4016-b820-df600797b4a9",
"isResizable": true,
"w": 12,
"x": 0,
"y": 175
},
"maxPerRow": 4,
"name": "MergeTree 表插入的未压缩字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_merge_tree_data_writer_uncompressed_bytes[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_merge_tree_data_writer_uncompressed_bytes[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "插入到MergeTree表的数据写入到文件系统的字节数",
"id": "6777102c-5efe-4082-9a86-d58f76d751ba",
"layout": {
"h": 8,
"i": "96a43967-afda-4c33-80f2-ce1237afeb35",
"isResizable": true,
"w": 11,
"x": 12,
"y": 175
},
"maxPerRow": 4,
"name": "MergeTree 表写入的压缩字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(clickhouse_events_merge_tree_data_writer_compressed_bytes[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_merge_tree_data_writer_compressed_bytes[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "为当前运行的后台合并保留的磁盘空间。它略大于当前合并部分的总大小",
"id": "2b70d3a1-f9e2-4b35-b181-809091fe0aa0",
"layout": {
"h": 8,
"i": "8612a9c6-bdbd-4eea-837e-84bf93205446",
"isResizable": true,
"w": 12,
"x": 0,
"y": 183
},
"maxPerRow": 4,
"name": "保留空间",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "clickhouse_metrics_disk_space_reserved_for_merge",
"legend": "{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "13a904ce-b661-4abb-bce6-d3f91cd09a05",
"layout": {
"h": 1,
"i": "13a904ce-b661-4abb-bce6-d3f91cd09a05",
"isResizable": false,
"w": 24,
"x": 0,
"y": 232
},
"name": "Cache",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从文件缓存中打开文件时没有命中缓存的次数",
"id": "6423431d-916e-4a79-9db1-d0146e0c83fd",
"layout": {
"h": 11,
"i": "443329a2-56bf-4632-8491-ca581b88b93b",
"isResizable": true,
"w": 12,
"x": 0,
"y": 233
},
"maxPerRow": 4,
"name": "文件缓存未命中次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(clickhouse_events_opened_file_cache_misses[2m]) * $trends",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(clickhouse_events_opened_file_cache_misses[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "b408fca8-4054-4ea6-a131-c3c76439d036",
"layout": {
"h": 1,
"i": "b408fca8-4054-4ea6-a131-c3c76439d036",
"isResizable": false,
"w": 24,
"x": 0,
"y": 255
},
"name": "Parts",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "目前正在生成的部分,不在数据部分列表中",
"id": "3a06dade-4969-40bf-aab1-1a5f3950a75f",
"layout": {
"h": 8,
"i": "3a06dade-4969-40bf-aab1-1a5f3950a75f",
"isResizable": true,
"w": 12,
"x": 0,
"y": 256
},
"maxPerRow": 4,
"name": "临时部分数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_parts_temporary",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "在数据部分中,但不用于SELECT查询的部分\t\n",
"id": "fd497233-3f93-4f73-9dc7-4045d1454eef",
"layout": {
"h": 8,
"i": "cd29ec5b-f032-4464-a1b5-ed987e7d5cd3",
"isResizable": true,
"w": 12,
"x": 12,
"y": 256
},
"maxPerRow": 4,
"name": "预提交部分数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_parts_pre_committed",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "已经提交的数据部分的数量",
"id": "b688234f-b66a-440f-8a95-20e2a45df833",
"layout": {
"h": 7,
"i": "d12a6546-3f67-4c25-90ed-16ec3d13ec94",
"isResizable": true,
"w": 12,
"x": 0,
"y": 264
},
"maxPerRow": 4,
"name": "提交部分数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_parts_committed",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "暂时保留、等待相关查询结束即可清理的数据分区的数量。",
"id": "85faac0b-b923-4f38-b10a-9a38b98f89ee",
"layout": {
"h": 7,
"i": "ee8c8080-9a96-445e-a45a-164b735ae0fa",
"isResizable": true,
"w": 12,
"x": 12,
"y": 264
},
"maxPerRow": 4,
"name": "过时分区",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_parts_outdated",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "1e45da14-63a6-4adb-be34-a83f9469af84",
"layout": {
"h": 7,
"i": "fca8edbe-2eb2-45a0-8886-ba554aa98100",
"isResizable": true,
"w": 12,
"x": 0,
"y": 271
},
"maxPerRow": 4,
"name": "正删除数据",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_parts_deleting",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "记录了当前有多少个数据分区正处于这样一种待删除的状态——即这些分区已经被逻辑上标记为“待清理”,并且它们的物理删除操作将在资源被正式释放时由系统自动执行。",
"id": "0fb33f4e-2392-4fb7-bbdb-486c8520e957",
"layout": {
"h": 7,
"i": "c09e1b70-3d5c-4d1d-9bac-9c7d0957ff22",
"isResizable": true,
"w": 12,
"x": 12,
"y": 271
},
"maxPerRow": 4,
"name": "待清理",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_parts_delete_on_destroy",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "包含了大量列或者说是字段广泛的数据。一个宽数据分区意味着单个数据块或行包含很多列,这可能会影响到数据处理和查询的效率,尤其是在涉及大量列的选择或聚合操作时。",
"id": "75a6156e-66c8-4b05-a7f3-303d6babe521",
"layout": {
"h": 7,
"i": "cd3637a2-e1af-4c1d-ab12-b5e85caf6835",
"isResizable": true,
"w": 12,
"x": 0,
"y": 278
},
"maxPerRow": 4,
"name": "宽数据分区",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_parts_wide",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "当前数据库中经过压缩处理的数据分区的数量。这些分区通过算法减少了数据占用的空间,同时保持了数据的完整性和查询的可行性。",
"id": "bb287913-7ef7-408f-9feb-d875b3683157",
"layout": {
"h": 7,
"i": "a40b9909-1cc3-47bd-b28e-aca03bd777bd",
"isResizable": true,
"w": 12,
"x": 12,
"y": 278
},
"maxPerRow": 4,
"name": "压缩数据分区 ",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_parts_compact",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "5aa89429-c4b8-4733-a7b9-2abfc45fb836",
"layout": {
"h": 1,
"i": "5aa89429-c4b8-4733-a7b9-2abfc45fb836",
"isResizable": false,
"w": 24,
"x": 0,
"y": 314
},
"name": "Distributed",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "衡量的是向远程服务器发送数据的连接数量,这些数据最初是插入到 Distributed 表中的。Distributed 表是 ClickHouse 中用于分布式存储和处理数据的一种特殊表类型,它实际上不存储数据,而是将数据操作(如 INSERT 查询)转发到表所定义的远程节点上。通过监控这个指标,您可以了解到数据分布的活跃程度,即 ClickHouse 集群内部数据流动的频繁程度,以及分布式查询或数据加载操作的负载情况。高数值可能意味着大量的数据正在被分布式地处理或存储,这可能是系统繁忙或数据管道高效运行的迹象。然而,如果发现该指标异常高且伴随性能问题,可能需要进一步调查网络状况、远程服务器的处理能力或分布式表的配置,以优化数据传输效率和系统整体性能。",
"id": "b9dde172-3e02-4e71-891f-5fb97c9c0571",
"layout": {
"h": 11,
"i": "b9dde172-3e02-4e71-891f-5fb97c9c0571",
"isResizable": true,
"w": 11,
"x": 0,
"y": 315
},
"maxPerRow": 4,
"name": "向远程服务器发送数据的连接数量",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_distributed_send",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "等待异步插入到 Distributed 表中的文件数量\n这个指标对于监控数据导入或异步处理流程的效率至关重要。如果数值较大,可能意味着有大量数据正在排队等待写入 Distributed 表,这可能会影响数据的实时性或处理速度。高数值还可能指示了数据导入作业的积压、网络瓶颈、远程节点处理能力不足或其他与分布式系统相关的性能问题。\n\n监控 ClickHouseMetrics_DistributedFilesToInsert 可以帮助识别和诊断数据流中的瓶颈,进而采取相应的优化措施,比如调整插入作业的并发度、优化网络配置、增加目标表的处理能力或调整分布式表的配置参数,以确保数据能够高效、及时地被分布式存储和处理。",
"id": "4f1782d0-5572-4a75-ba90-c74914246557",
"layout": {
"h": 11,
"i": "dd975296-d2cd-42fe-b23b-bb8c8dfa1c17",
"isResizable": true,
"w": 11,
"x": 11,
"y": 315
},
"maxPerRow": 4,
"name": "插入文件数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_distributed_files_to_insert",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "2ab85aaf-f0af-41b1-9760-34b9177ae600",
"layout": {
"h": 1,
"i": "2ab85aaf-f0af-41b1-9760-34b9177ae600",
"isResizable": false,
"w": 24,
"x": 0,
"y": 348
},
"name": "Background pool",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "后台任务池中的活跃任务数",
"id": "f329dc5d-6276-4652-855f-b26153d9155a",
"layout": {
"h": 9,
"i": "f329dc5d-6276-4652-855f-b26153d9155a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 22
},
"maxPerRow": 4,
"name": "后台任务池任务数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_background_common_pool_task",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "ClickHouse 中后台获取任务池(BackgroundFetchesPool)当前活跃任务的数量。",
"id": "71ba46cc-be89-400f-ab7a-27189c309e25",
"layout": {
"h": 9,
"i": "ebdfc65c-3b5e-4fa9-8165-6dd47ebab38d",
"isResizable": true,
"w": 11,
"x": 12,
"y": 22
},
"maxPerRow": 4,
"name": " 后台获取任务池任务数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_background_fetches_pool_task",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "ClickHouse中后台移动任务池(BackgroundMovePool)当前激活的任务数量",
"id": "47096333-948c-4010-bb0e-ced6581ea2c5",
"layout": {
"h": 8,
"i": "b942ad7d-9119-4eaf-9971-c8453257b1b8",
"isResizable": true,
"w": 12,
"x": 0,
"y": 31
},
"maxPerRow": 4,
"name": "后台移动任务池任务数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_background_move_pool_task",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "ClickHouse 中后台调度任务池(BackgroundSchedulePool)当前激活的任务数量",
"id": "d8d3c10b-236d-408f-b20c-012d7444b71e",
"layout": {
"h": 8,
"i": "9d862255-175f-4e4a-89d1-ebac3ca82e8f",
"isResizable": true,
"w": 11,
"x": 12,
"y": 31
},
"maxPerRow": 4,
"name": "后台调度任务池任务数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_background_schedule_pool_task",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "ClickHouse 中后台缓冲区刷新调度任务池(BackgroundBufferFlushSchedulePool)当前激活的任务数量。这个任务池专注于处理与缓冲区数据定期刷新相关的后台任务",
"id": "b16eff73-78d4-4554-946c-8e36dcee3678",
"layout": {
"h": 8,
"i": "76625404-ebc4-492b-800c-fa8756154ada",
"isResizable": true,
"w": 12,
"x": 0,
"y": 39
},
"maxPerRow": 4,
"name": "后台缓冲区刷新调度任务池任务数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_background_buffer_flush_schedule_pool_task",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": " ClickHouse 中后台分布式调度任务池(BackgroundDistributedSchedulePool)当前激活的任务数量。这个任务池专门负责处理异步执行的分布式数据发送任务",
"id": "9d3dece1-0765-4e25-a21e-e71f6ac57848",
"layout": {
"h": 8,
"i": "b7d2e412-b86a-4b15-9fb8-6a50e676e5e1",
"isResizable": true,
"w": 11,
"x": 12,
"y": 39
},
"maxPerRow": 4,
"name": "后台分布式调度任务池任务数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_background_distributed_schedule_pool_task",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "ClickHouse 中后台处理池(针对消息流处理的部分)当前激活的任务数量,专注于消息代理相关的后台任务",
"id": "4f590117-7281-443a-8713-5ea9696bd3c1",
"layout": {
"h": 9,
"i": "45ffe751-fb27-45e7-8d22-39820c539149",
"isResizable": true,
"w": 12,
"x": 0,
"y": 47
},
"maxPerRow": 4,
"name": "后台消息代理调度任务池任务数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_BackgroundMessageBrokerSchedulePoolTask",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "clickhouse_metrics_background_message_broker_schedule_pool_task",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
}
],
"var": [
{
"datasource": {
"cate": "prometheus",
"value": 1
},
"definition": "label_values(ClickHouseMetrics_Move,instance)",
"hide": false,
"name": "instance",
"type": "query"
},
{
"definition": "1,null",
"hide": false,
"label": "",
"multi": false,
"name": "peeks",
"type": "custom"
},
{
"definition": "1,null",
"hide": false,
"name": "trends",
"type": "custom"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1719305153872258000
}
================================================
FILE: integrations/ClickHouse/dashboards/clickhouse_by_exporter.json
================================================
{
"id": 0,
"group_id": 0,
"name": "ClickHouse_Exporter",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"graphTooltip": "default",
"graphZoom": "default",
"panels": [
{
"collapsed": true,
"id": "be21493f-8eb4-4947-9e34-0109fcda9a51",
"layout": {
"h": 1,
"i": "be21493f-8eb4-4947-9e34-0109fcda9a51",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "General",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "内存占用",
"id": "8687b3a6-c7f2-4af6-8509-29c99969a55d",
"layout": {
"h": 7,
"i": "8687b3a6-c7f2-4af6-8509-29c99969a55d",
"isResizable": true,
"w": 12,
"x": 0,
"y": 1
},
"maxPerRow": 4,
"name": "Memory",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_MemoryTracking",
"legend": "分配的内存总量",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "tcp连接数",
"id": "af8f29d3-8992-450e-8caa-749518669da1",
"layout": {
"h": 7,
"i": "af8f29d3-8992-450e-8caa-749518669da1",
"isResizable": true,
"w": 12,
"x": 12,
"y": 1
},
"maxPerRow": 4,
"name": "Connections",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_TCPConnection",
"legend": "与 TCP 服务器(带本地接口的客户端)的连接数,也包括服务器-服务器连接",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "由于慢查询读取,降低查询处理线程数的次数\n\npeek\n先计算每2分钟的瞬时速率,再在1小时(1h)的总时间跨度里,以每1分钟(1m)为一个时间片,寻找每个时间片内所有计算出的瞬时速率的最大值。这意味着它返回的是过去1小时内,每1分钟时间片内的最大瞬时读取回退事件速率。",
"id": "15c72e83-be8d-4f2c-b87e-a30e74d8aefd",
"layout": {
"h": 7,
"i": "15c72e83-be8d-4f2c-b87e-a30e74d8aefd",
"isResizable": true,
"w": 12,
"x": 0,
"y": 8
},
"maxPerRow": 4,
"name": "查询处理线程降低次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": [
{
"match": {},
"result": {},
"type": "textValue"
}
]
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_ReadBackoff{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"instant": false,
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "irate(ClickHouseProfileEvents_ReadBackoff{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从文件中进行慢查询读取的次数,这表明系统过载\n\ntrends\n计算过去2分钟内,匹配$instance条件的所有ClickHouse实例上慢速读取事件的平均每秒增长数量",
"id": "1855230f-78e9-4eda-8e76-1921e780b222",
"layout": {
"h": 7,
"i": "1855230f-78e9-4eda-8e76-1921e780b222",
"isResizable": true,
"w": 12,
"x": 12,
"y": 8
},
"maxPerRow": 4,
"name": "慢查询次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_SlowRead{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_SlowRead{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": false,
"id": "cd6bd256-20d0-4892-9c99-7b9976fd9013",
"layout": {
"h": 1,
"i": "cd6bd256-20d0-4892-9c99-7b9976fd9013",
"isResizable": false,
"w": 24,
"x": 0,
"y": 29
},
"name": "Queries",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "需要解释和可能执行的查询数量,不包括失败的查询",
"id": "0524900d-99fd-42b8-a647-d4da462fc309",
"layout": {
"h": 7,
"i": "0524900d-99fd-42b8-a647-d4da462fc309",
"isResizable": true,
"w": 12,
"x": 0,
"y": 44
},
"maxPerRow": 4,
"name": "查询总数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_Query{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_Query{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "SELECT查询的数量",
"id": "9e288fcd-4513-46bb-bcb5-b2f4e0b4edde",
"layout": {
"h": 7,
"i": "d5701b8e-6926-4349-934d-0999de4666a5",
"isResizable": true,
"w": 11,
"x": 12,
"y": 44
},
"maxPerRow": 4,
"name": "SELECT 查询数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_SelectQuery{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_SelectQuery{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "与查询数相同,但仅限于INSERT查询",
"id": "75f49b49-a926-48fa-8147-7e5c9414f029",
"layout": {
"h": 7,
"i": "cc2806a9-e121-4763-adfc-adb0b08477fa",
"isResizable": true,
"w": 12,
"x": 0,
"y": 51
},
"maxPerRow": 4,
"name": "INSERT 查询数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_InsertQuery{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_InsertQuery{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "与失败的查询相同,但仅限于SELECT查询",
"id": "9545befd-9040-4d54-88b0-bee01b8cd7f2",
"layout": {
"h": 7,
"i": "4daaaa6e-9df7-4f3d-8abc-e8281cc6145f",
"isResizable": true,
"w": 11,
"x": 12,
"y": 51
},
"maxPerRow": 4,
"name": "失败的SELECT查询数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_FailedSelectQuery{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_FailedSelectQuery{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "失败的查询数量",
"id": "ee8e2bcd-2312-41cc-b2b8-89bae9861797",
"layout": {
"h": 8,
"i": "e947240d-8086-4d90-9b4b-3d976387f40c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 58
},
"maxPerRow": 4,
"name": "失败的查询数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_FailedQuery{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_FailedQuery{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "05749cc9-2c52-4662-b9b0-de66c3ebd45a",
"layout": {
"h": 8,
"i": "cc19fb12-ba06-447e-b2de-f28a2c946903",
"isResizable": true,
"w": 11,
"x": 12,
"y": 58
},
"maxPerRow": 4,
"name": "查询内存限制超标次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_QueryMemoryLimitExceeded{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_QueryMemoryLimitExceeded{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "6a95c841-5ac7-4cd9-9597-21b0f41441e4",
"layout": {
"h": 9,
"i": "c14878d8-c197-45ab-802e-f328b84deeff",
"isResizable": true,
"w": 12,
"x": 0,
"y": 66
},
"maxPerRow": 4,
"name": "查询平均延迟",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "increase(ClickHouseProfileEvents_QueryTimeMicroseconds{instance=~\"$instance\"}[1m]) / (increase(ClickHouseProfileEvents_Query{instance=~\"$instance\"}[1m]) + 0.001)",
"legend": "{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "与失败的查询相同,但仅限于INSERT查询",
"id": "1c8f70b9-ec42-4d31-9fc1-892e742db86f",
"layout": {
"h": 9,
"i": "8087f7f0-3130-4c61-95a3-9ae36d97990b",
"isResizable": true,
"w": 11,
"x": 12,
"y": 66
},
"maxPerRow": 4,
"name": "失败的INSERT查询数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_FailedInsertQuery{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_FailedInsertQuery{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "4584d584-054b-4d42-b2f9-215a71fca118",
"layout": {
"h": 8,
"i": "050414c6-8e31-49d1-8ddd-1916bf2101f5",
"isResizable": true,
"w": 12,
"x": 0,
"y": 75
},
"maxPerRow": 4,
"name": "SELECT查询平均延迟",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "increase(ClickHouseProfileEvents_SelectQueryTimeMicroseconds{instance=~\"$instance\"}[1m] )/ (increase(ClickHouseProfileEvents_SelectQuery{instance=~\"$instance\"}[1m]) + 0.001)",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "47083dd6-3164-479c-8d62-f32ee9f988ad",
"layout": {
"h": 8,
"i": "8f07dca0-24cf-4a86-8099-5b27ba271d8c",
"isResizable": true,
"w": 11,
"x": 12,
"y": 75
},
"maxPerRow": 4,
"name": "INSERT查询平均延迟",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "increase(ClickHouseProfileEvents_InsertQueryTimeMicroseconds{instance=~\"$instance\"}[1m]) / (increase(ClickHouseProfileEvents_InsertQuery{instance=~\"$instance\"}[1m]) + 0.001)",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "9da4b0b9-6843-4225-b3ba-dec31b2ca7c0",
"layout": {
"h": 1,
"i": "9da4b0b9-6843-4225-b3ba-dec31b2ca7c0",
"isResizable": false,
"w": 24,
"x": 0,
"y": 30
},
"name": "Insert",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "所有表INSERT的行数",
"id": "6ea9312d-2334-4912-864c-4cea25bf65f9",
"layout": {
"h": 8,
"i": "6ea9312d-2334-4912-864c-4cea25bf65f9",
"isResizable": true,
"w": 12,
"x": 0,
"y": 123
},
"maxPerRow": 4,
"name": "Inserted 插入行数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_InsertedRows{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_InsertedRows{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "所有表INSERT的字节数(未压缩列以它们在内存中存储的形式)",
"id": "8a4987ad-c10f-419c-b2e9-107502e4bc20",
"layout": {
"h": 8,
"i": "993c68cb-fabe-4742-b1a3-b9bd317d9725",
"isResizable": true,
"w": 11,
"x": 12,
"y": 123
},
"maxPerRow": 4,
"name": "Inserted 插入字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_InsertedBytes{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_InsertedRows{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "由于分区的活动数据部分数量过多,INSERT到MergeTree表的块被限制的次数",
"id": "dd6d8098-1cab-483d-9462-3436678cbbb8",
"layout": {
"h": 8,
"i": "ecfdbaee-44bb-47ad-a871-e1448a7eac25",
"isResizable": true,
"w": 12,
"x": 0,
"y": 131
},
"maxPerRow": 4,
"name": "延迟插入次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_DelayedInserts{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_DelayedInserts{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "由于分区的活动数据部分数量过多,INSERT到MergeTree表的块被拒绝的次数",
"id": "c03aa1f6-e382-4230-abf0-246ef2464438",
"layout": {
"h": 8,
"i": "c3aab493-a207-44f1-8931-4232bb2f7ecc",
"isResizable": true,
"w": 11,
"x": 12,
"y": 131
},
"maxPerRow": 4,
"name": "拒绝插入次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_RejectedInserts{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_RejectedInserts{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "由于分区的活动数据部分数量过多,INSERT到MergeTree表的块被限制时的总等待时间(毫秒)",
"id": "edbe905e-81f8-4b14-8758-aa934d11f60c",
"layout": {
"h": 8,
"i": "27591842-66d4-47b6-b7e7-1d4897f1793b",
"isResizable": true,
"w": 12,
"x": 0,
"y": 139
},
"maxPerRow": 4,
"name": "延迟插入阻塞的平均等待时间",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "increase(ClickHouseProfileEvents_DelayedInsertsMilliseconds{instance=~\"$instance\"}[1m])/(increase(ClickHouseProfileEvents_DelayedInserts{instance=~\"$instance\"}[1m])+0.01)",
"legend": " {{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "681645ab-ef95-4875-9f97-2bf3d0981bf9",
"layout": {
"h": 1,
"i": "681645ab-ef95-4875-9f97-2bf3d0981bf9",
"isResizable": false,
"w": 24,
"x": 0,
"y": 31
},
"name": "Select",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从MergeTree表读取的数据部分数",
"id": "45448ee2-bff9-451b-ad37-dbc34554e57b",
"layout": {
"h": 8,
"i": "45448ee2-bff9-451b-ad37-dbc34554e57b",
"isResizable": true,
"w": 12,
"x": 0,
"y": 186
},
"maxPerRow": 4,
"name": "读取的数据部分数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_SelectedParts{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_SelectedParts{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从MergeTree表读取的所有数据部分中(非相邻)的范围数",
"id": "baff20c6-7cc9-47e3-8546-b3c49fc8426f",
"layout": {
"h": 8,
"i": "083f8612-e154-40fa-8061-64c29e0bf354",
"isResizable": true,
"w": 11,
"x": 12,
"y": 186
},
"maxPerRow": 4,
"name": "读取的范围数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_SelectedRanges{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_SelectedRanges{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从MergeTree表读取的标记数(索引粒度)",
"id": "efd49ff5-cb50-4df9-8fd0-e6af921dffec",
"layout": {
"h": 7,
"i": "7fc603a7-0a32-4b23-b359-b368ddae749e",
"isResizable": true,
"w": 12,
"x": 0,
"y": 194
},
"maxPerRow": 4,
"name": "读取的标记数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_SelectedMarks{instance=~\"$instance\"}[2m])[1h:1m]) * $trends",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_SelectedMarks{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从所有表SELECT的行数",
"id": "a7e4857b-cf88-4ed7-8cf6-adadeb521e78",
"layout": {
"h": 7,
"i": "2b63803d-ef02-49bc-aba9-41ac7e85bb9d",
"isResizable": true,
"w": 11,
"x": 12,
"y": 194
},
"maxPerRow": 4,
"name": "SELECT查询的行数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_SelectedRows{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_SelectedRows{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从所有表SELECT的字节数(未压缩列以它们在内存中存储的形式)",
"id": "244865f5-c78a-4b21-8068-462612b8730c",
"layout": {
"h": 7,
"i": "628a3845-fc7d-400c-b16c-6181faddd7b9",
"isResizable": true,
"w": 12,
"x": 0,
"y": 201
},
"maxPerRow": 4,
"name": "SELECT查询的字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_SelectedBytes{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_SelectedBytes{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "1d95c7da-85e1-4a6b-a6a9-68eb19ca821e",
"layout": {
"h": 1,
"i": "1d95c7da-85e1-4a6b-a6a9-68eb19ca821e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 32
},
"name": "IO",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "6be5bfc5-8d91-414f-9781-e4b390307961",
"layout": {
"h": 8,
"i": "6be5bfc5-8d91-414f-9781-e4b390307961",
"isResizable": true,
"w": 12,
"x": 0,
"y": 5
},
"maxPerRow": 4,
"name": "iseek函数调用次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_Seek{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_Seek{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "df47ce32-e3b6-4d5b-8bc9-4a640ec9c75c",
"layout": {
"h": 8,
"i": "4b6a7bf4-e91a-41fb-b2a1-61d79805f2c5",
"isResizable": true,
"w": 11,
"x": 12,
"y": 5
},
"maxPerRow": 4,
"name": "打开的文件数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_FileOpen{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_FileOpen{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从文件描述符进行读取(read/pread)的次数,不包括套接字",
"id": "26318204-5e2c-43dd-93b1-2d0ff34c3c9b",
"layout": {
"h": 8,
"i": "e4f66145-d9c6-48df-a9f5-a98ec26cf1e2",
"isResizable": true,
"w": 12,
"x": 0,
"y": 13
},
"maxPerRow": 4,
"name": "从FD文件描述符读取次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_ReadBufferFromFileDescriptorRead{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_ReadBufferFromFileDescriptorRead{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "向FD写入的次数,不包括套接字",
"id": "222a5968-672b-4f4f-ba1a-c823c4be1116",
"layout": {
"h": 8,
"i": "3a378a00-6b9f-4098-8a2d-ed32628cc182",
"isResizable": true,
"w": 11,
"x": 12,
"y": 13
},
"maxPerRow": 4,
"name": "向FD写入的次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_WriteBufferFromFileDescriptorWrite{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_WriteBufferFromFileDescriptorWrite{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从文件描述符读取(read/pread)失败的次数",
"id": "8de4adc5-c3c7-416c-b579-65436ca2504c",
"layout": {
"h": 7,
"i": "94330666-c48d-4479-83d9-44e2cb6134d6",
"isResizable": true,
"w": 12,
"x": 0,
"y": 21
},
"maxPerRow": 4,
"name": "从FD读取失败次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_ReadBufferFromFileDescriptorReadFailed{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_ReadBufferFromFileDescriptorReadFailed{instance=~\"$instance\"}[1m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从文件描述符读取的字节数。如果文件是压缩的,这将显示压缩后的数据大小",
"id": "b5546ead-ef3d-406c-acfc-33759f9d5ad9",
"layout": {
"h": 7,
"i": "7e16e52b-7c39-409b-b2f0-1e9f1feaa451",
"isResizable": true,
"w": 11,
"x": 12,
"y": 21
},
"maxPerRow": 4,
"name": "从FD读取的字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "irate(ClickHouseProfileEvents_ReadBufferFromFileDescriptorReadBytes{instance=~\"$instance\"}[2m]) * $trends",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_ReadBufferFromFileDescriptorReadBytes{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "fc2a63e9-aec2-4755-b4a2-8b0f54868bc6",
"layout": {
"h": 8,
"i": "7b9965c5-b341-4713-88f1-e5a48183ef94",
"isResizable": true,
"w": 12,
"x": 0,
"y": 28
},
"maxPerRow": 4,
"name": "向FD写入的字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_WriteBufferFromFileDescriptorWriteBytes{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_WriteBufferFromFileDescriptorWriteBytes{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从压缩源(文件,网络)读取的未压缩字节数(解压后的字节数)",
"id": "44c6e097-4114-4ef5-89d3-96da2c11ccf3",
"layout": {
"h": 8,
"i": "f3dfd2fc-a656-4ef0-a326-1977908ffd90",
"isResizable": true,
"w": 11,
"x": 12,
"y": 28
},
"maxPerRow": 4,
"name": "读取的未压缩字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_CompressedReadBufferBytes{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_CompressedReadBufferBytes{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从压缩源(文件,网络)读取的压缩块数(独立压缩的数据块)",
"id": "dc75e7de-7728-4f12-91fc-789cdc2610dd",
"layout": {
"h": 8,
"i": "ccfe64c6-a0b2-4179-abce-49a772d43a94",
"isResizable": true,
"w": 12,
"x": 0,
"y": 36
},
"maxPerRow": 4,
"name": "读取的压缩块数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_CompressedReadBufferBlocks{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_CompressedReadBufferBlocks{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "f358d0ce-ab09-4f5d-8477-c6c47341f185",
"layout": {
"h": 1,
"i": "f358d0ce-ab09-4f5d-8477-c6c47341f185",
"isResizable": false,
"w": 24,
"x": 0,
"y": 33
},
"name": "Replicas\n",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "在ClickHouse集群中,由于特定情况而暂时处于只读状态的复制表数量。这种情况通常发生在以下两种情形下:\n\nZooKeeper会话丢失后重新初始化:ClickHouse使用ZooKeeper来维护集群间的协调和一致性。如果与ZooKeeper的会话丢失,为了防止数据不一致,相关的复制表可能会被自动设置为只读模式,直到与ZooKeeper的连接恢复并且数据同步完成为止。\n\n未配置ZooKeeper启动:如果ClickHouse实例在没有正确配置ZooKeeper的情况下启动,它可能无法确定其在复制集群中的准确状态,从而为了安全起见,将涉及复制的表设置为只读,以避免潜在的数据冲突或不一致性问题。",
"id": "24f224bb-3f8c-4dae-9485-d2015e238ba4",
"layout": {
"h": 8,
"i": "24f224bb-3f8c-4dae-9485-d2015e238ba4",
"isResizable": true,
"w": 12,
"x": 0,
"y": 210
},
"maxPerRow": 4,
"name": "只读状态的复制表数量",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_ReadonlyReplica",
"legend": "{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "从ReplicatedMergeTree表的复制数据合并失败次数",
"id": "f71717f8-f96e-4805-968f-c98680a4a264",
"layout": {
"h": 8,
"i": "e998108e-a208-4ff6-a046-42d9ff5ba736",
"isResizable": true,
"w": 11,
"x": 12,
"y": 210
},
"maxPerRow": 4,
"name": "复制部分合并失败次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_ReplicatedPartFailedFetches{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_ReplicatedPartFailedFetches{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "ReplicatedMergeTree表的数据部分成功合并的次数",
"id": "f38eeb1b-6682-4e2a-82c4-ddecf95c4029",
"layout": {
"h": 9,
"i": "d31de4a5-434f-4985-a077-5a58d0645340",
"isResizable": true,
"w": 12,
"x": 0,
"y": 218
},
"maxPerRow": 4,
"name": "复制部分合并次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_ReplicatedPartFetches{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_ReplicatedPartFetches{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "在处理ReplicatedMergeTree表时,我们选择从副本中直接下载已经合并过的数据分区的次数,而不是自己进行合并操作。通常情况下,为了节省网络流量,ClickHouse更倾向于自己执行合并操作。但是,在两种特殊情况下会发生这种从副本直接下载已合并分区的情况:\n\n缺少合并所需源分区:当本地缺少执行某次合并操作所需的全部原始数据分区时,ClickHouse可能会选择从其他副本那里直接获取已经完成合并的数据分区,以避免因数据不完整导致的合并失败。\n\n数据分区过于陈旧:如果某个数据分区已经非常旧,意味着它可能已经被其他副本提前合并并且不再直接可得,这时ClickHouse也可能选择直接从其他副本下载已合并的结果,以减少复杂性和潜在的延迟。\n\n这个指标反映了在维持数据一致性和优化资源使用(如减少网络流量与提高合并效率)之间的权衡情况。通过监控此指标,可以了解系统在实际运行中如何平衡这些因素,并据此调整策略或优化配置以达到更好的性能和资源利用率。",
"id": "eebb688f-f08d-4802-a4bb-22c69dd59dba",
"layout": {
"h": 9,
"i": "031520ac-4699-4c5e-bd44-c0237dabaea1",
"isResizable": true,
"w": 11,
"x": 12,
"y": 218
},
"maxPerRow": 4,
"name": "复制部分获取或合并次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_ReplicatedPartFetchesOfMerged{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks\n\n\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_ReplicatedPartFetchesOfMerged{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "ReplicatedMergeTree表的数据分区成功进行合并的次数",
"id": "e80ea72d-67b4-46ac-9573-3805b4587f24",
"layout": {
"h": 9,
"i": "1b550063-b9bc-4672-825f-fe64a5dc26c2",
"isResizable": true,
"w": 12,
"x": 0,
"y": 227
},
"maxPerRow": 4,
"name": "复制分区合并次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_ReplicatedPartMerges{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_ReplicatedPartMerges{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "数据部分在任何副本上都不存在的次数(即使是现在离线的副本)。这些数据部分肯定丢失了。由于异步复制(如果未启用配额插入),当写入数据部分的副本失败并且在故障后重新联机时不包含该数据部分,这是正常现象",
"id": "e3e0c41e-01b6-42a0-be74-ca8f94d7afb3",
"layout": {
"h": 9,
"i": "630f24d4-9fba-47cd-92b3-a3eb909f0053",
"isResizable": true,
"w": 11,
"x": 12,
"y": 227
},
"maxPerRow": 4,
"name": "复制部分数据丢失次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_ReplicatedDataLoss{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_ReplicatedDataLoss{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "9bdb151e-adfb-4985-ad38-50f83a4b13e1",
"layout": {
"h": 1,
"i": "9bdb151e-adfb-4985-ad38-50f83a4b13e1",
"isResizable": false,
"w": 24,
"x": 0,
"y": 34
},
"name": "Merge",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "启动的后台合并次数",
"id": "505ab54d-1897-45d6-a087-db2c2570f8f7",
"layout": {
"h": 8,
"i": "505ab54d-1897-45d6-a087-db2c2570f8f7",
"isResizable": true,
"w": 12,
"x": 0,
"y": 7
},
"maxPerRow": 4,
"name": "后台合并次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_Merge{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_Merge{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "后台合并读取的行数。这是合并前的行数",
"id": "cf1cbd45-4762-4b57-b674-91520ee63ff4",
"layout": {
"h": 8,
"i": "019fba59-0df6-45d8-a7d0-3adbf32b44f5",
"isResizable": true,
"w": 11,
"x": 12,
"y": 7
},
"maxPerRow": 4,
"name": "合并读取的行数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_MergedRows{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_MergedRows{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "后台合并读取的未压缩字节数(列以它们在内存中存储的形式)。这是合并前的字节数",
"id": "bee87dc6-ace1-47c2-8676-3a7a4c33d2bd",
"layout": {
"h": 9,
"i": "2a43db17-8490-4f9c-a165-a547b112c31b",
"isResizable": true,
"w": 12,
"x": 0,
"y": 15
},
"maxPerRow": 4,
"name": "合并读取的未压缩字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_MergedUncompressedBytes{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_MergedUncompressedBytes{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "63c16da6-f138-4f8f-9a36-a0f71454ba5d",
"layout": {
"h": 9,
"i": "972b412f-af69-4709-858d-cdae2a1ab35f",
"isResizable": true,
"w": 11,
"x": 12,
"y": 15
},
"maxPerRow": 4,
"name": "合并平均持续时间",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "increase(ClickHouseProfileEvents_MergesTimeMilliseconds{instance=~\"$instance\"}[2m]) / increase(ClickHouseProfileEvents_Merge{instance=~\"$instance\"}[2m])",
"legend": "{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "a01d3de2-a79d-49d9-b238-e5db403d8259",
"layout": {
"h": 8,
"i": "8b2138af-31bb-4c88-b983-08eead1da3ba",
"isResizable": true,
"w": 12,
"x": 0,
"y": 24
},
"maxPerRow": 4,
"name": "MergeTree表插入的行数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_MergeTreeDataWriterRows{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_MergeTreeDataWriterRows{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "向MergeTree系列表(包括ReplicatedMergeTree等变种)中插入数据时所用的数据块数量",
"id": "87a118d2-305d-4eb9-921a-39bbf11b56b5",
"layout": {
"h": 8,
"i": "ea833f12-d5fa-419f-9825-5fd105a709df",
"isResizable": true,
"w": 11,
"x": 12,
"y": 24
},
"maxPerRow": 4,
"name": "MergeTree插入块数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_MergeTreeDataWriterBlocks{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_MergeTreeDataWriterBlocks{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "插入到MergeTree表的未压缩字节数(列以它们在内存中存储的形式)",
"id": "3dca4a05-b0cc-4e63-ae32-ee5261b6b7b3",
"layout": {
"h": 8,
"i": "84893385-fa2f-4016-b820-df600797b4a9",
"isResizable": true,
"w": 12,
"x": 0,
"y": 32
},
"maxPerRow": 4,
"name": "MergeTree 表插入的未压缩字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_MergeTreeDataWriterUncompressedBytes{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_MergeTreeDataWriterUncompressedBytes{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "插入到MergeTree表的数据写入到文件系统的字节数",
"id": "6777102c-5efe-4082-9a86-d58f76d751ba",
"layout": {
"h": 8,
"i": "96a43967-afda-4c33-80f2-ce1237afeb35",
"isResizable": true,
"w": 11,
"x": 12,
"y": 32
},
"maxPerRow": 4,
"name": "MergeTree 表写入的压缩字节数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_MergeTreeDataWriterCompressedBytes{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_MergeTreeDataWriterCompressedBytes{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "为当前运行的后台合并保留的磁盘空间。它略大于当前合并部分的总大小",
"id": "2b70d3a1-f9e2-4b35-b181-809091fe0aa0",
"layout": {
"h": 8,
"i": "8612a9c6-bdbd-4eea-837e-84bf93205446",
"isResizable": true,
"w": 12,
"x": 0,
"y": 40
},
"maxPerRow": 4,
"name": "保留空间",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_DiskSpaceReservedForMerge",
"legend": "{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "13a904ce-b661-4abb-bce6-d3f91cd09a05",
"layout": {
"h": 1,
"i": "13a904ce-b661-4abb-bce6-d3f91cd09a05",
"isResizable": false,
"w": 24,
"x": 0,
"y": 76
},
"name": "Cache",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"id": "b19b7613-4e09-4412-a256-fd1900db5b8f",
"layout": {
"h": 8,
"i": "b19b7613-4e09-4412-a256-fd1900db5b8f",
"isResizable": true,
"w": 12,
"x": 0,
"y": 49
},
"maxPerRow": 4,
"name": "未压缩缓存命中次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_UncompressedCacheHits{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks\n\n",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_UncompressedCacheHits{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"id": "cad9b78d-c186-4232-9fc4-dfa52d7225d9",
"layout": {
"h": 8,
"i": "2f8aec48-ccea-4793-b390-e560e57465f8",
"isResizable": true,
"w": 11,
"x": 12,
"y": 49
},
"maxPerRow": 4,
"name": "未压缩缓存未命中次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_UncompressedCacheMisses{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_UncompressedCacheMisses{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"id": "2011555e-68d4-47b5-8a36-cc0055b51fc9",
"layout": {
"h": 8,
"i": "9760b76c-c63c-4a73-b6a4-6ad709bb6e87",
"isResizable": true,
"w": 12,
"x": 0,
"y": 57
},
"maxPerRow": 4,
"name": "标记缓存命中次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_MarkCacheHits{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_MarkCacheHits{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"id": "6423431d-916e-4a79-9db1-d0146e0c83fd",
"layout": {
"h": 8,
"i": "443329a2-56bf-4632-8491-ca581b88b93b",
"isResizable": true,
"w": 11,
"x": 12,
"y": 57
},
"maxPerRow": 4,
"name": "标记缓存未命中次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(ClickHouseProfileEvents_MarkCacheMisses{instance=~\"$instance\"}[2m]) * $trends",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_MarkCacheMisses{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "b408fca8-4054-4ea6-a131-c3c76439d036",
"layout": {
"h": 1,
"i": "b408fca8-4054-4ea6-a131-c3c76439d036",
"isResizable": false,
"w": 24,
"x": 0,
"y": 93
},
"name": "Parts",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "目前正在生成的部分,不在数据部分列表中",
"id": "3a06dade-4969-40bf-aab1-1a5f3950a75f",
"layout": {
"h": 8,
"i": "3a06dade-4969-40bf-aab1-1a5f3950a75f",
"isResizable": true,
"w": 12,
"x": 0,
"y": 9
},
"maxPerRow": 4,
"name": "临时部分数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_PartsTemporary",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "在数据部分中,但不用于SELECT查询的部分\t\n",
"id": "fd497233-3f93-4f73-9dc7-4045d1454eef",
"layout": {
"h": 8,
"i": "cd29ec5b-f032-4464-a1b5-ed987e7d5cd3",
"isResizable": true,
"w": 12,
"x": 12,
"y": 9
},
"maxPerRow": 4,
"name": "预提交部分数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_PartsPreCommitted",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "已经提交的数据部分的数量",
"id": "b688234f-b66a-440f-8a95-20e2a45df833",
"layout": {
"h": 7,
"i": "d12a6546-3f67-4c25-90ed-16ec3d13ec94",
"isResizable": true,
"w": 12,
"x": 0,
"y": 17
},
"maxPerRow": 4,
"name": "提交部分数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_PartsCommitted",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "暂时保留、等待相关查询结束即可清理的数据分区的数量。",
"id": "85faac0b-b923-4f38-b10a-9a38b98f89ee",
"layout": {
"h": 7,
"i": "ee8c8080-9a96-445e-a45a-164b735ae0fa",
"isResizable": true,
"w": 12,
"x": 12,
"y": 17
},
"maxPerRow": 4,
"name": "过时分区",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_PartsOutdated",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "",
"id": "1e45da14-63a6-4adb-be34-a83f9469af84",
"layout": {
"h": 7,
"i": "fca8edbe-2eb2-45a0-8886-ba554aa98100",
"isResizable": true,
"w": 12,
"x": 0,
"y": 24
},
"maxPerRow": 4,
"name": "正删除数据",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_PartsDeleting",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "记录了当前有多少个数据分区正处于这样一种待删除的状态——即这些分区已经被逻辑上标记为“待清理”,并且它们的物理删除操作将在资源被正式释放时由系统自动执行。",
"id": "0fb33f4e-2392-4fb7-bbdb-486c8520e957",
"layout": {
"h": 7,
"i": "c09e1b70-3d5c-4d1d-9bac-9c7d0957ff22",
"isResizable": true,
"w": 12,
"x": 12,
"y": 24
},
"maxPerRow": 4,
"name": "待清理",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_PartsDeleteOnDestroy",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "包含了大量列或者说是字段广泛的数据。一个宽数据分区意味着单个数据块或行包含很多列,这可能会影响到数据处理和查询的效率,尤其是在涉及大量列的选择或聚合操作时。",
"id": "75a6156e-66c8-4b05-a7f3-303d6babe521",
"layout": {
"h": 7,
"i": "cd3637a2-e1af-4c1d-ab12-b5e85caf6835",
"isResizable": true,
"w": 12,
"x": 0,
"y": 31
},
"maxPerRow": 4,
"name": "宽数据分区",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_PartsWide",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "当前数据库中经过压缩处理的数据分区的数量。这些分区通过算法减少了数据占用的空间,同时保持了数据的完整性和查询的可行性。",
"id": "bb287913-7ef7-408f-9feb-d875b3683157",
"layout": {
"h": 7,
"i": "a40b9909-1cc3-47bd-b28e-aca03bd777bd",
"isResizable": true,
"w": 12,
"x": 12,
"y": 31
},
"maxPerRow": 4,
"name": "压缩数据分区 ",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_PartsCompact",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "5aa89429-c4b8-4733-a7b9-2abfc45fb836",
"layout": {
"h": 1,
"i": "5aa89429-c4b8-4733-a7b9-2abfc45fb836",
"isResizable": false,
"w": 24,
"x": 0,
"y": 94
},
"name": "Distributed",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "衡量的是向远程服务器发送数据的连接数量,这些数据最初是插入到 Distributed 表中的。Distributed 表是 ClickHouse 中用于分布式存储和处理数据的一种特殊表类型,它实际上不存储数据,而是将数据操作(如 INSERT 查询)转发到表所定义的远程节点上。通过监控这个指标,您可以了解到数据分布的活跃程度,即 ClickHouse 集群内部数据流动的频繁程度,以及分布式查询或数据加载操作的负载情况。高数值可能意味着大量的数据正在被分布式地处理或存储,这可能是系统繁忙或数据管道高效运行的迹象。然而,如果发现该指标异常高且伴随性能问题,可能需要进一步调查网络状况、远程服务器的处理能力或分布式表的配置,以优化数据传输效率和系统整体性能。",
"id": "b9dde172-3e02-4e71-891f-5fb97c9c0571",
"layout": {
"h": 9,
"i": "b9dde172-3e02-4e71-891f-5fb97c9c0571",
"isResizable": true,
"w": 12,
"x": 0,
"y": 67
},
"maxPerRow": 4,
"name": "向远程服务器发送数据的连接数量",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_DistributedSend",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "等待异步插入到 Distributed 表中的文件数量\n这个指标对于监控数据导入或异步处理流程的效率至关重要。如果数值较大,可能意味着有大量数据正在排队等待写入 Distributed 表,这可能会影响数据的实时性或处理速度。高数值还可能指示了数据导入作业的积压、网络瓶颈、远程节点处理能力不足或其他与分布式系统相关的性能问题。\n\n监控 ClickHouseMetrics_DistributedFilesToInsert 可以帮助识别和诊断数据流中的瓶颈,进而采取相应的优化措施,比如调整插入作业的并发度、优化网络配置、增加目标表的处理能力或调整分布式表的配置参数,以确保数据能够高效、及时地被分布式存储和处理。",
"id": "4f1782d0-5572-4a75-ba90-c74914246557",
"layout": {
"h": 9,
"i": "dd975296-d2cd-42fe-b23b-bb8c8dfa1c17",
"isResizable": true,
"w": 11,
"x": 12,
"y": 67
},
"maxPerRow": 4,
"name": "插入文件数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_DistributedFilesToInsert",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "向 Distributed 表插入数据块的操作因为待处理字节数量过高而被限制或延迟的次数。在 ClickHouse 中,为了防止瞬间大量数据涌入导致的网络拥塞或远程节点处理压力过大,系统会实施一定的流量控制策略。当待处理的数据量(以字节为单位)超过某一阈值时,新的插入请求不会立即执行,而是被暂时搁置或排队,直至系统资源得到释放或负载减轻",
"id": "c4d506ae-7b4b-48e4-bb9a-9ba769bf5b5d",
"layout": {
"h": 8,
"i": "cd092198-32df-46c9-80e5-cf5cb961c699",
"isResizable": true,
"w": 12,
"x": 0,
"y": 76
},
"maxPerRow": 4,
"name": "分布式延迟插入",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_DistributedDelayedInserts{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_DistributedDelayedInserts{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "向 Distributed 表插入数据块的操作因待处理字节数量过多而被拒绝的次数,具体原因是触发了 \"Too many bytes\"(字节过多)异常",
"id": "26bb38f5-075c-4589-9bd0-6bcfcc2e0cfe",
"layout": {
"h": 8,
"i": "8b14f559-5849-46c3-a897-685d7912f274",
"isResizable": true,
"w": 11,
"x": 12,
"y": 76
},
"maxPerRow": 4,
"name": " 分布式拒绝插入",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_DistributedRejectedInserts{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_DistributedRejectedInserts{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "在因待处理字节数量过多而导致向 Distributed 表插入数据块的操作被限制期间,所有延迟插入操作总共花费的毫秒数的平均值。这个指标反映的是插入延迟的具体时间成本,帮助理解数据写入过程中因流量控制而累积的等待时间。",
"id": "0c15cda6-0281-4274-878b-909eee7f7442",
"layout": {
"h": 8,
"i": "3aa55c66-2947-4f05-82fd-0607c6262c6a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 84
},
"maxPerRow": 4,
"name": " 平均延迟插入耗时",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "increase(ClickHouseProfileEvents_DistributedDelayedInsertsMilliseconds{instance=~\"$instance\"}[2m]) / increase(ClickHouseProfileEvents_DistributedDelayedInserts{instance=~\"$instance\"}[2m])",
"legend": "{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "ClickHouse尝试与分布式表中的远程节点建立连接时失败并进行重试的总次数。这意味着在执行分布式查询或数据插入操作时,ClickHouse遇到网络问题、远程节点不可达或其他连接错误,导致初次连接尝试失败,随后系统按照配置进行了重试",
"id": "3fb44129-7577-4f12-a7b4-03a850685bab",
"layout": {
"h": 8,
"i": "b4725cd2-0e9b-4f38-b9ac-838681e511ea",
"isResizable": true,
"w": 11,
"x": 12,
"y": 84
},
"maxPerRow": 4,
"name": "分布式连接失败尝试次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(ClickHouseProfileEvents_DistributedConnectionFailTry{instance=~\"$instance\"}[2m]) * $trends",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_DistributedConnectionFailTry{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "在ClickHouse进行所有重试尝试之后仍然未能成功建立到分布式表中远程节点的连接的累计次数。这意味着在尝试与分布式集群中的某个节点通信时,尽管系统已经按照配置进行了多次重试,但最终还是未能成功建立连接。",
"id": "154c703c-0bef-43f1-8bbe-d9145f5edda2",
"layout": {
"h": 8,
"i": "7622fae5-074a-4072-bd02-010f7c5adbd1",
"isResizable": true,
"w": 12,
"x": 0,
"y": 92
},
"maxPerRow": 4,
"name": "所有重试后分布式连接失败总数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(ClickHouseProfileEvents_DistributedConnectionFailAtAll{instance=~\"$instance\"}[2m]) * $trends",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_DistributedConnectionFailAtAll{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "因超时而未能成功完成的同步插入Distributed表的操作",
"id": "34b8272c-ef89-4f0c-a2a5-d7a4706c008d",
"layout": {
"h": 8,
"i": "5622086d-6ca8-4d9e-85e8-a966f5ae7ff6",
"isResizable": true,
"w": 11,
"x": 12,
"y": 92
},
"maxPerRow": 4,
"name": " 同步插入到 Distributed 表时超时",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "max_over_time(irate(ClickHouseProfileEvents_DistributedSyncInsertionTimeoutExceeded{instance=\"$instance\"}[2m]) [1h:1m]) * $peeks",
"legend": "peaks - {{instance}}",
"maxDataPoints": 240,
"refId": "A"
},
{
"__mode__": "__query__",
"expr": "rate(ClickHouseProfileEvents_DistributedSyncInsertionTimeoutExceeded{instance=~\"$instance\"}[2m]) * $trends",
"legend": "trend - {{instance}}",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "2ab85aaf-f0af-41b1-9760-34b9177ae600",
"layout": {
"h": 1,
"i": "2ab85aaf-f0af-41b1-9760-34b9177ae600",
"isResizable": false,
"w": 24,
"x": 0,
"y": 128
},
"name": "Background pool",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "后台任务池中的活跃任务数",
"id": "f329dc5d-6276-4652-855f-b26153d9155a",
"layout": {
"h": 9,
"i": "f329dc5d-6276-4652-855f-b26153d9155a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 101
},
"maxPerRow": 4,
"name": "后台任务池任务数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_BackgroundCommonPoolTask",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "ClickHouse 中后台获取任务池(BackgroundFetchesPool)当前活跃任务的数量。",
"id": "71ba46cc-be89-400f-ab7a-27189c309e25",
"layout": {
"h": 9,
"i": "ebdfc65c-3b5e-4fa9-8165-6dd47ebab38d",
"isResizable": true,
"w": 11,
"x": 12,
"y": 101
},
"maxPerRow": 4,
"name": " 后台获取任务池任务数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": " ClickHouseMetrics_BackgroundFetchesPoolTask",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "ClickHouse中后台移动任务池(BackgroundMovePool)当前激活的任务数量",
"id": "47096333-948c-4010-bb0e-ced6581ea2c5",
"layout": {
"h": 8,
"i": "b942ad7d-9119-4eaf-9971-c8453257b1b8",
"isResizable": true,
"w": 12,
"x": 0,
"y": 110
},
"maxPerRow": 4,
"name": "后台移动任务池任务数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_BackgroundMovePoolTask",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "ClickHouse 中后台调度任务池(BackgroundSchedulePool)当前激活的任务数量",
"id": "d8d3c10b-236d-408f-b20c-012d7444b71e",
"layout": {
"h": 8,
"i": "9d862255-175f-4e4a-89d1-ebac3ca82e8f",
"isResizable": true,
"w": 11,
"x": 12,
"y": 110
},
"maxPerRow": 4,
"name": "后台调度任务池任务数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_BackgroundSchedulePoolTask",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "ClickHouse 中后台缓冲区刷新调度任务池(BackgroundBufferFlushSchedulePool)当前激活的任务数量。这个任务池专注于处理与缓冲区数据定期刷新相关的后台任务",
"id": "b16eff73-78d4-4554-946c-8e36dcee3678",
"layout": {
"h": 8,
"i": "76625404-ebc4-492b-800c-fa8756154ada",
"isResizable": true,
"w": 12,
"x": 0,
"y": 118
},
"maxPerRow": 4,
"name": "后台缓冲区刷新调度任务池任务数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_BackgroundBufferFlushSchedulePoolTask",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": " ClickHouse 中后台分布式调度任务池(BackgroundDistributedSchedulePool)当前激活的任务数量。这个任务池专门负责处理异步执行的分布式数据发送任务",
"id": "9d3dece1-0765-4e25-a21e-e71f6ac57848",
"layout": {
"h": 8,
"i": "b7d2e412-b86a-4b15-9fb8-6a50e676e5e1",
"isResizable": true,
"w": 11,
"x": 12,
"y": 118
},
"maxPerRow": 4,
"name": "后台分布式调度任务池任务数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_BackgroundDistributedSchedulePoolTask",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"description": "ClickHouse 中后台处理池(针对消息流处理的部分)当前激活的任务数量,专注于消息代理相关的后台任务",
"id": "4f590117-7281-443a-8713-5ea9696bd3c1",
"layout": {
"h": 9,
"i": "45ffe751-fb27-45e7-8d22-39820c539149",
"isResizable": true,
"w": 12,
"x": 0,
"y": 126
},
"maxPerRow": 4,
"name": "后台消息代理调度任务池任务数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ClickHouseMetrics_BackgroundMessageBrokerSchedulePoolTask",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
}
],
"var": [
{
"datasource": {
"cate": "prometheus",
"value": 1
},
"definition": "label_values(ClickHouseMetrics_Move,instance)",
"hide": false,
"name": "instance",
"type": "query"
},
{
"definition": "1,null",
"hide": false,
"label": "",
"multi": false,
"name": "peeks",
"type": "custom"
},
{
"definition": "1,null",
"hide": false,
"name": "trends",
"type": "custom"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1719305153880302000
}
================================================
FILE: integrations/ClickHouse/markdown/README.md
================================================
# ClickHouse Input Plugin
This plugin gathers the statistic data from
[ClickHouse](https://github.com/ClickHouse/ClickHouse) server.
## Global configuration options
In addition to the plugin-specific configuration settings, plugins support
additional global and plugin configuration settings. These settings are used to
modify metrics, tags, and field or create aliases and configure ordering, etc.
See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
## Configuration
```toml
# # collect interval
# interval = 15
# Read metrics from one or many ClickHouse servers
[[instances]]
## Username for authorization on ClickHouse server
username = "default"
## Password for authorization on ClickHouse server
# password = ""
## HTTP(s) timeout while getting metrics values
## The timeout includes connection time, any redirects, and reading the
## response body.
# timeout = 5
## List of servers for metrics scraping
## metrics scrape via HTTP(s) clickhouse interface
## https://clickhouse.tech/docs/en/interfaces/http/
servers = ["http://127.0.0.1:8123"]
## If "auto_discovery"" is "true" plugin tries to connect to all servers
## available in the cluster with using same "user:password" described in
## "user" and "password" parameters and get this server hostname list from
## "system.clusters" table. See
## - https://clickhouse.tech/docs/en/operations/system_tables/#system-clusters
## - https://clickhouse.tech/docs/en/operations/server_settings/settings/#server_settings_remote_servers
## - https://clickhouse.tech/docs/en/operations/table_engines/distributed/
## - https://clickhouse.tech/docs/en/operations/table_engines/replication/#creating-replicated-tables
# auto_discovery = true
## Filter cluster names in "system.clusters" when "auto_discovery" is "true"
## when this filter present then "WHERE cluster IN (...)" filter will apply
## please use only full cluster names here, regexp and glob filters is not
## allowed for "/etc/clickhouse-server/config.d/remote.xml"
##
##
##
##
## clickhouse-ru-1.local 9000
## clickhouse-ru-2.local 9000
##
##
## clickhouse-eu-1.local 9000
## clickhouse-eu-2.local 9000
##
##
##
##
##
##
## example: cluster_include = ["my-own-cluster"]
# cluster_include = []
## Filter cluster names in "system.clusters" when "auto_discovery" is
## "true" when this filter present then "WHERE cluster NOT IN (...)"
## filter will apply
## example: cluster_exclude = ["my-internal-not-discovered-cluster"]
# cluster_exclude = []
## Optional TLS Config
# tls_ca = "/etc/telegraf/ca.pem"
# tls_cert = "/etc/telegraf/cert.pem"
# tls_key = "/etc/telegraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
```
## Metrics
- clickhouse_events (see [system.events][system.events] for details)
- tags:
- source (ClickHouse server hostname)
- cluster (Name of the cluster [optional])
- shard_num (Shard number in the cluster [optional])
- fields:
- all rows from [system.events][system.events]
- clickhouse_metrics (see [system.metrics][system.metrics] for details)
- tags:
- source (ClickHouse server hostname)
- cluster (Name of the cluster [optional])
- shard_num (Shard number in the cluster [optional])
- fields:
- all rows from [system.metrics][system.metrics]
- clickhouse_asynchronous_metrics (see [system.asynchronous_metrics][system.asynchronous_metrics]
for details)
- tags:
- source (ClickHouse server hostname)
- cluster (Name of the cluster [optional])
- shard_num (Shard number in the cluster [optional])
- fields:
- all rows from [system.asynchronous_metrics][system.asynchronous_metrics]
- clickhouse_tables
- tags:
- source (ClickHouse server hostname)
- table
- database
- cluster (Name of the cluster [optional])
- shard_num (Shard number in the cluster [optional])
- fields:
- bytes
- parts
- rows
- clickhouse_zookeeper (see [system.zookeeper][system.zookeeper] for details)
- tags:
- source (ClickHouse server hostname)
- cluster (Name of the cluster [optional])
- shard_num (Shard number in the cluster [optional])
- fields:
- root_nodes (count of node where path=/)
- clickhouse_replication_queue (see [system.replication_queue][system.replication_queue] for details)
- tags:
- source (ClickHouse server hostname)
- cluster (Name of the cluster [optional])
- shard_num (Shard number in the cluster [optional])
- fields:
- too_many_tries_replicas (count of replicas which have `num_tries > 1`)
- clickhouse_detached_parts (see [system.detached_parts][system.detached_parts] for details)
- tags:
- source (ClickHouse server hostname)
- cluster (Name of the cluster [optional])
- shard_num (Shard number in the cluster [optional])
- fields:
- detached_parts (total detached parts for all tables and databases
from [system.detached_parts][system.detached_parts])
- clickhouse_dictionaries (see [system.dictionaries][system.dictionaries] for details)
- tags:
- source (ClickHouse server hostname)
- cluster (Name of the cluster [optional])
- shard_num (Shard number in the cluster [optional])
- dict_origin (xml Filename when dictionary created from *_dictionary.xml,
database.table when dictionary created from DDL)
- fields:
- is_loaded (0 - when dictionary data not successful load, 1 - when
dictionary data loading fail
- bytes_allocated (bytes allocated in RAM after a dictionary loaded)
- clickhouse_mutations (see [system.mutations][system.mutations] for details)
- tags:
- source (ClickHouse server hostname)
- cluster (Name of the cluster [optional])
- shard_num (Shard number in the cluster [optional])
- fields:
- running - gauge which show how much mutation doesn't complete now
- failed - counter which show total failed mutations from first
clickhouse-server run
- completed - counter which show total successful finished mutations
from first clickhouse-server run
- clickhouse_disks (see [system.disks][system.disks] for details)
- tags:
- source (ClickHouse server hostname)
- cluster (Name of the cluster [optional])
- shard_num (Shard number in the cluster [optional])
- name (disk name in storage configuration)
- path (path to disk)
- fields:
- free_space_percent - 0-100, gauge which show current percent of
free disk space bytes relative to total disk space bytes
- keep_free_space_percent - 0-100, gauge which show current percent
of required keep free disk bytes relative to total disk space bytes
- clickhouse_processes (see [system.processes][system.processes] for details)
- tags:
- source (ClickHouse server hostname)
- cluster (Name of the cluster [optional])
- shard_num (Shard number in the cluster [optional])
- fields:
- percentile_50 - float gauge which show 50% percentile (quantile 0.5) for
`elapsed` field of running processes
- percentile_90 - float gauge which show 90% percentile (quantile 0.9) for
`elapsed` field of running processes
- longest_running - float gauge which show maximum value for `elapsed`
field of running processes
- clickhouse_text_log (see [system.text_log][system.text_log] for details)
- tags:
- source (ClickHouse server hostname)
- cluster (Name of the cluster [optional])
- shard_num (Shard number in the cluster [optional])
- level (message level, only messages with level less or equal Notice are
collected)
- fields:
- messages_last_10_min - gauge which show how many messages collected
## Example Output
```text
clickhouse_events,cluster=test_cluster_two_shards_localhost,host=kshvakov,source=localhost,shard_num=1 read_compressed_bytes=212i,arena_alloc_chunks=35i,function_execute=85i,merge_tree_data_writer_rows=3i,rw_lock_acquired_read_locks=421i,file_open=46i,io_buffer_alloc_bytes=86451985i,inserted_bytes=196i,regexp_created=3i,real_time_microseconds=116832i,query=23i,network_receive_elapsed_microseconds=268i,merge_tree_data_writer_compressed_bytes=1080i,arena_alloc_bytes=212992i,disk_write_elapsed_microseconds=556i,inserted_rows=3i,compressed_read_buffer_bytes=81i,read_buffer_from_file_descriptor_read_bytes=148i,write_buffer_from_file_descriptor_write=47i,merge_tree_data_writer_blocks=3i,soft_page_faults=896i,hard_page_faults=7i,select_query=21i,merge_tree_data_writer_uncompressed_bytes=196i,merge_tree_data_writer_blocks_already_sorted=3i,user_time_microseconds=40196i,compressed_read_buffer_blocks=5i,write_buffer_from_file_descriptor_write_bytes=3246i,io_buffer_allocs=296i,created_write_buffer_ordinary=12i,disk_read_elapsed_microseconds=59347044i,network_send_elapsed_microseconds=1538i,context_lock=1040i,insert_query=1i,system_time_microseconds=14582i,read_buffer_from_file_descriptor_read=3i 1569421000000000000
clickhouse_asynchronous_metrics,cluster=test_cluster_two_shards_localhost,host=kshvakov,source=localhost,shard_num=1 jemalloc.metadata_thp=0i,replicas_max_relative_delay=0i,jemalloc.mapped=1803177984i,jemalloc.allocated=1724839256i,jemalloc.background_thread.run_interval=0i,jemalloc.background_thread.num_threads=0i,uncompressed_cache_cells=0i,replicas_max_absolute_delay=0i,mark_cache_bytes=0i,compiled_expression_cache_count=0i,replicas_sum_queue_size=0i,number_of_tables=35i,replicas_max_merges_in_queue=0i,replicas_max_inserts_in_queue=0i,replicas_sum_merges_in_queue=0i,replicas_max_queue_size=0i,mark_cache_files=0i,jemalloc.background_thread.num_runs=0i,jemalloc.active=1726210048i,uptime=158i,jemalloc.retained=380481536i,replicas_sum_inserts_in_queue=0i,uncompressed_cache_bytes=0i,number_of_databases=2i,jemalloc.metadata=9207704i,max_part_count_for_partition=1i,jemalloc.resident=1742442496i 1569421000000000000
clickhouse_metrics,cluster=test_cluster_two_shards_localhost,host=kshvakov,source=localhost,shard_num=1 replicated_send=0i,write=0i,ephemeral_node=0i,zoo_keeper_request=0i,distributed_files_to_insert=0i,replicated_fetch=0i,background_schedule_pool_task=0i,interserver_connection=0i,leader_replica=0i,delayed_inserts=0i,global_thread_active=41i,merge=0i,readonly_replica=0i,memory_tracking_in_background_schedule_pool=0i,memory_tracking_for_merges=0i,zoo_keeper_session=0i,context_lock_wait=0i,storage_buffer_bytes=0i,background_pool_task=0i,send_external_tables=0i,zoo_keeper_watch=0i,part_mutation=0i,disk_space_reserved_for_merge=0i,distributed_send=0i,version_integer=19014003i,local_thread=0i,replicated_checks=0i,memory_tracking=0i,memory_tracking_in_background_processing_pool=0i,leader_election=0i,revision=54425i,open_file_for_read=0i,open_file_for_write=0i,storage_buffer_rows=0i,rw_lock_waiting_readers=0i,rw_lock_waiting_writers=0i,rw_lock_active_writers=0i,local_thread_active=0i,query_preempted=0i,tcp_connection=1i,http_connection=1i,read=2i,query_thread=0i,dict_cache_requests=0i,rw_lock_active_readers=1i,global_thread=43i,query=1i 1569421000000000000
clickhouse_tables,cluster=test_cluster_two_shards_localhost,database=system,host=kshvakov,source=localhost,shard_num=1,table=trace_log bytes=754i,parts=1i,rows=1i 1569421000000000000
clickhouse_tables,cluster=test_cluster_two_shards_localhost,database=default,host=kshvakov,source=localhost,shard_num=1,table=example bytes=326i,parts=2i,rows=2i 1569421000000000000
```
[CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins
[system.asynchronous_metrics]: https://clickhouse.tech/docs/en/operations/system-tables/asynchronous_metrics/
[system.detached_parts]: https://clickhouse.tech/docs/en/operations/system-tables/detached_parts/
[system.dictionaries]: https://clickhouse.tech/docs/en/operations/system-tables/dictionaries/
[system.disks]: https://clickhouse.tech/docs/en/operations/system-tables/disks/
[system.events]: https://clickhouse.tech/docs/en/operations/system-tables/events/
[system.metrics]: https://clickhouse.tech/docs/en/operations/system-tables/metrics/
[system.mutations]: https://clickhouse.tech/docs/en/operations/system-tables/mutations/
[system.processes]: https://clickhouse.tech/docs/en/operations/system-tables/processes/
[system.replication_queue]: https://clickhouse.com/docs/en/operations/system-tables/replication_queue/
[system.text_log]: https://clickhouse.tech/docs/en/operations/system-tables/text_log/
[system.zookeeper]: https://clickhouse.tech/docs/en/operations/system-tables/zookeeper/
================================================
FILE: integrations/ClickHouse/metrics/clickhouse_by_categraf.json
================================================
[
{
"id": 0,
"uuid": 1719305153888541000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse HTTP 连接数",
"unit": "sishort",
"note": "通过HTTP协议连接到ClickHouse服务器的客户端数量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_http_connection",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse HTTP 连接数",
"note": "通过HTTP协议连接到ClickHouse服务器的客户端数量。"
},
{
"lang": "en_US",
"name": "ClickHouse HTTP Connections",
"note": "The number of clients connected to the ClickHouse server via the HTTP protocol."
}
]
},
{
"id": 0,
"uuid": 1719305153889950000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse INSERT查询平均时间",
"unit": "sishort",
"note": "插入查询执行的平均时间(微秒)。",
"lang": "zh_CN",
"expression": "clickhouse_events_insert_query_time_microseconds_microseconds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse INSERT查询平均时间",
"note": "插入查询执行的平均时间(微秒)。"
},
{
"lang": "en_US",
"name": "ClickHouse INSERT query average time",
"note": "The average time in microseconds for the insertion query to execute."
}
]
},
{
"id": 0,
"uuid": 1719305153890963000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse SELECT 查询数",
"unit": "none",
"note": "执行的选择(SELECT)查询的数量",
"lang": "zh_CN",
"expression": "clickhouse_events_select_query",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse SELECT 查询数",
"note": "执行的选择(SELECT)查询的数量"
},
{
"lang": "en_US",
"name": "ClickHouse SELECT Query Number",
"note": "Number of SELECT queries executed"
}
]
},
{
"id": 0,
"uuid": 1719305153892134000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse SELECT查询平均时间",
"unit": "sishort",
"note": "选择查询执行的平均时间(微秒)。",
"lang": "zh_CN",
"expression": "clickhouse_events_select_query_time_microseconds_microseconds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse SELECT查询平均时间",
"note": "选择查询执行的平均时间(微秒)。"
},
{
"lang": "en_US",
"name": "ClickHouse SELECT query average time",
"note": "Select the average time (microseconds) for query execution."
}
]
},
{
"id": 0,
"uuid": 1719305153893317000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse TCP 连接数",
"unit": "sishort",
"note": "通过TCP协议连接到ClickHouse服务器的客户端数量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_tcp_connection",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse TCP 连接数",
"note": "通过TCP协议连接到ClickHouse服务器的客户端数量。"
},
{
"lang": "en_US",
"name": "ClickHouse TCP Connections",
"note": "The number of clients connected to the ClickHouse server via the TCP protocol."
}
]
},
{
"id": 0,
"uuid": 1719305153894646000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 临时数据量",
"unit": "sishort",
"note": "临时数据部分的数量,这些部分当前正在生成。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_temporary",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 临时数据量",
"note": "临时数据部分的数量,这些部分当前正在生成。"
},
{
"lang": "en_US",
"name": "ClickHouse Temporary Data Volume",
"note": "The number of temporary data sections that are currently being generated."
}
]
},
{
"id": 0,
"uuid": 1719305153896151000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 分布式表连接数",
"unit": "sishort",
"note": "发送到分布式表的远程服务器的数据连接数。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_distributed_send",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 分布式表连接数",
"note": "发送到分布式表的远程服务器的数据连接数。"
},
{
"lang": "en_US",
"name": "ClickHouse Distributed Table Joins",
"note": "The number of data connections sent to the remote server of the distributed table."
}
]
},
{
"id": 0,
"uuid": 1719305153897491000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 宽数据量",
"unit": "sishort",
"note": "宽数据部分的数量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_wide",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 宽数据量",
"note": "宽数据部分的数量。"
},
{
"lang": "en_US",
"name": "ClickHouse wide data volume",
"note": "Number of wide data sections."
}
]
},
{
"id": 0,
"uuid": 1719305153899026000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 待插入分布式表文件数",
"unit": "sishort",
"note": "等待异步插入到分布式表的文件数量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_distributed_files_to_insert",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 待插入分布式表文件数",
"note": "等待异步插入到分布式表的文件数量。"
},
{
"lang": "en_US",
"name": "ClickHouse Number of distributed table files to be inserted",
"note": "The number of files waiting to be inserted asynchronously into the distributed table."
}
]
},
{
"id": 0,
"uuid": 1719305153900278000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 提交前数据量",
"unit": "sishort",
"note": "提交前的数据部分数量,这些部分在data_parts列表中,但不用于SELECT查询。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_pre_committed",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 提交前数据量",
"note": "提交前的数据部分数量,这些部分在data_parts列表中,但不用于SELECT查询。"
},
{
"lang": "en_US",
"name": "Data volume before ClickHouse submission",
"note": "The number of data parts before submission, which are in the data _ parts list, but are not used for SELECT queries."
}
]
},
{
"id": 0,
"uuid": 1719305153901527000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 提交后数据量",
"unit": "sishort",
"note": "提交后的数据部分数量,这些部分在data_parts列表中,并且用于SELECT查询。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_committed",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 提交后数据量",
"note": "提交后的数据部分数量,这些部分在data_parts列表中,并且用于SELECT查询。"
},
{
"lang": "en_US",
"name": "Data volume after ClickHouse submission",
"note": "The number of submitted data parts, which are in the data _ parts list and used for SELECT queries."
}
]
},
{
"id": 0,
"uuid": 1719305153902727000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 插入未压缩",
"unit": "sishort",
"note": " 插入操作写入的未压缩字节数。",
"lang": "zh_CN",
"expression": "clickhouse_events_inserted_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 插入未压缩",
"note": " 插入操作写入的未压缩字节数。"
},
{
"lang": "en_US",
"name": "ClickHouse Insert Uncompressed",
"note": "The number of uncompressed bytes written by the insert operation."
}
]
},
{
"id": 0,
"uuid": 1719305153904402000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 插入行数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "clickhouse_events_inserted_rows",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 插入行数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of ClickHouse inserted rows",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1719305153905722000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 查询优先级",
"unit": "sishort",
"note": "由于优先级设置,被停止并等待的查询数量。\n",
"lang": "zh_CN",
"expression": "clickhouse_metrics_query_preempted",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 查询优先级",
"note": "由于优先级设置,被停止并等待的查询数量。\n"
},
{
"lang": "en_US",
"name": "ClickHouse Query Priority",
"note": "The number of queries that were stopped and waiting due to the priority setting. \n"
}
]
},
{
"id": 0,
"uuid": 1719305153906824000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 查询总数",
"unit": "none",
"note": "ClickHouse执行的查询总数。",
"lang": "zh_CN",
"expression": "clickhouse_events_query",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 查询总数",
"note": "ClickHouse执行的查询总数。"
},
{
"lang": "en_US",
"name": "Total ClickHouse Queries",
"note": "The total number of queries executed by ClickHouse."
}
]
},
{
"id": 0,
"uuid": 1719305153907953000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 查询总时间",
"unit": "milliseconds",
"note": "查询执行的总时间(微秒)。",
"lang": "zh_CN",
"expression": "clickhouse_events_query_time_microseconds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 查询总时间",
"note": "查询执行的总时间(微秒)。"
},
{
"lang": "en_US",
"name": "Total ClickHouse query time",
"note": "The total time in microseconds for the query to execute."
}
]
},
{
"id": 0,
"uuid": 1719305153909480000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 正被删除数据量",
"unit": "sishort",
"note": "正在被删除的数据部分数量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_deleting",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 正被删除数据量",
"note": "正在被删除的数据部分数量。"
},
{
"lang": "en_US",
"name": "ClickHouse Amount of Data being Deleted",
"note": "The number of data parts being deleted."
}
]
},
{
"id": 0,
"uuid": 1719305153911177000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 移动池活动任务数",
"unit": "sishort",
"note": "后台移动池中的活动任务数,用于处理数据移动。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_background_move_pool_task",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 移动池活动任务数",
"note": "后台移动池中的活动任务数,用于处理数据移动。"
},
{
"lang": "en_US",
"name": "Number of active tasks in ClickHouse mobile pool",
"note": "The number of active tasks in the background move pool, used to handle data moves."
}
]
},
{
"id": 0,
"uuid": 1719305153912274000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 紧凑数据量",
"unit": "sishort",
"note": "紧凑数据部分的数量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_compact",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 紧凑数据量",
"note": "紧凑数据部分的数量。"
},
{
"lang": "en_US",
"name": "ClickHouse Compact Data Volume",
"note": "Number of compact data sections."
}
]
},
{
"id": 0,
"uuid": 1719305153913312000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 缓冲区活动任务数",
"unit": "sishort",
"note": "后台缓冲区冲洗调度池中的活动任务数,用于定期缓冲区冲洗。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_background_buffer_flush_schedule_pool_task",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 缓冲区活动任务数",
"note": "后台缓冲区冲洗调度池中的活动任务数,用于定期缓冲区冲洗。"
},
{
"lang": "en_US",
"name": "Number of active tasks in ClickHouse buffer",
"note": "The number of active tasks in the background buffer flushing scheduling pool for periodic buffer flushing."
}
]
},
{
"id": 0,
"uuid": 1719305153914788000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 跨磁盘量",
"unit": "sishort",
"note": "移动到另一个磁盘并应在析构函数中删除的数据部分数量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_delete_on_destroy",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 跨磁盘量",
"note": "移动到另一个磁盘并应在析构函数中删除的数据部分数量。"
},
{
"lang": "en_US",
"name": "ClickHouse cross-disk volume",
"note": "The number of portions of data that are moved to another disk and should be deleted in the destructor."
}
]
},
{
"id": 0,
"uuid": 1719305153916159000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse 过时数据量",
"unit": "sishort",
"note": " 过时的数据部分数量,这些部分不是活动数据部分,但当前SELECT查询可能使用它们。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_parts_outdated",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 过时数据量",
"note": " 过时的数据部分数量,这些部分不是活动数据部分,但当前SELECT查询可能使用它们。"
},
{
"lang": "en_US",
"name": "ClickHouse Obsolete Data Volume",
"note": "The number of obsolete data parts that are not active data parts, but may be used by the current SELECT query."
}
]
},
{
"id": 0,
"uuid": 1719305153917507000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse中内存使用情况",
"unit": "sishort",
"note": "ClickHouse服务器使用的总内存量。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_memory_tracking",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse中内存使用情况",
"note": "ClickHouse服务器使用的总内存量。"
},
{
"lang": "en_US",
"name": "Memory usage in ClickHouse",
"note": "The total amount of memory used by the ClickHouse server."
}
]
},
{
"id": 0,
"uuid": 1719305153918455000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse中数据库数量",
"unit": "none",
"note": "ClickHouse数据库数量",
"lang": "zh_CN",
"expression": "clickhouse_asynchronous_metrics_number_of_databases",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse中数据库数量",
"note": "ClickHouse数据库数量"
},
{
"lang": "en_US",
"name": "Number of databases in ClickHouse",
"note": "Number of ClickHouse databases"
}
]
},
{
"id": 0,
"uuid": 1719305153919709000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse中表的数量",
"unit": "none",
"note": "ClickHouse表数量",
"lang": "zh_CN",
"expression": "clickhouse_asynchronous_metrics_number_of_tables",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse中表的数量",
"note": "ClickHouse表数量"
},
{
"lang": "en_US",
"name": "Number of tables in ClickHouse",
"note": "Number of ClickHouse tables"
}
]
},
{
"id": 0,
"uuid": 1719305153920898000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse修订",
"unit": "none",
"note": "ClickHouse服务器的修订号,通常是一个用于标识特定构建的数字。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_revision",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse修订",
"note": "ClickHouse服务器的修订号,通常是一个用于标识特定构建的数字。"
},
{
"lang": "en_US",
"name": "ClickHouse Revision",
"note": "The revision number of the ClickHouse server, usually a number used to identify a specific build."
}
]
},
{
"id": 0,
"uuid": 1719305153921934000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse服务器运行时间",
"unit": "sishort",
"note": "ClickHouse服务器自启动以来的运行时间。",
"lang": "zh_CN",
"expression": "clickhouse_asynchronous_metrics_uptime",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse服务器运行时间",
"note": "ClickHouse服务器自启动以来的运行时间。"
},
{
"lang": "en_US",
"name": "ClickHouse server runtime",
"note": "The running time of the ClickHouse server since it started."
}
]
},
{
"id": 0,
"uuid": 1719305153923130000,
"collector": "Categraf",
"typ": "ClickHouse",
"name": "ClickHouse版本号",
"unit": "none",
"note": "ClickHouse服务器的版本号,以整数形式表示。",
"lang": "zh_CN",
"expression": "clickhouse_metrics_version_integer",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse版本号",
"note": "ClickHouse服务器的版本号,以整数形式表示。"
},
{
"lang": "en_US",
"name": "ClickHouse version number",
"note": "Version number of the ClickHouse server, expressed as an integer."
}
]
}
]
================================================
FILE: integrations/ClickHouse/metrics/clickhouse_by_exporter.json
================================================
[
{
"id": 0,
"uuid": 1719305153924793000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "ClickHouse Tcp 连接数",
"unit": "none",
"note": "tcp连接数",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_TCPConnection",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse Tcp 连接数",
"note": "tcp连接数"
},
{
"lang": "en_US",
"name": "Number of ClickHouse Tcp connections",
"note": "tcp connections"
}
]
},
{
"id": 0,
"uuid": 1719305153926074000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "ClickHouse 内存",
"unit": "bitsIEC",
"note": "分配的内存总量",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_MemoryTracking",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ClickHouse 内存",
"note": "分配的内存总量"
},
{
"lang": "en_US",
"name": "ClickHouse Memory",
"note": "Total memory allocated"
}
]
},
{
"id": 0,
"uuid": 1719305153927130000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "INSERT查询平均延迟",
"unit": "microseconds",
"note": "INSERT查询平均延迟",
"lang": "zh_CN",
"expression": "increase(ClickHouseProfileEvents_InsertQueryTimeMicroseconds[1m]) / (increase(ClickHouseProfileEvents_InsertQuery[1m]) + 0.001)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "INSERT查询平均延迟",
"note": "INSERT查询平均延迟"
},
{
"lang": "en_US",
"name": "INSERT query average latency",
"note": "INSERT query average latency"
}
]
},
{
"id": 0,
"uuid": 1719305153928310000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "INSERT查询数",
"unit": "queries",
"note": "与查询数相同,但仅限于INSERT查询",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_InsertQuery[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "INSERT查询数",
"note": "与查询数相同,但仅限于INSERT查询"
},
{
"lang": "en_US",
"name": "INSERT queries",
"note": "Same number as queries, but only INSERT queries"
}
]
},
{
"id": 0,
"uuid": 1719305153929755000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "lseek函数调用次数",
"unit": "times",
"note": "'lseek'函数被调用的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_Seek[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "lseek函数调用次数",
"note": "'lseek'函数被调用的次数"
},
{
"lang": "en_US",
"name": "Number of lseek function calls",
"note": "Number of times the'lseek 'function was called"
}
]
},
{
"id": 0,
"uuid": 1719305153931299000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "MergeTree表写入的压缩字节数",
"unit": "bytes",
"note": "",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_MergeTreeDataWriterCompressedBytes[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "MergeTree表写入的压缩字节数",
"note": ""
},
{
"lang": "en_US",
"name": "Compressed bytes written to the MergeTree table",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1719305153932255000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "MergeTree表插入的数据块数",
"unit": "blocks",
"note": "插入到MergeTree表的数据块数。每个块形成一个数据部分",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_MergeTreeDataWriterBlocks[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "MergeTree表插入的数据块数",
"note": "插入到MergeTree表的数据块数。每个块形成一个数据部分"
},
{
"lang": "en_US",
"name": "Number of data blocks inserted into the MergeTree table",
"note": "The number of data blocks inserted into the MergeTree table. Each block forms a data portion"
}
]
},
{
"id": 0,
"uuid": 1719305153933664000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "MergeTree表插入的未压缩字节数",
"unit": "bytes",
"note": "插入到MergeTree表的未压缩字节数(列以它们在内存中存储的形式)\n\n在ClickHouse数据库中,当数据被插入到MergeTree系列表(包括ReplicatedMergeTree等)时,在数据实际被写入磁盘并经过压缩处理之前,在内存中以原始格式暂存时所占用的字节数量。这里的“未压缩”意味着数据还未经过ClickHouse为了节省存储空间而在存储阶段执行的列式存储压缩算法处理",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_MergeTreeDataWriterUncompressedBytes[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "MergeTree表插入的未压缩字节数",
"note": "插入到MergeTree表的未压缩字节数(列以它们在内存中存储的形式)\n\n在ClickHouse数据库中,当数据被插入到MergeTree系列表(包括ReplicatedMergeTree等)时,在数据实际被写入磁盘并经过压缩处理之前,在内存中以原始格式暂存时所占用的字节数量。这里的“未压缩”意味着数据还未经过ClickHouse为了节省存储空间而在存储阶段执行的列式存储压缩算法处理"
},
{
"lang": "en_US",
"name": "Number of uncompressed bytes inserted by the MergeTree table",
"note": "Number of uncompressed bytes inserted into the MergeTree table (columns in the form they are stored in memory) \n \nIn a ClickHouse database, when data is inserted into a MergeTree series table (including ReplicatedMergeTree, etc.), the number of bytes occupied when it is temporarily stored in the original format in memory before the data is actually written to disk and compressed. \"Uncompressed\" here means that the data has not been processed by the columnar storage compression algorithm that ClickHouse executes during the storage phase to save storage space"
}
]
},
{
"id": 0,
"uuid": 1719305153934869000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "MergeTree表插入的行数",
"unit": "rows",
"note": "插入到MergeTree表的行数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_MergeTreeDataWriterRows[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "MergeTree表插入的行数",
"note": "插入到MergeTree表的行数"
},
{
"lang": "en_US",
"name": "Number of rows inserted by the MergeTree table",
"note": "Number of rows inserted into the MergeTree table"
}
]
},
{
"id": 0,
"uuid": 1719305153935835000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "SELECT查询平均延迟",
"unit": "microseconds",
"note": "SELECT查询平均延迟",
"lang": "zh_CN",
"expression": "increase(ClickHouseProfileEvents_SelectQueryTimeMicroseconds[1m]) / (increase(ClickHouseProfileEvents_SelectQuery[1m]) + 0.001)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "SELECT查询平均延迟",
"note": "SELECT查询平均延迟"
},
{
"lang": "en_US",
"name": "SELECT query average latency",
"note": "SELECT query average latency"
}
]
},
{
"id": 0,
"uuid": 1719305153937045000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "SELECT查询数",
"unit": "queries",
"note": "SELECT查询的数量",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_SelectQuery[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "SELECT查询数",
"note": "SELECT查询的数量"
},
{
"lang": "en_US",
"name": "SELECT Query Number",
"note": "Number of SELECT queries"
}
]
},
{
"id": 0,
"uuid": 1719305153938270000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "SELECT查询的字节数",
"unit": "bytes",
"note": "从所有表SELECT的字节数(未压缩列以它们在内存中存储的形式)",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_SelectedBytes[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "SELECT查询的字节数",
"note": "从所有表SELECT的字节数(未压缩列以它们在内存中存储的形式)"
},
{
"lang": "en_US",
"name": "Number of bytes of SELECT query",
"note": "Number of bytes SELECT from all tables (uncompressed columns in the form they are stored in memory)"
}
]
},
{
"id": 0,
"uuid": 1719305153939642000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "SELECT查询的行数",
"unit": "rows",
"note": "从所有表SELECT的行数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_SelectedRows[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "SELECT查询的行数",
"note": "从所有表SELECT的行数"
},
{
"lang": "en_US",
"name": "Number of rows of the SELECT query",
"note": "The number of rows SELECT from all tables"
}
]
},
{
"id": 0,
"uuid": 1719305153940852000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "TCP连接数",
"unit": "connections",
"note": "与 TCP 服务器(带本地接口的客户端)的连接数,也包括服务器-服务器连接",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_TCPConnection",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "TCP连接数",
"note": "与 TCP 服务器(带本地接口的客户端)的连接数,也包括服务器-服务器连接"
},
{
"lang": "en_US",
"name": "TCP Connections",
"note": "Number of connections to TCP servers (clients with local interfaces), including server-server connections"
}
]
},
{
"id": 0,
"uuid": 1719305153941862000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "临时部分数",
"unit": "parts",
"note": "目前正在生成的部分,不在数据部分列表中",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_PartsTemporary",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "临时部分数",
"note": "目前正在生成的部分,不在数据部分列表中"
},
{
"lang": "en_US",
"name": "Number of temporary parts",
"note": "Section currently being generated, not in the data section list"
}
]
},
{
"id": 0,
"uuid": 1719305153942864000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "从文件描述符读取失败次数",
"unit": "times",
"note": "从文件描述符读取(read/pread)失败的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_ReadBufferFromFileDescriptorReadFailed[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "从文件描述符读取失败次数",
"note": "从文件描述符读取(read/pread)失败的次数"
},
{
"lang": "en_US",
"name": "Number of read failures from file descriptor",
"note": "Number of failed reads (read/pread) from file descriptors"
}
]
},
{
"id": 0,
"uuid": 1719305153943822000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "从文件描述符读取次数",
"unit": "reads",
"note": "从文件描述符进行读取(read/pread)的次数,不包括套接字",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_ReadBufferFromFileDescriptorRead[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "从文件描述符读取次数",
"note": "从文件描述符进行读取(read/pread)的次数,不包括套接字"
},
{
"lang": "en_US",
"name": "Read times from file descriptor",
"note": "Number of reads (read/pread) made from file descriptors, excluding sockets"
}
]
},
{
"id": 0,
"uuid": 1719305153944918000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "从文件描述符读取的字节数",
"unit": "bytes",
"note": "从文件描述符读取的字节数。如果文件是压缩的,这将显示压缩后的数据大小",
"lang": "zh_CN",
"expression": "irate(ClickHouseProfileEvents_ReadBufferFromFileDescriptorReadBytes[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "从文件描述符读取的字节数",
"note": "从文件描述符读取的字节数。如果文件是压缩的,这将显示压缩后的数据大小"
},
{
"lang": "en_US",
"name": "Number of bytes read from file descriptor",
"note": "The number of bytes read from the file descriptor. If the file is compressed, this will show the size of the compressed data"
}
]
},
{
"id": 0,
"uuid": 1719305153946307000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "保留空间",
"unit": "bytes",
"note": "为当前运行的后台合并保留的磁盘空间。它略大于当前合并部分的总大小",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_DiskSpaceReservedForMerge",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "保留空间",
"note": "为当前运行的后台合并保留的磁盘空间。它略大于当前合并部分的总大小"
},
{
"lang": "en_US",
"name": "Reserve space",
"note": "Disk space reserved for the currently running background merge. It is slightly larger than the total size of the current merged portion"
}
]
},
{
"id": 0,
"uuid": 1719305153947296000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "内存占用",
"unit": "bytes",
"note": "分配的内存总量",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_MemoryTracking",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存占用",
"note": "分配的内存总量"
},
{
"lang": "en_US",
"name": "Memory footprint",
"note": "Total memory allocated"
}
]
},
{
"id": 0,
"uuid": 1719305153948199000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "写入文件描述符次数",
"unit": "writes",
"note": "写入文件描述符(write/pwrite)的次数,不包括套接字",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_WriteBufferFromFileDescriptorWrite[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "写入文件描述符次数",
"note": "写入文件描述符(write/pwrite)的次数,不包括套接字"
},
{
"lang": "en_US",
"name": "Number of writes to file descriptor",
"note": "Number of writes to file descriptors (write/pwrite), excluding sockets"
}
]
},
{
"id": 0,
"uuid": 1719305153949391000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "写入文件描述符的字节数",
"unit": "bytes",
"note": "写入文件描述符的字节数。如果文件是压缩的,这将显示压缩后的数据大小",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_WriteBufferFromFileDescriptorWriteBytes[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "写入文件描述符的字节数",
"note": "写入文件描述符的字节数。如果文件是压缩的,这将显示压缩后的数据大小"
},
{
"lang": "en_US",
"name": "Number of bytes written to file descriptor",
"note": "The number of bytes written to the file descriptor. If the file is compressed, this will show the size of the compressed data"
}
]
},
{
"id": 0,
"uuid": 1719305153950577000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "合并平均持续时间",
"unit": "milliseconds",
"note": "合并的平均持续时间",
"lang": "zh_CN",
"expression": "increase(ClickHouseProfileEvents_MergesTimeMilliseconds[1m]) / increase(ClickHouseProfileEvents_Merge[1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "合并平均持续时间",
"note": "合并的平均持续时间"
},
{
"lang": "en_US",
"name": "Pooled Average Duration",
"note": "Average duration of merger"
}
]
},
{
"id": 0,
"uuid": 1719305153953146000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "合并读取的未压缩字节数",
"unit": "bytes",
"note": "后台合并读取的未压缩字节数(列以它们在内存中存储的形式)。这是合并前的字节数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_MergedUncompressedBytes[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "合并读取的未压缩字节数",
"note": "后台合并读取的未压缩字节数(列以它们在内存中存储的形式)。这是合并前的字节数"
},
{
"lang": "en_US",
"name": "Uncompressed bytes read by merging",
"note": "Number of uncompressed bytes read by background merge (columns in the form they are stored in memory). This is the number of bytes before merging"
}
]
},
{
"id": 0,
"uuid": 1719305153954853000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "合并读取的行数",
"unit": "rows",
"note": "后台合并读取的行数。这是合并前的行数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_MergedRows[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "合并读取的行数",
"note": "后台合并读取的行数。这是合并前的行数"
},
{
"lang": "en_US",
"name": "Merge number of rows read",
"note": "Number of rows read by background merge. This is the number of rows before merging"
}
]
},
{
"id": 0,
"uuid": 1719305153956608000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "后台合并次数",
"unit": "times",
"note": "启动的后台合并次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_Merge[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "后台合并次数",
"note": "启动的后台合并次数"
},
{
"lang": "en_US",
"name": "Number of background merges",
"note": "Number of background merges initiated"
}
]
},
{
"id": 0,
"uuid": 1719305153957668000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "复制部分合并次数",
"unit": "times",
"note": "ReplicatedMergeTree表的数据部分成功合并的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_ReplicatedPartMerges[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "复制部分合并次数",
"note": "ReplicatedMergeTree表的数据部分成功合并的次数"
},
{
"lang": "en_US",
"name": "Number of copy partial merges",
"note": "Number of successful merges of the data portion of the ReplicatedMergeTree table"
}
]
},
{
"id": 0,
"uuid": 1719305153958797000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "复制部分数据丢失次数",
"unit": "times",
"note": "数据在任何副本上都不存在的次数(即使是现在离线的副本)。这些数据部分肯定丢失了。由于异步复制(如果未启用配额插入),当写入数据部分的副本失败并且在故障后重新联机时不包含该数据部分,这是正常现象",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_ReplicatedDataLoss[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "复制部分数据丢失次数",
"note": "数据在任何副本上都不存在的次数(即使是现在离线的副本)。这些数据部分肯定丢失了。由于异步复制(如果未启用配额插入),当写入数据部分的副本失败并且在故障后重新联机时不包含该数据部分,这是正常现象"
},
{
"lang": "en_US",
"name": "Number of data losses in the replication part",
"note": "The number of times the data does not exist on any copy (even the copy that is now offline). These data parts must have been lost. Due to asynchronous replication (if quota insertion is not enabled), it is normal that the data part is not included when a copy of the data part fails to be written and comes back online after the failure"
}
]
},
{
"id": 0,
"uuid": 1719305153959861000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "失败的INSERT查询数",
"unit": "times",
"note": "与失败的查询相同,但仅限于INSERT查询",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_FailedInsertQuery[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "失败的INSERT查询数",
"note": "与失败的查询相同,但仅限于INSERT查询"
},
{
"lang": "en_US",
"name": "Number of failed INSERT queries",
"note": "Same as the failed query but only for the INSERT query"
}
]
},
{
"id": 0,
"uuid": 1719305153961190000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "失败的SELECT查询数",
"unit": "queries",
"note": "与失败的查询相同,但仅限于SELECT查询",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_FailedSelectQuery[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "失败的SELECT查询数",
"note": "与失败的查询相同,但仅限于SELECT查询"
},
{
"lang": "en_US",
"name": "Number of failed SELECT queries",
"note": "Same as the failed query but only for the SELECT query"
}
]
},
{
"id": 0,
"uuid": 1719305153962249000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "失败的查询数",
"unit": "queries",
"note": "失败的查询数量",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_FailedQuery[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "失败的查询数",
"note": "失败的查询数量"
},
{
"lang": "en_US",
"name": "Number of failed queries",
"note": "Number of failed queries"
}
]
},
{
"id": 0,
"uuid": 1719305153963287000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "延迟插入次数",
"unit": "times",
"note": "由于分区的活动数据部分数量过多,INSERT到MergeTree表的块被限制的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_DelayedInserts[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "延迟插入次数",
"note": "由于分区的活动数据部分数量过多,INSERT到MergeTree表的块被限制的次数"
},
{
"lang": "en_US",
"name": "Delayed Insertion Number",
"note": "The number of times the block of INSERT to MergeTree table is restricted due to the number of active data parts of the partition"
}
]
},
{
"id": 0,
"uuid": 1719305153964822000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "延迟插入阻塞的平均等待时间",
"unit": "milliseconds",
"note": "由于分区的活动数据部分数量过多,INSERT到MergeTree表的块被限制时的总等待时间(毫秒)",
"lang": "zh_CN",
"expression": "increase(ClickHouseProfileEvents_DelayedInsertsMilliseconds[1m]) / (increase(ClickHouseProfileEvents_DelayedInserts[1m]) + 0.01)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "延迟插入阻塞的平均等待时间",
"note": "由于分区的活动数据部分数量过多,INSERT到MergeTree表的块被限制时的总等待时间(毫秒)"
},
{
"lang": "en_US",
"name": "Average latency of delayed insertion blocking",
"note": "Total waiting time (milliseconds) when the block of INSERT to MergeTree table is restricted due to the number of active data parts of the partition"
}
]
},
{
"id": 0,
"uuid": 1719305153966130000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "慢查询次数",
"unit": "times",
"note": "从文件中进行慢查询读取的次数,这表明系统过载",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_SlowRead[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "慢查询次数",
"note": "从文件中进行慢查询读取的次数,这表明系统过载"
},
{
"lang": "en_US",
"name": "Slow query times",
"note": "Number of slow query reads from a file, indicating system overload"
}
]
},
{
"id": 0,
"uuid": 1719305153967132000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "打开的文件数",
"unit": "files",
"note": "打开的文件数量",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_FileOpen[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "打开的文件数",
"note": "打开的文件数量"
},
{
"lang": "en_US",
"name": "Number of open files",
"note": "Number of open files"
}
]
},
{
"id": 0,
"uuid": 1719305153968376000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "拒绝插入次数",
"unit": "times",
"note": "由于分区的活动数据部分数量过多,INSERT到MergeTree表的块被拒绝的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_RejectedInserts[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "拒绝插入次数",
"note": "由于分区的活动数据部分数量过多,INSERT到MergeTree表的块被拒绝的次数"
},
{
"lang": "en_US",
"name": "Number of rejected insertions",
"note": "Number of times a block of INSERT to MergeTree table was rejected due to an excessive number of active data parts of the partition"
}
]
},
{
"id": 0,
"uuid": 1719305153969972000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "提交部分数",
"unit": "parts",
"note": "已经提交的数据部分的数量",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_PartsCommitted",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "提交部分数",
"note": "已经提交的数据部分的数量"
},
{
"lang": "en_US",
"name": "Number of submitted parts",
"note": "Number of data sections that have been submitted"
}
]
},
{
"id": 0,
"uuid": 1719305153971113000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "插入字节数",
"unit": "bytes",
"note": "所有表INSERT的字节数(未压缩列以它们在内存中存储的形式)",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_InsertedBytes[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "插入字节数",
"note": "所有表INSERT的字节数(未压缩列以它们在内存中存储的形式)"
},
{
"lang": "en_US",
"name": "Insert bytes",
"note": "Number of bytes of all tables INSERT (uncompressed columns in the form they are stored in memory)"
}
]
},
{
"id": 0,
"uuid": 1719305153972182000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "插入行数",
"unit": "rows",
"note": "所有表INSERT的行数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_InsertedRows[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "插入行数",
"note": "所有表INSERT的行数"
},
{
"lang": "en_US",
"name": "Number of rows inserted",
"note": "Number of rows for all tables INSERT"
}
]
},
{
"id": 0,
"uuid": 1719305153973527000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "未压缩缓存命中次数",
"unit": "times",
"note": "未压缩缓存命中的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_UncompressedCacheHits[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "未压缩缓存命中次数",
"note": "未压缩缓存命中的次数"
},
{
"lang": "en_US",
"name": "Number of uncompressed cache hits",
"note": "Number of uncompressed cache hits"
}
]
},
{
"id": 0,
"uuid": 1719305153974747000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "未压缩缓存未命中次数",
"unit": "times",
"note": "未压缩缓存未命中的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_UncompressedCacheMisses[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "未压缩缓存未命中次数",
"note": "未压缩缓存未命中的次数"
},
{
"lang": "en_US",
"name": "Number of uncompressed cache misses",
"note": "Number of uncompressed cache misses"
}
]
},
{
"id": 0,
"uuid": 1719305153976184000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "查询内存限制超标次数",
"unit": "times",
"note": "查询内存限制超标的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_QueryMemoryLimitExceeded[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "查询内存限制超标次数",
"note": "查询内存限制超标的次数"
},
{
"lang": "en_US",
"name": "Number of times the query memory limit exceeds the standard",
"note": "The number of times the query memory limit exceeds the standard"
}
]
},
{
"id": 0,
"uuid": 1719305153977623000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "查询处理线程降低次数",
"unit": "times",
"note": "由于慢查询读取,降低查询处理线程数的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_ReadBackoff[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "查询处理线程降低次数",
"note": "由于慢查询读取,降低查询处理线程数的次数"
},
{
"lang": "en_US",
"name": "Query processing thread reduction times",
"note": "Reduced number of query processing threads due to slow query reads"
}
]
},
{
"id": 0,
"uuid": 1719305153978786000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "查询平均延迟",
"unit": "microseconds",
"note": "查询平均延迟",
"lang": "zh_CN",
"expression": "increase(ClickHouseProfileEvents_QueryTimeMicroseconds[1m]) / (increase(ClickHouseProfileEvents_Query[1m]) + 0.001)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "查询平均延迟",
"note": "查询平均延迟"
},
{
"lang": "en_US",
"name": "Average query latency",
"note": "Average query latency"
}
]
},
{
"id": 0,
"uuid": 1719305153980379000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "查询总数",
"unit": "queries",
"note": "需要解释和可能执行的查询数量,不包括失败的查询",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_Query[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "查询总数",
"note": "需要解释和可能执行的查询数量,不包括失败的查询"
},
{
"lang": "en_US",
"name": "Total queries",
"note": "Number of queries to be interpreted and possibly executed, excluding failed queries"
}
]
},
{
"id": 0,
"uuid": 1719305153981570000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "标记缓存命中次数",
"unit": "times",
"note": "标记缓存命中的次数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_MarkCacheHits[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "标记缓存命中次数",
"note": "标记缓存命中的次数"
},
{
"lang": "en_US",
"name": "Tag cache hits",
"note": "Mark the number of cache hits"
}
]
},
{
"id": 0,
"uuid": 1719305153983200000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "标记缓存未命中次数",
"unit": "times",
"note": "标记缓存未命中的次数",
"lang": "zh_CN",
"expression": "rate(ClickHouseProfileEvents_MarkCacheMisses[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "标记缓存未命中次数",
"note": "标记缓存未命中的次数"
},
{
"lang": "en_US",
"name": "Tag cache misses",
"note": "Mark the number of cache misses"
}
]
},
{
"id": 0,
"uuid": 1719305153984657000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "读取的压缩块数",
"unit": "blocks",
"note": "从压缩源(文件,网络)读取的压缩块数(独立压缩的数据块)",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_CompressedReadBufferBlocks[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "读取的压缩块数",
"note": "从压缩源(文件,网络)读取的压缩块数(独立压缩的数据块)"
},
{
"lang": "en_US",
"name": "Number of compressed blocks read",
"note": "Number of compressed blocks read from compression source (file, network) (independently compressed data blocks)"
}
]
},
{
"id": 0,
"uuid": 1719305153985923000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "读取的数据部分数",
"unit": "parts",
"note": "从MergeTree表读取的数据部分数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_SelectedParts[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "读取的数据部分数",
"note": "从MergeTree表读取的数据部分数"
},
{
"lang": "en_US",
"name": "Number of data parts read",
"note": "Number of data parts read from the MergeTree table"
}
]
},
{
"id": 0,
"uuid": 1719305153987437000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "读取的未压缩字节数",
"unit": "bytes",
"note": "从压缩源(文件,网络)读取的未压缩字节数(解压后的字节数)",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_CompressedReadBufferBytes[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "读取的未压缩字节数",
"note": "从压缩源(文件,网络)读取的未压缩字节数(解压后的字节数)"
},
{
"lang": "en_US",
"name": "Number of uncompressed bytes read",
"note": "Number of uncompressed bytes read from compressed source (file, network) (number of decompressed bytes)"
}
]
},
{
"id": 0,
"uuid": 1719305153988925000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "读取的标记数",
"unit": "marks",
"note": "从MergeTree表读取的标记数(索引粒度)",
"lang": "zh_CN",
"expression": "irate(ClickHouseProfileEvents_SelectedMarks[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "读取的标记数",
"note": "从MergeTree表读取的标记数(索引粒度)"
},
{
"lang": "en_US",
"name": "Number of marks read",
"note": "Number of tokens read from MergeTree table (index granularity)"
}
]
},
{
"id": 0,
"uuid": 1719305153990107000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "读取的范围数",
"unit": "ranges",
"note": "从MergeTree表读取的所有数据部分中(非相邻)的范围数",
"lang": "zh_CN",
"expression": "max_over_time(irate(ClickHouseProfileEvents_SelectedRanges[2m]) [1h:1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "读取的范围数",
"note": "从MergeTree表读取的所有数据部分中(非相邻)的范围数"
},
{
"lang": "en_US",
"name": "Number of ranges read",
"note": "Number of ranges (non-adjacent) in all data parts read from the MergeTree table"
}
]
},
{
"id": 0,
"uuid": 1719305153991749000,
"collector": "Exporter",
"typ": "ClickHouse",
"name": "预提交部分数",
"unit": "parts",
"note": "在数据部分中,但不用于SELECT查询的部分",
"lang": "zh_CN",
"expression": "ClickHouseMetrics_PartsPreCommitted",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "预提交部分数",
"note": "在数据部分中,但不用于SELECT查询的部分"
},
{
"lang": "en_US",
"name": "Number of pre-commit parts",
"note": "In the DATA section, but not the section used for the SELECT query"
}
]
}
]
================================================
FILE: integrations/CloudWatch/collect/cloudwatch/cloud.toml
================================================
interval="5m"
[[instances]]
## Amazon Region
# list of region and endpoint, see https://docs.aws.amazon.com/general/latest/gr/cw_region.html
region = "us-east-1"
## Amazon Credentials
## Credentials are loaded in the following order
## 1) Web identity provider credentials via STS if role_arn and
## web_identity_token_file are specified
## 2) Assumed credentials via STS if role_arn is specified
## 3) explicit credentials from 'access_key' and 'secret_key'
## 4) shared profile from 'profile'
## 5) environment variables
## 6) shared credentials file
## 7) EC2 Instance Profile
# access_key = ""
# secret_key = ""
# token = ""
# role_arn = ""
# web_identity_token_file = ""
# role_session_name = ""
# profile = ""
# shared_credential_file = ""
## Endpoint to make request against, the correct endpoint is automatically
## determined and this option should only be set if you wish to override the
## default.
## ex: endpoint_url = "http://localhost:8000"
endpoint_url = "https://monitoring.ap-southeast-1.amazonaws.com"
## Set http_proxy
# use_system_proxy = false
# http_proxy_url = "http://localhost:8888"
## The minimum period for CloudWatch metrics is 1 minute (60s). However not
## all metrics are made available to the 1 minute period. Some are collected
## at 3 minute, 5 minute, or larger intervals.
## See https://aws.amazon.com/cloudwatch/faqs/#monitoring.
## Note that if a period is configured that is smaller than the minimum for a
## particular metric, that metric will not be returned by the CloudWatch API
## and will not be collected by Telegraf.
#
## Collection Delay (required)
## Must account for metrics availability via CloudWatch API
delay = "5m"
## Requested CloudWatch aggregation Period (required)
## Must be a multiple of 60s.
period = "5m"
## Recommended if "delay" and "period" are both within 3 hours of request
## time. Invalid values will be ignored. Recently Active feature will only
## poll for CloudWatch ListMetrics values that occurred within the last 3h.
## If enabled, it will reduce total API usage of the CloudWatch ListMetrics
## API and require less memory to retain.
## Do not enable if "period" or "delay" is longer than 3 hours, as it will
## not return data more than 3 hours old.
## See https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/API_ListMetrics.html
#recently_active = "PT3H"
## Configure the TTL for the internal cache of metrics.
# cache_ttl = "1h"
## Metric Statistic Namespaces (required)
# namespaces = ["AWS/EC2"]
## Maximum requests per second. Note that the global default AWS rate limit
## is 50 reqs/sec, so if you define multiple namespaces, these should add up
## to a maximum of 50.
## See http://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/cloudwatch_limits.html
# ratelimit = 25
## Timeout for http requests made by the cloudwatch client.
# timeout = "5s"
## Batch Size
## The size of each batch to send requests to CloudWatch. 500 is the
## suggested largest size. If a request gets to large (413 errors), consider
## reducing this amount.
# batch_size = 500
## Namespace-wide statistic filters. These allow fewer queries to be made to
## cloudwatch.
# statistic_include = ["average", "sum", "minimum", "maximum", sample_count"]
# statistic_exclude = []
## Metrics to Pull
## Defaults to all Metrics in Namespace if nothing is provided
## Refreshes Namespace available metrics every 1h
#[[instances.metrics]]
# names = ["Latency", "RequestCount"]
#
# ## Statistic filters for Metric. These allow for retrieving specific
# ## statistics for an individual metric.
# # statistic_include = ["average", "sum", "minimum", "maximum", sample_count"]
# # statistic_exclude = []
#
# ## Dimension filters for Metric.
# ## All dimensions defined for the metric names must be specified in order
# ## to retrieve the metric statistics.
# ## 'value' has wildcard / 'glob' matching support such as 'p-*'.
# [[instances.metrics.dimensions]]
# name = "LoadBalancerName"
# value = "p-example"
================================================
FILE: integrations/CloudWatch/dashboards/dashboard-by-aws-rds.json
================================================
{
"id": 0,
"group_id": 0,
"name": "AWS RDS Telegraf",
"ident": "",
"tags": "AWS Cloudwatch Telegraf",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "2ceac4da-53d8-432d-ad43-51a25cf63b21",
"layout": {
"h": 1,
"i": "2ceac4da-53d8-432d-ad43-51a25cf63b21",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Common metrics",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds cpu 利用率平均值",
"id": "2002c9f5-6177-4239-a0c6-2981edacae5a",
"layout": {
"h": 6,
"i": "2002c9f5-6177-4239-a0c6-2981edacae5a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 1
},
"name": "RDS CPU利用率(百分比)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#d0021b",
"value": 80
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_cpu_utilization_average{region=\"$region\",db_instance_identifier=\"$instance\"}",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds 数据库连接平均值",
"id": "c54b9dca-88ce-425a-bf75-6d8b363f6ebb",
"layout": {
"h": 6,
"i": "05ddf798-e5f8-4b34-96f1-aaa2a45d1207",
"isResizable": true,
"w": 12,
"x": 12,
"y": 1
},
"name": "RDS 数据库连接数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#d0021b",
"value": 100
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_database_connections_average{region=\"$region\",db_instance_identifier=\"$instance\"}",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds 可用存储空间平均值",
"id": "997a6214-2ac0-46c6-a0b9-046810b2b8cf",
"layout": {
"h": 6,
"i": "2d42ff70-a867-4f02-9980-5f20c017a21e",
"isResizable": true,
"w": 12,
"x": 0,
"y": 7
},
"name": "RDS 可用存储空间(MB/秒)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#d0021b",
"value": 10000000000
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_free_storage_space_average{region=\"$region\",db_instance_identifier=\"$instance\"}",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds 可用内存平均值",
"id": "6c00311c-e931-487f-b088-3a3bfafc84ef",
"layout": {
"h": 6,
"i": "89bbb148-7fb3-4492-a5d6-abd0bb5df667",
"isResizable": true,
"w": 12,
"x": 12,
"y": 7
},
"name": "RDS 可用内存(MB)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#d0021b",
"value": 2000000000
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_freeable_memory_average{region=\"$region\",db_instance_identifier=\"$instance\"}",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds lvm 写入 iops 平均值",
"id": "990ab5a1-4aa5-47c3-b7b7-a65f63459119",
"layout": {
"h": 6,
"i": "18640a88-13c0-4ce7-8456-60b20f8c7422",
"isResizable": true,
"w": 12,
"x": 0,
"y": 13
},
"name": "RDS 写入IOPS(次数/秒)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_lvm_write_iops_average{region=\"$region\",db_instance_identifier=\"$instance\"}",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds 读取 iops 平均值",
"id": "a61a80da-7d0a-45a5-a868-bd442b3aa4cf",
"layout": {
"h": 6,
"i": "010a63f8-2a08-4d56-9131-0f9e50a7e2f4",
"isResizable": true,
"w": 12,
"x": 12,
"y": 13
},
"name": "RDS 读取IOPS(次数/秒)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_read_iops_average{region=\"$region\",db_instance_identifier=\"$instance\"}",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds 写入吞吐量平均值",
"id": "2e605342-3413-4004-9fcf-3dbbfa7e7be3",
"layout": {
"h": 6,
"i": "58987f8f-09d3-445f-b22f-5f872f5b9dde",
"isResizable": true,
"w": 12,
"x": 0,
"y": 19
},
"name": "RDS 写入吞吐量(MB/秒)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_write_throughput_average{region=\"$region\",db_instance_identifier=\"$instance\"}",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds 读取吞吐量平均值",
"id": "1ef3f98d-1b54-408a-8cc2-4570c327d705",
"layout": {
"h": 6,
"i": "23e7b924-d638-4293-9840-78fb129d5410",
"isResizable": true,
"w": 12,
"x": 12,
"y": 19
},
"name": "RDS 读取吞吐量(MB/秒)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_read_throughput_average{region=\"$region\",db_instance_identifier=\"$instance\"}",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "07e3cd80-1984-4ebe-a037-526e6a186ebb",
"layout": {
"h": 1,
"i": "07e3cd80-1984-4ebe-a037-526e6a186ebb",
"isResizable": false,
"w": 24,
"x": 0,
"y": 25
},
"name": "NetWork metrics",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds 网络接收吞吐量平均",
"id": "4ba500c9-e87e-41e4-bbc1-82fec507da9d",
"layout": {
"h": 6,
"i": "e1573095-990a-468d-bf2f-7bbf5a6dcb42",
"isResizable": true,
"w": 12,
"x": 0,
"y": 26
},
"name": "RDS 网络接收吞吐量(MB/秒)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_network_receive_throughput_average{region=\"$region\",db_instance_identifier=\"$instance\"}",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds 网络传输吞吐量平均值",
"id": "edee8285-1274-4ddc-b166-fb773c764c2b",
"layout": {
"h": 6,
"i": "0493a01d-d066-482a-b677-2d9ae1d9a30b",
"isResizable": true,
"w": 12,
"x": 12,
"y": 26
},
"name": "RDS 网络传输吞吐量(MB/秒)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_network_transmit_throughput_average{region=\"$region\",db_instance_identifier=\"$instance\"}",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds 写入延迟平均值",
"id": "ecb9b8a5-b168-4a65-b7f6-7912ab6c6b22",
"layout": {
"h": 6,
"i": "fb7ee87d-7bec-4123-ab16-7ef2b6838d8c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 32
},
"name": "RDS 写入延迟(毫秒)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_write_latency_average{region=\"$region\",db_instance_identifier=\"$instance\"} * 1000",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds 读取延迟平均值",
"id": "60d009fa-e547-45be-a862-9b156c15b675",
"layout": {
"h": 6,
"i": "d652843b-4005-4448-8342-b3761f58677b",
"isResizable": true,
"w": 12,
"x": 12,
"y": 32
},
"name": "RDS 读取延迟(毫秒)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_read_latency_average{region=\"$region\",db_instance_identifier=\"$instance\"} * 1000",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "3fafd89f-e6dc-4666-96b7-9f2dc216f496",
"layout": {
"h": 1,
"i": "3fafd89f-e6dc-4666-96b7-9f2dc216f496",
"isResizable": false,
"w": 24,
"x": 0,
"y": 38
},
"name": "Additional metrics",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds 磁盘队列深度平均值",
"id": "7edcf2a8-16f3-49ef-9026-e53dc5e72c69",
"layout": {
"h": 6,
"i": "b36508a8-057d-44fe-9899-74862407fd03",
"isResizable": true,
"w": 12,
"x": 0,
"y": 39
},
"name": "RDS 队列深度(数量)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_disk_queue_depth_average{region=\"$region\",db_instance_identifier=\"$instance\"}",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds 二进制日志磁盘使用情况 (MB)",
"id": "42143731-22a9-45b4-bb1e-ddb8f2c11a70",
"layout": {
"h": 6,
"i": "ca09fee2-6496-444a-937d-3fc2d7483630",
"isResizable": true,
"w": 12,
"x": 12,
"y": 39
},
"name": "RDS 二进制日志磁盘使用情况 (MB)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_bin_log_disk_usage_average{region=\"$region\",db_instance_identifier=\"$instance\"}",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds 交换分区使用平均值",
"id": "51c6f9d9-30db-4514-a54d-712e1a570b23",
"layout": {
"h": 6,
"i": "1252f5b7-278b-4cd9-9f36-8fb5ccf6ee51",
"isResizable": true,
"w": 12,
"x": 0,
"y": 45
},
"name": "RDS 交换分区使用情况(MB)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_swap_usage_average{region=\"$region\",db_instance_identifier=\"$instance\"}",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* Telegraf Gather AWS Cloudwatch RDS\n* cloudwatch aws rds 突发余额平均值",
"id": "767bcc71-3f71-443a-9713-03f587ccc350",
"layout": {
"h": 6,
"i": "05473d8c-ea01-40c7-b4d4-47378a42aa3e",
"isResizable": true,
"w": 12,
"x": 12,
"y": 45
},
"name": "RDS 突发信用余额平均值(百分比)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": 110
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cloudwatch_aws_rds_burst_balance_average{region=\"$region\",db_instance_identifier=\"$instance\"}",
"legend": "{{db_instance_identifier}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(cloudwatch_aws_rds_cpu_utilization_average, region)",
"multi": false,
"name": "region",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(cloudwatch_aws_rds_cpu_utilization_average{region=\"$region\"}, db_instance_identifier)",
"name": "instance",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327336057000
}
================================================
FILE: integrations/CloudWatch/markdown/README.md
================================================
Forked from [telegraf/cloudwatch](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/cloudwatch)
____
# Amazon CloudWatch Statistics Input Plugin
This plugin will pull Metric Statistics from Amazon CloudWatch.
## Amazon Authentication
This plugin uses a credential chain for Authentication with the CloudWatch
API endpoint. In the following order the plugin will attempt to authenticate.
1. Assumed credentials via STS if `role_arn` attribute is specified
(source credentials are evaluated from subsequent rules)
2. Explicit credentials from `access_key`, `secret_key`, and `token` attributes
3. Shared profile from `profile` attribute
4. [Environment Variables][env]
5. [Shared Credentials][credentials]
6. [EC2 Instance Profile][iam-roles]
## Global configuration options
In addition to the plugin-specific configuration settings, plugins support
additional global and plugin configuration settings. These settings are used to
modify metrics, tags, and field or create aliases and configure ordering, etc.
See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
[CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins
## Configuration
```toml @sample.conf
# Pull Metric Statistics from Amazon CloudWatch
[[instances]]
## Amazon Region
region = "us-east-1"
## Amazon Credentials
## Credentials are loaded in the following order
## 1) Web identity provider credentials via STS if role_arn and
## web_identity_token_file are specified
## 2) Assumed credentials via STS if role_arn is specified
## 3) explicit credentials from 'access_key' and 'secret_key'
## 4) shared profile from 'profile'
## 5) environment variables
## 6) shared credentials file
## 7) EC2 Instance Profile
# access_key = ""
# secret_key = ""
# token = ""
# role_arn = ""
# web_identity_token_file = ""
# role_session_name = ""
# profile = ""
# shared_credential_file = ""
## Endpoint to make request against, the correct endpoint is automatically
## determined and this option should only be set if you wish to override the
## default.
## ex: endpoint_url = "http://localhost:8000"
# endpoint_url = ""
## Set http_proxy
# use_system_proxy = false
# http_proxy_url = "http://localhost:8888"
## The minimum period for CloudWatch metrics is 1 minute (60s). However not
## all metrics are made available to the 1 minute period. Some are collected
## at 3 minute, 5 minute, or larger intervals.
## See https://aws.amazon.com/cloudwatch/faqs/#monitoring.
## Note that if a period is configured that is smaller than the minimum for a
## particular metric, that metric will not be returned by the CloudWatch API
## and will not be collected by Categraf.
## Collection Delay (required)
## Must account for metrics availability via CloudWatch API
delay = "5m"
## Requested CloudWatch aggregation Period (required)
## Must be a multiple of 60s.
period = "5m"
## Recommended if "delay" and "period" are both within 3 hours of request
## time. Invalid values will be ignored. Recently Active feature will only
## poll for CloudWatch ListMetrics values that occurred within the last 3h.
## If enabled, it will reduce total API usage of the CloudWatch ListMetrics
## API and require less memory to retain.
## Do not enable if "period" or "delay" is longer than 3 hours, as it will
## not return data more than 3 hours old.
## See https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/API_ListMetrics.html
#recently_active = "PT3H"
## Configure the TTL for the internal cache of metrics.
# cache_ttl = "1h"
## Metric Statistic Namespaces (required)
namespaces = ["AWS/ELB"]
## Maximum requests per second. Note that the global default AWS rate limit
## is 50 reqs/sec, so if you define multiple namespaces, these should add up
## to a maximum of 50.
## See http://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/cloudwatch_limits.html
# ratelimit = 25
## Timeout for http requests made by the cloudwatch client.
# timeout = "5s"
## Batch Size
## The size of each batch to send requests to CloudWatch. 500 is the
## suggested largest size. If a request gets to large (413 errors), consider
## reducing this amount.
# batch_size = 500
## Namespace-wide statistic filters. These allow fewer queries to be made to
## cloudwatch.
# statistic_include = ["average", "sum", "minimum", "maximum", sample_count"]
# statistic_exclude = []
## Metrics to Pull
## Defaults to all Metrics in Namespace if nothing is provided
## Refreshes Namespace available metrics every 1h
# [[instances.metrics]]
# names = ["Latency", "RequestCount"]
#
# ## Statistic filters for Metric. These allow for retrieving specific
# ## statistics for an individual metric.
# # statistic_include = ["average", "sum", "minimum", "maximum", sample_count"]
# # statistic_exclude = []
#
# ## Dimension filters for Metric.
# ## All dimensions defined for the metric names must be specified in order
# ## to retrieve the metric statistics.
# ## 'value' has wildcard / 'glob' matching support such as 'p-*'.
# [[instances.metrics.dimensions]]
# name = "LoadBalancerName"
# value = "p-example"
```
Please note, the `namespace` option is deprecated in favor of the `namespaces`
list option.
## Requirements and Terminology
Plugin Configuration utilizes [CloudWatch concepts][concept] and access
pattern to allow monitoring of any CloudWatch Metric.
- `region` must be a valid AWS [region][] value
- `period` must be a valid CloudWatch [period][] value
- `namespaces` must be a list of valid CloudWatch [namespace][] value(s)
- `names` must be valid CloudWatch [metric][] names
- `dimensions` must be valid CloudWatch [dimension][] name/value pairs
Omitting or specifying a value of `'*'` for a dimension value configures all
available metrics that contain a dimension with the specified name to be
retrieved. If specifying >1 dimension, then the metric must contain *all* the
configured dimensions where the value of the wildcard dimension is ignored.
Example:
```toml
[[instances]]
period = "1m"
interval = "5m"
[[instances.metrics]]
names = ["Latency"]
## Dimension filters for Metric (optional)
[[instances.metrics.dimensions]]
name = "LoadBalancerName"
value = "p-example"
[[instances.metrics.dimensions]]
name = "AvailabilityZone"
value = "*"
```
If the following ELBs are available:
- name: `p-example`, availabilityZone: `us-east-1a`
- name: `p-example`, availabilityZone: `us-east-1b`
- name: `q-example`, availabilityZone: `us-east-1a`
- name: `q-example`, availabilityZone: `us-east-1b`
Then 2 metrics will be output:
- name: `p-example`, availabilityZone: `us-east-1a`
- name: `p-example`, availabilityZone: `us-east-1b`
If the `AvailabilityZone` wildcard dimension was omitted, then a single metric
(name: `p-example`) would be exported containing the aggregate values of the ELB
across availability zones.
To maximize efficiency and savings, consider making fewer requests by increasing
`interval` but keeping `period` at the duration you would like metrics to be
reported. The above example will request metrics from Cloudwatch every 5 minutes
but will output five metrics timestamped one minute apart.
## Restrictions and Limitations
- CloudWatch metrics are not available instantly via the CloudWatch API.
You should adjust your collection `delay` to account for this lag in metrics
availability based on your [monitoring subscription level][using]
- CloudWatch API usage incurs cost - see [GetMetricData Pricing][pricing]
## Metrics
Each CloudWatch Namespace monitored records a measurement with fields for each
available Metric Statistic. Namespace and Metrics are represented in [snake
case](https://en.wikipedia.org/wiki/Snake_case)
- cloudwatch_{namespace}
- {metric}_sum (metric Sum value)
- {metric}_average (metric Average value)
- {metric}_minimum (metric Minimum value)
- {metric}_maximum (metric Maximum value)
- {metric}_sample_count (metric SampleCount value)
### Tags
Each measurement is tagged with the following identifiers to uniquely identify
the associated metric Tag Dimension names are represented in [snake
case](https://en.wikipedia.org/wiki/Snake_case)
- All measurements have the following tags:
- region (CloudWatch Region)
- {dimension-name} (Cloudwatch Dimension value - one per metric dimension)
## Troubleshooting
You can use the aws cli to get a list of available metrics and dimensions:
```shell
aws cloudwatch list-metrics --namespace AWS/EC2 --region us-east-1
aws cloudwatch list-metrics --namespace AWS/EC2 --region us-east-1 --metric-name CPUCreditBalance
```
If the expected metrics are not returned, you can try getting them manually
for a short period of time:
```shell
aws cloudwatch get-metric-data \
--start-time 2018-07-01T00:00:00Z \
--end-time 2018-07-01T00:15:00Z \
--metric-data-queries '[
{
"Id": "avgCPUCreditBalance",
"MetricStat": {
"Metric": {
"Namespace": "AWS/EC2",
"MetricName": "CPUCreditBalance",
"Dimensions": [
{
"Name": "InstanceId",
"Value": "i-deadbeef"
}
]
},
"Period": 300,
"Stat": "Average"
},
"Label": "avgCPUCreditBalance"
}
]'
```
## Example Output
```shell
$ ./categraf --inputs cloudwatch --test
> cloudwatch_aws_elb,load_balancer_name=p-example,region=us-east-1 latency_average=0.004810798017284538,latency_maximum=0.1100282669067383,latency_minimum=0.0006084442138671875,latency_sample_count=4029,latency_sum=19.382705211639404 1459542420000000000
```
[concept]: http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/cloudwatch_concepts.html
[credentials]: https://docs.aws.amazon.com/sdk-for-go/v1/developer-guide/configuring-sdk.html#shared-credentials-file
[dimension]: http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/cloudwatch_concepts.html#Dimension
[env]: https://docs.aws.amazon.com/sdk-for-go/v1/developer-guide/configuring-sdk.html#environment-variables
[iam-roles]: http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html
[metric]: http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/cloudwatch_concepts.html#Metric
[namespace]: http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/cloudwatch_concepts.html#Namespace
[period]: http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/cloudwatch_concepts.html#CloudWatchPeriods
[pricing]: https://aws.amazon.com/cloudwatch/pricing/
[region]: http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/cloudwatch_concepts.html#CloudWatchRegions
[using]: http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-cloudwatch-new.html
================================================
FILE: integrations/Consul/collect/consul/consul.toml
================================================
# # collect interval
# interval = 15
[[instances]]
## Consul server address
# address = "localhost:8500"
## URI scheme for the Consul server, one of "http", "https"
# scheme = "http"
## ACL token used in every request
# token = ""
## HTTP Basic Authentication username and password.
# username = ""
# password = ""
## Data center to query the health checks from
# datacenter = ""
## Allows any Consul server (non-leader) to service a read.
## Default is true
# allow_stale = true
## Forces the read to be fully consistent.
## Default is false
# require_consistent = false
## Prefix from which to expose key/value pairs.
# kv_prefix = ""
## Regex that determines which keys to expose.
## Default is ".*"
# kv_filter = ".*"
## Optional TLS Config
# tls_ca = "/etc/telegraf/ca.pem"
# tls_cert = "/etc/telegraf/cert.pem"
# tls_key = "/etc/telegraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
================================================
FILE: integrations/Consul/markdown/README.md
================================================
# Consul Input Plugin
This plugin will collect statistics about all health checks registered in the
Consul. It uses [Consul API][1] to query the data. It will not report the
[telemetry][2] but Consul can report those stats already using StatsD protocol
if needed.
[1]: https://www.consul.io/docs/agent/http/health.html#health_state
[2]: https://www.consul.io/docs/agent/telemetry.html
## Configuration
```toml
# Gather health check statuses from services registered in Consul
[[instances]]
## Consul server address
# address = "localhost:8500"
## URI scheme for the Consul server, one of "http", "https"
# scheme = "http"
## ACL token used in every request
# token = ""
## HTTP Basic Authentication username and password.
# username = ""
# password = ""
## Data center to query the health checks from
# datacenter = ""
## Optional TLS Config
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
```
## Metrics
| name | help |
| ----------------------------- | ----------------------------------------------------------------------------------------------------- |
| consul_up | Was the last query of Consul successful. |
| consul_scrape_use_seconds | scrape use seconds. |
| consul_raft_peers | How many peers (servers) are in the Raft cluster. |
| consul_raft_leader | Does Raft cluster have a leader (according to this node). |
| consul_serf_lan_members | How many members are in the cluster. |
| consul_serf_lan_member_status | Status of member in the cluster. 1=Alive, 2=Leaving, 3=Left, 4=Failed. |
| consul_serf_wan_member_status | Status of member in the wan cluster. 1=Alive, 2=Leaving, 3=Left, 4=Failed. |
| consul_catalog_services | How many services are in the cluster. |
| consul_service_tag | Tags of a service. |
| consul_health_node_status | Status of health checks associated with a node. |
| consul_health_service_status | Status of health checks associated with a service. |
| consul_service_checks | Link the service id and check name if available. |
| consul_catalog_kv | The values for selected keys in Consul's key/value catalog. Keys with non-numeric values are omitted. |
And some metrics with uncertain names, See the [Agent Metrics][Agent Metrics] for more details
[Agent Metrics]: https://developer.hashicorp.com/consul/api-docs/agent#view-metrics
## Example Output
```text
consul_up address=localhost:8500 agent_hostname=hostname 1
consul_scrape_use_seconds address=localhost:8500 agent_hostname=hostname 0.015674053
consul_raft_peers address=localhost:8500 agent_hostname=hostname 1
consul_raft_leader address=localhost:8500 agent_hostname=hostname 1
consul_serf_lan_members address=localhost:8500 agent_hostname=hostname 1
consul_serf_lan_member_status address=localhost:8500 agent_hostname=hostname member=localhost.localdomain 1
consul_serf_wan_member_status address=localhost:8500 agent_hostname=hostname dc=dc1 member=localhost.localdomain.dc1 1
consul_catalog_services address=localhost:8500 agent_hostname=hostname 1
consul_health_node_status address=localhost:8500 agent_hostname=hostname check_id=service:demo check_name=Service 'demo' check node=localhost.localdomain status=passing 1
consul_health_node_status address=localhost:8500 agent_hostname=hostname check_id=service:demo check_name=Service 'demo' check node=localhost.localdomain status=warning 0
consul_health_node_status address=localhost:8500 agent_hostname=hostname check_id=service:demo check_name=Service 'demo' check node=localhost.localdomain status=critical 0
consul_health_node_status address=localhost:8500 agent_hostname=hostname check_id=service:demo check_name=Service 'demo' check node=localhost.localdomain status=maintenance 0
consul_health_service_status address=localhost:8500 agent_hostname=hostname check_id=service:demo check_name=Service 'demo' check node=localhost.localdomain service_id=demo service_name=demo status=passing 1
consul_health_service_status address=localhost:8500 agent_hostname=hostname check_id=service:demo check_name=Service 'demo' check node=localhost.localdomain service_id=demo service_name=demo status=warning 0
consul_health_service_status address=localhost:8500 agent_hostname=hostname check_id=service:demo check_name=Service 'demo' check node=localhost.localdomain service_id=demo service_name=demo status=critical 0
consul_health_service_status address=localhost:8500 agent_hostname=hostname check_id=service:demo check_name=Service 'demo' check node=localhost.localdomain service_id=demo service_name=demo status=maintenance 0
consul_service_checks address=localhost:8500 agent_hostname=hostname check_id=service:demo check_name=Service 'demo' check node=localhost.localdomain service_id=demo service_name=demo status=critical 1
consul_service_tag address=localhost:8500 agent_hostname=hostname check_id=service:demo check_name=Service 'demo' check node=localhost.localdomain service_id=demo service_name=demo tag=tag1 1
consul_service_tag address=localhost:8500 agent_hostname=hostname check_id=service:demo check_name=Service 'demo' check node=localhost.localdomain service_id=demo service_name=demo tag=tag2 1
```
================================================
FILE: integrations/Dns_Query/collect/dns_query/dns_query.toml
================================================
# # collect interval
# interval = 15
[[instances]]
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
# #
auto_detect_local_dns_server = false
## servers to query
# servers = ["8.8.8.8"]
servers = []
## Network is the network protocol name.
# network = "udp"
## Domains or subdomains to query.
# domains = ["."]
## Query record type.
## Possible values: A, AAAA, CNAME, MX, NS, PTR, TXT, SOA, SPF, SRV.
# record_type = "A"
## Dns server port.
# port = 53
## Query timeout in seconds.
# timeout = 2
================================================
FILE: integrations/Dns_Query/markdown/README.md
================================================
# 应用场景
一般用于对DNS服务器的响应监测,帮助运维快速定位网络问题。
# 部署场景
不需要每台虚拟机都启用此插件,建议是独立或复合的某一台虚拟机启用此插件。
# 配置场景
```
本配置启用或数据定义如下功能:
使用本机DNS查询域名解析质量。
使用外部DNS查询域名解析质量。
使用不同记录类型进行DNS查询。
每种查询都设置超时时间5秒。
增加自定义标签,可通过自定义标签筛选数据及更加精确的告警推送。
在domains字段处增加自己想要被DNS查询的域名,一般填写公司业务系统的域名或第三方依赖的业务系统。
```
# 修改dns_query.toml文件配置
``` 以下文件内容配置作为参考
[root@aliyun input.dns_query]# cat dns_query.toml
# # collect interval
# interval = 15
[[instances]]
# # append some labels for series
labels = { cloud="huaweicloud", region="huabei-beijing-4",azone="az1", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
# #
auto_detect_local_dns_server = true
### A record
## servers to query
servers = ["223.5.5.5","114.114.114.114","119.29.29.29"]
## Network is the network protocol name.
# network = "udp"
## Domains or subdomains to query.
domains = ["www.huaweicloud.com", "www.baidu.com", "www.tapd.cn"]
## Query record type.
## Possible values: A, AAAA, CNAME, MX, NS, PTR, TXT, SOA, SPF, SRV.
record_type = "A"
## Dns server port.
# port = 53
## Query timeout in seconds.
timeout = 5
### CNAME record
[[instances]]
# # append some labels for series
labels = { cloud="huaweicloud", region="huabei-beijing-4",azone="az1", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
# #
auto_detect_local_dns_server = false
## servers to query
servers = ["223.5.5.5","114.114.114.114","119.29.29.29"]
## Network is the network protocol name.
# network = "udp"
## Domains or subdomains to query.
domains = ["www.huaweicloud.com", "www.baidu.com", "www.tapd.cn"]
## Query record type.
## Possible values: A, AAAA, CNAME, MX, NS, PTR, TXT, SOA, SPF, SRV.
record_type = "CNAME"
## Dns server port.
# port = 53
## Query timeout in seconds.
timeout = 5
### NS record
[[instances]]
# # append some labels for series
labels = { cloud="huaweicloud", region="huabei-beijing-4",azone="az1", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
# #
auto_detect_local_dns_server = false
## servers to query
servers = ["223.5.5.5","114.114.114.114","119.29.29.29"]
## Network is the network protocol name.
# network = "udp"
## Domains or subdomains to query.
domains = ["www.huaweicloud.com", "www.baidu.com", "www.tapd.cn"]
## Query record type.
## Possible values: A, AAAA, CNAME, MX, NS, PTR, TXT, SOA, SPF, SRV.
record_type = "NS"
## Dns server port.
# port = 53
## Query timeout in seconds.
timeout = 5
```
# 测试配置
```
./categraf --test --inputs dns_query
....... A记录同理就省略
20:51:34 dns_query_rcode_value agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.tapd.cn product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 0
20:51:34 dns_query_result_code agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.tapd.cn product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 0
20:51:34 dns_query_query_time_ms agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.tapd.cn product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 33.500371
20:51:34 dns_query_rcode_value agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.baidu.com product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 0
20:51:34 dns_query_result_code agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.baidu.com product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 0
20:51:34 dns_query_query_time_ms agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.baidu.com product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 34.328242
20:51:34 dns_query_rcode_value agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.huaweicloud.com product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 0
20:51:34 dns_query_result_code agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.huaweicloud.com product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29 0
20:51:34 dns_query_query_time_ms agent_hostname=aliyun.tjf.n9e.001 azone=az1 cloud=huaweicloud domain=www.huaweicloud.com product=n9e record_type=CNAME region=huabei-beijing-4 server=119.29.29.29
.....
```
# 重启服务
```
重启categraf服务生效
systemctl daemon-reload && systemctl restart categraf && systemctl status categraf
查看启动日志是否有错误
journalctl -f -n 500 -u categraf | grep "E\!" | grep "W\!"
```
# 检查数据呈现
等待1-2分钟后数据就会在图表中展示出来,如图:

# 监控告警规则配置
```
个人经验仅供参考,一般DNS解析延迟时间:
超过2000毫秒,为P2级别,启用企业微信应用推送告警,3分钟内恢复发出恢复告警。
超过5000毫秒,为P1级别,启用电话语音告警&企业微信应用告警,3分钟内恢复发出恢复告警。
为什么会这么考量设计?
在用到DNS监控时,一般公司业务是遍布全国的,然而全国各个地区在解析DNS存在各种场景因素导致的DNS问题(如DNS被劫持、片区DNS服务器故障等),所以需要以高级别对待。
从收到告警到恢复告警设置3分钟的意图是防止期间是短暂时间有问题,同时也给SLA(99.99%)给足处理时长。
```
================================================
FILE: integrations/Docker/collect/docker/docker.toml
================================================
# # collect interval
# interval = 15
[[instances]]
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
## Docker Endpoint
## To use TCP, set endpoint = "tcp://[ip]:[port]"
## To use environment variables (ie, docker-machine), set endpoint = "ENV"
# endpoint = "unix:///var/run/docker.sock"
endpoint = ""
## Set to true to collect Swarm metrics(desired_replicas, running_replicas)
gather_services = false
gather_extend_memstats = false
container_id_label_enable = true
container_id_label_short_style = false
## Containers to include and exclude. Globs accepted.
## Note that an empty array for both will include all containers
container_name_include = []
container_name_exclude = []
## Container states to include and exclude. Globs accepted.
## When empty only containers in the "running" state will be captured.
## example: container_state_include = ["created", "restarting", "running", "removing", "paused", "exited", "dead"]
## example: container_state_exclude = ["created", "restarting", "running", "removing", "paused", "exited", "dead"]
# container_state_include = []
# container_state_exclude = []
## Timeout for docker list, info, and stats commands
timeout = "5s"
## Specifies for which classes a per-device metric should be issued
## Possible values are 'cpu' (cpu0, cpu1, ...), 'blkio' (8:0, 8:1, ...) and 'network' (eth0, eth1, ...)
## Please note that this setting has no effect if 'perdevice' is set to 'true'
perdevice_include = []
## Specifies for which classes a total metric should be issued. Total is an aggregated of the 'perdevice' values.
## Possible values are 'cpu', 'blkio' and 'network'
## Total 'cpu' is reported directly by Docker daemon, and 'network' and 'blkio' totals are aggregated by this plugin.
## Please note that this setting has no effect if 'total' is set to 'false'
total_include = ["cpu", "blkio", "network"]
## Which environment variables should we use as a tag
##tag_env = ["JAVA_HOME", "HEAP_SIZE"]
## docker labels to include and exclude as tags. Globs accepted.
## Note that an empty array for both will include all labels as tags
docker_label_include = []
docker_label_exclude = ["annotation*", "io.kubernetes*", "*description*", "*maintainer*", "*hash", "*author*", "*org_*", "*date*", "*url*", "*docker_compose*"]
## Optional TLS Config
# use_tls = false
# tls_ca = "/etc/telegraf/ca.pem"
# tls_cert = "/etc/telegraf/cert.pem"
# tls_key = "/etc/telegraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
================================================
FILE: integrations/Docker/markdown/README.md
================================================
# docker
forked from telegraf/inputs.docker。容器监控更推荐采用 cAdvisor 采集数据,然后用 input.prometheus 来采集 cAdvisor 的数据。Docker 插件基本可以忽略了。
## change
1. Using `container_id` as label not field
1. Some metrics have been deleted
## 容器ID标签
通过下面两个配置来控制 container_id 这个标签:
```ini
container_id_label_enable = true
container_id_label_short_style = false
```
默认 container_id_label_enable 设置为 true,表示启用,即会把容器ID放到标签里,container_id_label_short_style 是短格式,容器ID很长,如果把 short_style 设置为 true,就会只截取前面12位
## 权限问题
Categraf 最好是用 root 账号来运行,否则,请求 docker.sock 可能会遇到权限问题,需要把 Categraf 的运行账号,加到 docker group 中,假设 Categraf 使用 categraf 账号运行:
```
sudo usermod -aG docker categraf
```
## 运行在容器里
如果 Categraf 运行在容器中,docker 的 unix socket 就需要挂到 Categraf 的容器里,比如通过 `-v /var/run/docker.sock:/var/run/docker.sock` 这样的参数来启动 Categraf 的容器。如果是在 compose 环境下,也可以在 docker compose 配置中加上 volume 的配置:
```yaml
volumes:
- /var/run/docker.sock:/var/run/docker.sock
```
## 停用该插件
- 方法一:把 `input.docker` 目录改个别的名字,不用 `input.` 打头
- 方法二:docker.toml 中的 endpoint 配置留空
================================================
FILE: integrations/Doris/alerts/doris_by_categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris JVM 线程数",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "jvm_thread",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784164489700,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris JVM内存使用率",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "sum(jvm_heap_size_bytes{type=\"used\"})by(ident)/sum(jvm_heap_size_bytes{type=\"committed\"})by(ident)",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784169880800,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_BE 1 分钟 Load Avg",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_be_load_average{mode=\"1_minutes\"}",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784171517400,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_BE 1 分钟新增tcp包接收错误的次数",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "increase(doris_be_snmp_tcp_in_errs[1m])",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784172854000,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_BE 15 分钟 Load Avg",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_be_load_average{mode=\"15_minutes\"}",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784174515000,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_BE 5 分钟 Load Avg",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_be_load_average{mode=\"5_minutes\"}",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784175691000,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_BE batch 的线程池队列积压",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_be_add_batch_task_queue_size > 20",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784177409800,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_BE CPU 使用率",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "(sum(doris_be_cpu)by(instance)-sum(doris_be_cpu{mode=~\"idle|iowait\"})by(instance))/sum(doris_be_cpu)by(instance)*100 > 70",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784179069200,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_BE OlapScanner 线程池积压",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_be_scanner_thread_pool_queue_size > 0",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784180644600,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_BE 发送数据包的线程池出现积压",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_be_send_batch_thread_pool_queue_size > 0",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784181912800,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 95百分位查询延迟",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_fe_query_latency_ms{quantile=\"0.95\"}",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784184238600,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 99百分位查询延迟",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_fe_query_latency_ms{quantile=\"0.99\"}",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784185714400,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 事务 publish 耗时95分位",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_fe_txn_publish_latency_ms{quantile=\"0.95\"}",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784187173400,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 事务 publish 耗时99分位",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_fe_txn_publish_latency_ms{quantile=\"0.99\"}",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784188659500,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 事务执行耗时95分位",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_fe_txn_exec_latency_ms{quantile=\"0.95\"}",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784190024000,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 事务执行耗时99分位",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_fe_txn_exec_latency_ms{quantile=\"0.99\"}",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784191466000,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 失败的事务数量",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_fe_txn_counter{type=\"failed\"}",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784192987100,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 异常事务的数量",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "sum(doris_fe_txn_status{type=~\"aborted|unknown\"})by(type)",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784194383000,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 日志写入延迟95分位",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_fe_editlog_write_latency_ms{quantile=\"0.95\"}",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784195737900,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 日志写入延迟99分位",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_fe_editlog_write_latency_ms{quantile=\"0.99\"}",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784197151700,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 每秒查询数量",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_fe_qps",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784198568000,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 每秒错误查询数",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_fe_query_err_rate",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784199959300,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 清理元数据文件失败的次数",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "increase(doris_fe_image_clean[1m]) > 0",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784201470700,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 清理元数据日志失败的次数",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "increase(doris_fe_edit_log_clean[1m]) > 0",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784202892800,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 生成元数据镜像文件失败的次数",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "increase(doris_fe_image_write[1m]) > 0",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784204482600,
"cur_event_count": 0,
"update_by_nickname": "管理员"
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Doris_FE 被拒绝的事务数量",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"keys": {
"labelKey": "",
"metricKey": "",
"valueKey": ""
},
"prom_ql": "doris_fe_txn_counter{type=\"reject\"}",
"severity": 2
}
],
"version": "v1"
},
"event_relabel_config": null,
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {
"for_duration": 60,
"new_severity": 2,
"notify_max_number": 0,
"notify_repeat_step": 60
},
"network_device_config": {},
"notify_aggregation": {
"wait": 1
}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1730292784205710300,
"cur_event_count": 0,
"update_by_nickname": "管理员"
}
]
================================================
FILE: integrations/Doris/collect/prometheus/collect_doris_examples.toml
================================================
# doris_fe
[[instances]]
# 配置 fe metrics 服务地址
urls = [
"http://127.0.0.1:8030/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
# 指定 fe 服务 group 和 job 标签,这里是仪表盘变量调用,可根据实际需求修改。
labels = { group = "fe",job = "doris_cluster01"}
# doris_be
[[instances]]
# 配置 be metrics 服务地址
urls = [
"http://127.0.0.1:8040/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
# 指定 be 服务 group 和 job 标签,这里是仪表盘变量调用,可根据实际需求修改。
labels = { group = "be",job = "doris_cluster01"}
================================================
FILE: integrations/Doris/dashboards/Doris_Overview.json
================================================
{
"name": "Doris Overview",
"tags": "",
"configs": {
"links": [],
"panels": [
{
"collapsed": true,
"id": "5d9daaa1-b1de-476d-a924-c2a3abe4778f",
"layout": {
"h": 1,
"i": "5d9daaa1-b1de-476d-a924-c2a3abe4778f",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Overview",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"calc": "avg",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Num of Doris Clusters",
"id": "2696e097-5c15-4c1a-81f5-58d5b923cfc6",
"layout": {
"h": 7,
"i": "2696e097-5c15-4c1a-81f5-58d5b923cfc6",
"isResizable": true,
"w": 4,
"x": 0,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Cluster Number",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "count(node_info{type=\"is_master\"})",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Dead Frontends will be shown as Colored points.\nIf all Frontends are alive, all points should be Green.",
"id": "80bdedc3-e2e4-4198-b1ce-a94c8aa417f1",
"layout": {
"h": 7,
"i": "80bdedc3-e2e4-4198-b1ce-a94c8aa417f1",
"isResizable": true,
"w": 10,
"x": 4,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Frontends Status",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(up{group=\"fe\"} == 0) +0",
"legend": "{{job}}-{{instance}}: DEAD",
"refId": "B"
},
{
"expr": "(up{group=\"fe\"} == 1) +0",
"legend": "{{job}}-{{instance}}: ALIVE",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Dead Backends will be shown as Colored points.\nIf all Backends are alive, all points should be Green.",
"id": "757fca81-5e3f-48de-ab61-ad8150422d50",
"layout": {
"h": 7,
"i": "757fca81-5e3f-48de-ab61-ad8150422d50",
"isResizable": true,
"w": 10,
"x": 14,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Backends status",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(up{group=\"be\"} == 0) +0",
"legend": "{{job}}-{{instance}}: DEAD",
"refId": "B"
},
{
"expr": "(up{group=\"be\"} == 1) +0",
"legend": "{{job}}-{{instance}}: ALIVE",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The JVM heap usage percent of each Frontend of each Doris cluster.",
"id": "27d22825-5a24-4d53-87d1-2639f3e13a70",
"layout": {
"h": 7,
"i": "27d22825-5a24-4d53-87d1-2639f3e13a70",
"isResizable": true,
"w": 8,
"x": 0,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Cluster FE JVM Heap Stat",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(jvm_heap_size_bytes{group=\"fe\", type=\"used\"} * 100) by (instance, job) / sum(jvm_heap_size_bytes{group=\"fe\", type=\"max\"}) by (instance, job)",
"legend": "{{job}}-{{instance}}",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The Backend CPU idle overview of each Doris cluster.\nThe detail Backend CPU idle info can be seen in 'BE' section.",
"id": "df750dad-1be8-4b67-976d-91e751724193",
"layout": {
"h": 7,
"i": "df750dad-1be8-4b67-976d-91e751724193",
"isResizable": true,
"w": 8,
"x": 8,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Cluster BE CPU Idle",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(rate(doris_be_cpu{mode=\"idle\"}[$interval])) by (job)) / (sum(rate(doris_be_cpu[$interval])) by (job))",
"legend": "{{job}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The Backend memory usage overview of each Doris cluster.\nThe detail backend memory usage can be seen in 'BE' section.",
"id": "dd16ac62-5af4-40e0-a449-b0a95f32b33b",
"layout": {
"h": 7,
"i": "dd16ac62-5af4-40e0-a449-b0a95f32b33b",
"isResizable": true,
"w": 8,
"x": 16,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Cluster BE Mem Stat",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "avg(doris_be_memory_allocated_bytes) by (job)",
"legend": "{{job}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "QPS statistic group by cluster.\nThe QPS of each cluster is the sum of all queries processed on all Frontends.",
"id": "df810a3f-79ae-4a56-868e-abd9dee23ecc",
"layout": {
"h": 7,
"i": "df810a3f-79ae-4a56-868e-abd9dee23ecc",
"isResizable": true,
"w": 8,
"x": 0,
"y": 15
},
"links": [],
"maxPerRow": 4,
"name": "Cluster QPS Stat",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (job)(rate(doris_fe_query_total{group=\"fe\"}[$interval]))",
"legend": "{{job}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The Disk state. GREEN point means this disk is ONLINE. RED point means this disk is OFFLINE",
"id": "dbb3fd31-5577-4d89-9c5d-801469286c35",
"layout": {
"h": 7,
"i": "dbb3fd31-5577-4d89-9c5d-801469286c35",
"isResizable": true,
"w": 8,
"x": 8,
"y": 15
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Disk State",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(doris_be_disks_state{job=\"$cluster_name\"} == 0)+0",
"legend": "{{instance}}: {{path}} OFFLINE",
"refId": "A"
},
{
"expr": "(doris_be_disks_state{job=\"$cluster_name\"} == 1)+0",
"legend": "{{instance}}: {{path}} ONLINE",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "4934c795-bb60-412d-8a5e-0aeb9db6905e",
"layout": {
"h": 1,
"i": "4934c795-bb60-412d-8a5e-0aeb9db6905e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 22
},
"name": "Cluster Overview",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"calc": "avg",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Total Frontends node number",
"id": "81e85606-4059-4728-8624-a1c3adaf4356",
"layout": {
"h": 6,
"i": "81e85606-4059-4728-8624-a1c3adaf4356",
"isResizable": true,
"w": 4,
"x": 0,
"y": 23
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] FE Node",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "count(up{group=\"fe\", job=\"$cluster_name\"})",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "avg",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Total alive number of Frontends. Normally, it should be equal to the Total number of Frontends",
"id": "e469b68d-9351-4083-b4bc-1fb3f410efd9",
"layout": {
"h": 6,
"i": "e469b68d-9351-4083-b4bc-1fb3f410efd9",
"isResizable": true,
"w": 4,
"x": 4,
"y": 23
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] FE Alive",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "count(up{group=\"fe\", job=\"$cluster_name\"}==1)",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "avg",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Total Backends node number",
"id": "332a4eca-3ca3-4f73-b800-4a88dffc8c1e",
"layout": {
"h": 6,
"i": "332a4eca-3ca3-4f73-b800-4a88dffc8c1e",
"isResizable": true,
"w": 4,
"x": 8,
"y": 23
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] BE Node",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "count(up{group=\"be\", job=\"$cluster_name\"})",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "avg",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Total alive number of Backends. Normally, it should be equal to the Total number of Backends.",
"id": "2303b720-98e1-421c-918d-e6b613b3036d",
"layout": {
"h": 6,
"i": "2303b720-98e1-421c-918d-e6b613b3036d",
"isResizable": true,
"w": 4,
"x": 12,
"y": 23
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] BE Alive",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "count(up{group=\"be\", job=\"$cluster_name\"}==1)",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Total used disk capacity of all Backends.",
"id": "8ff1a193-dd17-40c8-a25d-39985edef8ee",
"layout": {
"h": 6,
"i": "8ff1a193-dd17-40c8-a25d-39985edef8ee",
"isResizable": true,
"w": 4,
"x": 16,
"y": 23
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Used Capacity",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "SUM(doris_be_disks_local_used_capacity{job=\"$cluster_name\"})",
"refId": "B"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Total disk capacity of all Backends",
"id": "d3762969-ca1b-4794-b710-ceeee5820008",
"layout": {
"h": 6,
"i": "d3762969-ca1b-4794-b710-ceeee5820008",
"isResizable": true,
"w": 4,
"x": 20,
"y": 23
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Total Capacity",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "SUM(doris_be_disks_total_capacity{job=\"$cluster_name\"})",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The max replayed meta data journal id on Frontends.\nNormally, all Frontends should be same on this metrics, or just slightly different for a short period.",
"id": "9c5fb54f-2428-4ece-8057-b75cf4cbbef9",
"layout": {
"h": 6,
"i": "9c5fb54f-2428-4ece-8057-b75cf4cbbef9",
"isResizable": true,
"w": 6,
"x": 0,
"y": 29
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Max Replayed journal id",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_max_journal_id{job=\"$cluster_name\"}",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The counter of meta data image generation on Master Frontend. And the counter of image successfully pushing to other Non-master Frontends.\nThese metrics is expected to increase at reasonable intervals. And normally, they should be equal.",
"id": "40779789-0758-4a7b-916e-d66f54c4d096",
"layout": {
"h": 6,
"i": "40779789-0758-4a7b-916e-d66f54c4d096",
"isResizable": true,
"w": 6,
"x": 6,
"y": 29
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Image counter",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_image_write{job=\"$cluster_name\", instance=\"$fe_master\"}",
"legend": "{{instance}}-write",
"refId": "A"
},
{
"expr": "doris_fe_image_push{job=\"$cluster_name\", instance=\"$fe_master\"}",
"legend": "{{instance}}-push",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The left Y axes shows write latency of 99th. The right Y axes shows the write per seconds of journal.",
"id": "1c1d3ebe-182b-4182-ae0a-4e7339b1eb42",
"layout": {
"h": 6,
"i": "1c1d3ebe-182b-4182-ae0a-4e7339b1eb42",
"isResizable": true,
"w": 6,
"x": 12,
"y": 29
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] BDBJE Write",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_editlog_write_latency_ms{job=\"$cluster_name\", instance=\"$fe_master\",quantile=\"0.99\"}",
"legend": "{{instance}}-99th",
"refId": "A"
},
{
"expr": "rate(doris_fe_edit_log{job=\"$cluster_name\", type=\"write\"}[$interval])",
"legend": "{{instance}}-write-rate",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The left Y axes shows the read per seconds of journal.",
"id": "a4aee7a4-3acc-4259-827f-4e28a669ea18",
"layout": {
"h": 6,
"i": "a4aee7a4-3acc-4259-827f-4e28a669ea18",
"isResizable": true,
"w": 6,
"x": 18,
"y": 29
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] BDBJE Read",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(doris_fe_edit_log{job=\"$cluster_name\", type=\"read\"}[$interval])",
"legend": "{{instance}}-read-rate",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The edit log size for each FE",
"id": "2137be68-6db0-4ee3-a6b1-127b3d00c146",
"layout": {
"h": 6,
"i": "2137be68-6db0-4ee3-a6b1-127b3d00c146",
"isResizable": true,
"w": 6,
"x": 0,
"y": 35
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Edit Log Size",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_edit_log{job=\"$cluster_name\", type=\"bytes\"}",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The edit log clean of each FE",
"id": "fff6178f-056a-4354-b714-71e00eb35b7e",
"layout": {
"h": 6,
"i": "fff6178f-056a-4354-b714-71e00eb35b7e",
"isResizable": true,
"w": 6,
"x": 6,
"y": 35
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Edit Log Clean",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_edit_log_clean{job=\"$cluster_name\", type=\"success\"}",
"legend": "{{instance}}_success",
"refId": "A"
},
{
"expr": "doris_fe_edit_log_clean{job=\"$cluster_name\", type=\"failed\"}",
"legend": "{{instance}}_failed",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The FE collect compaction score of each BE",
"id": "0e7a3912-ef46-442a-96a9-ca98f1b3ec1f",
"layout": {
"h": 6,
"i": "0e7a3912-ef46-442a-96a9-ca98f1b3ec1f",
"isResizable": true,
"w": 6,
"x": 12,
"y": 35
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] FE Collect Compaction Score",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_max_tablet_compaction_score{job=\"$cluster_name\"}",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The compaction score of each BE",
"id": "96c4f66c-f8e5-4020-b13a-ad5f39b4c7bf",
"layout": {
"h": 6,
"i": "96c4f66c-f8e5-4020-b13a-ad5f39b4c7bf",
"isResizable": true,
"w": 6,
"x": 18,
"y": 35
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] BE Compaction Score",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_tablet_max_compaction_score{job=\"$cluster_name\", instance=\"$fe_master\"}",
"legend": "{{backend}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The image Write of each FE",
"id": "09b36550-35e9-4a57-8384-e49b159d4bb6",
"layout": {
"h": 6,
"i": "09b36550-35e9-4a57-8384-e49b159d4bb6",
"isResizable": true,
"w": 6,
"x": 0,
"y": 41
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Image Write",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_image_write{job=\"$cluster_name\", type=\"success\"}",
"legend": "{{instance}}_success",
"refId": "A"
},
{
"expr": "doris_fe_image_write{job=\"$cluster_name\", type=\"failed\"}",
"legend": "{{instance}}_failed",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The image push of each FE",
"id": "490f2533-e2eb-4a49-9b51-878281a79b1a",
"layout": {
"h": 6,
"i": "490f2533-e2eb-4a49-9b51-878281a79b1a",
"isResizable": true,
"w": 6,
"x": 6,
"y": 41
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Image Push",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_image_push{job=\"$cluster_name\", type=\"success\"}",
"legend": "{{instance}}_success",
"refId": "A"
},
{
"expr": "doris_fe_image_push{job=\"$cluster_name\", type=\"failed\"}",
"legend": "{{instance}}_failed",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The image clean of each FE",
"id": "cffa363d-b514-42ed-be6b-67c93325d9e1",
"layout": {
"h": 6,
"i": "cffa363d-b514-42ed-be6b-67c93325d9e1",
"isResizable": true,
"w": 6,
"x": 12,
"y": 41
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Image Clean",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_image_clean{job=\"$cluster_name\", type=\"success\"}",
"legend": "{{instance}}_success",
"refId": "A"
},
{
"expr": "doris_fe_image_clean{job=\"$cluster_name\", type=\"failed\"}",
"legend": "{{instance}}_failed",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Number of tablets begin scheduled. These tablet may be in recovery process or balance process",
"id": "1cab833b-22db-494c-bd44-fe7e22b91321",
"layout": {
"h": 6,
"i": "1cab833b-22db-494c-bd44-fe7e22b91321",
"isResizable": true,
"w": 6,
"x": 18,
"y": 41
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Scheduling Tablets",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_scheduled_tablet_num{job=\"$cluster_name\", instance=\"$fe_master\"}",
"legend": "Scheduling tablet number",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The max IO util of each Backend",
"id": "854f7bb3-cf8e-48a3-81aa-50f83d89d6c0",
"layout": {
"h": 6,
"i": "854f7bb3-cf8e-48a3-81aa-50f83d89d6c0",
"isResizable": true,
"w": 6,
"x": 0,
"y": 47
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] BE IO Util",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_be_max_disk_io_util_percent{job=\"$cluster_name\"}",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "016e25d8-b7ca-4492-95f2-9caaaeb9897a",
"layout": {
"h": 1,
"i": "016e25d8-b7ca-4492-95f2-9caaaeb9897a",
"isResizable": false,
"w": 24,
"x": 0,
"y": 53
},
"name": "Query Statistic",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Requests per seconds on each Frontends.\nRequests include all requests sending to the Frontends.",
"id": "1d0ea05c-654a-4e55-a56a-f8cf29b3e109",
"layout": {
"h": 9,
"i": "1d0ea05c-654a-4e55-a56a-f8cf29b3e109",
"isResizable": true,
"w": 8,
"x": 0,
"y": 54
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] RPS",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(doris_fe_request_total{job=\"$cluster_name\", group=\"fe\"}[$interval])",
"legend": "{{instance}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Queries per seconds on each Frontends.\nQueries only include Select requests.",
"id": "8147ab91-ab8a-47c9-9049-08fbb77f9412",
"layout": {
"h": 9,
"i": "8147ab91-ab8a-47c9-9049-08fbb77f9412",
"isResizable": true,
"w": 8,
"x": 8,
"y": 54
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] QPS",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(doris_fe_query_total{job=\"$cluster_name\", group=\"fe\"}[$interval])",
"legend": "{{instance}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "99 quantiles of query latency on each Frontends.",
"id": "8de6fab5-27b8-4f40-8209-8a702ec9f665",
"layout": {
"h": 9,
"i": "8de6fab5-27b8-4f40-8209-8a702ec9f665",
"isResizable": true,
"w": 8,
"x": 16,
"y": 54
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] 99th Latency",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(doris_fe_query_latency_ms{job=\"$cluster_name\", quantile=\"0.99\"}) by (instance)",
"legend": "{{instance}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes indicates 95 to 99 quantiles of query latency on each Frontends.\nRight Y axes indicates the query rate per 1 min.",
"id": "6d6c3311-86dd-443f-97bf-2e9874b61650",
"layout": {
"h": 6,
"i": "6d6c3311-86dd-443f-97bf-2e9874b61650",
"isResizable": true,
"w": 8,
"x": 0,
"y": 63
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] [$fe_instance] Query Percentile",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_query_latency_ms{job=\"$cluster_name\", instance=\"$fe_instance\"}",
"legend": "{{quantile}}",
"refId": "A"
},
{
"expr": "rate(doris_fe_query_latency_ms_count{job=\"$cluster_name\", instance=\"$fe_instance\"}[1m])",
"legend": "query rate",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes indicates the accumulated error queries number.\nRight Y axes indicates the error query rate per 1 min.\nNormally, the error query rate should be 0.",
"id": "dec70a12-df04-4763-8f51-30a5b1d4c2c5",
"layout": {
"h": 6,
"i": "dec70a12-df04-4763-8f51-30a5b1d4c2c5",
"isResizable": true,
"w": 8,
"x": 8,
"y": 63
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Query Error [1m]",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_query_err{job=\"$cluster_name\"}",
"legend": "Err Counter-{{instance}}",
"refId": "B"
},
{
"expr": "rate(doris_fe_query_err{job=\"$cluster_name\"}[$interval])",
"legend": "Err Rate-{{instance}}",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The connections' number to each Frontends.",
"id": "5297af7c-771c-467f-a8c0-4d7311151492",
"layout": {
"h": 6,
"i": "5297af7c-771c-467f-a8c0-4d7311151492",
"isResizable": true,
"w": 8,
"x": 16,
"y": 63
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Connections",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_connection_total{job=\"$cluster_name\"}",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "3f88e434-af57-4b10-a36a-6af0c40b8fcd",
"layout": {
"h": 1,
"i": "3f88e434-af57-4b10-a36a-6af0c40b8fcd",
"isResizable": false,
"w": 24,
"x": 0,
"y": 69
},
"name": "Jobs",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Statistic of Broker load jobs's num in each Load State.",
"id": "36b2c233-f929-4d0a-af5a-1b4c433b8ba6",
"layout": {
"h": 6,
"i": "36b2c233-f929-4d0a-af5a-1b4c433b8ba6",
"isResizable": true,
"w": 6,
"x": 0,
"y": 70
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Broker Load Job",
"options": {},
"targets": [
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"BROKER\", instance=\"$fe_master\"}",
"refId": "A"
}
],
"type": "unknown",
"version": "3.0.0"
},
{
"custom": {},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Statistic of load jobs's num in each Load State which is generated by Insert Stmt.",
"id": "e4518ad0-a300-4e22-9114-7bcc2a770baa",
"layout": {
"h": 6,
"i": "e4518ad0-a300-4e22-9114-7bcc2a770baa",
"isResizable": true,
"w": 6,
"x": 6,
"y": 70
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Insert Load Job",
"options": {},
"targets": [
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"INSERT\", instance=\"$fe_master\"}",
"refId": "A"
}
],
"type": "unknown",
"version": "3.0.0"
},
{
"custom": {},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Statistic of Routine load jobs's num in each Load State.",
"id": "90ad8b66-2b7d-4efc-9dc9-bc9fee72492c",
"layout": {
"h": 6,
"i": "90ad8b66-2b7d-4efc-9dc9-bc9fee72492c",
"isResizable": true,
"w": 6,
"x": 12,
"y": 70
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Routine Load Job",
"options": {},
"targets": [
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"ROUTINE_LOAD\", instance=\"$fe_master\"}",
"refId": "A"
}
],
"type": "unknown",
"version": "3.0.0"
},
{
"custom": {},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Statistic of Spark load jobs's num in each Load State.",
"id": "8e431597-7fef-4a81-9c60-caafc9150e75",
"layout": {
"h": 6,
"i": "8e431597-7fef-4a81-9c60-caafc9150e75",
"isResizable": true,
"w": 6,
"x": 18,
"y": 70
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Spark Load Job",
"options": {},
"targets": [
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"SPARK\", instance=\"$fe_master\"}",
"refId": "A"
}
],
"type": "unknown",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The trend report of broker load job",
"id": "6398fd57-672a-4104-bbf9-5b6a08a31d02",
"layout": {
"h": 6,
"i": "6398fd57-672a-4104-bbf9-5b6a08a31d02",
"isResizable": true,
"w": 6,
"x": 0,
"y": 76
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Broker load tendency",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"BROKER\", instance=\"$fe_master\", state=\"PENDING\"}",
"legend": "PENDING",
"refId": "A"
},
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"BROKER\", instance=\"$fe_master\", state=\"ETL\"}",
"legend": "ETL",
"refId": "B"
},
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"BROKER\", instance=\"$fe_master\", state=\"LOADING\"}",
"legend": "LOADING",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The trend report of insert load job",
"id": "2cf4e7c5-0edc-411d-bc33-863d6fda4107",
"layout": {
"h": 6,
"i": "2cf4e7c5-0edc-411d-bc33-863d6fda4107",
"isResizable": true,
"w": 6,
"x": 6,
"y": 76
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Insert load tendency",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"INSERT\", instance=\"$fe_master\", state=\"PENDING\"}",
"legend": "PENDING",
"refId": "A"
},
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"INSERT\", instance=\"$fe_master\", state=\"ETL\"}",
"legend": "ETL",
"refId": "B"
},
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"INSERT\", instance=\"$fe_master\", state=\"LOADING\"}",
"legend": "LOADING",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The trend report of routine load job",
"id": "0899fb9a-dcc6-4eb8-a8dc-96d3f2124b10",
"layout": {
"h": 6,
"i": "0899fb9a-dcc6-4eb8-a8dc-96d3f2124b10",
"isResizable": true,
"w": 6,
"x": 12,
"y": 76
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Routine load tendency",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"ROUTINE_LOAD\", instance=\"$fe_master\", state=\"NEED_SCHEDULE\"}",
"legend": "NEED_SCHEDULE",
"refId": "A"
},
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"ROUTINE_LOAD\", instance=\"$fe_master\", state=\"RUNNING\"}",
"legend": "RUNNING",
"refId": "B"
},
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"ROUTINE_LOAD\", instance=\"$fe_master\", state=\"PAUSED\"}",
"legend": "PAUSED",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The trend report of spark load job",
"id": "ea340542-58e0-4ced-a758-a0563d3346bd",
"layout": {
"h": 6,
"i": "ea340542-58e0-4ced-a758-a0563d3346bd",
"isResizable": true,
"w": 6,
"x": 18,
"y": 76
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Spark load tendency",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"SPARK\", instance=\"$fe_master\", state=\"PENDING\"}",
"legend": "PENDING",
"refId": "A"
},
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"SPARK\", instance=\"$fe_master\", state=\"ETL\"}",
"legend": "ETL",
"refId": "B"
},
{
"expr": "doris_fe_job{job=\"$cluster_name\", exported_job=\"load\", type=\"SPARK\", instance=\"$fe_master\", state=\"LOADING\"}",
"legend": "LOADING",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Number of running schema change jobs.",
"id": "f0df6aae-52a4-4a92-a4a9-b6a0679be656",
"layout": {
"h": 3,
"i": "f0df6aae-52a4-4a92-a4a9-b6a0679be656",
"isResizable": true,
"w": 6,
"x": 0,
"y": 82
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] SC Job",
"options": {},
"targets": [
{
"expr": "doris_fe_job{job=\"$cluster_name\", instance=\"$fe_master\", type=\"SCHEMA_CHANGE\"}",
"legend": "asds",
"refId": "A"
}
],
"type": "unknown",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Queue size of report in Master FE.",
"id": "33d04fa7-1c77-4d0c-a43b-4b36539fc5e6",
"layout": {
"h": 6,
"i": "33d04fa7-1c77-4d0c-a43b-4b36539fc5e6",
"isResizable": true,
"w": 6,
"x": 6,
"y": 82
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Report queue size",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_report_queue_size{job=\"$cluster_name\", instance=\"$fe_master\"}",
"legend": "Report queue size",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Number of running rollup jobs.",
"id": "aa2551f4-361d-498b-a7ff-ed9485346db2",
"layout": {
"h": 3,
"i": "aa2551f4-361d-498b-a7ff-ed9485346db2",
"isResizable": true,
"w": 6,
"x": 0,
"y": 85
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Rollup Job",
"options": {},
"targets": [
{
"expr": "doris_fe_job{job=\"$cluster_name\", instance=\"$fe_master\", type=\"ROLLUP\"}",
"refId": "A"
}
],
"type": "unknown",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "b6ec71fc-8801-49d9-b418-80a56527637e",
"layout": {
"h": 1,
"i": "b6ec71fc-8801-49d9-b418-80a56527637e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 88
},
"name": "Transaction",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Show the number and rate of txn begin and success",
"id": "03f0fe6c-9794-4bde-af91-d19caecd6166",
"layout": {
"h": 6,
"i": "03f0fe6c-9794-4bde-af91-d19caecd6166",
"isResizable": true,
"w": 4,
"x": 0,
"y": 89
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Txn Begin/Success on FE",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_txn_counter{type=\"begin\"}",
"legend": "txn begin",
"refId": "A"
},
{
"expr": "doris_fe_txn_counter{type=\"begin\"}",
"legend": "txn success",
"refId": "D"
},
{
"expr": "irate(doris_fe_txn_counter{type=\"begin\"}[$interval])",
"legend": "txn begin rate",
"refId": "B"
},
{
"expr": "irate(doris_fe_txn_counter{type=\"begin\"}[$interval])",
"legend": "txn success rate",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Show the failed txn request. Including rejected request and failed txn",
"id": "fb50e00a-30c4-4ad4-9cbd-49defebfd090",
"layout": {
"h": 6,
"i": "fb50e00a-30c4-4ad4-9cbd-49defebfd090",
"isResizable": true,
"w": 5,
"x": 4,
"y": 89
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Txn Failed/Reject on FE",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(doris_fe_txn_counter{type=\"reject\"}[$interval])",
"legend": "txn reject rate",
"refId": "C"
},
{
"expr": "rate(doris_fe_txn_counter{type=\"failed\"}[$interval])",
"legend": "txn failed rate",
"refId": "D"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The number of total publish task request and error rate.",
"id": "cfc68c22-6619-478b-9e12-0a15314662f1",
"layout": {
"h": 6,
"i": "cfc68c22-6619-478b-9e12-0a15314662f1",
"isResizable": true,
"w": 5,
"x": 9,
"y": 89
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Publish Task on BE",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"publish\", status=\"total\"})",
"legend": "Total",
"refId": "A"
},
{
"expr": "irate(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"publish\", status=\"failed\"}[$interval])",
"legend": "{{instance}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Show the txn rstatus on FE",
"id": "0a89693c-6530-46dd-a01a-0f00cee1fc75",
"layout": {
"h": 6,
"i": "0a89693c-6530-46dd-a01a-0f00cee1fc75",
"isResizable": true,
"w": 5,
"x": 14,
"y": 89
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] fe_txn_status",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_txn_status{group=\"fe\", job=\"$cluster_name\", type=\"prepare\"}",
"legend": "prepare",
"refId": "A"
},
{
"expr": "doris_fe_txn_status{group=\"fe\", job=\"$cluster_name\", type=\"precommitted\"}",
"legend": "precommitted",
"refId": "B"
},
{
"expr": "doris_fe_txn_status{group=\"fe\", job=\"$cluster_name\", type=\"committed\"}",
"legend": "committed",
"refId": "C"
},
{
"expr": "doris_fe_txn_status{group=\"fe\", job=\"$cluster_name\", type=\"aborted\"}",
"legend": "aborted",
"refId": "D"
},
{
"expr": "doris_fe_txn_status{group=\"fe\", job=\"$cluster_name\", type=\"visible\"}",
"legend": "visible",
"refId": "E"
},
{
"expr": "doris_fe_txn_status{group=\"fe\", job=\"$cluster_name\", type=\"unknown\"}",
"legend": "unknown",
"refId": "F"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes indicates the total received bytes rate of txn. Right Y axes indicates the loaded rows rate of txn.",
"id": "a965e2f4-1210-4a9c-a60c-142e3ef20857",
"layout": {
"h": 6,
"i": "a965e2f4-1210-4a9c-a60c-142e3ef20857",
"isResizable": true,
"w": 5,
"x": 19,
"y": 89
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Txn Load Bytes/Rows rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(doris_be_stream_load{group=\"be\", job=\"$cluster_name\", type=\"receive_bytes\"}[$interval]))",
"legend": "bytes",
"refId": "A"
},
{
"expr": "sum(rate(doris_be_stream_load{group=\"be\", job=\"$cluster_name\", type=\"load_rows\"}[$interval]))",
"legend": "rows",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "d5660739-4863-45ab-8989-321806539fa2",
"layout": {
"h": 1,
"i": "d5660739-4863-45ab-8989-321806539fa2",
"isResizable": false,
"w": 24,
"x": 0,
"y": 95
},
"name": "FE JVM",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "JVM Heap usage of specified Frontend.\nLeft Y Axes shows the used/max heap size.\nRight Y Axes shows the used percentage.",
"id": "4a802abe-2eb9-43b2-80ef-83edc18d69b1",
"layout": {
"h": 6,
"i": "4a802abe-2eb9-43b2-80ef-83edc18d69b1",
"isResizable": true,
"w": 6,
"x": 0,
"y": 96
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] [$fe_instance] JVM Heap",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "jvm_heap_size_bytes{instance=\"$fe_instance\", job=\"$cluster_name\", type=\"used\"}",
"legend": "used",
"refId": "A"
},
{
"expr": "jvm_heap_size_bytes{instance=\"$fe_instance\", job=\"$cluster_name\", type=\"max\"}",
"legend": "max",
"refId": "B"
},
{
"expr": "sum(jvm_heap_size_bytes{instance=\"$fe_instance\", job=\"$cluster_name\", type=\"used\"}) * 100 / sum(jvm_heap_size_bytes{instance=\"$fe_instance\", job=\"$cluster_name\", type=\"max\"})",
"legend": "percentage",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "JVM Non Heap usage of specified Frontend.\nLeft Y Axes shows the used/committed non heap size.",
"id": "d9854424-b151-4c38-853a-4526cd31edb9",
"layout": {
"h": 6,
"i": "d9854424-b151-4c38-853a-4526cd31edb9",
"isResizable": true,
"w": 6,
"x": 6,
"y": 96
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] [$fe_instance] JVM Non Heap",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "jvm_non_heap_size_bytes{instance=\"$fe_instance\", job=\"$cluster_name\", type=\"used\"}",
"legend": "used",
"refId": "A"
},
{
"expr": "jvm_non_heap_size_bytes{instance=\"$fe_instance\", job=\"$cluster_name\", type=\"committed\"}",
"legend": "committed",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "JVM old generation usage of specified Frontend. Left Y Axes shows the used/max old generation size. Right Y Axes shows the used percentage.\nNormally, the usage percentage should be less than 80%.",
"id": "c30e8716-0e03-4b28-a9a7-4c3ea862f458",
"layout": {
"h": 6,
"i": "c30e8716-0e03-4b28-a9a7-4c3ea862f458",
"isResizable": true,
"w": 6,
"x": 12,
"y": 96
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] [$fe_instance] JVM Old",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "jvm_old_size_bytes{job=\"$cluster_name\", instance=\"$fe_instance\", type=\"used\"}",
"legend": "used",
"refId": "A"
},
{
"expr": "jvm_old_size_bytes{job=\"$cluster_name\", instance=\"$fe_instance\", type=\"max\"}",
"legend": "max",
"refId": "B"
},
{
"expr": "sum(jvm_old_size_bytes{job=\"$cluster_name\", instance=\"$fe_instance\", type=\"used\"}) * 100 / sum(jvm_old_size_bytes{job=\"$cluster_name\", instance=\"$fe_instance\", type=\"max\"})",
"legend": "percentage",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "JVM full gc stat of specified Frontend. \nLeft Y Axes shows times of full gc.\nRight Y Axes shows the time cost of each full gc.",
"id": "1ae86a38-9421-4472-b23a-1f41828a24e4",
"layout": {
"h": 6,
"i": "1ae86a38-9421-4472-b23a-1f41828a24e4",
"isResizable": true,
"w": 6,
"x": 18,
"y": 96
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] [$fe_instance] JVM Old GC",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "jvm_old_gc{job=\"$cluster_name\", instance=\"$fe_instance\", type=\"count\"}",
"legend": "count",
"refId": "A"
},
{
"expr": "sum(jvm_old_gc{job=\"$cluster_name\", instance=\"$fe_instance\", type=\"time\"}) / sum(jvm_old_gc{job=\"$cluster_name\", instance=\"$fe_instance\", type=\"count\"})",
"legend": "avg time",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "JVM young generation usage of specified Frontend.\nLeft Y Axes shows the used/max young generation size.\nRight Y Axes shows the used percentage.",
"id": "d6d6155c-470d-4fb6-a4db-fded915c5cef",
"layout": {
"h": 6,
"i": "d6d6155c-470d-4fb6-a4db-fded915c5cef",
"isResizable": true,
"w": 6,
"x": 0,
"y": 102
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] [$fe_instance] JVM Young",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "jvm_young_size_bytes{job=\"$cluster_name\", instance=\"$fe_instance\", type=\"used\"}",
"legend": "used",
"refId": "A"
},
{
"expr": "jvm_young_size_bytes{job=\"$cluster_name\", instance=\"$fe_instance\", type=\"max\"}",
"legend": "max",
"refId": "B"
},
{
"expr": "sum(jvm_young_size_bytes{job=\"$cluster_name\", instance=\"$fe_instance\", type=\"used\"}) * 100 / sum(jvm_young_size_bytes{job=\"$cluster_name\", instance=\"$fe_instance\", type=\"max\"})",
"legend": "percentage",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "JVM young gc stat of specified Frontend. \nLeft Y Axes shows times of young gc.\nRight Y Axes shows the time cost of each young gc.",
"id": "1e9c7d1d-f69d-4c95-bfa7-d46014151f21",
"layout": {
"h": 6,
"i": "1e9c7d1d-f69d-4c95-bfa7-d46014151f21",
"isResizable": true,
"w": 6,
"x": 6,
"y": 102
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] [$fe_instance] JVM Young GC",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "jvm_young_gc{job=\"$cluster_name\", instance=\"$fe_instance\", type=\"count\"}",
"legend": "count",
"refId": "A"
},
{
"expr": "sum(jvm_young_gc{job=\"$cluster_name\", instance=\"$fe_instance\", type=\"time\"}) / sum(jvm_young_gc{job=\"$cluster_name\", instance=\"$fe_instance\", type=\"count\"})",
"legend": "avg time",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Num of threads of FE JVM",
"id": "dd35a648-f3f2-4ce5-8f3c-315ccc3ad655",
"layout": {
"h": 6,
"i": "dd35a648-f3f2-4ce5-8f3c-315ccc3ad655",
"isResizable": true,
"w": 6,
"x": 12,
"y": 102
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] JVM Threads",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "jvm_thread{job=\"$cluster_name\", group=\"fe\", type=\"count\"}",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "e9f5be81-e476-4056-bb0f-065a9082b26c",
"layout": {
"h": 1,
"i": "e9f5be81-e476-4056-bb0f-065a9082b26c",
"isResizable": false,
"w": 24,
"x": 0,
"y": 108
},
"name": "BE",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "CPU idle stat of Backends.\nLow means CPU is busy.",
"id": "d174d1e4-3f66-45c6-b2c8-e0ad12662bd3",
"layout": {
"h": 9,
"i": "d174d1e4-3f66-45c6-b2c8-e0ad12662bd3",
"isResizable": true,
"w": 12,
"x": 0,
"y": 109
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] BE CPU Idle",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(rate(doris_be_cpu{mode=\"idle\", job=\"$cluster_name\"}[$interval])) by (job, instance)) / (sum(rate(doris_be_cpu{job=\"$cluster_name\"}[$interval])) by (job, instance)) * 100",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Memory usage of Backends.",
"id": "6d71521c-9ef9-4204-aba6-cf2a7659b5cd",
"layout": {
"h": 9,
"i": "6d71521c-9ef9-4204-aba6-cf2a7659b5cd",
"isResizable": true,
"w": 12,
"x": 12,
"y": 109
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] BE Mem",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_be_memory_allocated_bytes{job=\"$cluster_name\"}",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Network send(Left Y)/receive(Right Y) bytes rate of all device except 'lo'",
"id": "48850636-2977-43c0-bac3-2860f6e95eb1",
"layout": {
"h": 7,
"i": "48850636-2977-43c0-bac3-2860f6e95eb1",
"isResizable": true,
"w": 8,
"x": 0,
"y": 118
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Net send/receive bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(doris_be_network_send_bytes{job=\"$cluster_name\", group=\"be\", device!=\"lo\"}[$interval])",
"legend": "{{instance}}-{{device}}-send",
"refId": "A"
},
{
"expr": "irate(doris_be_network_receive_bytes{job=\"$cluster_name\", group=\"be\", device!=\"lo\"}[$interval])",
"legend": "{{instance}}-{{device}}-receive",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Disk capacity usage of Backends",
"id": "3e344063-09e3-44e2-a544-80879a68bb67",
"layout": {
"h": 7,
"i": "3e344063-09e3-44e2-a544-80879a68bb67",
"isResizable": true,
"w": 8,
"x": 8,
"y": 118
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Disk Usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(SUM(doris_be_disks_total_capacity{job=\"$cluster_name\"}) by (instance, path) - SUM(doris_be_disks_avail_capacity{job=\"$cluster_name\"}) by (instance, path)) / SUM(doris_be_disks_total_capacity{job=\"$cluster_name\"}) by (instance, path)",
"legend": "{{instance}}:{{path}}",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Number of tablets of each Backends",
"id": "10bc7e4d-44f0-4ec7-b16a-72eae631fb50",
"layout": {
"h": 7,
"i": "10bc7e4d-44f0-4ec7-b16a-72eae631fb50",
"isResizable": true,
"w": 8,
"x": 16,
"y": 118
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Tablet Distribution",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_fe_tablet_num{job=\"$cluster_name\", instance=\"$fe_master\"}",
"legend": "{{backend}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The file descriptor usage of Backends. Left Y axes shows the used fd num. Right Y axes shows the soft limit open file number.",
"id": "06bfa07c-aacd-44b6-8059-1d859999457d",
"layout": {
"h": 7,
"i": "06bfa07c-aacd-44b6-8059-1d859999457d",
"isResizable": true,
"w": 8,
"x": 0,
"y": 125
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] BE FD count",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_be_process_fd_num_used{job=\"$cluster_name\", group=\"be\"}",
"legend": "{{instance}}-used",
"refId": "A"
},
{
"expr": "doris_be_process_fd_num_limit_soft{job=\"$cluster_name\", group=\"be\"}",
"legend": "{{instance}}-soft limit",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The thread number of Backends",
"id": "d22e80d3-b055-4520-a667-aeee7b754d42",
"layout": {
"h": 7,
"i": "d22e80d3-b055-4520-a667-aeee7b754d42",
"isResizable": true,
"w": 8,
"x": 8,
"y": 125
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] BE thread num",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_be_process_thread_num{job=\"$cluster_name\", group=\"be\"}",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "IO util of Backends.\nHigh means I/O is busy.",
"id": "621fec2e-c115-4c11-9420-f96720c0d348",
"layout": {
"h": 7,
"i": "621fec2e-c115-4c11-9420-f96720c0d348",
"isResizable": true,
"w": 8,
"x": 16,
"y": 125
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Disk IO util",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "doris_be_max_disk_io_util_percent",
"legend": "{{instance}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Base compaction rate of Backends.\nNormally, base compaction only runs between 20:00 to 4:00 and it is configurable.\nRight Y axes indicates the total base compaction bytes.",
"id": "5cf950b7-0b3d-4eab-8f23-b5623cb0c590",
"layout": {
"h": 5,
"i": "5cf950b7-0b3d-4eab-8f23-b5623cb0c590",
"isResizable": true,
"w": 12,
"x": 0,
"y": 132
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] BE Compaction Base",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(doris_be_compaction_bytes_total{type=\"base\", job=\"$cluster_name\"}[$interval])",
"legend": "{{instance}}",
"refId": "A"
},
{
"expr": "sum(doris_be_compaction_bytes_total{type=\"base\", job=\"$cluster_name\"})",
"legend": "Total",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Cumulative compaction rate of Backends.\nRight Y axes indicates the total cumulative compaction bytes.",
"id": "111d1ab7-f1ad-4de3-be45-9015d0ca1967",
"layout": {
"h": 5,
"i": "111d1ab7-f1ad-4de3-be45-9015d0ca1967",
"isResizable": true,
"w": 12,
"x": 12,
"y": 132
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] BE Compaction Cumulate",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(doris_be_compaction_bytes_total{type=\"cumulative\", job=\"$cluster_name\"}[$interval])",
"legend": "{{instance}}",
"refId": "A"
},
{
"expr": "SUM(doris_be_compaction_bytes_total{type=\"cumulative\", job=\"$cluster_name\"})",
"legend": "Total",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Loading rate of Backends.\nThis indicates the rate of file downloading in LOADING state of load job(MINI and BROKER load).\nRight Y axes indicates the total rate of file downloading.",
"id": "5ee12bf7-aaab-4f49-baa7-406c3782239d",
"layout": {
"h": 5,
"i": "5ee12bf7-aaab-4f49-baa7-406c3782239d",
"isResizable": true,
"w": 12,
"x": 0,
"y": 137
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] BE Push Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(doris_be_push_request_write_bytes{job=\"$cluster_name\"}[$interval])",
"legend": "{{instance}}",
"refId": "A"
},
{
"expr": "sum(rate(doris_be_push_request_write_bytes{job=\"$cluster_name\"}[$interval]))",
"legend": "Total rate",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Loading rows rate of Backends.\nThis indicates the rate of rows loaded in LOADING state of load job. Right Y axes shows the total push rate of cluster.",
"id": "ba09e247-7df0-42f5-9f08-2352e54a0644",
"layout": {
"h": 5,
"i": "ba09e247-7df0-42f5-9f08-2352e54a0644",
"isResizable": true,
"w": 12,
"x": 12,
"y": 137
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] BE Push Rows",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(doris_be_push_request_write_rows{job=\"$cluster_name\"}[$interval])",
"legend": "{{instance}}",
"refId": "A"
},
{
"expr": "sum(rate(doris_be_push_request_write_rows{job=\"$cluster_name\"}[$interval]))",
"legend": "Total",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes shows the write rate of tablet header saved in rocksdb. Right Y axes shows the duration of each write operation.",
"id": "579dbc8d-11a0-4eda-b5f0-3e8f89d26268",
"layout": {
"h": 10,
"i": "579dbc8d-11a0-4eda-b5f0-3e8f89d26268",
"isResizable": true,
"w": 12,
"x": 0,
"y": 142
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Tablet Meta Write",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(doris_be_meta_request_total{job=\"$cluster_name\", type=\"write\"}[$interval])",
"legend": "{{instance}}-rate",
"refId": "B"
},
{
"expr": "doris_be_meta_request_duration{job=\"$cluster_name\", type=\"write\"} / doris_be_meta_request_total{job=\"$cluster_name\", type=\"write\"}",
"legend": "{{instance}}-latency",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Scan rows rate of Backends.\nThis indicates the read rows rate when processing queries.",
"id": "24202a67-6d39-42b1-afb8-b1058ab6da0c",
"layout": {
"h": 5,
"i": "24202a67-6d39-42b1-afb8-b1058ab6da0c",
"isResizable": true,
"w": 12,
"x": 12,
"y": 142
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] BE Scan Rows",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(doris_be_query_scan_rows{job=\"$cluster_name\"}[$interval])",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes shows the read rate of tablet header saved in rocksdb. Right Y axes shows the duration of each read operation.",
"id": "d00679d1-d426-4848-ad94-c51ddae6e46f",
"layout": {
"h": 5,
"i": "d00679d1-d426-4848-ad94-c51ddae6e46f",
"isResizable": true,
"w": 12,
"x": 12,
"y": 147
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Tablet Meta Read",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(doris_be_meta_request_total{job=\"$cluster_name\", type=\"read\"}[$interval])",
"legend": "{{instance}}-rate",
"refId": "B"
},
{
"expr": "doris_be_meta_request_duration{job=\"$cluster_name\", type=\"read\"} / doris_be_meta_request_total{job=\"$cluster_name\", type=\"read\"}",
"legend": "{{instance}}-latency",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "c0a6e025-7ed2-4a81-9459-ce37715aede9",
"layout": {
"h": 1,
"i": "c0a6e025-7ed2-4a81-9459-ce37715aede9",
"isResizable": false,
"w": 24,
"x": 0,
"y": 152
},
"name": "BE tasks",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes indicates the failure rate of specified tasks. Normally, it should be 0.\nRight Y axes indicates the total number of specified tasks in all Backends.",
"id": "100bce80-d34a-4244-8b85-627499710294",
"layout": {
"h": 6,
"i": "100bce80-d34a-4244-8b85-627499710294",
"isResizable": true,
"w": 8,
"x": 0,
"y": 153
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Tablets Report",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "SUM(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"report_all_tablets\", status=\"total\"})",
"legend": "Total",
"refId": "A"
},
{
"expr": "irate(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"report_all_tablets\", status=\"failed\"}[$interval])",
"legend": "{{instance}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes indicates the failure rate of specified tasks. Normally, it should be 0.\nRight Y axes indicates the total number of specified tasks in all Backends.",
"id": "62b0082f-e377-43f5-a0e5-98049379d152",
"layout": {
"h": 6,
"i": "62b0082f-e377-43f5-a0e5-98049379d152",
"isResizable": true,
"w": 8,
"x": 8,
"y": 153
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Single Tablet Report",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "SUM(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"report_tablet\", status=\"total\"})",
"legend": "Total",
"refId": "A"
},
{
"expr": "irate(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"report_tablet\", status=\"failed\"}[$interval])",
"legend": "{{instance}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes indicates the failure rate of specified tasks. Normally, it should be 0.\nRight Y axes indicates the total number of specified tasks in all Backends.",
"id": "bd866f1a-7bfb-44d0-a4b4-4f4f2b2cd267",
"layout": {
"h": 6,
"i": "bd866f1a-7bfb-44d0-a4b4-4f4f2b2cd267",
"isResizable": true,
"w": 8,
"x": 16,
"y": 153
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Finish task report",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "SUM(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"finish_task\", status=\"total\"})",
"legend": "Total",
"refId": "A"
},
{
"expr": "irate(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"finish_task\", status=\"failed\"}[$interval])",
"legend": "{{instance}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes indicates the failure rate of specified tasks. Normally, it should be 0.\nRight Y axes indicates the total number of specified tasks in all Backends.",
"id": "b22c2e79-a7fa-4675-a256-6e032ecd9b74",
"layout": {
"h": 6,
"i": "b22c2e79-a7fa-4675-a256-6e032ecd9b74",
"isResizable": true,
"w": 8,
"x": 0,
"y": 159
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Push Task",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(doris_be_push_requests_total{job=\"$cluster_name\", status=\"SUCCESS\"})",
"legend": "Total",
"refId": "A"
},
{
"expr": "irate(doris_be_push_requests_total{job=\"$cluster_name\", status=\"FAIL\"}[$interval])",
"legend": "{{instance}}-failed",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "The average cost time of push tasks on each Backend.",
"id": "94ca89bb-8c46-4f30-8ac3-14981b09208d",
"layout": {
"h": 6,
"i": "94ca89bb-8c46-4f30-8ac3-14981b09208d",
"isResizable": true,
"w": 8,
"x": 8,
"y": 159
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Push Task Cost Time",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(doris_be_push_request_duration_us{job=\"$cluster_name\"}[$interval])",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes indicates the failure rate of specified tasks. Normally, it should be 0.\nRight Y axes indicates the total number of specified tasks in all Backends.",
"id": "d2df89c0-808c-4032-8f69-473e02645208",
"layout": {
"h": 6,
"i": "d2df89c0-808c-4032-8f69-473e02645208",
"isResizable": true,
"w": 8,
"x": 16,
"y": 159
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Delete",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "SUM(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"delete\", status=\"total\"})",
"legend": "Total",
"refId": "A"
},
{
"expr": "irate(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"delete\", status=\"failed\"}[$interval])",
"legend": "{{instance}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes indicates the failure rate of specified tasks. Normally, it should be 0.\nRight Y axes indicates the total number of specified tasks in all Backends.",
"id": "360dd4ef-b945-4948-8c3c-73311392380d",
"layout": {
"h": 6,
"i": "360dd4ef-b945-4948-8c3c-73311392380d",
"isResizable": true,
"w": 8,
"x": 0,
"y": 165
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Base Compaction",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "SUM(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"base_compaction\", status=\"total\"})",
"legend": "Total",
"refId": "A"
},
{
"expr": "irate(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"base_compaction\", status=\"failed\"}[$interval])",
"legend": "{{instance}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes indicates the failure rate of specified tasks. Normally, it should be 0.\nRight Y axes indicates the total number of specified tasks in all Backends.",
"id": "f532a20d-6207-4287-a656-a86347f84e69",
"layout": {
"h": 6,
"i": "f532a20d-6207-4287-a656-a86347f84e69",
"isResizable": true,
"w": 8,
"x": 8,
"y": 165
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Cumulative Compaction",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "SUM(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"cumulative_compaction\", status=\"total\"})",
"legend": "Total",
"refId": "A"
},
{
"expr": "irate(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"cumulative_compaction\", status=\"failed\"}[$interval])",
"legend": "{{instance}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes indicates the failure rate of specified tasks. Normally, it should be 0.\nRight Y axes indicates the total number of specified tasks in all Backends.",
"id": "e38c661a-ebd5-461d-8821-e2a29fc5fffb",
"layout": {
"h": 6,
"i": "e38c661a-ebd5-461d-8821-e2a29fc5fffb",
"isResizable": true,
"w": 8,
"x": 16,
"y": 165
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Clone",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "SUM(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"clone\", status=\"total\"})",
"legend": "Total",
"refId": "A"
},
{
"expr": "irate(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"clone\", status=\"failed\"}[$interval])",
"legend": "{{instance}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes indicates the failure rate of specified tasks. Normally, it should be 0.\nRight Y axes indicates the total number of specified tasks in all Backends.",
"id": "6f07f466-a195-4232-ba97-4694e4d74f4c",
"layout": {
"h": 6,
"i": "6f07f466-a195-4232-ba97-4694e4d74f4c",
"isResizable": true,
"w": 8,
"x": 0,
"y": 171
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Create rollup",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "SUM(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"create_rollup\", status=\"total\"})",
"legend": "Total",
"refId": "A"
},
{
"expr": "irate(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"create_rollup\", status=\"failed\"}[$interval])",
"legend": "{{instance}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes indicates the failure rate of specified tasks. Normally, it should be 0.\nRight Y axes indicates the total number of specified tasks in all Backends.",
"id": "f79f8151-34dc-4f8c-a08c-1c0af04cd0b7",
"layout": {
"h": 6,
"i": "f79f8151-34dc-4f8c-a08c-1c0af04cd0b7",
"isResizable": true,
"w": 8,
"x": 8,
"y": 171
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Schema change",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "SUM(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"schema_change\", status=\"total\"})",
"legend": "Total",
"refId": "A"
},
{
"expr": "irate(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"schema_change\", status=\"failed\"}[$interval])",
"legend": "{{instance}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_DORIS}",
"description": "Left Y axes indicates the failure rate of specified tasks. Normally, it should be 0.\nRight Y axes indicates the total number of specified tasks in all Backends.",
"id": "fe07e069-210e-410c-9781-d71bf4086803",
"layout": {
"h": 6,
"i": "fe07e069-210e-410c-9781-d71bf4086803",
"isResizable": true,
"w": 8,
"x": 16,
"y": 171
},
"links": [],
"maxPerRow": 4,
"name": "[$cluster_name] Create tablet",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "SUM(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"create_tablet\", status=\"total\"})",
"legend": "Total",
"refId": "A"
},
{
"expr": "irate(doris_be_engine_requests_total{job=\"$cluster_name\", type=\"create_tablet\", status=\"failed\"}[$interval])",
"legend": "{{instance}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "DS_DORIS",
"type": "datasource"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${DS_DORIS}"
},
"definition": "label_values(up, job)",
"hide": false,
"multi": false,
"name": "cluster_name",
"reg": "",
"type": "query"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${DS_DORIS}"
},
"definition": "query_result(node_info{group=\"fe\", job=\"$cluster_name\", type=\"is_master\"})",
"hide": false,
"multi": false,
"name": "fe_master",
"reg": "/instance=\"(.+:\\d+)\"/",
"type": "query"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${DS_DORIS}"
},
"definition": "up{group=\"fe\", job=\"$cluster_name\"}",
"hide": false,
"multi": false,
"name": "fe_instance",
"reg": "/instance=\"(.+:\\d+)/",
"type": "query"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${DS_DORIS}"
},
"definition": "up{group=\"be\", job=\"$cluster_name\"}",
"hide": false,
"multi": false,
"name": "be_instance",
"reg": "/instance=\"(.+:\\d+)/",
"type": "query"
},
{
"definition": "1s,5s,1m,5m,1h,6h,1d",
"hide": false,
"name": "interval",
"type": "custom"
}
],
"version": "3.0.0"
},
"uuid": 1731586085431795000
}
================================================
FILE: integrations/Doris/markdown/README.md
================================================
# Doris
Doris 的进程都会暴露 `/metrics` 接口,通过这个接口暴露 Prometheus 协议的监控数据。
## 采集配置
categraf 的 `conf/input.prometheus/prometheus.toml`。因为 Doris 是暴露的 Prometheus 协议的监控数据,所以使用 categraf 的 prometheus 插件即可采集。
```toml
# doris_fe
[[instances]]
urls = [
"http://127.0.0.1:8030/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = { group = "fe",job = "doris_cluster01"}
# doris_be
[[instances]]
urls = [
"http://127.0.0.1:8040/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
labels = { group = "be",job = "doris_cluster01"}
```
================================================
FILE: integrations/Elasticsearch/alerts/elasticsearch_by_categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elastic Cluster Red status",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": " elasticsearch_cluster_health_status{color=\"red\"} == 1",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterRed"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327354100000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elastic Cluster Yellow status",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_status{color=\"yellow\"} == 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterYellow"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327354616000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch disk out of space of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_in_bytes * 100 \u003c 10",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskOutOfSpace"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327355180000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch disk space low of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_in_bytes * 100 \u003c 20",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskSpaceLow"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327355686000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch Heap Usage Too High of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 \u003e 90",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageTooHigh"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327356156000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch Heap Usage warning of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 \u003e 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageWarning"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327356669000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch initializing shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_initializing_shards \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchInitializingShards"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327357308000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch no new documents of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(elasticsearch_indices_docs{es_data_node=\"true\"}[5m]) == 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchNoNewDocuments"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327357752000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch pending tasks of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_number_of_pending_tasks \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchPendingTasks"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327358275000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch relocation shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_relocating_shards \u003e 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchRelocationShards"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327358785000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch unassigned shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_unassigned_shards \u003e 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchUnassignedShards"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327359272000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch Unhealthy Data Nodes",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_number_of_data_nodes \u003c number_of_data_nodes",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyDataNodes"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327359819000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch Unhealthy Nodes",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": " elasticsearch_cluster_health_number_of_nodes \u003c number_of_nodes",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyNodes"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327360313000
}
]
================================================
FILE: integrations/Elasticsearch/alerts/elasticsearch_by_exporter.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elastic Cluster Red status",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": " elasticsearch_cluster_health_status{color=\"red\"} == 1",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterRed"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327361785000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elastic Cluster Yellow status",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_status{color=\"yellow\"} == 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchClusterYellow"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327362329000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch disk out of space of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 \u003c 10",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskOutOfSpace"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327362792000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch disk space low of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 \u003c 20",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchDiskSpaceLow"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327363280000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch Heap Usage Too High of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 \u003e 90",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageTooHigh"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327363865000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch Heap Usage warning of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"}) * 100 \u003e 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHeapUsageWarning"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327364319000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch initializing shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_initializing_shards \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchInitializingShards"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327364769000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch no new documents of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(elasticsearch_indices_docs{es_data_node=\"true\"}[5m]) == 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchNoNewDocuments"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327365271000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch pending tasks of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_number_of_pending_tasks \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchPendingTasks"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327365726000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch relocation shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_relocating_shards \u003e 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchRelocationShards"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327366204000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch unassigned shards of the instance",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_unassigned_shards \u003e 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchUnassignedShards"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327366970000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch Unhealthy Data Nodes",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "elasticsearch_cluster_health_number_of_data_nodes \u003c number_of_data_nodes",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyDataNodes"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327367398000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Elasticsearch Unhealthy Nodes",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": " elasticsearch_cluster_health_number_of_nodes \u003c number_of_nodes",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=ElasticsearchHealthyNodes"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327367874000
}
]
================================================
FILE: integrations/Elasticsearch/collect/elasticsearch/elasticsearch.toml
================================================
# # collect interval
# interval = 15
############################################################################
# !!! uncomment [[instances]] to enable this plugin
[[instances]]
# # interval = global.interval * interval_times
# interval_times = 1
# append some labels to metrics
# labels = { cluster="cloud-n9e-es" }
## specify a list of one or more Elasticsearch servers
# servers = ["http://localhost:9200"]
servers = []
## Timeout for HTTP requests to the elastic search server(s)
http_timeout = "10s"
# either /_nodes/stats or /_nodes/_local/stats depending on this setting
local = false
## Set cluster_health to true when you want to obtain cluster health stats
cluster_health = true
## Adjust cluster_health_level when you want to obtain detailed health stats
## The options are
## - indices (default)
## - cluster
cluster_health_level = "cluster"
## Set cluster_stats to true when you want to obtain cluster stats.
cluster_stats = true
## Indices to collect; can be one or more indices names or _all
## Use of wildcards is allowed. Use a wildcard at the end to retrieve index names that end with a changing value, like a date.
# indices_include = ["zipkin*"]
## use "shards" or blank string for indices level
indices_level = ""
## node_stats is a list of sub-stats that you want to have gathered. Valid options
## are "indices", "os", "process", "jvm", "thread_pool", "fs", "transport", "http",
## "breaker". Per default, all stats are gathered.
node_stats = ["jvm", "breaker", "process", "os", "fs", "indices", "thread_pool", "transport"]
## HTTP Basic Authentication username and password.
username = "elastic"
password = "password"
## Optional TLS Config
# use_tls = false
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
## Sets the number of most recent indices to return for indices that are configured with a date-stamped suffix.
## Each 'indices_include' entry ending with a wildcard (*) or glob matching pattern will group together all indices that match it, and
## sort them by the date or number after the wildcard. Metrics then are gathered for only the 'num_most_recent_indices' amount of most
## recent indices.
num_most_recent_indices = 1
================================================
FILE: integrations/Elasticsearch/dashboards/elasticsearch_by_categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "ElasticSearch",
"ident": "",
"tags": "ElasticSearch Prometheus",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f70f4198-dec2-40c0-97d9-6986c7001e73",
"layout": {
"h": 3,
"i": "f70f4198-dec2-40c0-97d9-6986c7001e73",
"isResizable": true,
"w": 4,
"x": 0,
"y": 0
},
"name": "",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"text": "N/A"
},
"type": "special"
},
{
"match": {
"special": 1
},
"result": {
"color": "#417505",
"text": "Green"
},
"type": "special"
},
{
"match": {
"special": 2
},
"result": {
"color": "#f5a623",
"text": "Yellow"
},
"type": "special"
},
{
"match": {
"special": 3
},
"result": {
"color": "#d0021b",
"text": "Red"
},
"type": "special"
}
]
},
"targets": [
{
"expr": "min(elasticsearch_cluster_health_status_code{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7dafe232-ee30-479b-a2f1-e1064572c154",
"layout": {
"h": 3,
"i": "7dafe232-ee30-479b-a2f1-e1064572c154",
"isResizable": true,
"w": 4,
"x": 4,
"y": 0
},
"name": "Nodes",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "min(elasticsearch_cluster_health_number_of_nodes{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "447fb784-a7e4-41cf-820f-6086837590e6",
"layout": {
"h": 3,
"i": "c6953ef5-3e29-44dc-bf9e-74905934e9df",
"isResizable": true,
"w": 4,
"x": 8,
"y": 0
},
"name": "Data Nodes",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "min(elasticsearch_cluster_health_number_of_data_nodes{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f0375f72-4ca1-474f-81e9-ce6b64f22204",
"layout": {
"h": 3,
"i": "e6cf29e7-bb5d-4c8f-8aa6-67a63fc325c7",
"isResizable": true,
"w": 4,
"x": 12,
"y": 0
},
"name": "CPU Util Percent",
"options": {
"standardOptions": {
"util": "percent"
},
"valueMappings": [
{
"match": {
"to": 50
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 50,
"to": 80
},
"result": {
"color": "#f5a623"
},
"type": "range"
},
{
"match": {
"from": 80
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "max(elasticsearch_process_cpu_percent{service =\"$service\", node_host=~\"$node_host\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "288ee5f1-b484-43f5-86bf-5b81c01b3c2c",
"layout": {
"h": 3,
"i": "34ae1975-6acb-48a7-adce-1d67b7c581ec",
"isResizable": true,
"w": 4,
"x": 16,
"y": 0
},
"name": "JVM Heap Util Percent",
"options": {
"standardOptions": {
"util": "percent"
},
"valueMappings": [
{
"match": {
"to": 50
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 50,
"to": 80
},
"result": {
"color": "#f5a623"
},
"type": "range"
},
{
"match": {
"from": 80
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "max(elasticsearch_jvm_mem_heap_used_percent{service =\"$service\", node_host=~\"$node_host\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4dd345c1-2bc1-474e-83b1-153be10a5b5b",
"layout": {
"h": 3,
"i": "01c403f3-c3b0-4910-84df-a50d4968bcd6",
"isResizable": true,
"w": 4,
"x": 20,
"y": 0
},
"name": "Pending Tasks",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"to": 1
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 1
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_number_of_pending_tasks{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "b398d46e-7345-4cc9-90b4-918cbd1e8d1f",
"layout": {
"h": 1,
"i": "b398d46e-7345-4cc9-90b4-918cbd1e8d1f",
"isResizable": false,
"w": 24,
"x": 0,
"y": 3
},
"name": "Breakers",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.04,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "15882e6f-0585-4035-bfb6-71cb9caaa0a8",
"layout": {
"h": 4,
"i": "15882e6f-0585-4035-bfb6-71cb9caaa0a8",
"isResizable": true,
"w": 12,
"x": 0,
"y": 4
},
"name": "Tripped for breakers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum({__name__=~\"elasticsearch_breakers_.+_tripped\", service =\"$service\", node_host=~\"$node_host\"}) by (node_host)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.04,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8adbc8e4-f630-4a25-98e3-ee03dec92011",
"layout": {
"h": 4,
"i": "d05d16d0-022d-49f8-9b55-2388c4cbb2b1",
"isResizable": true,
"w": 12,
"x": 12,
"y": 4
},
"name": "Estimated size in bytes of breaker",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "{__name__=~\"elasticsearch_breakers_.+_size_in_bytes\", service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{__name__}} {{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "2aadd697-3bed-4f70-bc74-4bc801ef7d1d",
"layout": {
"h": 1,
"i": "2aadd697-3bed-4f70-bc74-4bc801ef7d1d",
"isResizable": false,
"w": 24,
"x": 0,
"y": 8
},
"name": "Shards",
"panels": [],
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7aec074e-1672-4dbb-8529-28292f9a4221",
"layout": {
"h": 3,
"i": "7aec074e-1672-4dbb-8529-28292f9a4221",
"isResizable": true,
"w": 4,
"x": 0,
"y": 9
},
"name": "Active shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_active_shards{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f138daa7-b98f-4575-89e3-42363a8102c9",
"layout": {
"h": 3,
"i": "fe82bb33-7b8d-4909-adda-64a4121f29fd",
"isResizable": true,
"w": 4,
"x": 4,
"y": 9
},
"name": "Active primary shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_active_primary_shards{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7412543a-dba5-4624-96ff-11e30b7e8ff4",
"layout": {
"h": 3,
"i": "de2c46fd-dcfd-43a3-847b-9fd1320dfaa7",
"isResizable": true,
"w": 4,
"x": 8,
"y": 9
},
"name": "Initializing shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_initializing_shards{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2f26f24f-2a79-4552-b79d-60b41fa3aee6",
"layout": {
"h": 3,
"i": "4403206d-a491-4564-9f61-db25a6beb356",
"isResizable": true,
"w": 4,
"x": 12,
"y": 9
},
"name": "Relocating shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_relocating_shards{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "394a83cc-f4e1-467e-83fa-b77d2c2be907",
"layout": {
"h": 3,
"i": "e24c847a-5704-4b7c-861e-75dd4e4b59d8",
"isResizable": true,
"w": 4,
"x": 16,
"y": 9
},
"name": "Delayed Unassigned shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_delayed_unassigned_shards{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "720b9719-5c37-44d9-bce8-539308afa6ae",
"layout": {
"h": 3,
"i": "d82314d5-028c-41fb-a79f-34699d56d17a",
"isResizable": true,
"w": 4,
"x": 20,
"y": 9
},
"name": "Unassigned shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_unassigned_shards{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "a1bc2be7-723b-4fe4-b217-bfdd8248559e",
"layout": {
"h": 1,
"i": "a1bc2be7-723b-4fe4-b217-bfdd8248559e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 12
},
"name": "JVM",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ee0c56e0-8f8e-4cbe-ac41-de2afad7b75a",
"layout": {
"h": 4,
"i": "ee0c56e0-8f8e-4cbe-ac41-de2afad7b75a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 13
},
"name": "GC counts / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_jvm_gc_collectors_old_collection_count{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "old gc {{node_host}}",
"refId": "A"
},
{
"expr": "irate(elasticsearch_jvm_gc_collectors_young_collection_count{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "young gc {{node_host}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5c361278-8a94-4b16-afdd-e6def804b9ff",
"layout": {
"h": 4,
"i": "4f21ebfc-b51c-469b-b149-479966750920",
"isResizable": true,
"w": 12,
"x": 12,
"y": 13
},
"name": "GC time in millis",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_jvm_gc_collectors_old_collection_time_in_millis{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "old gc {{node_host}}",
"refId": "A"
},
{
"expr": "irate(elasticsearch_jvm_gc_collectors_young_collection_time_in_millis{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "young gc {{node_host}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ff81d109-79e5-4909-8765-857a75cebf17",
"layout": {
"h": 4,
"i": "5105f1dc-26cb-4818-a04d-90f2e5803da2",
"isResizable": true,
"w": 6,
"x": 0,
"y": 17
},
"name": "Yong Used(Bytes)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "{__name__=~\"elasticsearch_jvm_mem_pools_young_used_in_bytes\", service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{__name__}} {{node_name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d7c76456-8f34-4e1b-843b-9d174bbdfcee",
"layout": {
"h": 4,
"i": "86841663-2a17-4858-a9e1-13c296b3bb76",
"isResizable": true,
"w": 6,
"x": 6,
"y": 17
},
"name": "Old Used(Bytes)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "{__name__=~\"elasticsearch_jvm_mem_pools_old_used_in_bytes\", service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{__name__}} {{node_name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a0096936-3790-40a1-b2ad-d7805945b948",
"layout": {
"h": 4,
"i": "14f655ac-9c1c-40fa-bfef-158cc8601ead",
"isResizable": true,
"w": 12,
"x": 12,
"y": 17
},
"name": "Committed Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "{__name__=~\"elasticsearch_jvm_mem_.+_committed_in_bytes\", service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{__name__}} {{node_name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "e1d04a8c-81ee-4949-87de-3b70bc637584",
"layout": {
"h": 1,
"i": "e1d04a8c-81ee-4949-87de-3b70bc637584",
"isResizable": false,
"w": 24,
"x": 0,
"y": 21
},
"name": "Translog",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "45aafb11-c694-4686-89ab-685068f91560",
"layout": {
"h": 4,
"i": "45aafb11-c694-4686-89ab-685068f91560",
"isResizable": true,
"w": 12,
"x": 0,
"y": 22
},
"name": "Total translog operations",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_translog_operations{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "09ca6329-8eec-4a61-b19e-9bbeea2b9712",
"layout": {
"h": 4,
"i": "56806f8a-525a-4ab4-a9d3-c83559ae4828",
"isResizable": true,
"w": 12,
"x": 12,
"y": 22
},
"name": "Total translog size in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_translog_size_in_bytes{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "d9694c3f-9a14-4bde-9427-88531b0ea3a6",
"layout": {
"h": 1,
"i": "d9694c3f-9a14-4bde-9427-88531b0ea3a6",
"isResizable": false,
"w": 24,
"x": 0,
"y": 26
},
"name": "Disk and Network",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6c0d9b3c-dda5-4da9-825e-33f650dbb008",
"layout": {
"h": 4,
"i": "6c0d9b3c-dda5-4da9-825e-33f650dbb008",
"isResizable": true,
"w": 12,
"x": 0,
"y": 27
},
"name": "Disk usage %",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "1-(elasticsearch_fs_total_available_in_bytes{service =\"$service\",node_host=~\"$node_host\"}/elasticsearch_fs_total_total_in_bytes{service =\"$service\",node_host=~\"$node_host\"})",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4f7ce5a7-2771-4cbf-a569-b1a90b070b93",
"layout": {
"h": 4,
"i": "4f7ce5a7-2771-4cbf-a569-b1a90b070b93",
"isResizable": true,
"w": 12,
"x": 12,
"y": 27
},
"name": "Network usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_transport_tx_size_in_bytes{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}: sent",
"refId": "A"
},
{
"expr": "-irate(elasticsearch_transport_rx_size_in_bytes{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}: received",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "292b6c24-5471-4eeb-9d65-1a1e7a684fe3",
"layout": {
"h": 1,
"i": "292b6c24-5471-4eeb-9d65-1a1e7a684fe3",
"isResizable": false,
"w": 24,
"x": 0,
"y": 31
},
"name": "Documents",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e98839c4-e3f3-4e6e-be3a-c44b70e6072c",
"layout": {
"h": 4,
"i": "e98839c4-e3f3-4e6e-be3a-c44b70e6072c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 32
},
"name": "Documents count on node",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_docs_count{service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "45c4e3d9-90f1-41bd-8169-1d8c0a921ba9",
"layout": {
"h": 4,
"i": "45c4e3d9-90f1-41bd-8169-1d8c0a921ba9",
"isResizable": true,
"w": 12,
"x": 12,
"y": 32
},
"name": "Documents indexed rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_indexing_index_total{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3b2a922d-4423-4845-8cfc-95970f3300d6",
"layout": {
"h": 4,
"i": "3b2a922d-4423-4845-8cfc-95970f3300d6",
"isResizable": true,
"w": 12,
"x": 0,
"y": 36
},
"name": "Documents deleted rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_docs_deleted{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "764fbcf7-3056-41ef-b62a-51813a6c315f",
"layout": {
"h": 4,
"i": "764fbcf7-3056-41ef-b62a-51813a6c315f",
"isResizable": true,
"w": 6,
"x": 12,
"y": 36
},
"name": "Documents merged rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_merges_total_docs{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7cc04ae4-946d-4837-9ea9-764a7cc2eecd",
"layout": {
"h": 4,
"i": "97b5d900-e91e-4e0e-8184-f508a3433bc6",
"isResizable": true,
"w": 6,
"x": 18,
"y": 36
},
"name": "Documents merged bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_merges_total_size_in_bytes{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "a0545cbd-6df5-4845-90e0-88a710f738ba",
"layout": {
"h": 1,
"i": "a0545cbd-6df5-4845-90e0-88a710f738ba",
"isResizable": false,
"w": 24,
"x": 0,
"y": 40
},
"name": "Times",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ad0445b0-8539-440d-bbf4-712450132a7a",
"layout": {
"h": 4,
"i": "ad0445b0-8539-440d-bbf4-712450132a7a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 41
},
"name": "Query time(Unit: ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_search_query_time_in_millis{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c3cf6c57-c4ce-4bc2-a150-df32c4951144",
"layout": {
"h": 4,
"i": "2af98dc1-f24e-4c7b-bd2c-723224facc5d",
"isResizable": true,
"w": 12,
"x": 12,
"y": 41
},
"name": "Indexing time(Unit: ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_indexing_index_time_in_millis{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "022db454-70ba-49f5-8c11-f89b76d145cb",
"layout": {
"h": 4,
"i": "553c7da8-2d83-4ea0-a6ef-b064a5101633",
"isResizable": true,
"w": 12,
"x": 0,
"y": 45
},
"name": "Merging time(Unit: ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_merges_total_time_in_millis{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f20bad4f-656c-428a-a1cf-aafb7d92137c",
"layout": {
"h": 4,
"i": "51056e8d-6dc8-4c7f-91e9-9c24c056462d",
"isResizable": true,
"w": 12,
"x": 12,
"y": 45
},
"name": "Indexing throttle time(Unit: ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_indexing_throttle_time_in_millis{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "2c56fb7a-85a0-4396-a317-6754d761cff2",
"layout": {
"h": 1,
"i": "2c56fb7a-85a0-4396-a317-6754d761cff2",
"isResizable": false,
"w": 24,
"x": 0,
"y": 49
},
"name": "Thread Pool",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "cefafeb9-fc8a-4c73-92b3-648cd6f08b11",
"layout": {
"h": 4,
"i": "cefafeb9-fc8a-4c73-92b3-648cd6f08b11",
"isResizable": true,
"w": 6,
"x": 0,
"y": 50
},
"name": "Thread Pool operations rejected",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate\n(label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_rejected\", service =\"$service\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_rejected\")[5m:])",
"legend": "{{node_host}}: {{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0ab67903-16ea-4001-b784-ae04d8b815c0",
"layout": {
"h": 4,
"i": "793e98e7-2729-4106-940c-ecccff1d4b89",
"isResizable": true,
"w": 6,
"x": 6,
"y": 50
},
"name": "Thread Pool threads active",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_active\", service =\"$service\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_active\")",
"legend": "{{node_host}}: {{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "bb5dc07d-673b-4e2d-b44c-441acfa7c27b",
"layout": {
"h": 4,
"i": "9a14c86a-86af-4464-ac82-41f621ce7166",
"isResizable": true,
"w": 6,
"x": 12,
"y": 50
},
"name": "Thread Pool threads queued",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_queue\", service =\"$service\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_queue\")",
"legend": "{{node_host}}: {{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4cac1498-c141-483f-97c6-e1177317a2ea",
"layout": {
"h": 4,
"i": "d5f42ea7-bdb5-44da-9ba6-7c0f09ba7c71",
"isResizable": true,
"w": 6,
"x": 18,
"y": 50
},
"name": "Thread Pool operations completed",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate\n(label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_completed\", service =\"$service\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_completed\")[5m:])",
"legend": "{{node_host}}: {{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "a5c3b529-c329-4a66-aab0-6caebba8be96",
"layout": {
"h": 1,
"i": "a5c3b529-c329-4a66-aab0-6caebba8be96",
"isResizable": false,
"w": 24,
"x": 0,
"y": 54
},
"name": "Caches",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5247f393-a934-4d9e-be0f-40b177d2be80",
"layout": {
"h": 4,
"i": "5247f393-a934-4d9e-be0f-40b177d2be80",
"isResizable": true,
"w": 4,
"x": 0,
"y": 55
},
"name": "Field data memory size",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_fielddata_memory_size_in_bytes{service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b87c56f7-4e50-4d15-8bcd-1218fee879d9",
"layout": {
"h": 4,
"i": "c33fceb6-df37-483e-ba53-4ffa5f5e5456",
"isResizable": true,
"w": 4,
"x": 4,
"y": 55
},
"name": "Field data evictions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_fielddata_evictions{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ae2d0a7a-b6cd-4fd5-99d4-3c4289b8b5a8",
"layout": {
"h": 4,
"i": "445484f4-32d3-4569-af8d-76790d0aa56b",
"isResizable": true,
"w": 4,
"x": 8,
"y": 55
},
"name": "Query cache size",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_query_cache_memory_size_in_bytes{service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "78e4badc-8d51-4aa6-81c5-d1c9183810a2",
"layout": {
"h": 4,
"i": "ce9aa255-9d5b-44ed-9071-85e9d95675ec",
"isResizable": true,
"w": 4,
"x": 12,
"y": 55
},
"name": "Query cache evictions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_query_cache_evictions{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "296b43f1-2f33-492a-bce8-6f0fde1e7b52",
"layout": {
"h": 4,
"i": "b8b2604c-d84f-426f-b033-af9035a9e80d",
"isResizable": true,
"w": 8,
"x": 16,
"y": 55
},
"name": "Request cache evictions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_request_cache_evictions{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "d3161bf5-27a7-4552-a7d6-7b2b7d46b611",
"layout": {
"h": 1,
"i": "d3161bf5-27a7-4552-a7d6-7b2b7d46b611",
"isResizable": false,
"w": 24,
"x": 0,
"y": 59
},
"name": "Segments",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1537acaa-d5ce-48c5-b740-26fd543eb120",
"layout": {
"h": 4,
"i": "1537acaa-d5ce-48c5-b740-26fd543eb120",
"isResizable": true,
"w": 12,
"x": 0,
"y": 60
},
"name": "Count of index segments",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segments_count{service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a1c34fa4-4549-41a6-8d31-d25e7d860106",
"layout": {
"h": 4,
"i": "9c8efed6-7ced-4805-87e0-3da3b18d2989",
"isResizable": true,
"w": 12,
"x": 12,
"y": 60
},
"name": "Current memory size of segments in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segments_memory_in_bytes{service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(elasticsearch_up, service)",
"name": "service",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(elasticsearch_jvm_uptime_in_millis{service =\"$service\"}, node_host)",
"multi": true,
"name": "node_host",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327369517000
}
================================================
FILE: integrations/Elasticsearch/dashboards/elasticsearch_by_categraf_0.3.102.json
================================================
{
"name": "Elasticsearch Dashboard",
"tags": "Elasticsearch Categraf Categraf_version>0.3.102",
"ident": "",
"uuid": 1755618101588000,
"configs": {
"panels": [
{
"type": "row",
"id": "16c2d8ff-6596-48c6-bd81-56e01024c146",
"name": "KPI",
"collapsed": true,
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 0,
"i": "16c2d8ff-6596-48c6-bd81-56e01024c146",
"isResizable": false
},
"panels": []
},
{
"type": "stat",
"id": "f70f4198-dec2-40c0-97d9-6986c7001e73",
"layout": {
"h": 3,
"w": 4,
"x": 0,
"y": 1,
"i": "f70f4198-dec2-40c0-97d9-6986c7001e73",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "elasticsearch_cluster_health_status{cluster=\"$cluster\", color=\"green\"} == 1 or (elasticsearch_cluster_health_status{cluster=\"$cluster\", color=\"yellow\"} == 1)+1 or (elasticsearch_cluster_health_status{cluster=\"$cluster\", color=\"red\"} == 1)+2",
"refId": "A",
"maxDataPoints": 240,
"instant": true,
"legend": "{{color}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Cluster health",
"description": "绿色:健康,黄色:存在副本分片异常,红色:存在主分片异常",
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "none",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 1,
"textSize": {},
"orientation": "horizontal"
},
"options": {
"thresholds": {
"steps": [
{
"color": "rgba(99, 76, 217, 1)",
"value": null,
"type": "base"
}
]
},
"valueMappings": [
{
"type": "special",
"result": {
"color": "rgba(65, 117, 5, 1)",
"text": "Green"
},
"match": {
"textValue": "green",
"special": 1
}
},
{
"type": "special",
"result": {
"color": "rgba(245, 166, 35, 1)",
"text": "Yellow"
},
"match": {
"textValue": "yellow",
"special": 2
}
},
{
"type": "special",
"result": {
"color": "rgba(208, 2, 27, 1)",
"text": "Red"
},
"match": {
"textValue": "red",
"special": 3
}
},
{
"type": "specialValue",
"result": {
"color": "rgba(99, 76, 217, 1)",
"text": "N/A"
},
"match": {
"specialValue": "null"
}
}
],
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID",
"value": "B"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"valueMappings": []
}
}
]
},
{
"type": "stat",
"id": "7dafe232-ee30-479b-a2f1-e1064572c154",
"layout": {
"h": 3,
"w": 4,
"x": 4,
"y": 1,
"i": "7dafe232-ee30-479b-a2f1-e1064572c154",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "min(elasticsearch_cluster_health_number_of_nodes{cluster=\"$cluster\"})",
"refId": "A",
"maxDataPoints": 240,
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Nodes",
"description": "集群节点数量",
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {
"value": null
},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "stat",
"id": "447fb784-a7e4-41cf-820f-6086837590e6",
"layout": {
"h": 3,
"w": 4,
"x": 8,
"y": 1,
"i": "c6953ef5-3e29-44dc-bf9e-74905934e9df",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "min(elasticsearch_cluster_health_number_of_data_nodes{cluster=\"$cluster\"})",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Data Nodes",
"description": "集群数据节点数量",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "stat",
"id": "f0375f72-4ca1-474f-81e9-ce6b64f22204",
"layout": {
"h": 3,
"w": 4,
"x": 12,
"y": 1,
"i": "e6cf29e7-bb5d-4c8f-8aa6-67a63fc325c7",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "max(elasticsearch_process_cpu_percent{cluster=\"$cluster\", host=~\"$host\"})",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "CPU usage Max",
"links": [],
"description": "",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"valueMappings": [
{
"match": {
"to": 50
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 50,
"to": 80
},
"result": {
"color": "#f5a623"
},
"type": "range"
},
{
"match": {
"from": 80
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
],
"standardOptions": {
"util": "percent"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "stat",
"id": "288ee5f1-b484-43f5-86bf-5b81c01b3c2c",
"layout": {
"h": 3,
"w": 4,
"x": 16,
"y": 1,
"i": "34ae1975-6acb-48a7-adce-1d67b7c581ec",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "max(elasticsearch_jvm_mem_heap_used_percent{cluster=\"$cluster\", host=~\"$host\"})",
"refId": "A",
"maxDataPoints": 240,
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "JVM Heap Util Percent Max",
"description": "",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"valueMappings": [
{
"match": {
"to": 50
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 50,
"to": 80
},
"result": {
"color": "#f5a623"
},
"type": "range"
},
{
"match": {
"from": 80
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
],
"standardOptions": {
"util": "percent"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "stat",
"id": "4dd345c1-2bc1-474e-83b1-153be10a5b5b",
"layout": {
"h": 3,
"w": 4,
"x": 20,
"y": 1,
"i": "01c403f3-c3b0-4910-84df-a50d4968bcd6",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "max(elasticsearch_cluster_health_number_of_pending_tasks{cluster=\"$cluster\"})",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Pending Tasks",
"description": "",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"valueMappings": [
{
"match": {
"to": 1
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 1
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
],
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"collapsed": true,
"id": "b398d46e-7345-4cc9-90b4-918cbd1e8d1f",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 4,
"i": "b398d46e-7345-4cc9-90b4-918cbd1e8d1f",
"isResizable": false
},
"name": "Breakers",
"panels": [],
"type": "row"
},
{
"type": "timeseries",
"id": "15882e6f-0585-4035-bfb6-71cb9caaa0a8",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 5,
"i": "15882e6f-0585-4035-bfb6-71cb9caaa0a8",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "sum(elasticsearch_breakers_tripped{cluster=\"$cluster\", host=~\"$host\"}) by (host, breaker)",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Tripped for breakers",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.04,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "8adbc8e4-f630-4a25-98e3-ee03dec92011",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 5,
"i": "d05d16d0-022d-49f8-9b55-2388c4cbb2b1",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "{__name__=~\"elasticsearch_breakers_.+_size_in_bytes\", cluster=\"$cluster\", host=~\"$host\"}",
"legend": "{{__name__}} {{host}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Estimated size in bytes of breaker",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.04,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "2aadd697-3bed-4f70-bc74-4bc801ef7d1d",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 9,
"i": "2aadd697-3bed-4f70-bc74-4bc801ef7d1d",
"isResizable": false
},
"name": "Shards",
"panels": [],
"type": "row"
},
{
"type": "stat",
"id": "7aec074e-1672-4dbb-8529-28292f9a4221",
"layout": {
"h": 3,
"w": 4,
"x": 0,
"y": 10,
"i": "7aec074e-1672-4dbb-8529-28292f9a4221",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "max(elasticsearch_cluster_health_active_shards{cluster=\"$cluster\"})",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Active shards",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "stat",
"id": "f138daa7-b98f-4575-89e3-42363a8102c9",
"layout": {
"h": 3,
"w": 4,
"x": 4,
"y": 10,
"i": "fe82bb33-7b8d-4909-adda-64a4121f29fd",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "max(elasticsearch_cluster_health_active_primary_shards{cluster=\"$cluster\"})",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Active primary shards",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "stat",
"id": "7412543a-dba5-4624-96ff-11e30b7e8ff4",
"layout": {
"h": 3,
"w": 4,
"x": 8,
"y": 10,
"i": "de2c46fd-dcfd-43a3-847b-9fd1320dfaa7",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "max(elasticsearch_cluster_health_initializing_shards{cluster=\"$cluster\"})",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Initializing shards",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "stat",
"id": "2f26f24f-2a79-4552-b79d-60b41fa3aee6",
"layout": {
"h": 3,
"w": 4,
"x": 12,
"y": 10,
"i": "4403206d-a491-4564-9f61-db25a6beb356",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "max(elasticsearch_cluster_health_relocating_shards{cluster=\"$cluster\"})",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Relocating shards",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "stat",
"id": "394a83cc-f4e1-467e-83fa-b77d2c2be907",
"layout": {
"h": 3,
"w": 4,
"x": 16,
"y": 10,
"i": "e24c847a-5704-4b7c-861e-75dd4e4b59d8",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "max(elasticsearch_cluster_health_delayed_unassigned_shards{cluster=\"$cluster\"})",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Delayed Unassigned shards",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "stat",
"id": "720b9719-5c37-44d9-bce8-539308afa6ae",
"layout": {
"h": 3,
"w": 4,
"x": 20,
"y": 10,
"i": "d82314d5-028c-41fb-a79f-34699d56d17a",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "max(elasticsearch_cluster_health_unassigned_shards{cluster=\"$cluster\"})",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Unassigned shards",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"collapsed": true,
"id": "a1bc2be7-723b-4fe4-b217-bfdd8248559e",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 13,
"i": "a1bc2be7-723b-4fe4-b217-bfdd8248559e",
"isResizable": false
},
"name": "JVM",
"panels": [],
"type": "row"
},
{
"type": "timeseries",
"id": "ee0c56e0-8f8e-4cbe-ac41-de2afad7b75a",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 14,
"i": "ee0c56e0-8f8e-4cbe-ac41-de2afad7b75a",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "irate(elasticsearch_jvm_gc_collection_seconds_count{cluster=\"$cluster\", host=~\"$host\"}[5m])",
"legend": "{{host}}",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "irate(elasticsearch_jvm_gc_collectors_young_collection_count{cluster=\"$cluster\", host=~\"$host\"}[5m])",
"legend": "young gc {{host}}",
"refId": "B",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "GC counts / second",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "5c361278-8a94-4b16-afdd-e6def804b9ff",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 14,
"i": "4f21ebfc-b51c-469b-b149-479966750920",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "irate(elasticsearch_jvm_gc_collection_seconds_sum{cluster=\"$cluster\", host=~\"$host\"}[5m])",
"legend": "{{host}}",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "irate(elasticsearch_jvm_gc_collectors_young_collection_time_in_millis{cluster=\"$cluster\", host=~\"$host\"}[5m])",
"legend": "young gc {{host}}",
"refId": "B",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "GC time in seconds",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "ff81d109-79e5-4909-8765-857a75cebf17",
"layout": {
"h": 4,
"w": 6,
"x": 0,
"y": 18,
"i": "5105f1dc-26cb-4818-a04d-90f2e5803da2",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "elasticsearch_jvm_memory_pools_young_used_in_bytes{cluster=\"$cluster\", host=~\"$host\"}",
"legend": "{{host}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Yong Used(Bytes)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "d7c76456-8f34-4e1b-843b-9d174bbdfcee",
"layout": {
"h": 4,
"w": 6,
"x": 6,
"y": 18,
"i": "86841663-2a17-4858-a9e1-13c296b3bb76",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "elasticsearch_jvm_memory_pools_old_used_in_bytes{cluster=\"$cluster\", host=~\"$host\"}",
"legend": "{{host}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Old Used(Bytes)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a0096936-3790-40a1-b2ad-d7805945b948",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 18,
"i": "14f655ac-9c1c-40fa-bfef-158cc8601ead",
"isResizable": true
},
"name": "Committed Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "{__name__=~\"elasticsearch_jvm_mem_.+_committed_in_bytes\", cluster=\"$cluster\", host=~\"$host\"}",
"legend": "{{__name__}} {{node_name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "e1d04a8c-81ee-4949-87de-3b70bc637584",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 22,
"i": "e1d04a8c-81ee-4949-87de-3b70bc637584",
"isResizable": false
},
"name": "Translog",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "45aafb11-c694-4686-89ab-685068f91560",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 23,
"i": "45aafb11-c694-4686-89ab-685068f91560",
"isResizable": true
},
"name": "Total translog operations",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_translog_operations{cluster=\"$cluster\", host=~\"$host\"}[5m])",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "09ca6329-8eec-4a61-b19e-9bbeea2b9712",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 23,
"i": "56806f8a-525a-4ab4-a9d3-c83559ae4828",
"isResizable": true
},
"name": "Total translog size in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_translog_size_in_bytes{cluster=\"$cluster\", host=~\"$host\"}[5m])",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "d9694c3f-9a14-4bde-9427-88531b0ea3a6",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 27,
"i": "d9694c3f-9a14-4bde-9427-88531b0ea3a6",
"isResizable": false
},
"name": "Disk and Network",
"panels": [],
"type": "row"
},
{
"type": "timeseries",
"id": "6c0d9b3c-dda5-4da9-825e-33f650dbb008",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 28,
"i": "6c0d9b3c-dda5-4da9-825e-33f650dbb008",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "1-(elasticsearch_filesystem_data_free_bytes{cluster=\"$cluster\",host=~\"$host\"}/elasticsearch_filesystem_data_size_in_bytes{cluster=\"$cluster\",host=~\"$host\"})",
"legend": "{{host}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Disk usage %",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "4f7ce5a7-2771-4cbf-a569-b1a90b070b93",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 28,
"i": "4f7ce5a7-2771-4cbf-a569-b1a90b070b93",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "irate(elasticsearch_transport_tx_size_in_bytes_total{cluster=\"$cluster\",host=~\"$host\"}[5m])",
"legend": "{{host}}: sent",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "-irate(elasticsearch_transport_rx_size_in_bytes_total{cluster=\"$cluster\",host=~\"$host\"}[5m])",
"legend": "{{host}}: received",
"refId": "B",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Network usage",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "292b6c24-5471-4eeb-9d65-1a1e7a684fe3",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 32,
"i": "292b6c24-5471-4eeb-9d65-1a1e7a684fe3",
"isResizable": false
},
"name": "Documents",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e98839c4-e3f3-4e6e-be3a-c44b70e6072c",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 33,
"i": "e98839c4-e3f3-4e6e-be3a-c44b70e6072c",
"isResizable": true
},
"name": "Documents count on node",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_docs_count{cluster=\"$cluster\", host=~\"$host\"}",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "45c4e3d9-90f1-41bd-8169-1d8c0a921ba9",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 33,
"i": "45c4e3d9-90f1-41bd-8169-1d8c0a921ba9",
"isResizable": true
},
"name": "Documents indexed rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_indexing_index_total{cluster=\"$cluster\",host=~\"$host\"}[5m])",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3b2a922d-4423-4845-8cfc-95970f3300d6",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 37,
"i": "3b2a922d-4423-4845-8cfc-95970f3300d6",
"isResizable": true
},
"name": "Documents deleted rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_docs_deleted{cluster=\"$cluster\",host=~\"$host\"}[5m])",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"type": "timeseries",
"id": "764fbcf7-3056-41ef-b62a-51813a6c315f",
"layout": {
"h": 4,
"w": 6,
"x": 12,
"y": 37,
"i": "764fbcf7-3056-41ef-b62a-51813a6c315f",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "rate(elasticsearch_indices_merges_docs_total{cluster=\"$cluster\",host=~\"$host\"}[5m])",
"legend": "{{host}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Documents merged rate",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7cc04ae4-946d-4837-9ea9-764a7cc2eecd",
"layout": {
"h": 4,
"w": 6,
"x": 18,
"y": 37,
"i": "97b5d900-e91e-4e0e-8184-f508a3433bc6",
"isResizable": true
},
"name": "Documents merged bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_merges_total_size_in_bytes{cluster=\"$cluster\",host=~\"$host\"}[5m])",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "a0545cbd-6df5-4845-90e0-88a710f738ba",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 41,
"i": "a0545cbd-6df5-4845-90e0-88a710f738ba",
"isResizable": false
},
"name": "Times",
"panels": [],
"type": "row"
},
{
"type": "timeseries",
"id": "ad0445b0-8539-440d-bbf4-712450132a7a",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 42,
"i": "ad0445b0-8539-440d-bbf4-712450132a7a",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "irate(elasticsearch_indices_search_query_time_seconds{cluster=\"$cluster\",host=~\"$host\"}[5m])",
"legend": "{{host}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Query time(Unit: s)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "c3cf6c57-c4ce-4bc2-a150-df32c4951144",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 42,
"i": "2af98dc1-f24e-4c7b-bd2c-723224facc5d",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "irate(elasticsearch_indices_indexing_index_time_seconds_total{cluster=\"$cluster\",host=~\"$host\"}[5m])",
"legend": "{{host}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Indexing time(Unit: s)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "022db454-70ba-49f5-8c11-f89b76d145cb",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 46,
"i": "553c7da8-2d83-4ea0-a6ef-b064a5101633",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "irate(elasticsearch_indices_merges_total_time_seconds_total{cluster=\"$cluster\",host=~\"$host\"}[5m])",
"legend": "{{host}}",
"refId": "A",
"maxDataPoints": 240,
"step": null
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Merging time(Unit: s)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "f20bad4f-656c-428a-a1cf-aafb7d92137c",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 46,
"i": "51056e8d-6dc8-4c7f-91e9-9c24c056462d",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "irate(elasticsearch_indices_indexing_throttle_time_seconds_total{cluster=\"$cluster\",host=~\"$host\"}[5m])",
"legend": "{{host}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Indexing throttle time(Unit: s)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "2c56fb7a-85a0-4396-a317-6754d761cff2",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 50,
"i": "2c56fb7a-85a0-4396-a317-6754d761cff2",
"isResizable": false
},
"name": "Thread Pool",
"panels": [],
"type": "row"
},
{
"type": "timeseries",
"id": "cefafeb9-fc8a-4c73-92b3-648cd6f08b11",
"layout": {
"h": 4,
"w": 6,
"x": 0,
"y": 51,
"i": "cefafeb9-fc8a-4c73-92b3-648cd6f08b11",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "irate(elasticsearch_thread_pool_rejected_count{cluster=\"$cluster\", host=~\"$host\"}[5m])",
"legend": "{{host}}: {{type}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Thread Pool operations rejected",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "0ab67903-16ea-4001-b784-ae04d8b815c0",
"layout": {
"h": 4,
"w": 6,
"x": 6,
"y": 51,
"i": "793e98e7-2729-4106-940c-ecccff1d4b89",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "elasticsearch_thread_pool_active_count{cluster=\"$cluster\", host=~\"$host\"}",
"legend": "{{host}}: {{type}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Thread Pool threads active",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "bb5dc07d-673b-4e2d-b44c-441acfa7c27b",
"layout": {
"h": 4,
"w": 6,
"x": 12,
"y": 51,
"i": "9a14c86a-86af-4464-ac82-41f621ce7166",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "elasticsearch_thread_pool_queue_count{cluster=\"$cluster\", host=~\"$host\"}",
"legend": "{{host}}: {{type}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Thread Pool threads queued",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "4cac1498-c141-483f-97c6-e1177317a2ea",
"layout": {
"h": 4,
"w": 6,
"x": 18,
"y": 51,
"i": "d5f42ea7-bdb5-44da-9ba6-7c0f09ba7c71",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "irate(elasticsearch_thread_pool_completed_count{cluster=\"$cluster\",host=~\"$host\"}[5m])",
"legend": "{{host}}: {{type}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Thread Pool operations completed",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "a5c3b529-c329-4a66-aab0-6caebba8be96",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 55,
"i": "a5c3b529-c329-4a66-aab0-6caebba8be96",
"isResizable": false
},
"name": "Caches",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5247f393-a934-4d9e-be0f-40b177d2be80",
"layout": {
"h": 4,
"w": 4,
"x": 0,
"y": 56,
"i": "5247f393-a934-4d9e-be0f-40b177d2be80",
"isResizable": true
},
"name": "Field data memory size",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_fielddata_memory_size_in_bytes{cluster=\"$cluster\", host=~\"$host\"}",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b87c56f7-4e50-4d15-8bcd-1218fee879d9",
"layout": {
"h": 4,
"w": 4,
"x": 4,
"y": 56,
"i": "c33fceb6-df37-483e-ba53-4ffa5f5e5456",
"isResizable": true
},
"name": "Field data evictions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_fielddata_evictions{cluster=\"$cluster\", host=~\"$host\"}[5m])",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ae2d0a7a-b6cd-4fd5-99d4-3c4289b8b5a8",
"layout": {
"h": 4,
"w": 4,
"x": 8,
"y": 56,
"i": "445484f4-32d3-4569-af8d-76790d0aa56b",
"isResizable": true
},
"name": "Query cache size",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_query_cache_memory_size_in_bytes{cluster=\"$cluster\", host=~\"$host\"}",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "78e4badc-8d51-4aa6-81c5-d1c9183810a2",
"layout": {
"h": 4,
"w": 4,
"x": 12,
"y": 56,
"i": "ce9aa255-9d5b-44ed-9071-85e9d95675ec",
"isResizable": true
},
"name": "Query cache evictions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_query_cache_evictions{cluster=\"$cluster\", host=~\"$host\"}[5m])",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "296b43f1-2f33-492a-bce8-6f0fde1e7b52",
"layout": {
"h": 4,
"w": 8,
"x": 16,
"y": 56,
"i": "b8b2604c-d84f-426f-b033-af9035a9e80d",
"isResizable": true
},
"name": "Request cache evictions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_request_cache_evictions{cluster=\"$cluster\", host=~\"$host\"}[5m])",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "d3161bf5-27a7-4552-a7d6-7b2b7d46b611",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 60,
"i": "d3161bf5-27a7-4552-a7d6-7b2b7d46b611",
"isResizable": false
},
"name": "Segments",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1537acaa-d5ce-48c5-b740-26fd543eb120",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 61,
"i": "1537acaa-d5ce-48c5-b740-26fd543eb120",
"isResizable": true
},
"name": "Count of index segments",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segments_count{cluster=\"$cluster\", host=~\"$host\"}",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a1c34fa4-4549-41a6-8d31-d25e7d860106",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 61,
"i": "9c8efed6-7ced-4805-87e0-3da3b18d2989",
"isResizable": true
},
"name": "Current memory size of segments in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segments_memory_in_bytes{cluster=\"$cluster\", host=~\"$host\"}",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"name": "datasource",
"label": "Datasource",
"type": "datasource",
"hide": false,
"definition": "prometheus"
},
{
"name": "cluster",
"label": "Cluster",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(elasticsearch_cluster_health_status, cluster)"
},
{
"name": "host",
"label": "Host",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(elasticsearch_process_cpu_seconds_total{cluster=\"$cluster\"}, host)",
"multi": true,
"allOption": true,
"allValue": ".*"
},
{
"name": "name",
"label": "Name",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(elasticsearch_jvm_gc_collection_seconds_count{cluster=\"$cluster\", host=~\"$host\"}, name)",
"reg": "",
"multi": true,
"allOption": true
}
],
"version": "3.0.0",
"graphTooltip": "default",
"graphZoom": "default"
}
}
================================================
FILE: integrations/Elasticsearch/dashboards/elasticsearch_by_categraf_a.json
================================================
{
"id": 0,
"group_id": 0,
"name": "ElasticSearch, group by service",
"ident": "",
"tags": "ElasticSearch Prometheus Categraf",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f70f4198-dec2-40c0-97d9-6986c7001e73",
"layout": {
"h": 3,
"i": "f70f4198-dec2-40c0-97d9-6986c7001e73",
"isResizable": true,
"w": 4,
"x": 0,
"y": 0
},
"name": "",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"text": "N/A"
},
"type": "special"
},
{
"match": {
"special": 1
},
"result": {
"color": "#417505",
"text": "Green"
},
"type": "special"
},
{
"match": {
"special": 2
},
"result": {
"color": "#f5a623",
"text": "Yellow"
},
"type": "special"
},
{
"match": {
"special": 3
},
"result": {
"color": "#d0021b",
"text": "Red"
},
"type": "special"
}
]
},
"targets": [
{
"expr": "min(elasticsearch_cluster_health_status_code{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7dafe232-ee30-479b-a2f1-e1064572c154",
"layout": {
"h": 3,
"i": "7dafe232-ee30-479b-a2f1-e1064572c154",
"isResizable": true,
"w": 4,
"x": 4,
"y": 0
},
"name": "Nodes",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "min(elasticsearch_cluster_health_number_of_nodes{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "447fb784-a7e4-41cf-820f-6086837590e6",
"layout": {
"h": 3,
"i": "c6953ef5-3e29-44dc-bf9e-74905934e9df",
"isResizable": true,
"w": 4,
"x": 8,
"y": 0
},
"name": "Data Nodes",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "min(elasticsearch_cluster_health_number_of_data_nodes{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f0375f72-4ca1-474f-81e9-ce6b64f22204",
"layout": {
"h": 3,
"i": "e6cf29e7-bb5d-4c8f-8aa6-67a63fc325c7",
"isResizable": true,
"w": 4,
"x": 12,
"y": 0
},
"name": "CPU Util Percent",
"options": {
"standardOptions": {
"util": "percent"
},
"valueMappings": [
{
"match": {
"to": 50
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 50,
"to": 80
},
"result": {
"color": "#f5a623"
},
"type": "range"
},
{
"match": {
"from": 80
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "max(elasticsearch_process_cpu_percent{service =\"$service\", node_host=~\"$node_host\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "288ee5f1-b484-43f5-86bf-5b81c01b3c2c",
"layout": {
"h": 3,
"i": "34ae1975-6acb-48a7-adce-1d67b7c581ec",
"isResizable": true,
"w": 4,
"x": 16,
"y": 0
},
"name": "JVM Heap Util Percent",
"options": {
"standardOptions": {
"util": "percent"
},
"valueMappings": [
{
"match": {
"to": 50
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 50,
"to": 80
},
"result": {
"color": "#f5a623"
},
"type": "range"
},
{
"match": {
"from": 80
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "max(elasticsearch_jvm_mem_heap_used_percent{service =\"$service\", node_host=~\"$node_host\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4dd345c1-2bc1-474e-83b1-153be10a5b5b",
"layout": {
"h": 3,
"i": "01c403f3-c3b0-4910-84df-a50d4968bcd6",
"isResizable": true,
"w": 4,
"x": 20,
"y": 0
},
"name": "Pending Tasks",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"to": 1
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 1
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_number_of_pending_tasks{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "b398d46e-7345-4cc9-90b4-918cbd1e8d1f",
"layout": {
"h": 1,
"i": "b398d46e-7345-4cc9-90b4-918cbd1e8d1f",
"isResizable": false,
"w": 24,
"x": 0,
"y": 3
},
"name": "Breakers",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.04,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "15882e6f-0585-4035-bfb6-71cb9caaa0a8",
"layout": {
"h": 4,
"i": "15882e6f-0585-4035-bfb6-71cb9caaa0a8",
"isResizable": true,
"w": 12,
"x": 0,
"y": 4
},
"name": "Tripped for breakers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum({__name__=~\"elasticsearch_breakers_.+_tripped\", service =\"$service\", node_host=~\"$node_host\"}) by (node_host)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.04,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8adbc8e4-f630-4a25-98e3-ee03dec92011",
"layout": {
"h": 4,
"i": "d05d16d0-022d-49f8-9b55-2388c4cbb2b1",
"isResizable": true,
"w": 12,
"x": 12,
"y": 4
},
"name": "Estimated size in bytes of breaker",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "{__name__=~\"elasticsearch_breakers_.+_size_in_bytes\", service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{__name__}} {{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "2aadd697-3bed-4f70-bc74-4bc801ef7d1d",
"layout": {
"h": 1,
"i": "2aadd697-3bed-4f70-bc74-4bc801ef7d1d",
"isResizable": false,
"w": 24,
"x": 0,
"y": 8
},
"name": "Shards",
"panels": [],
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7aec074e-1672-4dbb-8529-28292f9a4221",
"layout": {
"h": 3,
"i": "7aec074e-1672-4dbb-8529-28292f9a4221",
"isResizable": true,
"w": 4,
"x": 0,
"y": 9
},
"name": "Active shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_active_shards{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f138daa7-b98f-4575-89e3-42363a8102c9",
"layout": {
"h": 3,
"i": "fe82bb33-7b8d-4909-adda-64a4121f29fd",
"isResizable": true,
"w": 4,
"x": 4,
"y": 9
},
"name": "Active primary shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_active_primary_shards{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7412543a-dba5-4624-96ff-11e30b7e8ff4",
"layout": {
"h": 3,
"i": "de2c46fd-dcfd-43a3-847b-9fd1320dfaa7",
"isResizable": true,
"w": 4,
"x": 8,
"y": 9
},
"name": "Initializing shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_initializing_shards{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2f26f24f-2a79-4552-b79d-60b41fa3aee6",
"layout": {
"h": 3,
"i": "4403206d-a491-4564-9f61-db25a6beb356",
"isResizable": true,
"w": 4,
"x": 12,
"y": 9
},
"name": "Relocating shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_relocating_shards{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "394a83cc-f4e1-467e-83fa-b77d2c2be907",
"layout": {
"h": 3,
"i": "e24c847a-5704-4b7c-861e-75dd4e4b59d8",
"isResizable": true,
"w": 4,
"x": 16,
"y": 9
},
"name": "Delayed Unassigned shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_delayed_unassigned_shards{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "720b9719-5c37-44d9-bce8-539308afa6ae",
"layout": {
"h": 3,
"i": "d82314d5-028c-41fb-a79f-34699d56d17a",
"isResizable": true,
"w": 4,
"x": 20,
"y": 9
},
"name": "Unassigned shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_unassigned_shards{service =\"$service\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "a1bc2be7-723b-4fe4-b217-bfdd8248559e",
"layout": {
"h": 1,
"i": "a1bc2be7-723b-4fe4-b217-bfdd8248559e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 12
},
"name": "JVM",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ee0c56e0-8f8e-4cbe-ac41-de2afad7b75a",
"layout": {
"h": 4,
"i": "ee0c56e0-8f8e-4cbe-ac41-de2afad7b75a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 13
},
"name": "GC counts / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_jvm_gc_collectors_old_collection_count{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "old gc {{node_host}}",
"refId": "A"
},
{
"expr": "irate(elasticsearch_jvm_gc_collectors_young_collection_count{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "young gc {{node_host}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5c361278-8a94-4b16-afdd-e6def804b9ff",
"layout": {
"h": 4,
"i": "4f21ebfc-b51c-469b-b149-479966750920",
"isResizable": true,
"w": 12,
"x": 12,
"y": 13
},
"name": "GC time in millis",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_jvm_gc_collectors_old_collection_time_in_millis{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "old gc {{node_host}}",
"refId": "A"
},
{
"expr": "irate(elasticsearch_jvm_gc_collectors_young_collection_time_in_millis{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "young gc {{node_host}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ff81d109-79e5-4909-8765-857a75cebf17",
"layout": {
"h": 4,
"i": "5105f1dc-26cb-4818-a04d-90f2e5803da2",
"isResizable": true,
"w": 6,
"x": 0,
"y": 17
},
"name": "Yong Used(Bytes)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "{__name__=~\"elasticsearch_jvm_mem_pools_young_used_in_bytes\", service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{__name__}} {{node_name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d7c76456-8f34-4e1b-843b-9d174bbdfcee",
"layout": {
"h": 4,
"i": "86841663-2a17-4858-a9e1-13c296b3bb76",
"isResizable": true,
"w": 6,
"x": 6,
"y": 17
},
"name": "Old Used(Bytes)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "{__name__=~\"elasticsearch_jvm_mem_pools_old_used_in_bytes\", service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{__name__}} {{node_name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a0096936-3790-40a1-b2ad-d7805945b948",
"layout": {
"h": 4,
"i": "14f655ac-9c1c-40fa-bfef-158cc8601ead",
"isResizable": true,
"w": 12,
"x": 12,
"y": 17
},
"name": "Committed Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "{__name__=~\"elasticsearch_jvm_mem_.+_committed_in_bytes\", service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{__name__}} {{node_name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "e1d04a8c-81ee-4949-87de-3b70bc637584",
"layout": {
"h": 1,
"i": "e1d04a8c-81ee-4949-87de-3b70bc637584",
"isResizable": false,
"w": 24,
"x": 0,
"y": 21
},
"name": "Translog",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "45aafb11-c694-4686-89ab-685068f91560",
"layout": {
"h": 4,
"i": "45aafb11-c694-4686-89ab-685068f91560",
"isResizable": true,
"w": 12,
"x": 0,
"y": 22
},
"name": "Total translog operations",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_translog_operations{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "09ca6329-8eec-4a61-b19e-9bbeea2b9712",
"layout": {
"h": 4,
"i": "56806f8a-525a-4ab4-a9d3-c83559ae4828",
"isResizable": true,
"w": 12,
"x": 12,
"y": 22
},
"name": "Total translog size in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_translog_size_in_bytes{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "d9694c3f-9a14-4bde-9427-88531b0ea3a6",
"layout": {
"h": 1,
"i": "d9694c3f-9a14-4bde-9427-88531b0ea3a6",
"isResizable": false,
"w": 24,
"x": 0,
"y": 26
},
"name": "Disk and Network",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6c0d9b3c-dda5-4da9-825e-33f650dbb008",
"layout": {
"h": 4,
"i": "6c0d9b3c-dda5-4da9-825e-33f650dbb008",
"isResizable": true,
"w": 12,
"x": 0,
"y": 27
},
"name": "Disk usage %",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "1-(elasticsearch_fs_total_available_in_bytes{service =\"$service\",node_host=~\"$node_host\"}/elasticsearch_fs_total_total_in_bytes{service =\"$service\",node_host=~\"$node_host\"})",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4f7ce5a7-2771-4cbf-a569-b1a90b070b93",
"layout": {
"h": 4,
"i": "4f7ce5a7-2771-4cbf-a569-b1a90b070b93",
"isResizable": true,
"w": 12,
"x": 12,
"y": 27
},
"name": "Network usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_transport_tx_size_in_bytes{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}: sent",
"refId": "A"
},
{
"expr": "-irate(elasticsearch_transport_rx_size_in_bytes{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}: received",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "292b6c24-5471-4eeb-9d65-1a1e7a684fe3",
"layout": {
"h": 1,
"i": "292b6c24-5471-4eeb-9d65-1a1e7a684fe3",
"isResizable": false,
"w": 24,
"x": 0,
"y": 31
},
"name": "Documents",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e98839c4-e3f3-4e6e-be3a-c44b70e6072c",
"layout": {
"h": 4,
"i": "e98839c4-e3f3-4e6e-be3a-c44b70e6072c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 32
},
"name": "Documents count on node",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_docs_count{service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "45c4e3d9-90f1-41bd-8169-1d8c0a921ba9",
"layout": {
"h": 4,
"i": "45c4e3d9-90f1-41bd-8169-1d8c0a921ba9",
"isResizable": true,
"w": 12,
"x": 12,
"y": 32
},
"name": "Documents indexed rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_indexing_index_total{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3b2a922d-4423-4845-8cfc-95970f3300d6",
"layout": {
"h": 4,
"i": "3b2a922d-4423-4845-8cfc-95970f3300d6",
"isResizable": true,
"w": 12,
"x": 0,
"y": 36
},
"name": "Documents deleted rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_docs_deleted{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "764fbcf7-3056-41ef-b62a-51813a6c315f",
"layout": {
"h": 4,
"i": "764fbcf7-3056-41ef-b62a-51813a6c315f",
"isResizable": true,
"w": 6,
"x": 12,
"y": 36
},
"name": "Documents merged rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_merges_total_docs{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7cc04ae4-946d-4837-9ea9-764a7cc2eecd",
"layout": {
"h": 4,
"i": "97b5d900-e91e-4e0e-8184-f508a3433bc6",
"isResizable": true,
"w": 6,
"x": 18,
"y": 36
},
"name": "Documents merged bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_merges_total_size_in_bytes{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "a0545cbd-6df5-4845-90e0-88a710f738ba",
"layout": {
"h": 1,
"i": "a0545cbd-6df5-4845-90e0-88a710f738ba",
"isResizable": false,
"w": 24,
"x": 0,
"y": 40
},
"name": "Times",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ad0445b0-8539-440d-bbf4-712450132a7a",
"layout": {
"h": 4,
"i": "ad0445b0-8539-440d-bbf4-712450132a7a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 41
},
"name": "Query time(Unit: ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_search_query_time_in_millis{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c3cf6c57-c4ce-4bc2-a150-df32c4951144",
"layout": {
"h": 4,
"i": "2af98dc1-f24e-4c7b-bd2c-723224facc5d",
"isResizable": true,
"w": 12,
"x": 12,
"y": 41
},
"name": "Indexing time(Unit: ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_indexing_index_time_in_millis{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "022db454-70ba-49f5-8c11-f89b76d145cb",
"layout": {
"h": 4,
"i": "553c7da8-2d83-4ea0-a6ef-b064a5101633",
"isResizable": true,
"w": 12,
"x": 0,
"y": 45
},
"name": "Merging time(Unit: ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_merges_total_time_in_millis{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f20bad4f-656c-428a-a1cf-aafb7d92137c",
"layout": {
"h": 4,
"i": "51056e8d-6dc8-4c7f-91e9-9c24c056462d",
"isResizable": true,
"w": 12,
"x": 12,
"y": 45
},
"name": "Indexing throttle time(Unit: ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_indexing_throttle_time_in_millis{service =\"$service\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "2c56fb7a-85a0-4396-a317-6754d761cff2",
"layout": {
"h": 1,
"i": "2c56fb7a-85a0-4396-a317-6754d761cff2",
"isResizable": false,
"w": 24,
"x": 0,
"y": 49
},
"name": "Thread Pool",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "cefafeb9-fc8a-4c73-92b3-648cd6f08b11",
"layout": {
"h": 4,
"i": "cefafeb9-fc8a-4c73-92b3-648cd6f08b11",
"isResizable": true,
"w": 6,
"x": 0,
"y": 50
},
"name": "Thread Pool operations rejected",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate\n(label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_rejected\", service =\"$service\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_rejected\")[5m:])",
"legend": "{{node_host}}: {{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0ab67903-16ea-4001-b784-ae04d8b815c0",
"layout": {
"h": 4,
"i": "793e98e7-2729-4106-940c-ecccff1d4b89",
"isResizable": true,
"w": 6,
"x": 6,
"y": 50
},
"name": "Thread Pool threads active",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_active\", service =\"$service\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_active\")",
"legend": "{{node_host}}: {{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "bb5dc07d-673b-4e2d-b44c-441acfa7c27b",
"layout": {
"h": 4,
"i": "9a14c86a-86af-4464-ac82-41f621ce7166",
"isResizable": true,
"w": 6,
"x": 12,
"y": 50
},
"name": "Thread Pool threads queued",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_queue\", service =\"$service\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_queue\")",
"legend": "{{node_host}}: {{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4cac1498-c141-483f-97c6-e1177317a2ea",
"layout": {
"h": 4,
"i": "d5f42ea7-bdb5-44da-9ba6-7c0f09ba7c71",
"isResizable": true,
"w": 6,
"x": 18,
"y": 50
},
"name": "Thread Pool operations completed",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate\n(label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_completed\", service =\"$service\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_completed\")[5m:])",
"legend": "{{node_host}}: {{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "a5c3b529-c329-4a66-aab0-6caebba8be96",
"layout": {
"h": 1,
"i": "a5c3b529-c329-4a66-aab0-6caebba8be96",
"isResizable": false,
"w": 24,
"x": 0,
"y": 54
},
"name": "Caches",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5247f393-a934-4d9e-be0f-40b177d2be80",
"layout": {
"h": 4,
"i": "5247f393-a934-4d9e-be0f-40b177d2be80",
"isResizable": true,
"w": 4,
"x": 0,
"y": 55
},
"name": "Field data memory size",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_fielddata_memory_size_in_bytes{service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b87c56f7-4e50-4d15-8bcd-1218fee879d9",
"layout": {
"h": 4,
"i": "c33fceb6-df37-483e-ba53-4ffa5f5e5456",
"isResizable": true,
"w": 4,
"x": 4,
"y": 55
},
"name": "Field data evictions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_fielddata_evictions{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ae2d0a7a-b6cd-4fd5-99d4-3c4289b8b5a8",
"layout": {
"h": 4,
"i": "445484f4-32d3-4569-af8d-76790d0aa56b",
"isResizable": true,
"w": 4,
"x": 8,
"y": 55
},
"name": "Query cache size",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_query_cache_memory_size_in_bytes{service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "78e4badc-8d51-4aa6-81c5-d1c9183810a2",
"layout": {
"h": 4,
"i": "ce9aa255-9d5b-44ed-9071-85e9d95675ec",
"isResizable": true,
"w": 4,
"x": 12,
"y": 55
},
"name": "Query cache evictions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_query_cache_evictions{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "296b43f1-2f33-492a-bce8-6f0fde1e7b52",
"layout": {
"h": 4,
"i": "b8b2604c-d84f-426f-b033-af9035a9e80d",
"isResizable": true,
"w": 8,
"x": 16,
"y": 55
},
"name": "Request cache evictions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_request_cache_evictions{service =\"$service\", node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "d3161bf5-27a7-4552-a7d6-7b2b7d46b611",
"layout": {
"h": 1,
"i": "d3161bf5-27a7-4552-a7d6-7b2b7d46b611",
"isResizable": false,
"w": 24,
"x": 0,
"y": 59
},
"name": "Segments",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1537acaa-d5ce-48c5-b740-26fd543eb120",
"layout": {
"h": 4,
"i": "1537acaa-d5ce-48c5-b740-26fd543eb120",
"isResizable": true,
"w": 12,
"x": 0,
"y": 60
},
"name": "Count of index segments",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segments_count{service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a1c34fa4-4549-41a6-8d31-d25e7d860106",
"layout": {
"h": 4,
"i": "9c8efed6-7ced-4805-87e0-3da3b18d2989",
"isResizable": true,
"w": 12,
"x": 12,
"y": 60
},
"name": "Current memory size of segments in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segments_memory_in_bytes{service =\"$service\", node_host=~\"$node_host\"}",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(elasticsearch_up, service)",
"name": "service",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(elasticsearch_jvm_uptime_in_millis{service =\"$service\"}, node_host)",
"multi": true,
"name": "node_host",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327373675000
}
================================================
FILE: integrations/Elasticsearch/dashboards/elasticsearch_by_categraf_b.json
================================================
{
"id": 0,
"group_id": 0,
"name": "ElasticSearch, group by cluster",
"ident": "",
"tags": "ElasticSearch Categraf",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f70f4198-dec2-40c0-97d9-6986c7001e73",
"layout": {
"h": 3,
"i": "f70f4198-dec2-40c0-97d9-6986c7001e73",
"isResizable": true,
"w": 4,
"x": 0,
"y": 0
},
"name": "",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"text": "N/A"
},
"type": "special"
},
{
"match": {
"special": 1
},
"result": {
"color": "#417505",
"text": "Green"
},
"type": "special"
},
{
"match": {
"special": 2
},
"result": {
"color": "#f5a623",
"text": "Yellow"
},
"type": "special"
},
{
"match": {
"special": 3
},
"result": {
"color": "#d0021b",
"text": "Red"
},
"type": "special"
}
]
},
"targets": [
{
"expr": "min(elasticsearch_cluster_health_status_code{cluster=\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7dafe232-ee30-479b-a2f1-e1064572c154",
"layout": {
"h": 3,
"i": "7dafe232-ee30-479b-a2f1-e1064572c154",
"isResizable": true,
"w": 4,
"x": 4,
"y": 0
},
"name": "Nodes",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "min(elasticsearch_cluster_health_number_of_nodes{cluster=\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "447fb784-a7e4-41cf-820f-6086837590e6",
"layout": {
"h": 3,
"i": "c6953ef5-3e29-44dc-bf9e-74905934e9df",
"isResizable": true,
"w": 4,
"x": 8,
"y": 0
},
"name": "Data Nodes",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "min(elasticsearch_cluster_health_number_of_data_nodes{cluster=\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f0375f72-4ca1-474f-81e9-ce6b64f22204",
"layout": {
"h": 3,
"i": "e6cf29e7-bb5d-4c8f-8aa6-67a63fc325c7",
"isResizable": true,
"w": 4,
"x": 12,
"y": 0
},
"name": "CPU Util Percent",
"options": {
"standardOptions": {
"util": "percent"
},
"valueMappings": [
{
"match": {
"to": 50
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 50,
"to": 80
},
"result": {
"color": "#f5a623"
},
"type": "range"
},
{
"match": {
"from": 80
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "max(elasticsearch_process_cpu_percent{cluster=\"$cluster\", node_host=~\"$node_host\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "288ee5f1-b484-43f5-86bf-5b81c01b3c2c",
"layout": {
"h": 3,
"i": "34ae1975-6acb-48a7-adce-1d67b7c581ec",
"isResizable": true,
"w": 4,
"x": 16,
"y": 0
},
"name": "JVM Heap Util Percent",
"options": {
"standardOptions": {
"util": "percent"
},
"valueMappings": [
{
"match": {
"to": 50
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 50,
"to": 80
},
"result": {
"color": "#f5a623"
},
"type": "range"
},
{
"match": {
"from": 80
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "max(elasticsearch_jvm_mem_heap_used_percent{cluster=\"$cluster\", node_host=~\"$node_host\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4dd345c1-2bc1-474e-83b1-153be10a5b5b",
"layout": {
"h": 3,
"i": "01c403f3-c3b0-4910-84df-a50d4968bcd6",
"isResizable": true,
"w": 4,
"x": 20,
"y": 0
},
"name": "Pending Tasks",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"to": 1
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 1
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_number_of_pending_tasks{cluster=\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "b398d46e-7345-4cc9-90b4-918cbd1e8d1f",
"layout": {
"h": 1,
"i": "b398d46e-7345-4cc9-90b4-918cbd1e8d1f",
"isResizable": false,
"w": 24,
"x": 0,
"y": 3
},
"name": "Breakers",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.04,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "15882e6f-0585-4035-bfb6-71cb9caaa0a8",
"layout": {
"h": 4,
"i": "15882e6f-0585-4035-bfb6-71cb9caaa0a8",
"isResizable": true,
"w": 12,
"x": 0,
"y": 4
},
"name": "Tripped for breakers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum({__name__=~\"elasticsearch_breakers_.+_tripped\", cluster=\"$cluster\", node_host=~\"$node_host\"}) by (node_host)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.04,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8adbc8e4-f630-4a25-98e3-ee03dec92011",
"layout": {
"h": 4,
"i": "d05d16d0-022d-49f8-9b55-2388c4cbb2b1",
"isResizable": true,
"w": 12,
"x": 12,
"y": 4
},
"name": "Estimated size in bytes of breaker",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "{__name__=~\"elasticsearch_breakers_.+_size_in_bytes\", cluster=\"$cluster\", node_host=~\"$node_host\"}",
"legend": "{{__name__}} {{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "2aadd697-3bed-4f70-bc74-4bc801ef7d1d",
"layout": {
"h": 1,
"i": "2aadd697-3bed-4f70-bc74-4bc801ef7d1d",
"isResizable": false,
"w": 24,
"x": 0,
"y": 8
},
"name": "Shards",
"panels": [],
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7aec074e-1672-4dbb-8529-28292f9a4221",
"layout": {
"h": 3,
"i": "7aec074e-1672-4dbb-8529-28292f9a4221",
"isResizable": true,
"w": 4,
"x": 0,
"y": 9
},
"name": "Active shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_active_shards{cluster=\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f138daa7-b98f-4575-89e3-42363a8102c9",
"layout": {
"h": 3,
"i": "fe82bb33-7b8d-4909-adda-64a4121f29fd",
"isResizable": true,
"w": 4,
"x": 4,
"y": 9
},
"name": "Active primary shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_active_primary_shards{cluster=\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7412543a-dba5-4624-96ff-11e30b7e8ff4",
"layout": {
"h": 3,
"i": "de2c46fd-dcfd-43a3-847b-9fd1320dfaa7",
"isResizable": true,
"w": 4,
"x": 8,
"y": 9
},
"name": "Initializing shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_initializing_shards{cluster=\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2f26f24f-2a79-4552-b79d-60b41fa3aee6",
"layout": {
"h": 3,
"i": "4403206d-a491-4564-9f61-db25a6beb356",
"isResizable": true,
"w": 4,
"x": 12,
"y": 9
},
"name": "Relocating shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_relocating_shards{cluster=\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "394a83cc-f4e1-467e-83fa-b77d2c2be907",
"layout": {
"h": 3,
"i": "e24c847a-5704-4b7c-861e-75dd4e4b59d8",
"isResizable": true,
"w": 4,
"x": 16,
"y": 9
},
"name": "Delayed Unassigned shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_delayed_unassigned_shards{cluster=\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "720b9719-5c37-44d9-bce8-539308afa6ae",
"layout": {
"h": 3,
"i": "d82314d5-028c-41fb-a79f-34699d56d17a",
"isResizable": true,
"w": 4,
"x": 20,
"y": 9
},
"name": "Unassigned shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(elasticsearch_cluster_health_unassigned_shards{cluster=\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "a1bc2be7-723b-4fe4-b217-bfdd8248559e",
"layout": {
"h": 1,
"i": "a1bc2be7-723b-4fe4-b217-bfdd8248559e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 12
},
"name": "JVM",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ee0c56e0-8f8e-4cbe-ac41-de2afad7b75a",
"layout": {
"h": 4,
"i": "ee0c56e0-8f8e-4cbe-ac41-de2afad7b75a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 13
},
"name": "GC counts / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_jvm_gc_collectors_old_collection_count{cluster=\"$cluster\", node_host=~\"$node_host\"}[5m])",
"legend": "old gc {{node_host}}",
"refId": "A"
},
{
"expr": "irate(elasticsearch_jvm_gc_collectors_young_collection_count{cluster=\"$cluster\", node_host=~\"$node_host\"}[5m])",
"legend": "young gc {{node_host}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5c361278-8a94-4b16-afdd-e6def804b9ff",
"layout": {
"h": 4,
"i": "4f21ebfc-b51c-469b-b149-479966750920",
"isResizable": true,
"w": 12,
"x": 12,
"y": 13
},
"name": "GC time in millis",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_jvm_gc_collectors_old_collection_time_in_millis{cluster=\"$cluster\", node_host=~\"$node_host\"}[5m])",
"legend": "old gc {{node_host}}",
"refId": "A"
},
{
"expr": "irate(elasticsearch_jvm_gc_collectors_young_collection_time_in_millis{cluster=\"$cluster\", node_host=~\"$node_host\"}[5m])",
"legend": "young gc {{node_host}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ff81d109-79e5-4909-8765-857a75cebf17",
"layout": {
"h": 4,
"i": "5105f1dc-26cb-4818-a04d-90f2e5803da2",
"isResizable": true,
"w": 6,
"x": 0,
"y": 17
},
"name": "Yong Used(Bytes)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "{__name__=~\"elasticsearch_jvm_mem_pools_young_used_in_bytes\", cluster=\"$cluster\", node_host=~\"$node_host\"}",
"legend": "{{__name__}} {{node_name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d7c76456-8f34-4e1b-843b-9d174bbdfcee",
"layout": {
"h": 4,
"i": "86841663-2a17-4858-a9e1-13c296b3bb76",
"isResizable": true,
"w": 6,
"x": 6,
"y": 17
},
"name": "Old Used(Bytes)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "{__name__=~\"elasticsearch_jvm_mem_pools_old_used_in_bytes\", cluster=\"$cluster\", node_host=~\"$node_host\"}",
"legend": "{{__name__}} {{node_name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a0096936-3790-40a1-b2ad-d7805945b948",
"layout": {
"h": 4,
"i": "14f655ac-9c1c-40fa-bfef-158cc8601ead",
"isResizable": true,
"w": 12,
"x": 12,
"y": 17
},
"name": "Committed Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "{__name__=~\"elasticsearch_jvm_mem_.+_committed_in_bytes\", cluster=\"$cluster\", node_host=~\"$node_host\"}",
"legend": "{{__name__}} {{node_name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "e1d04a8c-81ee-4949-87de-3b70bc637584",
"layout": {
"h": 1,
"i": "e1d04a8c-81ee-4949-87de-3b70bc637584",
"isResizable": false,
"w": 24,
"x": 0,
"y": 21
},
"name": "Translog",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "45aafb11-c694-4686-89ab-685068f91560",
"layout": {
"h": 4,
"i": "45aafb11-c694-4686-89ab-685068f91560",
"isResizable": true,
"w": 12,
"x": 0,
"y": 22
},
"name": "Total translog operations",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_translog_operations{cluster=\"$cluster\", node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "09ca6329-8eec-4a61-b19e-9bbeea2b9712",
"layout": {
"h": 4,
"i": "56806f8a-525a-4ab4-a9d3-c83559ae4828",
"isResizable": true,
"w": 12,
"x": 12,
"y": 22
},
"name": "Total translog size in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_translog_size_in_bytes{cluster=\"$cluster\", node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "d9694c3f-9a14-4bde-9427-88531b0ea3a6",
"layout": {
"h": 1,
"i": "d9694c3f-9a14-4bde-9427-88531b0ea3a6",
"isResizable": false,
"w": 24,
"x": 0,
"y": 26
},
"name": "Disk and Network",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6c0d9b3c-dda5-4da9-825e-33f650dbb008",
"layout": {
"h": 4,
"i": "6c0d9b3c-dda5-4da9-825e-33f650dbb008",
"isResizable": true,
"w": 12,
"x": 0,
"y": 27
},
"name": "Disk usage %",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "1-(elasticsearch_fs_total_available_in_bytes{cluster=\"$cluster\",node_host=~\"$node_host\"}/elasticsearch_fs_total_total_in_bytes{cluster=\"$cluster\",node_host=~\"$node_host\"})",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4f7ce5a7-2771-4cbf-a569-b1a90b070b93",
"layout": {
"h": 4,
"i": "4f7ce5a7-2771-4cbf-a569-b1a90b070b93",
"isResizable": true,
"w": 12,
"x": 12,
"y": 27
},
"name": "Network usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_transport_tx_size_in_bytes{cluster=\"$cluster\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}: sent",
"refId": "A"
},
{
"expr": "-irate(elasticsearch_transport_rx_size_in_bytes{cluster=\"$cluster\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}: received",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "292b6c24-5471-4eeb-9d65-1a1e7a684fe3",
"layout": {
"h": 1,
"i": "292b6c24-5471-4eeb-9d65-1a1e7a684fe3",
"isResizable": false,
"w": 24,
"x": 0,
"y": 31
},
"name": "Documents",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e98839c4-e3f3-4e6e-be3a-c44b70e6072c",
"layout": {
"h": 4,
"i": "e98839c4-e3f3-4e6e-be3a-c44b70e6072c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 32
},
"name": "Documents count on node",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_docs_count{cluster=\"$cluster\", node_host=~\"$node_host\"}",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "45c4e3d9-90f1-41bd-8169-1d8c0a921ba9",
"layout": {
"h": 4,
"i": "45c4e3d9-90f1-41bd-8169-1d8c0a921ba9",
"isResizable": true,
"w": 12,
"x": 12,
"y": 32
},
"name": "Documents indexed rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_indexing_index_total{cluster=\"$cluster\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3b2a922d-4423-4845-8cfc-95970f3300d6",
"layout": {
"h": 4,
"i": "3b2a922d-4423-4845-8cfc-95970f3300d6",
"isResizable": true,
"w": 12,
"x": 0,
"y": 36
},
"name": "Documents deleted rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_docs_deleted{cluster=\"$cluster\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "764fbcf7-3056-41ef-b62a-51813a6c315f",
"layout": {
"h": 4,
"i": "764fbcf7-3056-41ef-b62a-51813a6c315f",
"isResizable": true,
"w": 6,
"x": 12,
"y": 36
},
"name": "Documents merged rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_merges_total_docs{cluster=\"$cluster\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7cc04ae4-946d-4837-9ea9-764a7cc2eecd",
"layout": {
"h": 4,
"i": "97b5d900-e91e-4e0e-8184-f508a3433bc6",
"isResizable": true,
"w": 6,
"x": 18,
"y": 36
},
"name": "Documents merged bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_merges_total_size_in_bytes{cluster=\"$cluster\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "a0545cbd-6df5-4845-90e0-88a710f738ba",
"layout": {
"h": 1,
"i": "a0545cbd-6df5-4845-90e0-88a710f738ba",
"isResizable": false,
"w": 24,
"x": 0,
"y": 40
},
"name": "Times",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ad0445b0-8539-440d-bbf4-712450132a7a",
"layout": {
"h": 4,
"i": "ad0445b0-8539-440d-bbf4-712450132a7a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 41
},
"name": "Query time(Unit: ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_search_query_time_in_millis{cluster=\"$cluster\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c3cf6c57-c4ce-4bc2-a150-df32c4951144",
"layout": {
"h": 4,
"i": "2af98dc1-f24e-4c7b-bd2c-723224facc5d",
"isResizable": true,
"w": 12,
"x": 12,
"y": 41
},
"name": "Indexing time(Unit: ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_indexing_index_time_in_millis{cluster=\"$cluster\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "022db454-70ba-49f5-8c11-f89b76d145cb",
"layout": {
"h": 4,
"i": "553c7da8-2d83-4ea0-a6ef-b064a5101633",
"isResizable": true,
"w": 12,
"x": 0,
"y": 45
},
"name": "Merging time(Unit: ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_merges_total_time_in_millis{cluster=\"$cluster\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f20bad4f-656c-428a-a1cf-aafb7d92137c",
"layout": {
"h": 4,
"i": "51056e8d-6dc8-4c7f-91e9-9c24c056462d",
"isResizable": true,
"w": 12,
"x": 12,
"y": 45
},
"name": "Indexing throttle time(Unit: ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_indexing_throttle_time_in_millis{cluster=\"$cluster\",node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "2c56fb7a-85a0-4396-a317-6754d761cff2",
"layout": {
"h": 1,
"i": "2c56fb7a-85a0-4396-a317-6754d761cff2",
"isResizable": false,
"w": 24,
"x": 0,
"y": 49
},
"name": "Thread Pool",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "cefafeb9-fc8a-4c73-92b3-648cd6f08b11",
"layout": {
"h": 4,
"i": "cefafeb9-fc8a-4c73-92b3-648cd6f08b11",
"isResizable": true,
"w": 6,
"x": 0,
"y": 50
},
"name": "Thread Pool operations rejected",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate\n(label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_rejected\", cluster=\"$cluster\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_rejected\")[5m:])",
"legend": "{{node_host}}: {{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0ab67903-16ea-4001-b784-ae04d8b815c0",
"layout": {
"h": 4,
"i": "793e98e7-2729-4106-940c-ecccff1d4b89",
"isResizable": true,
"w": 6,
"x": 6,
"y": 50
},
"name": "Thread Pool threads active",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_active\", cluster=\"$cluster\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_active\")",
"legend": "{{node_host}}: {{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "bb5dc07d-673b-4e2d-b44c-441acfa7c27b",
"layout": {
"h": 4,
"i": "9a14c86a-86af-4464-ac82-41f621ce7166",
"isResizable": true,
"w": 6,
"x": 12,
"y": 50
},
"name": "Thread Pool threads queued",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_queue\", cluster=\"$cluster\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_queue\")",
"legend": "{{node_host}}: {{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4cac1498-c141-483f-97c6-e1177317a2ea",
"layout": {
"h": 4,
"i": "d5f42ea7-bdb5-44da-9ba6-7c0f09ba7c71",
"isResizable": true,
"w": 6,
"x": 18,
"y": 50
},
"name": "Thread Pool operations completed",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate\n(label_replace({__name__=~\"elasticsearch_thread_pool_(.*)_completed\", cluster=\"$cluster\", node_host=~\"$node_host\"}, \"type\", \"$1\", \"__name__\", \"elasticsearch_thread_pool_(.*)_completed\")[5m:])",
"legend": "{{node_host}}: {{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "a5c3b529-c329-4a66-aab0-6caebba8be96",
"layout": {
"h": 1,
"i": "a5c3b529-c329-4a66-aab0-6caebba8be96",
"isResizable": false,
"w": 24,
"x": 0,
"y": 54
},
"name": "Caches",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5247f393-a934-4d9e-be0f-40b177d2be80",
"layout": {
"h": 4,
"i": "5247f393-a934-4d9e-be0f-40b177d2be80",
"isResizable": true,
"w": 4,
"x": 0,
"y": 55
},
"name": "Field data memory size",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_fielddata_memory_size_in_bytes{cluster=\"$cluster\", node_host=~\"$node_host\"}",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b87c56f7-4e50-4d15-8bcd-1218fee879d9",
"layout": {
"h": 4,
"i": "c33fceb6-df37-483e-ba53-4ffa5f5e5456",
"isResizable": true,
"w": 4,
"x": 4,
"y": 55
},
"name": "Field data evictions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_fielddata_evictions{cluster=\"$cluster\", node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ae2d0a7a-b6cd-4fd5-99d4-3c4289b8b5a8",
"layout": {
"h": 4,
"i": "445484f4-32d3-4569-af8d-76790d0aa56b",
"isResizable": true,
"w": 4,
"x": 8,
"y": 55
},
"name": "Query cache size",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_query_cache_memory_size_in_bytes{cluster=\"$cluster\", node_host=~\"$node_host\"}",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "78e4badc-8d51-4aa6-81c5-d1c9183810a2",
"layout": {
"h": 4,
"i": "ce9aa255-9d5b-44ed-9071-85e9d95675ec",
"isResizable": true,
"w": 4,
"x": 12,
"y": 55
},
"name": "Query cache evictions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_query_cache_evictions{cluster=\"$cluster\", node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "296b43f1-2f33-492a-bce8-6f0fde1e7b52",
"layout": {
"h": 4,
"i": "b8b2604c-d84f-426f-b033-af9035a9e80d",
"isResizable": true,
"w": 8,
"x": 16,
"y": 55
},
"name": "Request cache evictions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_request_cache_evictions{cluster=\"$cluster\", node_host=~\"$node_host\"}[5m])",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "d3161bf5-27a7-4552-a7d6-7b2b7d46b611",
"layout": {
"h": 1,
"i": "d3161bf5-27a7-4552-a7d6-7b2b7d46b611",
"isResizable": false,
"w": 24,
"x": 0,
"y": 59
},
"name": "Segments",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1537acaa-d5ce-48c5-b740-26fd543eb120",
"layout": {
"h": 4,
"i": "1537acaa-d5ce-48c5-b740-26fd543eb120",
"isResizable": true,
"w": 12,
"x": 0,
"y": 60
},
"name": "Count of index segments",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segments_count{cluster=\"$cluster\", node_host=~\"$node_host\"}",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a1c34fa4-4549-41a6-8d31-d25e7d860106",
"layout": {
"h": 4,
"i": "9c8efed6-7ced-4805-87e0-3da3b18d2989",
"isResizable": true,
"w": 12,
"x": 12,
"y": 60
},
"name": "Current memory size of segments in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segments_memory_in_bytes{cluster=\"$cluster\", node_host=~\"$node_host\"}",
"legend": "{{node_host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(elasticsearch_up, cluster)",
"name": "cluster",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(elasticsearch_jvm_uptime_in_millis{cluster=\"$cluster\"}, node_host)",
"multi": true,
"name": "node_host",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327376754000
}
================================================
FILE: integrations/Elasticsearch/dashboards/elasticsearch_by_exporter.json
================================================
{
"id": 0,
"group_id": 0,
"name": "ElasticSearch By Exporter",
"ident": "",
"tags": "Prometheus ElasticSearch ES",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "7d06624d-28be-4586-a804-11ca2f036964",
"layout": {
"h": 1,
"i": "7d06624d-28be-4586-a804-11ca2f036964",
"w": 24,
"x": 0,
"y": 0
},
"name": "KPI",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "集群数据节点数量",
"id": "ae1ebc12-8639-4812-b5ae-9391a0ec4dbc",
"layout": {
"h": 3,
"i": "ae1ebc12-8639-4812-b5ae-9391a0ec4dbc",
"w": 3,
"x": 6,
"y": 1
},
"name": "Data Nodes",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "elasticsearch_cluster_health_number_of_nodes{instance=\"$instance\",cluster=\"$cluster\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "集群节点数量",
"id": "8648c6f9-ab82-4067-a02c-eebe84ae3a96",
"layout": {
"h": 3,
"i": "8648c6f9-ab82-4067-a02c-eebe84ae3a96",
"w": 3,
"x": 3,
"y": 1
},
"name": "Nodes",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "elasticsearch_cluster_health_number_of_nodes{instance=\"$instance\",cluster=\"$cluster\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "平均CPU使用率",
"id": "20bf6dec-36cb-4e3a-a096-275a33e791b9",
"layout": {
"h": 3,
"i": "20bf6dec-36cb-4e3a-a096-275a33e791b9",
"w": 3,
"x": 9,
"y": 1
},
"name": "CPU usage Avg",
"options": {
"standardOptions": {
"util": "percent"
},
"valueMappings": [
{
"match": {
"from": 80,
"special": 80
},
"result": {
"color": "#f90606"
},
"type": "range"
},
{
"match": {
"from": 70,
"special": 70
},
"result": {
"color": "#f5ac0f"
},
"type": "range"
},
{
"match": {
"to": 70
},
"result": {
"color": "#21c00c"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum (elasticsearch_process_cpu_percent{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"} ) / count (elasticsearch_process_cpu_percent{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"} )",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "平均JVM内存使用率",
"id": "6380e6f2-3aec-4d56-a07a-efe6106c492f",
"layout": {
"h": 3,
"i": "6380e6f2-3aec-4d56-a07a-efe6106c492f",
"w": 3,
"x": 12,
"y": 1
},
"name": "JVM memory used Avg",
"options": {
"standardOptions": {
"util": "percent"
},
"valueMappings": [
{
"match": {
"from": 80,
"special": 80
},
"result": {
"color": "#f12c09"
},
"type": "range"
},
{
"match": {
"from": 70,
"to": 80
},
"result": {
"color": "#fbca18"
},
"type": "range"
},
{
"match": {
"to": 70
},
"result": {
"color": "#21c00c"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum (elasticsearch_jvm_memory_used_bytes{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}) / sum (elasticsearch_jvm_memory_max_bytes{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}) * 100",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "de70ff1e-5ad6-4d68-b961-10eb78840d90",
"layout": {
"h": 3,
"i": "de70ff1e-5ad6-4d68-b961-10eb78840d90",
"w": 3,
"x": 15,
"y": 1
},
"name": "Open file descriptors",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum (elasticsearch_process_open_files_count{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "集群断路器阻断总数",
"id": "88631ba2-b3d4-45e1-966a-9c3eb12a1d49",
"layout": {
"h": 3,
"i": "88631ba2-b3d4-45e1-966a-9c3eb12a1d49",
"w": 3,
"x": 18,
"y": 1
},
"name": "Tripped for breakers",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "#21c00c",
"text": ""
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum(elasticsearch_breakers_tripped{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "等待执行的集群变更任务数量",
"id": "7b85a99f-9cee-4cda-936c-eedcd2409e42",
"layout": {
"h": 3,
"i": "7b85a99f-9cee-4cda-936c-eedcd2409e42",
"w": 3,
"x": 21,
"y": 1
},
"name": "Pending tasks",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 5
},
"result": {
"color": "#f24207"
},
"type": "range"
},
{
"match": {
"from": 1,
"to": 5
},
"result": {
"color": "#f9a006"
},
"type": "range"
},
{
"match": {
"to": 1
},
"result": {
"color": "#21c00c"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "elasticsearch_cluster_health_number_of_pending_tasks{instance=\"$instance\",cluster=\"$cluster\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "绿色:健康,黄色:存在副本分片异常,红色:存在主分片异常",
"id": "a9c1c7fa-2985-4a14-bbe9-2ee7db630e94",
"layout": {
"h": 3,
"i": "a9c1c7fa-2985-4a14-bbe9-2ee7db630e94",
"w": 3,
"x": 0,
"y": 1
},
"name": "Cluster health",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {},
"result": {
"text": "N/A"
},
"type": "special"
},
{
"match": {
"special": 5
},
"result": {
"color": "#21c00c",
"text": "Green"
},
"type": "special"
},
{
"match": {
"special": 3
},
"result": {
"color": "#f9e406",
"text": "Yellow"
},
"type": "special"
},
{
"match": {
"special": 1
},
"result": {
"color": "#f43606",
"text": "Red"
},
"type": "special"
}
]
},
"targets": [
{
"expr": "elasticsearch_cluster_health_status{instance=\"$instance\",cluster=\"$cluster\",color=\"red\"}==1 or (elasticsearch_cluster_health_status{instance=\"$instance\",cluster=\"$cluster\",color=\"green\"}==1)+4 or (elasticsearch_cluster_health_status{instance=\"$instance\",cluster=\"$cluster\",color=\"yellow\"}==1)+2",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "12024030-3476-44b0-94b1-194629ce4efa",
"layout": {
"h": 1,
"i": "12024030-3476-44b0-94b1-194629ce4efa",
"w": 24,
"x": 0,
"y": 4
},
"name": "Shards",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "活跃分片数\nAggregate total of all shards across all indices, which includes replica shards",
"id": "717067e8-5f9c-4485-8158-d977c217d620",
"layout": {
"h": 3,
"i": "717067e8-5f9c-4485-8158-d977c217d620",
"w": 4,
"x": 0,
"y": 5
},
"name": "Active shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "elasticsearch_cluster_health_active_shards{instance=\"$instance\",cluster=\"$cluster\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "活跃主分片数\nThe number of primary shards in your cluster. This is an aggregate total across all indices.",
"id": "03e4fbef-93dd-488c-bb52-3eac620cc331",
"layout": {
"h": 3,
"i": "03e4fbef-93dd-488c-bb52-3eac620cc331",
"w": 4,
"x": 4,
"y": 5
},
"name": "Active primary shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "elasticsearch_cluster_health_active_primary_shards{instance=\"$instance\",cluster=\"$cluster\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "创建中分片数量\nCount of shards that are being freshly created",
"id": "90b642aa-4815-492b-a821-94a8120f2dc9",
"layout": {
"h": 3,
"i": "90b642aa-4815-492b-a821-94a8120f2dc9",
"w": 4,
"x": 8,
"y": 5
},
"name": "Initializing shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "elasticsearch_cluster_health_initializing_shards{instance=\"$instance\",cluster=\"$cluster\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "迁移中分片数量\nThe number of shards that are currently moving from one node to another node.",
"id": "684d1be6-ebdf-4075-a7bc-6580c148b997",
"layout": {
"h": 3,
"i": "684d1be6-ebdf-4075-a7bc-6580c148b997",
"w": 4,
"x": 12,
"y": 5
},
"name": "Relocating shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "elasticsearch_cluster_health_relocating_shards{instance=\"$instance\",cluster=\"$cluster\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "未分配的分片数量\nThe number of shards that exist in the cluster state, but cannot be found in the cluster itself",
"id": "df959515-14fa-44e5-a16c-f40f87eacb01",
"layout": {
"h": 3,
"i": "df959515-14fa-44e5-a16c-f40f87eacb01",
"w": 4,
"x": 20,
"y": 5
},
"name": "Unassigned shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "elasticsearch_cluster_health_unassigned_shards{instance=\"$instance\",cluster=\"$cluster\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "暂缓分配的分片数量,当节点丢失,会发生选主和数据拷贝。为了较少网络抖动等原因导致的重分配情况,配置delay参数,该值为等待delay到期将被重分配的分片数量\nShards delayed to reduce reallocation overhead",
"id": "f307f045-cfcf-4cd9-a39f-10d4360a85b2",
"layout": {
"h": 3,
"i": "f307f045-cfcf-4cd9-a39f-10d4360a85b2",
"w": 4,
"x": 16,
"y": 5
},
"name": "Delayed shards",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "elasticsearch_cluster_health_delayed_unassigned_shards{instance=\"$instance\",cluster=\"$cluster\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "ff67d655-1664-4432-bcef-52c00f77ef33",
"layout": {
"h": 1,
"i": "ff67d655-1664-4432-bcef-52c00f77ef33",
"w": 24,
"x": 0,
"y": 8
},
"name": "JVM Garbage Collection",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "GC运行次数",
"id": "f39502b5-a6a8-4bed-93d7-be83096eecbf",
"layout": {
"h": 7,
"i": "f39502b5-a6a8-4bed-93d7-be83096eecbf",
"w": 12,
"x": 0,
"y": 9
},
"name": "GC count",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_jvm_gc_collection_seconds_count{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}} - {{gc}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "GC运行耗时(秒)",
"id": "df97e013-e810-4b78-8253-0a1fe7c96b3a",
"layout": {
"h": 7,
"i": "df97e013-e810-4b78-8253-0a1fe7c96b3a",
"w": 12,
"x": 12,
"y": 9
},
"name": "GC time",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_jvm_gc_collection_seconds_sum{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}} - {{gc}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "674eb58f-4afe-440b-81eb-2c007c5bf598",
"layout": {
"h": 1,
"i": "674eb58f-4afe-440b-81eb-2c007c5bf598",
"w": 24,
"x": 0,
"y": 16
},
"name": "Translog",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "translog大小(byte)",
"id": "47583195-3a48-4d76-8b36-c08c29be936e",
"layout": {
"h": 7,
"i": "47583195-3a48-4d76-8b36-c08c29be936e",
"w": 12,
"x": 0,
"y": 17
},
"name": "Total translog operations",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_translog_operations{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "translog大小(byte)",
"id": "dca13d7f-90ca-4302-afed-753c2c181cf5",
"layout": {
"h": 7,
"i": "dca13d7f-90ca-4302-afed-753c2c181cf5",
"w": 12,
"x": 12,
"y": 17
},
"name": "Total translog size in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_translog_size_in_bytes{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "6c7202e2-b991-482b-a6d4-ad26cc97f6d2",
"layout": {
"h": 1,
"i": "6c7202e2-b991-482b-a6d4-ad26cc97f6d2",
"w": 24,
"x": 0,
"y": 24
},
"name": "Breakers",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "节点断路器阻断总数",
"id": "7040d182-228f-4a58-b4ce-1eb4addeb88f",
"layout": {
"h": 7,
"i": "7040d182-228f-4a58-b4ce-1eb4addeb88f",
"w": 12,
"x": 0,
"y": 25
},
"name": "Tripped for breakers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_breakers_tripped{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}}: {{breaker}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "预估内存大小和限制内存大小",
"id": "31d1efe8-d58a-4f1c-b510-1854ec1f491a",
"layout": {
"h": 7,
"i": "31d1efe8-d58a-4f1c-b510-1854ec1f491a",
"w": 12,
"x": 12,
"y": 25
},
"name": "Estimated size in bytes of breaker",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_breakers_estimated_size_bytes{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}}: {{breaker}}",
"refId": "A"
},
{
"expr": "elasticsearch_breakers_limit_size_bytes{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}}: limit for {{breaker}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "0df4ccb6-d6c1-4235-a186-6b596f68dc61",
"layout": {
"h": 1,
"i": "0df4ccb6-d6c1-4235-a186-6b596f68dc61",
"w": 24,
"x": 0,
"y": 32
},
"name": "Cpu and Memory",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "1m/5m/15m系统负载",
"id": "1295b795-0e8a-4db7-afc1-5248e386062d",
"layout": {
"h": 7,
"i": "1295b795-0e8a-4db7-afc1-5248e386062d",
"w": 12,
"x": 0,
"y": 33
},
"name": "Load average",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_os_load1{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "load1: {{name}}",
"refId": "A"
},
{
"expr": "elasticsearch_os_load5{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "load5: {{name}}",
"refId": "B"
},
{
"expr": "elasticsearch_os_load15{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "load15: {{name}}",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "进程CPU占比",
"id": "356d363a-682b-41f9-9fdd-7212e943fc49",
"layout": {
"h": 7,
"i": "356d363a-682b-41f9-9fdd-7212e943fc49",
"w": 12,
"x": 12,
"y": 33
},
"name": "CPU usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_process_cpu_percent{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "进程内存占用/限制/峰值(byte)",
"id": "f0216c7e-ade7-47c0-aec1-6efab9543e58",
"layout": {
"h": 7,
"i": "f0216c7e-ade7-47c0-aec1-6efab9543e58",
"w": 12,
"x": 0,
"y": 35
},
"name": "JVM memory usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_jvm_memory_used_bytes{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}} used: {{area}}",
"refId": "A"
},
{
"expr": "elasticsearch_jvm_memory_max_bytes{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}} max: {{area}}",
"refId": "B"
},
{
"expr": "elasticsearch_jvm_memory_pool_peak_used_bytes{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}} peak used pool: {{pool}}",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "JVM申请/限制内存(byte)",
"id": "12bce6cc-a0fc-4e98-ba19-5cf5fdeb6332",
"layout": {
"h": 7,
"i": "12bce6cc-a0fc-4e98-ba19-5cf5fdeb6332",
"w": 12,
"x": 12,
"y": 35
},
"name": "JVM memory committed",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_jvm_memory_committed_bytes{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}} committed: {{area}}",
"refId": "A"
},
{
"expr": "elasticsearch_jvm_memory_max_bytes{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}} max: {{area}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "df751a4b-72b5-4e4b-bc33-b89f8b4b8cac",
"layout": {
"h": 1,
"i": "df751a4b-72b5-4e4b-bc33-b89f8b4b8cac",
"w": 24,
"x": 0,
"y": 42
},
"name": "Disk and Network",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "磁盘使用率",
"id": "5de089e7-c5cd-4a01-9cd7-70e39feef131",
"layout": {
"h": 7,
"i": "5de089e7-c5cd-4a01-9cd7-70e39feef131",
"w": 12,
"x": 0,
"y": 43
},
"name": "Disk usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "1-(elasticsearch_filesystem_data_available_bytes{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}/elasticsearch_filesystem_data_size_bytes{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"})",
"legend": "{{name}}: {{path}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "网络流量(byte)",
"id": "957402f9-2e17-4c67-8d6b-02a8cc6789d9",
"layout": {
"h": 7,
"i": "957402f9-2e17-4c67-8d6b-02a8cc6789d9",
"w": 12,
"x": 12,
"y": 43
},
"name": "Network usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_transport_tx_size_bytes_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: sent",
"refId": "A"
},
{
"expr": "-irate(elasticsearch_transport_rx_size_bytes_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: received",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "f26bc1c8-5528-4faa-950d-075d39c023a8",
"layout": {
"h": 1,
"i": "f26bc1c8-5528-4faa-950d-075d39c023a8",
"w": 24,
"x": 0,
"y": 50
},
"name": "Documents",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "节点文档总数,不包含已删除文档和子文档以及刚索引的文档",
"id": "6220138b-3452-4529-a008-c78e20672169",
"layout": {
"h": 7,
"i": "6220138b-3452-4529-a008-c78e20672169",
"w": 12,
"x": 0,
"y": 51
},
"name": "Documents count on node",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_docs{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "平均每秒索引文档数",
"id": "242dff44-09b1-4468-856d-642e76048296",
"layout": {
"h": 7,
"i": "242dff44-09b1-4468-856d-642e76048296",
"w": 12,
"x": 12,
"y": 51
},
"name": "Documents indexed rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_indexing_index_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "平均每秒删除文档数",
"id": "400efbe3-a4c5-49af-a0bf-eabf2511ebf5",
"layout": {
"h": 7,
"i": "400efbe3-a4c5-49af-a0bf-eabf2511ebf5",
"w": 8,
"x": 0,
"y": 53
},
"name": "Documents deleted rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_docs_deleted{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "平均每秒合并文档数",
"id": "0b2d6480-6f50-413f-8552-f49109d2be59",
"layout": {
"h": 7,
"i": "0b2d6480-6f50-413f-8552-f49109d2be59",
"w": 8,
"x": 8,
"y": 53
},
"name": "Documents merged rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_merges_docs_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "平均每秒合并文档大小",
"id": "981c2d24-56df-437c-bc43-77682008dd01",
"layout": {
"h": 7,
"i": "981c2d24-56df-437c-bc43-77682008dd01",
"w": 8,
"x": 16,
"y": 53
},
"name": "Documents merged bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_merges_total_size_bytes_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "301ccb76-3b75-48a3-b37c-0ab035c2c2e9",
"layout": {
"h": 1,
"i": "301ccb76-3b75-48a3-b37c-0ab035c2c2e9",
"w": 24,
"x": 0,
"y": 60
},
"name": "Times",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "查询操作耗时(秒)",
"id": "e217b136-6021-473a-859e-7ac4218c4e86",
"layout": {
"h": 7,
"i": "e217b136-6021-473a-859e-7ac4218c4e86",
"w": 12,
"x": 0,
"y": 61
},
"name": "Query time",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_search_query_time_seconds{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "索引操作耗时(秒)",
"id": "d51a4ef1-1fc5-4d1b-ba6c-a2fd483295ee",
"layout": {
"h": 7,
"i": "d51a4ef1-1fc5-4d1b-ba6c-a2fd483295ee",
"w": 12,
"x": 12,
"y": 61
},
"name": "Indexing time",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_indexing_index_time_seconds_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "合并操作耗时(秒)",
"id": "20751b64-38da-4263-a055-02bc844bd416",
"layout": {
"h": 7,
"i": "20751b64-38da-4263-a055-02bc844bd416",
"w": 12,
"x": 0,
"y": 63
},
"name": "Merging time",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_merges_total_time_seconds_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "索引存储限制耗时(秒)",
"id": "d688b715-5959-433a-89b2-51bcd2b0b1f3",
"layout": {
"h": 7,
"i": "d688b715-5959-433a-89b2-51bcd2b0b1f3",
"w": 12,
"x": 12,
"y": 63
},
"name": "Throttle time for index store",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_store_throttle_time_seconds_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "f83e7128-21d7-4d5d-9956-61c9537f9b20",
"layout": {
"h": 1,
"i": "f83e7128-21d7-4d5d-9956-61c9537f9b20",
"w": 24,
"x": 0,
"y": 70
},
"name": "Total Operations states",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "9be3061f-ade2-43a5-ab5f-bfdc8861e4be",
"layout": {
"h": 7,
"i": "9be3061f-ade2-43a5-ab5f-bfdc8861e4be",
"w": 12,
"x": 0,
"y": 71
},
"name": "Total Operations time",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_indices_indexing_index_time_seconds_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: indexing",
"refId": "A"
},
{
"expr": "irate(elasticsearch_indices_search_query_time_seconds{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: query",
"refId": "B"
},
{
"expr": "irate(elasticsearch_indices_search_fetch_time_seconds{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: fetch",
"refId": "C"
},
{
"expr": "irate(elasticsearch_indices_merges_total_time_seconds_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: merges",
"refId": "D"
},
{
"expr": "irate(elasticsearch_indices_refresh_time_seconds_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: refresh",
"refId": "E"
},
{
"expr": "irate(elasticsearch_indices_flush_time_seconds{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: flush",
"refId": "F"
},
{
"expr": "irate(elasticsearch_indices_get_exists_time_seconds{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: get_exists",
"refId": "G"
},
{
"expr": "irate(elasticsearch_indices_get_time_seconds{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: get_time",
"refId": "H"
},
{
"expr": "irate(elasticsearch_indices_get_missing_time_seconds{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: get_missing",
"refId": "I"
},
{
"expr": "irate(elasticsearch_indices_indexing_delete_time_seconds_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: indexing_delete",
"refId": "K"
},
{
"expr": "irate(elasticsearch_indices_get_time_seconds{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: get",
"refId": "L"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "55cf0c8e-bf4f-430c-bc68-1eca72d7055d",
"layout": {
"h": 7,
"i": "55cf0c8e-bf4f-430c-bc68-1eca72d7055d",
"w": 12,
"x": 12,
"y": 71
},
"name": "Total Operations rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_indexing_index_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: indexing",
"refId": "A"
},
{
"expr": "rate(elasticsearch_indices_search_query_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: query",
"refId": "B"
},
{
"expr": "rate(elasticsearch_indices_search_fetch_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: fetch",
"refId": "C"
},
{
"expr": "rate(elasticsearch_indices_merges_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: merges",
"refId": "D"
},
{
"expr": "rate(elasticsearch_indices_refresh_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: refresh",
"refId": "E"
},
{
"expr": "rate(elasticsearch_indices_flush_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: flush",
"refId": "F"
},
{
"expr": "rate(elasticsearch_indices_get_exists_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: get_exists",
"refId": "G"
},
{
"expr": "rate(elasticsearch_indices_get_missing_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: get_missing",
"refId": "H"
},
{
"expr": "rate(elasticsearch_indices_get_tota{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: get",
"refId": "I"
},
{
"expr": "rate(elasticsearch_indices_indexing_delete_total{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: indexing_delete",
"refId": "K"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "7f948b8c-2881-416c-83b7-0819823c7b70",
"layout": {
"h": 1,
"i": "7f948b8c-2881-416c-83b7-0819823c7b70",
"w": 24,
"x": 0,
"y": 78
},
"name": "Thread Pool",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "线程池reject次数",
"id": "d337f12c-6fbe-4463-9181-faecdc612a90",
"layout": {
"h": 7,
"i": "d337f12c-6fbe-4463-9181-faecdc612a90",
"w": 6,
"x": 0,
"y": 79
},
"name": "Thread Pool operations rejected",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_thread_pool_rejected_count{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: {{ type }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "活跃线程数",
"id": "1053fab0-f105-4a6b-9508-dc87c4f78956",
"layout": {
"h": 7,
"i": "1053fab0-f105-4a6b-9508-dc87c4f78956",
"w": 6,
"x": 6,
"y": 79
},
"name": "Thread Pool threads active",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_thread_pool_active_count{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}}: {{ type }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "排队等待线程任务数量",
"id": "aad51280-5725-4485-841d-c08f0277508f",
"layout": {
"h": 7,
"i": "aad51280-5725-4485-841d-c08f0277508f",
"w": 6,
"x": 12,
"y": 79
},
"name": "Thread Pool threads queued",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_thread_pool_queue_count{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}}: {{ type }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "线程池complete次数",
"id": "76538cda-67f3-4040-b42e-be27078440e4",
"layout": {
"h": 7,
"i": "76538cda-67f3-4040-b42e-be27078440e4",
"w": 6,
"x": 18,
"y": 79
},
"name": "Thread Pool operations completed",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(elasticsearch_thread_pool_completed_count{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}: {{ type }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "2f6f4c26-2203-4f94-a105-a166f35608da",
"layout": {
"h": 1,
"i": "2f6f4c26-2203-4f94-a105-a166f35608da",
"w": 24,
"x": 0,
"y": 86
},
"name": "Caches",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "fielddata cache内存占用(byte)",
"id": "100f8d6d-073b-46f4-a664-db11e889a111",
"layout": {
"h": 7,
"i": "100f8d6d-073b-46f4-a664-db11e889a111",
"w": 12,
"x": 0,
"y": 87
},
"name": "Field data memory size",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_fielddata_memory_size_bytes{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "fielddata cache平均每秒内存剔除次数",
"id": "4af09644-306e-404d-a1a0-174397b26c4e",
"layout": {
"h": 7,
"i": "4af09644-306e-404d-a1a0-174397b26c4e",
"w": 12,
"x": 12,
"y": 87
},
"name": "Field data evictions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_fielddata_evictions{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "query cache内存占用(byte)",
"id": "e9632f58-8186-43bf-81b1-c57e91078e35",
"layout": {
"h": 7,
"i": "e9632f58-8186-43bf-81b1-c57e91078e35",
"w": 8,
"x": 0,
"y": 89
},
"name": "Query cache size",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_query_cache_memory_size_bytes{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "query cache平均每秒内存剔除次数",
"id": "e1145b0d-b839-4189-bd02-d955ae3dbcf3",
"layout": {
"h": 7,
"i": "e1145b0d-b839-4189-bd02-d955ae3dbcf3",
"w": 8,
"x": 8,
"y": 89
},
"name": "Query cache evictions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_query_cache_evictions{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "老版本的filter cache内存剔除次数",
"id": "14ce9ebf-5ff7-42da-9628-8746e18fe32b",
"layout": {
"h": 7,
"i": "14ce9ebf-5ff7-42da-9628-8746e18fe32b",
"w": 8,
"x": 16,
"y": 89
},
"name": "Evictions from filter cache",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(elasticsearch_indices_filter_cache_evictions{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}[5m])",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "f645ba3f-d2eb-4335-a2ff-56037184c779",
"layout": {
"h": 1,
"i": "f645ba3f-d2eb-4335-a2ff-56037184c779",
"w": 24,
"x": 0,
"y": 96
},
"name": "Segments",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "segment个数",
"id": "7f3f74ca-a8ca-4d07-b17a-72bb198f9726",
"layout": {
"h": 7,
"i": "7f3f74ca-a8ca-4d07-b17a-72bb198f9726",
"w": 12,
"x": 0,
"y": 97
},
"name": "Count of index segments",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segments_count{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "segment内存占用(byte)",
"id": "0d2bbc28-2211-4097-acbc-e1f72b3cad88",
"layout": {
"h": 7,
"i": "0d2bbc28-2211-4097-acbc-e1f72b3cad88",
"w": 12,
"x": 12,
"y": 97
},
"name": "Current memory size of segments in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segments_memory_bytes{instance=\"$instance\",cluster=\"$cluster\",name=~\"$name\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "6e28b8b4-dc24-45fb-9d60-351f08716296",
"layout": {
"h": 1,
"i": "6e28b8b4-dc24-45fb-9d60-351f08716296",
"w": 24,
"x": 0,
"y": 104
},
"name": "Indices: Count of documents and Total size",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "主分片文档数",
"id": "8620f336-4e26-40db-8c85-6a336a5fb7c9",
"layout": {
"h": 7,
"i": "8620f336-4e26-40db-8c85-6a336a5fb7c9",
"w": 24,
"x": 0,
"y": 105
},
"name": "Count of documents with only primary shards",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_docs_primary{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "主分片索引容量(byte)",
"id": "d4541f1a-136f-4b3e-b5e9-8cc92267aa8a",
"layout": {
"h": 7,
"i": "d4541f1a-136f-4b3e-b5e9-8cc92267aa8a",
"w": 24,
"x": 0,
"y": 107
},
"name": "Total size of stored index data in bytes with only primary shards on all nodes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_store_size_bytes_primary{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有分片索引容量(byte)",
"id": "66926f5a-d562-428e-b48b-d69b4d705b8f",
"layout": {
"h": 7,
"i": "66926f5a-d562-428e-b48b-d69b4d705b8f",
"w": 24,
"x": 0,
"y": 109
},
"name": "Total size of stored index data in bytes with all shards on all nodes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_store_size_bytes_total{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "c92992cf-3b56-4472-9278-9103b8fe55b7",
"layout": {
"h": 1,
"i": "c92992cf-3b56-4472-9278-9103b8fe55b7",
"w": 24,
"x": 0,
"y": 116
},
"name": "Indices: Index writer",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "主分片索引写入数据量(byte)",
"id": "7837ae60-c490-4e8a-a604-133cdeb5a3e7",
"layout": {
"h": 7,
"i": "7837ae60-c490-4e8a-a604-133cdeb5a3e7",
"w": 24,
"x": 0,
"y": 117
},
"name": "Index writer with only primary shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_index_writer_memory_bytes_primary{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有分片索引写入数据量(byte)",
"id": "8f200ba5-8c9b-4a26-b7b3-307cdb504316",
"layout": {
"h": 7,
"i": "8f200ba5-8c9b-4a26-b7b3-307cdb504316",
"w": 24,
"x": 0,
"y": 119
},
"name": "Index writer with all shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_index_writer_memory_bytes_total{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "fb8887c4-c0a4-44c1-83da-f077fa0feb9d",
"layout": {
"h": 1,
"i": "fb8887c4-c0a4-44c1-83da-f077fa0feb9d",
"w": 24,
"x": 0,
"y": 126
},
"name": "Indices: Segments",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "主分片segment数",
"id": "d20fb2f9-0e51-4bd1-a580-1308349e0b71",
"layout": {
"h": 7,
"i": "d20fb2f9-0e51-4bd1-a580-1308349e0b71",
"w": 24,
"x": 0,
"y": 127
},
"name": "Segments with only primary shards on all nodes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_count_primary{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有分片segment总数",
"id": "b8f5151d-d7d9-4cf2-822e-56ce5b0f58b2",
"layout": {
"h": 7,
"i": "b8f5151d-d7d9-4cf2-822e-56ce5b0f58b2",
"w": 24,
"x": 0,
"y": 129
},
"name": "Segments with all shards on all nodes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_count_total{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "主分片segment容量",
"id": "c6c1eee0-4c93-462e-8c53-f369fee48e46",
"layout": {
"h": 7,
"i": "c6c1eee0-4c93-462e-8c53-f369fee48e46",
"w": 24,
"x": 0,
"y": 131
},
"name": "Size of segments with only primary shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_memory_bytes_primary{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有分片segment容量",
"id": "d9b36d80-c753-4bde-8e6d-29af8c5a7e83",
"layout": {
"h": 7,
"i": "d9b36d80-c753-4bde-8e6d-29af8c5a7e83",
"w": 24,
"x": 0,
"y": 133
},
"name": "Size of segments with all shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_memory_bytes_total{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "abf4df7c-975a-493e-8951-be33c7d3ec6b",
"layout": {
"h": 1,
"i": "abf4df7c-975a-493e-8951-be33c7d3ec6b",
"w": 24,
"x": 0,
"y": 140
},
"name": "Indices: Doc values",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "主分片doc value内存占用(byte)",
"id": "205f362a-ad57-4cb8-ac3f-7afdb5dc87d6",
"layout": {
"h": 7,
"i": "205f362a-ad57-4cb8-ac3f-7afdb5dc87d6",
"w": 24,
"x": 0,
"y": 141
},
"name": "Doc values with only primary shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_doc_values_memory_bytes_primary{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有分片doc value内存占用(byte)",
"id": "ea838448-825c-4627-a09d-19db6c1e4a14",
"layout": {
"h": 7,
"i": "ea838448-825c-4627-a09d-19db6c1e4a14",
"w": 24,
"x": 0,
"y": 143
},
"name": "Doc values with all shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_doc_values_memory_bytes_total{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "43768cea-7ee8-430a-9aa3-e8c38817de55",
"layout": {
"h": 1,
"i": "43768cea-7ee8-430a-9aa3-e8c38817de55",
"w": 24,
"x": 0,
"y": 150
},
"name": "Indices: Fields",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "分片field内存占用(byte)",
"id": "6f8efb01-aac1-427c-bf7b-d6b90a968185",
"layout": {
"h": 7,
"i": "6f8efb01-aac1-427c-bf7b-d6b90a968185",
"w": 24,
"x": 0,
"y": 151
},
"name": "Size of fields with only primary shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_fields_memory_bytes_primary{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有分片field内存占用(byte)",
"id": "b82b390b-8f72-427b-9d1f-37d279f94366",
"layout": {
"h": 7,
"i": "b82b390b-8f72-427b-9d1f-37d279f94366",
"w": 24,
"x": 0,
"y": 153
},
"name": "Size of fields with all shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_fields_memory_bytes_total{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "41285e05-696c-4a94-946f-f75021316665",
"layout": {
"h": 1,
"i": "41285e05-696c-4a94-946f-f75021316665",
"w": 24,
"x": 0,
"y": 160
},
"name": "Indices: Fixed bit",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "主分片fixed bit set内存占用(byte)",
"id": "707cfc5f-cdbe-43dc-9d1f-f79e306ca7ff",
"layout": {
"h": 7,
"i": "707cfc5f-cdbe-43dc-9d1f-f79e306ca7ff",
"w": 24,
"x": 0,
"y": 161
},
"name": "Size of fixed bit with only primary shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_fixed_bit_set_memory_bytes_primary{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有分片fixed bit set内存占用(byte)",
"id": "ee949fc8-2c50-4ca3-944f-b42030439968",
"layout": {
"h": 7,
"i": "ee949fc8-2c50-4ca3-944f-b42030439968",
"w": 24,
"x": 0,
"y": 163
},
"name": "Size of fixed bit with all shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_fixed_bit_set_memory_bytes_total{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "23141b77-a525-44b1-a3b8-9ff4a65af597",
"layout": {
"h": 1,
"i": "23141b77-a525-44b1-a3b8-9ff4a65af597",
"w": 24,
"x": 0,
"y": 170
},
"name": "Indices: Norms",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "主分片normalization factor内存占用(byte)",
"id": "167f10ca-179d-4dfd-ba1d-e2184be993be",
"layout": {
"h": 7,
"i": "167f10ca-179d-4dfd-ba1d-e2184be993be",
"w": 24,
"x": 0,
"y": 171
},
"name": "Size of norms with only primary shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_norms_memory_bytes_primary{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有分片normalization factor内存占用(byte)",
"id": "41fba22d-bbb0-4cbd-a8cd-c7910991cecf",
"layout": {
"h": 7,
"i": "41fba22d-bbb0-4cbd-a8cd-c7910991cecf",
"w": 24,
"x": 0,
"y": 173
},
"name": "Size of norms with all shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_norms_memory_bytes_total{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "1f40b15b-8f0e-4c57-a1fd-602242195863",
"layout": {
"h": 1,
"i": "1f40b15b-8f0e-4c57-a1fd-602242195863",
"w": 24,
"x": 0,
"y": 180
},
"name": "Indices: Points",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "主分片point内存占用(byte)",
"id": "24e451cb-3835-4172-926b-54a33aa3b9b4",
"layout": {
"h": 7,
"i": "24e451cb-3835-4172-926b-54a33aa3b9b4",
"w": 24,
"x": 0,
"y": 181
},
"name": "Size of points with only primary shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_points_memory_bytes_primary{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有分片point内存占用(byte)",
"id": "79665ebe-5804-4809-af68-15d3a72a2c2c",
"layout": {
"h": 7,
"i": "79665ebe-5804-4809-af68-15d3a72a2c2c",
"w": 24,
"x": 0,
"y": 183
},
"name": "Size of points with all shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_points_memory_bytes_total{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "a62b3a9f-f3ac-4c4e-9d20-bde6c9eb3372",
"layout": {
"h": 1,
"i": "a62b3a9f-f3ac-4c4e-9d20-bde6c9eb3372",
"w": 24,
"x": 0,
"y": 190
},
"name": "Indices: Terms",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "主分片term内存占用(byte)",
"id": "c464e467-a6cf-4d60-8108-fb0359c801d8",
"layout": {
"h": 7,
"i": "c464e467-a6cf-4d60-8108-fb0359c801d8",
"w": 24,
"x": 0,
"y": 191
},
"name": "Size of terms with only primary shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_terms_memory_primary{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有分片term内存占用(byte)",
"id": "b0befc26-33c8-44f3-aa3e-80a9ed99180a",
"layout": {
"h": 7,
"i": "b0befc26-33c8-44f3-aa3e-80a9ed99180a",
"w": 24,
"x": 0,
"y": 193
},
"name": "Number of terms with all shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_terms_memory_total{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "df60be90-b370-4995-93c6-b26ee8584e6f",
"layout": {
"h": 1,
"i": "df60be90-b370-4995-93c6-b26ee8584e6f",
"w": 24,
"x": 0,
"y": 200
},
"name": "Indices: Terms",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有分片version map内存占用(byte)",
"id": "a2778c3a-6473-4482-bade-398904922e2f",
"layout": {
"h": 7,
"i": "a2778c3a-6473-4482-bade-398904922e2f",
"w": 24,
"x": 0,
"y": 201
},
"name": "Size of version map with only primary shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_version_map_memory_bytes_primary{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有分片version map内存占用(byte)",
"id": "322b0c1b-982a-4654-8552-7f246f496480",
"layout": {
"h": 7,
"i": "322b0c1b-982a-4654-8552-7f246f496480",
"w": 24,
"x": 0,
"y": 203
},
"name": "Size of version map with all shards on all nodes in bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "elasticsearch_indices_segment_version_map_memory_bytes_total{instance=~\"$instance\"}",
"legend": "{{index}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(elasticsearch_indices_docs,cluster)",
"name": "cluster",
"options": [
"elasticsearch-cluster"
],
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(elasticsearch_indices_docs{cluster=\"$cluster\", name!=\"\"},instance)",
"name": "instance",
"options": [
"10.206.0.7:9200"
],
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(elasticsearch_indices_docs{instance=\"$instance\",cluster=\"$cluster\", name!=\"\"},name)",
"multi": true,
"name": "name",
"options": [
"node-2",
"node-1",
"node-3"
],
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327380871000
}
================================================
FILE: integrations/Elasticsearch/markdown/README.md
================================================
# elasticsearch plugin
ElasticSearch 通过 HTTP JSON 的方式暴露了自身的监控指标,通过 categraf 的 [elasticsearch](https://github.com/flashcatcloud/categraf/tree/main/inputs/elasticsearch) 插件抓取。
如果是小规模集群,设置 `local=false`,从集群中某一个节点抓取数据,即可拿到整个集群所有节点的监控数据。如果是大规模集群,建议设置 `local=true`,在集群的每个节点上都部署抓取器,抓取本地 elasticsearch 进程的监控数据。
## 配置示例
categraf 配置文件:`conf/input.elasticsearch/elasticsearch.toml`
```yaml
[[instances]]
servers = ["http://192.168.11.177:9200"]
http_timeout = "10s"
local = false
cluster_health = true
cluster_health_level = "cluster"
cluster_stats = true
indices_level = ""
node_stats = ["jvm", "breaker", "process", "os", "fs", "indices", "thread_pool", "transport"]
username = "elastic"
password = "xxxxxxxx"
num_most_recent_indices = 1
labels = { service="es" }
```
================================================
FILE: integrations/Elasticsearch/metrics/categraf-base.json
================================================
[
{
"id": 0,
"uuid": 1717556327385727000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "Cluster Health delayed unassigned 的分片数",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "elasticsearch_cluster_health_delayed_unassigned_shards",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Cluster Health delayed unassigned 的分片数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of Cluster Health delayed unassigned shards",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327389271000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "Cluster Health Pending task 数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "elasticsearch_cluster_health_number_of_pending_tasks",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Cluster Health Pending task 数量",
"note": ""
},
{
"lang": "en_US",
"name": "Cluster Health Pending tasks quantity",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327391502000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "Cluster Health relocating 的分片数",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "elasticsearch_cluster_health_relocating_shards",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Cluster Health relocating 的分片数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of shards for Cluster Health relocating",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327393576000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "Cluster Health unassigned 的分片数",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "elasticsearch_cluster_health_unassigned_shards",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Cluster Health unassigned 的分片数",
"note": ""
},
{
"lang": "en_US",
"name": "Cluster Health unassigned number of shards",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327396682000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "Cluster Health 健康度状态码",
"unit": "none",
"note": "- 1:Green,绿色状态,表示所有分片都正常\n- 2:Yellow,黄色状态,主分片都正常,从分片有不正常的\n- 3:Red,红色状态,有些主分片不正常",
"lang": "zh_CN",
"expression": "elasticsearch_cluster_health_status_code",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Cluster Health 健康度状态码",
"note": "- 1:Green,绿色状态,表示所有分片都正常\n- 2:Yellow,黄色状态,主分片都正常,从分片有不正常的\n- 3:Red,红色状态,有些主分片不正常"
},
{
"lang": "en_US",
"name": "Cluster Health health status code",
"note": "-1: Green, Green state, indicating that all shards are normal \n-2: Yellow, Yellow state, the main shard is normal, the slave shard is abnormal \n-3: Red, Red state, some main shards are abnormal"
}
]
},
{
"id": 0,
"uuid": 1717556327398665000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "Cluster Health 数据节点数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "elasticsearch_cluster_health_number_of_data_nodes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Cluster Health 数据节点数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of Cluster Health data nodes",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327400525000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "Cluster Health 正在初始化的分片数",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "elasticsearch_cluster_health_initializing_shards",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Cluster Health 正在初始化的分片数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of shards being initialized by Cluster Health",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327402553000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "Cluster Health 活跃主分片数",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "elasticsearch_cluster_health_active_primary_shards",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Cluster Health 活跃主分片数",
"note": ""
},
{
"lang": "en_US",
"name": "Cluster Health Number of active primary shards",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327404570000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "Cluster Health 活跃分片数",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "elasticsearch_cluster_health_active_shards",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Cluster Health 活跃分片数",
"note": ""
},
{
"lang": "en_US",
"name": "Cluster Health Active Shards",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327406404000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "Cluster Health 节点数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "elasticsearch_cluster_health_number_of_nodes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Cluster Health 节点数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of Cluster Health nodes",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327408587000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "Indexing 平均耗时",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "irate(elasticsearch_indices_indexing_index_time_in_millis[3m])\n/\nirate(elasticsearch_indices_indexing_index_total[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Indexing 平均耗时",
"note": ""
},
{
"lang": "en_US",
"name": "Indexing average time consumption",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327410419000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "Merge 平均耗时",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "irate(elasticsearch_indices_merges_total_time_in_millis[3m])\n/\nirate(elasticsearch_indices_merges_total[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Merge 平均耗时",
"note": ""
},
{
"lang": "en_US",
"name": "Average time consumed by Merge",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327413133000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "Query 平均耗时",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "irate(elasticsearch_indices_search_query_time_in_millis[3m])\n/\nirate(elasticsearch_indices_search_query_total[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Query 平均耗时",
"note": ""
},
{
"lang": "en_US",
"name": "Query average time consumption",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327415242000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "每秒 indexing 数量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "irate(elasticsearch_indices_indexing_index_total[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每秒 indexing 数量",
"note": ""
},
{
"lang": "en_US",
"name": "indexing per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327417739000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "每秒 merge 大小",
"unit": "bytesSecIEC",
"note": "",
"lang": "zh_CN",
"expression": "irate(elasticsearch_indices_merges_total_size_in_bytes[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每秒 merge 大小",
"note": ""
},
{
"lang": "en_US",
"name": "merge size per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327419933000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "每秒 merge 数量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "irate(elasticsearch_indices_merges_total_docs[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每秒 merge 数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of merges per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327421867000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "每秒删除 doc 数量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "irate(elasticsearch_indices_docs_deleted[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "每秒删除 doc 数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of docs deleted per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327424001000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "硬盘使用率",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "100 - 100 * elasticsearch_fs_total_available_in_bytes / elasticsearch_fs_total_total_in_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘使用率",
"note": ""
},
{
"lang": "en_US",
"name": "Hard Drive Usage",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327425727000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "网络流量 - 入向每秒流量",
"unit": "bytesSecIEC",
"note": "",
"lang": "zh_CN",
"expression": "irate(elasticsearch_transport_rx_size_in_bytes[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网络流量 - 入向每秒流量",
"note": ""
},
{
"lang": "en_US",
"name": "Network traffic-inbound traffic per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327428683000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "网络流量 - 出向每秒流量",
"unit": "bytesSecIEC",
"note": "",
"lang": "zh_CN",
"expression": "irate(elasticsearch_transport_tx_size_in_bytes[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网络流量 - 出向每秒流量",
"note": ""
},
{
"lang": "en_US",
"name": "Network traffic-outbound traffic per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327434651000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "进程 CPU 使用率",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "elasticsearch_process_cpu_percent",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 CPU 使用率",
"note": ""
},
{
"lang": "en_US",
"name": "Process CPU usage",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327437231000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "进程 JVM Heap 使用率",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "elasticsearch_jvm_mem_heap_used_percent",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 JVM Heap 使用率",
"note": ""
},
{
"lang": "en_US",
"name": "Process JVM Heap Usage",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327439234000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "进程 JVM Heap 区 committed 大小",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "elasticsearch_jvm_mem_heap_committed_in_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 JVM Heap 区 committed 大小",
"note": ""
},
{
"lang": "en_US",
"name": "Process JVM Heap area committed size",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327441202000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "进程 JVM Non Heap 区 committed 大小",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "elasticsearch_jvm_mem_non_heap_committed_in_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 JVM Non Heap 区 committed 大小",
"note": ""
},
{
"lang": "en_US",
"name": "Process JVM Non Heap area committed size",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327443058000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "进程 JVM Old 内存池 used 大小",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "elasticsearch_jvm_mem_pools_old_used_in_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 JVM Old 内存池 used 大小",
"note": ""
},
{
"lang": "en_US",
"name": "Process JVM Old memory pool used size",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327444862000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "进程 JVM Young 内存池 used 大小",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "elasticsearch_jvm_mem_pools_young_used_in_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 JVM Young 内存池 used 大小",
"note": ""
},
{
"lang": "en_US",
"name": "Process JVM Young memory pool used size",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327447174000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "进程新生代每秒 GC 次数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "irate(elasticsearch_jvm_gc_collectors_young_collection_count[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程新生代每秒 GC 次数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of GCs per second for the new generation of the process",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327449234000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "进程新生代每秒 GC 耗时",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "irate(elasticsearch_jvm_gc_collectors_young_collection_time_in_millis[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程新生代每秒 GC 耗时",
"note": ""
},
{
"lang": "en_US",
"name": "Process new generation time per second GC",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327451371000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "进程老生代每秒 GC 次数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "irate(elasticsearch_jvm_gc_collectors_old_collection_count[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程老生代每秒 GC 次数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of GCs per second of process old generation",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327459172000,
"collector": "Categraf",
"typ": "Elasticsearch",
"name": "进程老生代每秒 GC 耗时",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "irate(elasticsearch_jvm_gc_collectors_old_collection_time_in_millis[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程老生代每秒 GC 耗时",
"note": ""
},
{
"lang": "en_US",
"name": "Process old generation GC time per second",
"note": ""
}
]
}
]
================================================
FILE: integrations/Exec/collect/exec/exec.toml
================================================
# # collect interval
# interval = 15
[[instances]]
# # commands, support glob
commands = [
# "/opt/categraf/scripts/*.sh"
]
# # timeout for each command to complete
# timeout = 5
# # interval = global.interval * interval_times
# interval_times = 1
# # choices: influx prometheus falcon
# # influx stdout example: measurement,labelkey1=labelval1,labelkey2=labelval2 field1=1.2,field2=2.3
# data_format = "influx"
================================================
FILE: integrations/Exec/markdown/README.md
================================================
# 应用场景
```
应用于input插件库exec目录之外的特殊或自定义实现指定业务的监控。
监控脚本采集到监控数据之后通过相应的格式输出到stdout,categraf截获stdout内容,解析之后传给服务端,
脚本的输出格式支持3种:influx、falcon、prometheus,通过 exec.toml 的 `data_format` 配置告诉 Categraf。
data_format有3个值,其用法为:
```
## influx
influx 格式的内容规范:
```
measurement,labelkey1=labelval1,labelkey2=labelval2 field1=1.2,field2=2.3
```
- 首先measurement,表示一个类别的监控指标,比如 connections;
- measurement后面是逗号,逗号后面是标签,如果没有标签,则measurement后面不需要逗号
- 标签是k=v的格式,多个标签用逗号分隔,比如region=beijing,env=test
- 标签后面是空格
- 空格后面是属性字段,多个属性字段用逗号分隔
- 属性字段是字段名=值的格式,在categraf里值只能是数字
最终,measurement和各个属性字段名称拼接成metric名字
## falcon
Open-Falcon的格式如下,举例:
```json
[
{
"endpoint": "test-endpoint",
"metric": "test-metric",
"timestamp": 1658490609,
"step": 60,
"value": 1,
"counterType": "GAUGE",
"tags": "idc=lg,loc=beijing",
},
{
"endpoint": "test-endpoint",
"metric": "test-metric2",
"timestamp": 1658490609,
"step": 60,
"value": 2,
"counterType": "GAUGE",
"tags": "idc=lg,loc=beijing",
}
]
```
timestamp、step、counterType,这三个字段在categraf处理的时候会直接忽略掉,endpoint会放到labels里上报。
## prometheus
prometheus 格式大家不陌生了,比如我这里准备一个监控脚本,输出 prometheus 的格式数据:
```shell
#!/bin/sh
echo '# HELP demo_http_requests_total Total number of http api requests'
echo '# TYPE demo_http_requests_total counter'
echo 'demo_http_requests_total{api="add_product"} 4633433'
```
其中 `#` 注释的部分,其实会被 categraf 忽略,不要也罢,prometheus 协议的数据具体的格式,请大家参考 prometheus 官方文档
# 部署场景
一般在复合型用途或独立的虚拟机启用此插件。
# 前置条件
```
1.需使用人解读每个脚本或程序的逻辑,其脚本或程序顶部有大概作用的描述。
```
# 配置场景
本配置启用或数据定义如下功能:
增加自定义标签,可通过自定义标签筛选数据及更加精确的告警推送。
响应超时时间为5秒。
commands字段正确应用脚本所在位置。
# 修改exec.toml文件配置
```
[root@aliyun input.exec]# vi exec.toml
# # collect interval
# interval = 15
[[instances]]
# # commands, support glob
commands = [
"/opt/categraf/scripts/*/collect_*.sh"
#"/opt/categraf/scripts/*/collect_*.py"
#"/opt/categraf/scripts/*/collect_*.go"
#"/opt/categraf/scripts/*/collect_*.lua"
#"/opt/categraf/scripts/*/collect_*.java"
#"/opt/categraf/scripts/*/collect_*.bat"
#"/opt/categraf/scripts/*/collect_*.cmd"
#"/opt/categraf/scripts/*/collect_*.ps1"
]
# # timeout for each command to complete
# timeout = 5
# # interval = global.interval * interval_times
# interval_times = 1
# # measurement,labelkey1=labelval1,labelkey2=labelval2 field1=1.2,field2=2.3
data_format = "influx"
```
# 测试配置
```
以cert/collect_cert_expiretime.sh为例:
sh /opt/categraf/cert/collect_cert_expiretime.sh 出现:
cert,cloud=huaweicloud,region=huabei-beijing-4,azone=az1,product=cert,domain_name=www.baidu.com expire_days=163
cert,cloud=huaweicloud,region=huabei-beijing-4,azone=az1,product=cert,domain_name=www.weibo.com expire_days=85
cert,cloud=huaweicloud,region=huabei-beijing-4,azone=az1,product=cert,domain_name=www.csdn.net expire_days=281
```
# 重启服务
```
重启categraf服务生效
systemctl daemon-reload && systemctl restart categraf && systemctl status categraf
查看启动日志是否有错误
journalctl -f -n 500 -u categraf | grep "E\!" | grep "W\!"
```
# 检查数据呈现
如图:

================================================
FILE: integrations/Filecount/collect/filecount/filecount.toml
================================================
# # collect interval
# interval = 15
[[instances]]
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
## Directories to gather stats about.
## This accept standard unit glob matching rules, but with the addition of
## ** as a "super asterisk". ie:
## /var/log/** -> recursively find all directories in /var/log and count files in each directories
## /var/log/*/* -> find all directories with a parent dir in /var/log and count files in each directories
## /var/log -> count all files in /var/log and all of its subdirectories
## directories = ["/var/cache/apt", "/tmp"]
directories = ["/tmp"]
## Only count files that match the name pattern. Defaults to "*".
file_name = "*"
## Count files in subdirectories. Defaults to true.
recursive = true
## Only count regular files. Defaults to true.
regular_only = true
## Follow all symlinks while walking the directory tree. Defaults to false.
follow_symlinks = false
## Only count files that are at least this size. If size is
## a negative number, only count files that are smaller than the
## absolute value of size. Acceptable units are B, KiB, MiB, KB, ...
## Without quotes and units, interpreted as size in bytes.
size = "0B"
## Only count files that have not been touched for at least this
## duration. If mtime is negative, only count files that have been
## touched in this duration. Defaults to "0s".
mtime = "0s"
================================================
FILE: integrations/Filecount/markdown/README.md
================================================
# Filecount Input Plugin
forked from telegraf/inputs.filecount
Reports the number and total size of files in specified directories.
## Configuration
```toml filecount.toml
# # collect interval
# interval = 15
[[instances]]
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
## Directories to gather stats about.
## This accept standard unit glob matching rules, but with the addition of
## ** as a "super asterisk". ie:
## /var/log/** -> recursively find all directories in /var/log and count files in each directories
## /var/log/*/* -> find all directories with a parent dir in /var/log and count files in each directories
## /var/log -> count all files in /var/log and all of its subdirectories
## directories = ["/var/cache/apt", "/tmp"]
directories = ["/tmp", "/root"]
## Only count files that match the name pattern. Defaults to "*".
file_name = "*"
## Count files in subdirectories. Defaults to true.
recursive = true
## Only count regular files. Defaults to true.
regular_only = true
## Follow all symlinks while walking the directory tree. Defaults to false.
follow_symlinks = false
## Only count files that are at least this size. If size is
## a negative number, only count files that are smaller than the
## absolute value of size. Acceptable units are B, KiB, MiB, KB, ...
## Without quotes and units, interpreted as size in bytes.
size = "0B"
## Only count files that have not been touched for at least this
## duration. If mtime is negative, only count files that have been
## touched in this duration. Defaults to "0s".
mtime = "0s"
```
## Metrics
- filecount
- tags:
- directory (the directory path)
- fields:
- count (integer)
- size_bytes (integer)
- oldest_file_timestamp (int, unix time nanoseconds)
- newest_file_timestamp (int, unix time nanoseconds)
## Example Output
```text
13:25:07 filecount_count agent_hostname=host1 directory=/tmp 319
13:25:07 filecount_size_bytes agent_hostname=host1 directory=/tmp 83196547
13:25:07 filecount_oldest_file_timestamp agent_hostname=host1 directory=/tmp 0
13:25:07 filecount_newest_file_timestamp agent_hostname=host1 directory=/tmp 1692336254306413522
```
================================================
FILE: integrations/Gitlab/alerts/gitlab_by_categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "HighgRPCResourceExhaustedRate",
"note": "High gRPC ResourceExhausted error rate",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 3600,
"prom_ql": "sum without (grpc_code, grpc_method, grpc_service, grpc_type) (\n rate(grpc_server_handled_total{grpc_code=\"ResourceExhausted\"}[5m])\n) / sum without (grpc_code, grpc_method, grpc_service, grpc_type) (\n rate(grpc_server_handled_total[5m])\n) * 100 \u003e 1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum without (grpc_code, grpc_method, grpc_service, grpc_type) (\n rate(grpc_server_handled_total{grpc_code=\"ResourceExhausted\"}[5m])\n) / sum without (grpc_code, grpc_method, grpc_service, grpc_type) (\n rate(grpc_server_handled_total[5m])\n) * 100 \u003e 1\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327473722000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "HighPumaUtilization",
"note": "Puma instance {{ $labels.instance }} has more than 90% thread utilization ({{ $value | printf \"%.1f\" }}%) over the last 60 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 3600,
"prom_ql": "instance:puma_utilization:ratio * 100 \u003e 90",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "instance:puma_utilization:ratio * 100 \u003e 90",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327474171000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PostgresDatabaseDeadlockCancels",
"note": "Postgres database has queries canceled due to deadlocks",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "increase(pg_stat_database_deadlocks[5m]) \u003e 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(pg_stat_database_deadlocks[5m]) \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327474653000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PostgresDatabaseDeadlocks",
"note": "Postgres database has deadlocks",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "increase(pg_stat_database_deadlocks[5m]) \u003e 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(pg_stat_database_deadlocks[5m]) \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327475111000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PostgresDown",
"note": "The Postgres service {{ $labels.job }} instance {{ $labels.instance }} is not responding for more than 50% of the time for 5 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "avg_over_time(pg_up[5m]) * 100 \u003c 50",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(pg_up[5m]) * 100 \u003c 50",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327475499000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PumaQueueing",
"note": "Puma instance {{ $labels.instance }} is queueing requests with an average of {{ $value | printf \"%.1f\" }} over the last 30 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "avg_over_time(puma_queued_connections[30m]) \u003e 1",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(puma_queued_connections[30m]) \u003e 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327475980000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "RedisDown",
"note": "The Redis service {{ $labels.job }} instance {{ $labels.instance }} is not responding for more than 50% of the time for 5 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "avg_over_time(redis_up[5m]) * 100 \u003c 50",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(redis_up[5m]) * 100 \u003c 50",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327476393000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ServiceDown",
"note": "The service {{ $labels.job }} instance {{ $labels.instance }} is not responding for more than 50% of the time for 5 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "avg_over_time(up[5m]) * 100 \u003c 50",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(up[5m]) * 100 \u003c 50",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327476862000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "SidekiqJobsQueuing",
"note": "Sidekiq has jobs queued",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 3600,
"prom_ql": "sum by (name) (sidekiq_queue_size) \u003e 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum by (name) (sidekiq_queue_size) \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327477358000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "WorkhorseHighErrorRate",
"note": "Workhorse has high error rates",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "(\n sum without (job, code) (\n job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~\"5..\"}\n ) /\n sum without (job,code) (\n job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m\n ) \u003c 10\n) * 100 \u003e 50\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(\n sum without (job, code) (\n job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m{code=~\"5..\"}\n ) /\n sum without (job,code) (\n job_route_method_code:gitlab_workhorse_http_request_duration_seconds_count:rate5m\n ) \u003c 10\n) * 100 \u003e 50\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327477956000
}
]
================================================
FILE: integrations/Gitlab/dashboards/MachinePerformance.json
================================================
{
"id": 0,
"group_id": 0,
"name": "GitLab - Machine Performance",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "a4389d18-8aed-4207-8fa9-4b25da036d6a",
"layout": {
"h": 3,
"i": "a4389d18-8aed-4207-8fa9-4b25da036d6a",
"isResizable": true,
"w": 5,
"x": 0,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Uptime",
"options": {
"standardOptions": {
"decimals": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "node_boot_time_seconds{instance=~\"$instance\"} / 60 / 60 /24 / 30 / 365",
"legend": "Uptime",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "service"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "fe9d5645-e775-446e-876b-35852b751961",
"layout": {
"h": 3,
"i": "fe9d5645-e775-446e-876b-35852b751961",
"isResizable": true,
"w": 2,
"x": 5,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Running",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "node_boot_time_seconds{instance=~\"$instance\"} / 60 / 60 /24 / 30 / 365",
"legend": "Uptime",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"baseColor": "#9470FF",
"calc": "avg",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "3190bf4b-c7a3-4044-955c-f488b31a200b",
"layout": {
"h": 3,
"i": "3190bf4b-c7a3-4044-955c-f488b31a200b",
"isResizable": true,
"w": 5,
"x": 7,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "NTP",
"options": {
"standardOptions": {
"util": "milliseconds"
},
"valueMappings": []
},
"targets": [
{
"expr": "node_timex_sync_status{instance=\"$instance\"}",
"legend": "NTP Enabled",
"refId": "A"
},
{
"expr": "node_timex_offset_seconds{instance=\"$instance\"}",
"legend": "NTP Offset",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorDomainAuto": true,
"colorRange": [
"#83c898",
"#c2c2c2",
"#fc653f"
],
"reverseColorOrder": false,
"textMode": "valueAndName"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "f1c435fa-8c9d-40cc-96ab-54889f32a178",
"layout": {
"h": 3,
"i": "f1c435fa-8c9d-40cc-96ab-54889f32a178",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "System Info",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "count(count(node_cpu_seconds_total{instance=~\"$instance\"}) by (cpu))",
"legend": "Core",
"refId": "A"
},
{
"expr": "node_memory_MemTotal_bytes{instance=~\"$instance\"}",
"legend": "Mem",
"refId": "B"
},
{
"expr": "node_memory_SwapTotal_bytes{instance=~\"$instance\"}",
"legend": "Swap",
"refId": "C"
},
{
"expr": "sum(node_filesystem_size_bytes{instance=~\"$instance\"}) ",
"legend": "Disk",
"refId": "D"
},
{
"expr": "node_filefd_allocated{instance=~\"$instance\"}",
"legend": "Openfiles",
"refId": "E"
},
{
"expr": "rate(node_context_switches_total{instance=~\"$instance\"}[1m])",
"legend": "ContextSwitch",
"refId": "F"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "hexbin",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "d0add8e0-c815-4442-8a3b-d86d3bc3d8fd",
"layout": {
"h": 7,
"i": "d0add8e0-c815-4442-8a3b-d86d3bc3d8fd",
"isResizable": true,
"w": 9,
"x": 0,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "Load Average",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(node_load1{instance=~\"$instance\"}) / max(count(node_cpu_seconds_total{instance=~\"$instance\", mode=~\"system\"}))",
"legend": "1m",
"refId": "A"
},
{
"expr": "max(node_load5{instance=~\"$instance\"}) / max(count(node_cpu_seconds_total{instance=~\"$instance\", mode=~\"system\"}))",
"legend": "5m",
"refId": "B"
},
{
"expr": "max(node_load15{instance=~\"$instance\"}) / max(count(node_cpu_seconds_total{instance=~\"$instance\", mode=~\"system\"}))",
"legend": "15m",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "83f3134f-1fdb-4bd6-bc2e-7d32222c2660",
"layout": {
"h": 7,
"i": "83f3134f-1fdb-4bd6-bc2e-7d32222c2660",
"isResizable": true,
"w": 15,
"x": 9,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "Resource Usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "1 - (avg(irate(node_cpu_seconds_total{instance=~\"$instance\",mode=~\"idle\"}[2m])))",
"legend": "CPU",
"refId": "A"
},
{
"expr": "(node_memory_MemTotal_bytes{instance=~\"$instance\"} - (node_memory_MemFree_bytes{instance=~\"$instance\"} + node_memory_Buffers_bytes{instance=~\"$instance\"} + node_memory_Cached_bytes{instance=~\"$instance\"})) / node_memory_MemTotal_bytes{instance=~\"$instance\"}",
"legend": "MEM",
"refId": "B"
},
{
"expr": "avg(irate(node_cpu_seconds_total{instance=~\"$instance\",mode=~\"iowait\"}[2m]))",
"legend": "IOWAIT",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "2a57ae69-c0e1-447c-825d-bcee2bcb6d3f",
"layout": {
"h": 5,
"i": "2a57ae69-c0e1-447c-825d-bcee2bcb6d3f",
"isResizable": true,
"w": 24,
"x": 0,
"y": 10
},
"links": [],
"maxPerRow": 4,
"name": "Memory Usage",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "node_memory_MemAvailable_bytes{instance=~\"$instance\"}",
"legend": "Available",
"refId": "A"
},
{
"expr": "node_memory_Buffers_bytes{instance=~\"$instance\"} + node_memory_Cached_bytes{instance=~\"$instance\"}",
"legend": "Buffers/Cached",
"refId": "B"
},
{
"expr": "node_memory_MemTotal_bytes{instance=~\"$instance\"} - node_memory_MemAvailable_bytes{instance=~\"$instance\"}",
"legend": "Used",
"refId": "C"
},
{
"expr": "node_memory_MemFree_bytes{instance=~\"$instance\"}",
"legend": "Free",
"refId": "D"
},
{
"expr": "node_memory_MemTotal_bytes{instance=~\"$instance\"}",
"legend": "Total",
"refId": "E"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "f291673f-3b9d-4b4f-a2b1-2ed7131e57f0",
"layout": {
"h": 9,
"i": "f291673f-3b9d-4b4f-a2b1-2ed7131e57f0",
"isResizable": true,
"w": 15,
"x": 0,
"y": 15
},
"links": [],
"maxPerRow": 4,
"name": "Disk IO",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum by (instance) (irate(node_disk_reads_completed_total{instance=~\"$instance\"}[1m]))",
"legend": "{{instance}}-Reads",
"refId": "A"
},
{
"expr": "sum by (instance) (irate(node_disk_writes_completed_total{instance=~\"$instance\"}[1m]))",
"legend": "{{instance}}-Writes",
"refId": "B"
},
{
"expr": "sum by (instance) (node_disk_io_now{instance=~\"$instance\"})",
"legend": "{{instance}}-CurrentIO",
"refId": "C"
},
{
"expr": "sum by (instance) (irate(node_disk_read_time_seconds_total{instance=~\"$instance\"}[5m])) / sum by (instance) (irate(node_disk_reads_completed_total{instance=~\"$instance\"}[5m]))",
"legend": "{{instance}}-Read-consuming",
"refId": "D"
},
{
"expr": "sum by (instance) (irate(node_disk_write_time_seconds_total{instance=~\"$instance\"}[5m])) / sum by (instance) (irate(node_disk_writes_completed_total{instance=~\"$instance\"}[5m]))",
"legend": "{{instance}}-Write-consuming",
"refId": "E"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "8cb012a2-113c-4d06-8bea-da7d14a870d2",
"layout": {
"h": 9,
"i": "8cb012a2-113c-4d06-8bea-da7d14a870d2",
"isResizable": true,
"w": 9,
"x": 15,
"y": 15
},
"links": [],
"maxPerRow": 4,
"name": "Disk Usage",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "node_filesystem_avail_bytes{instance=~\"$instance\",fstype=~\"ext4|xfs|nfs\"}",
"instant": false,
"legend": "{{device}}",
"refId": "A"
},
{
"expr": "1-(node_filesystem_free_bytes{instance=~\"$instance\",fstype=~\"ext4|xfs|nfs\"} / node_filesystem_size_bytes{instance=\"$instance\",fstype=~\"ext4|xfs|nfs\"})",
"legend": "{{device}}",
"refId": "B"
},
{
"expr": "sum(node_filesystem_size_bytes{instance=~\"$instance\",fstype=~\"ext4|xfs|nfs\"})",
"legend": "{{device}}",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "2f11831d-6bdd-4cfa-9b3f-3bc5b1722185",
"layout": {
"h": 10,
"i": "2f11831d-6bdd-4cfa-9b3f-3bc5b1722185",
"isResizable": true,
"w": 24,
"x": 0,
"y": 24
},
"links": [],
"maxPerRow": 4,
"name": "Network Traffic",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (instance) (irate(node_network_receive_bytes_total{instance=~\"$instance\",device!~\"tap.*|veth.*|br.*|docker.*|virbr*|lo*|nointernet*\"}[5m]))",
"legend": "{{instance}}-{{device}}-receive",
"refId": "A"
},
{
"expr": "sum by (instance) (irate(node_network_transmit_bytes_total{instance=~\"$instance\",device!~\"tap.*|veth.*|br.*|docker.*|virbr*|lo*|nointernet*\"}[5m]))",
"legend": "{{instance}}-{{device}}-transmit",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "DS_PROMETHEUS",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "query_result(node_boot_time_seconds)",
"multi": false,
"name": "instance",
"reg": "/instance=\"(?\u003ctext\u003e[^\"]*)/",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327479287000
}
================================================
FILE: integrations/Gitlab/dashboards/NGINXVTS.json
================================================
{
"id": 0,
"group_id": 0,
"name": "GitLab - NGINX VTS",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "ad321a25-c895-4cf0-b5ce-2a60c7f035ac",
"layout": {
"h": 4,
"i": "ad321a25-c895-4cf0-b5ce-2a60c7f035ac",
"w": 7,
"x": 0,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Nginx Info",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "topk(1, count by (hostname) (nginx_vts_info{instance=~\"$instance\"}))",
"legend": "hostname: {{hostname}}",
"refId": "B"
},
{
"expr": "topk(1, count by (version) (nginx_vts_info{instance=~\"$instance\"}))",
"legend": "nginx: {{version}}",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "f6643654-98f5-4d4a-8dd2-943a2f4f8e68",
"layout": {
"h": 12,
"i": "f6643654-98f5-4d4a-8dd2-943a2f4f8e68",
"w": 17,
"x": 7,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Latency",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum by (instance) (\n rate(nginx_vts_upstream_request_seconds_total{instance=~\"$instance\"}[5m])\n) /\nsum by (instance) (\n rate(nginx_vts_upstream_requests_total{instance=~\"$instance\"}[5m])\n)",
"instant": false,
"legend": "{{instance}}",
"refId": "A",
"step": 15,
"time": {
"end": "now",
"start": "now-6h"
}
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "cb24fbce-8bb5-425e-89f3-c32e22532b5d",
"layout": {
"h": 8,
"i": "cb24fbce-8bb5-425e-89f3-c32e22532b5d",
"w": 7,
"x": 0,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Shared Memory Usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (shared) (nginx_vts_main_shm_usage_bytes{instance=~\"$instance\", job=~\"nginx\"})",
"legend": "{{shared}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "54c5264a-a596-486f-a4a8-4ab451d91870",
"layout": {
"h": 8,
"i": "54c5264a-a596-486f-a4a8-4ab451d91870",
"w": 12,
"x": 0,
"y": 13
},
"links": [],
"maxPerRow": 4,
"name": "Server - Current total of incoming / outgoing bytes",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum without (instance) (\n rate(nginx_vts_server_bytes_total{instance=~\"$instance\"}[5m])\n)",
"legend": "{{direction}}-{{host}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "d0efa52f-862e-49b0-aa8c-7a9ba4f2af0b",
"layout": {
"h": 8,
"i": "d0efa52f-862e-49b0-aa8c-7a9ba4f2af0b",
"w": 12,
"x": 12,
"y": 13
},
"links": [],
"maxPerRow": 4,
"name": "Upstream - Current total of incoming / outgoing bytes",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum without (instance) (\n rate(nginx_vts_upstream_bytes_total{instance=~\"$instance\"}[5m])\n)",
"legend": "{{direction}}-{{backend}}-{{upstream}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "712b82cd-633a-4088-8695-fa2d75ab37ca",
"layout": {
"h": 8,
"i": "712b82cd-633a-4088-8695-fa2d75ab37ca",
"w": 12,
"x": 0,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "Server - Requests by HTTP code",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum without (instance) (\n rate(nginx_vts_server_requests_total{instance=~\"$instance\"}[5m])\n)",
"legend": "{{code}} {{host}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "09ca96b6-4e23-4db2-bc14-79c5f965d16f",
"layout": {
"h": 8,
"i": "09ca96b6-4e23-4db2-bc14-79c5f965d16f",
"w": 12,
"x": 12,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "Upstream - Requests by HTTP code",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum without (instance) (\n rate(nginx_vts_upstream_requests_total{instance=~\"$instance\"}[5m])\n)",
"legend": "{{code}} {{backend}} {{upstream }}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "DS_PROMETHEUS",
"type": "datasource"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(up{job=\"nginx\"}, instance)",
"multi": true,
"name": "instance",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327480972000
}
================================================
FILE: integrations/Gitlab/dashboards/Overview.json
================================================
{
"id": 0,
"group_id": 0,
"name": "GitLab - Overview",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [
{
"title": "GitLab Docs",
"url": "https://docs.gitlab.com/"
}
],
"panels": [
{
"custom": {
"alignItems": "center",
"bgColor": "#FFFFFF",
"content": "\u003cbr\u003e\u003ch1\u003e\u003ci\u003e\u003cfont color=#5991A7\u003e\u003cb\u003eGitLab Service Status\u003c/b\u003e\u003c/font\u003e\u003c/i\u003e\u003c/h1\u003e\n",
"justifyContent": "center",
"textColor": "#000000",
"textSize": 12
},
"id": "ffcfeb75-3a21-40b1-8fe7-313aa3e5f4e3",
"layout": {
"h": 3,
"i": "ffcfeb75-3a21-40b1-8fe7-313aa3e5f4e3",
"isResizable": true,
"w": 24,
"x": 0,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "",
"type": "text",
"version": "3.0.0"
},
{
"custom": {
"alignItems": "center",
"bgColor": "#FFFFFF",
"content": "\u003cp style=\"text-align:center;\"\u003e\u003cimg src=\"https://www.cloudfoundry.org/wp-content/uploads/2017/10/icon_gitlab_cf@2x.png\" width=80px/\u003e\u003c/p\u003e",
"justifyContent": "center",
"textColor": "#000000",
"textSize": 12
},
"id": "9991440f-1e01-4807-8911-2619329af244",
"layout": {
"h": 3,
"i": "9991440f-1e01-4807-8911-2619329af244",
"isResizable": true,
"w": 2,
"x": 0,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "Gitlab",
"type": "text",
"version": "3.0.0"
},
{
"custom": {
"calc": "first",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "version"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "f3fcab1f-d198-48c6-bf1d-44e481a9fa7a",
"layout": {
"h": 3,
"i": "f3fcab1f-d198-48c6-bf1d-44e481a9fa7a",
"isResizable": true,
"w": 5,
"x": 2,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "GitLab Version",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"valueMappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
]
},
"targets": [
{
"expr": "topk(1, count by (version) (gitlab_build_info{job=~\"gitlab-workhorse\"}))",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorDomainAuto": true,
"colorRange": [
"#f0ee6e",
"#6ba261",
"#306d52"
],
"reverseColorOrder": false,
"textMode": "valueAndName"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "7a4c7be6-56de-4c76-8c2f-cef5a80e84b5",
"layout": {
"h": 3,
"i": "7a4c7be6-56de-4c76-8c2f-cef5a80e84b5",
"isResizable": true,
"w": 17,
"x": 7,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "Service Status",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(up{job!~\"gitlab_exporter.*|node\",instance!~\".*:9168\",service=~\"gitlab\"}) by (job) / count(up{job!~\"gitlab_exporter.*|node\",instance!~\".*:9168\",service=~\"gitlab\"}) by (job) * 100",
"legend": "{{job}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "hexbin",
"version": "3.0.0"
},
{
"custom": {
"content": "\u003cbr\u003e\u003ch1\u003e\u003ci\u003e\u003cfont color=#5991A7\u003e\u003cb\u003eSidekiq Statistics\u003c/b\u003e\u003c/font\u003e\u003c/i\u003e\u003c/h1\u003e\n",
"version": "3.0.0"
},
"id": "9efa19b0-18fd-4f4d-abee-cebe09a36803",
"layout": {
"h": 2,
"i": "9efa19b0-18fd-4f4d-abee-cebe09a36803",
"isResizable": true,
"w": 24,
"x": 0,
"y": 6
},
"links": [],
"maxPerRow": 4,
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [],
"style": "line"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [],
"type": "text",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorDomainAuto": true,
"colorRange": [
"#83c898",
"#c2c2c2",
"#fc653f"
],
"reverseColorOrder": false,
"textMode": "valueAndName"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "fd0e96e0-7577-4475-8287-143d9d7e0bc7",
"layout": {
"h": 4,
"i": "fd0e96e0-7577-4475-8287-143d9d7e0bc7",
"isResizable": true,
"w": 12,
"x": 0,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Background Jobs",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(sidekiq_jobs_processed_total)",
"legend": "Processed",
"refId": "A"
},
{
"expr": "max(sidekiq_jobs_failed_total)",
"legend": "Failed",
"refId": "B"
},
{
"expr": "max(sidekiq_jobs_enqueued_size)",
"legend": "Enqueued",
"refId": "C"
},
{
"expr": "max(sidekiq_jobs_scheduled_size)",
"legend": "Scheduled",
"refId": "D"
},
{
"expr": "max(sidekiq_jobs_retry_size)",
"legend": "Retry",
"refId": "E"
},
{
"expr": "max(sidekiq_jobs_dead_size)",
"legend": "Dead",
"refId": "F"
},
{
"expr": "max(sidekiq_processes_size)",
"legend": "Processes",
"refId": "G"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "hexbin",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "1645b734-ef79-4ff5-8be4-a24747cf4e6e",
"layout": {
"h": 4,
"i": "1645b734-ef79-4ff5-8be4-a24747cf4e6e",
"isResizable": true,
"w": 12,
"x": 12,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Queue Size Top 10",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "topk(10, avg_over_time(sidekiq_queue_size{}[5m]))",
"legend": "{{name}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"content": "\u003cbr\u003e\u003ch1\u003e\u003ci\u003e\u003cfont color=#5991A7\u003e\u003cb\u003eWorkhorse Statistics\u003c/b\u003e\u003c/font\u003e\u003c/i\u003e\u003c/h1\u003e",
"version": "3.0.0"
},
"id": "e25e6cfb-020b-4743-8de9-ef6240144a94",
"layout": {
"h": 2,
"i": "e25e6cfb-020b-4743-8de9-ef6240144a94",
"isResizable": true,
"w": 24,
"x": 0,
"y": 12
},
"links": [],
"maxPerRow": 4,
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [],
"style": "line"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [],
"type": "text",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "dd030993-481d-43e4-887a-3c0dee279ea0",
"layout": {
"h": 9,
"i": "dd030993-481d-43e4-887a-3c0dee279ea0",
"isResizable": true,
"w": 12,
"x": 0,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "Http Request Total",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (code) (\n rate(gitlab_workhorse_http_requests_total{instance=~\".*:9229\"}[5m])\n)",
"legend": "{{code}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "77343eca-60d3-4be9-b80d-6db471399339",
"layout": {
"h": 9,
"i": "77343eca-60d3-4be9-b80d-6db471399339",
"isResizable": true,
"w": 12,
"x": 12,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "Workhorse Latency",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (le) (\n rate(gitlab_workhorse_http_request_duration_seconds_bucket{instance=~\".*:9229\"}[5m])\n)",
"legend": "{{le}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "DS_PROMETHEUS",
"type": "datasource"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327482615000
}
================================================
FILE: integrations/Gitlab/dashboards/PostgreSQL.json
================================================
{
"id": 0,
"group_id": 0,
"name": "GitLab - PostgreSQL",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "start time of the process",
"id": "dbbd49e7-f2e4-49e1-ad75-1010e5c24266",
"layout": {
"h": 4,
"i": "dbbd49e7-f2e4-49e1-ad75-1010e5c24266",
"w": 5,
"x": 0,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Start Time",
"options": {
"standardOptions": {
"util": "datetimeMilliseconds"
},
"thresholds": {
"steps": [
{
"color": "#56A64B",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "min(pg_postmaster_start_time_seconds{instance=~\"$instance\"} * 1000)",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorDomainAuto": true,
"colorRange": [
"#83c898",
"#c2c2c2",
"#fc653f"
],
"reverseColorOrder": false,
"textMode": "valueAndName"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "0a2e21b9-2cb0-46b4-99f3-141896ddce93",
"layout": {
"h": 4,
"i": "0a2e21b9-2cb0-46b4-99f3-141896ddce93",
"w": 15,
"x": 5,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "memory status",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "pg_settings_shared_buffers_bytes{instance=~\"$instance\"}",
"legend": "Shared Buffers",
"refId": "A"
},
{
"expr": "pg_settings_wal_buffers_bytes{instance=~\"$instance\"}",
"legend": "WAL Buffers",
"refId": "B"
},
{
"expr": "pg_settings_work_mem_bytes{instance=~\"$instance\"}",
"legend": "Work Mem",
"refId": "D"
},
{
"expr": "pg_settings_maintenance_work_mem_bytes{instance=~\"$instance\"}",
"legend": "Maintenance Work Mem",
"refId": "E"
},
{
"expr": "pg_settings_temp_buffers_bytes{instance=~\"$instance\"}",
"legend": "Temp Buffers",
"refId": "F"
},
{
"expr": "pg_settings_wal_buffers_bytes{instance=~\"$instance\"}",
"legend": "WAL Buffers",
"refId": "G"
},
{
"expr": "pg_settings_wal_segment_size_bytes{instance=~\"$instance\"}",
"legend": "WAL Segment Size",
"refId": "H"
},
{
"expr": "pg_settings_wal_keep_segments{instance=~\"$instance\"}",
"legend": "WAL Keep Segments",
"refId": "I"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "hexbin",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "87c6f320-cf0c-4b71-9a95-53144518cb4a",
"layout": {
"h": 4,
"i": "87c6f320-cf0c-4b71-9a95-53144518cb4a",
"w": 4,
"x": 20,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Worker",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "pg_settings_max_worker_processes{instance=~\"$instance\"}",
"legend": "Max Workers",
"refId": "A"
},
{
"expr": "pg_settings_max_parallel_workers{instance=~\"$instance\"}",
"legend": "Max Parallel Workers",
"refId": "B"
},
{
"expr": "pg_settings_max_parallel_workers_per_gather{instance=~\"$instance\"}",
"legend": "Max Parallel Workers Per Gather",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "6e5f1534-1f9f-401f-b1bb-b42690b59b76",
"layout": {
"h": 4,
"i": "6e5f1534-1f9f-401f-b1bb-b42690b59b76",
"w": 4,
"x": 0,
"y": 5
},
"links": [],
"maxPerRow": 4,
"name": "Current Conn",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(pg_stat_activity_count{datname=~\"$datname\", instance=~\"$instance\"})",
"legend": " Current Conn",
"refId": "A"
},
{
"expr": "pg_settings_max_connections{instance=~\"$instance\"}",
"legend": " {{instance}}-Max Conn",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Lag behind master in seconds.\n\nOnly available on a standby System.",
"id": "af3c4f59-a47b-4556-bcd5-e1ca89057e86",
"layout": {
"h": 4,
"i": "af3c4f59-a47b-4556-bcd5-e1ca89057e86",
"w": 3,
"x": 4,
"y": 5
},
"links": [],
"maxPerRow": 4,
"name": "Replication Lag ",
"options": {
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 1
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "max(pg_replication_lag{instance=\"$instance\"})",
"legend": "",
"refId": "A",
"step": 120,
"time": {
"end": "now",
"start": "now-15m"
}
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "c923682c-4337-4d20-82f5-db463714234b",
"layout": {
"h": 4,
"i": "c923682c-4337-4d20-82f5-db463714234b",
"w": 8,
"x": 7,
"y": 5
},
"links": [],
"maxPerRow": 4,
"name": "SQL",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(pg_stat_database_tup_inserted{datname=~\"$datname\", instance=~\"$instance\"})",
"legend": "Tuples Inserted",
"refId": "A"
},
{
"expr": "sum(pg_stat_database_tup_updated{datname=~\"$datname\", instance=~\"$instance\"})",
"legend": "Tuples Updated",
"refId": "B"
},
{
"expr": "sum(pg_stat_database_tup_deleted{datname=~\"$datname\", instance=~\"$instance\"})",
"legend": "Tuples Deleted",
"refId": "C"
},
{
"expr": "sum(pg_stat_database_tup_fetched{datname=~\"$datname\", instance=~\"$instance\"})",
"legend": "Tuples Fetched",
"refId": "D"
},
{
"expr": "sum(pg_stat_database_tup_returned{datname=~\"$datname\", instance=~\"$instance\"})",
"legend": "Tuples Returned",
"refId": "E"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "d8c5523d-0fa2-417f-8739-d20e5d48e234",
"layout": {
"h": 4,
"i": "d8c5523d-0fa2-417f-8739-d20e5d48e234",
"w": 9,
"x": 15,
"y": 5
},
"links": [],
"maxPerRow": 4,
"name": "Database Size",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(pg_database_size_bytes{instance=~\"$instance\",datname!~\"template.*|postgres.*\"}) by (datname)",
"legend": " {{datname}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "08ab8c71-d04a-463e-9857-e52c05f08877",
"layout": {
"h": 8,
"i": "08ab8c71-d04a-463e-9857-e52c05f08877",
"w": 12,
"x": 0,
"y": 9
},
"links": [],
"maxPerRow": 4,
"name": "Connections",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(pg_stat_activity_count{instance=~\"$instance\"})",
"legend": "CurrConn",
"refId": "A"
},
{
"expr": "sum(pg_settings_max_connections{instance=~\"$instance\"})",
"legend": "MaxConn",
"refId": "B"
},
{
"expr": "sum(pg_stat_activity_count{instance=~\"$instance\"}) * 100 / sum(pg_settings_max_connections{instance=~\"$instance\"})",
"legend": "RatioConn",
"refId": "C"
},
{
"expr": "sum(pg_stat_activity_count{instance=~\"$instance\",state!~\"idle\"}) * 100 / sum(pg_stat_activity_count{instance=~\"$instance\"})",
"legend": "SaturationConn",
"refId": "D"
},
{
"expr": "sum(pg_stat_activity_count{instance=~\"$instance\"}) by (state)",
"legend": "{{state}}",
"refId": "E"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "9f4089ed-bd47-47a5-a26d-17ab97262ee0",
"layout": {
"h": 8,
"i": "9f4089ed-bd47-47a5-a26d-17ab97262ee0",
"w": 12,
"x": 12,
"y": 9
},
"links": [],
"maxPerRow": 4,
"name": "Cache Hit Rate",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "pg_stat_database_blks_hit{instance=~\"$instance\",datname=~\"$datname\"} / (pg_stat_database_blks_read{instance=~\"$instance\"} + pg_stat_database_blks_hit{instance=~\"$instance\"})",
"legend": "Cache Hit Ratio - {{datname}} ",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "1 Minute rate of transactions committed or rollback.",
"id": "16780d96-8d43-439a-acae-cdf763074884",
"layout": {
"h": 9,
"i": "16780d96-8d43-439a-acae-cdf763074884",
"w": 12,
"x": 0,
"y": 18
},
"links": [],
"maxPerRow": 4,
"name": "Transactions",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(pg_stat_database_xact_commit{instance=~\"$instance\"}[1m])) by (datname) !=0",
"legend": "commit-{{datname}}",
"refId": "A"
},
{
"expr": "sum(rate(pg_stat_database_xact_rollback{instance=~\"$instance\"}[1m])) by (datname) !=0",
"legend": "rollback-{{datname}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "bb023ef8-c926-4380-8c40-a972a5d69dd3",
"layout": {
"h": 9,
"i": "bb023ef8-c926-4380-8c40-a972a5d69dd3",
"w": 12,
"x": 12,
"y": 18
},
"links": [],
"maxPerRow": 4,
"name": "Longest Transaction",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(pg_stat_activity_max_tx_duration{instance=~\"$instance\"}) by (datname)",
"legend": "{{datname}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Should be 0 \n\nSource: pg_stat_database\n\nWith log_lock_waits turned on, deadlocks will be logged to the PostgreSQL Logfiles.",
"id": "2db8688d-c88a-4d8c-93cf-5889de67e3db",
"layout": {
"h": 10,
"i": "2db8688d-c88a-4d8c-93cf-5889de67e3db",
"w": 12,
"x": 0,
"y": 27
},
"links": [],
"maxPerRow": 4,
"name": "Deadlocks",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum by (datname) (pg_stat_database_deadlocks{instance=~\"$instance\"})",
"legend": "{{datname}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Source: pg_locks",
"id": "c96d2e4e-1e72-484f-bbbe-7194ac306d97",
"layout": {
"h": 10,
"i": "c96d2e4e-1e72-484f-bbbe-7194ac306d97",
"w": 12,
"x": 12,
"y": 27
},
"links": [],
"maxPerRow": 4,
"name": "Locks by state",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum by (mode) (pg_locks_count{instance=~\"$instance\",datname=~\"$datname\"})",
"legend": "{{mode}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "* blk_read_time: Time spent reading data file blocks by backends in this database, in milliseconds\n* blk_write_time: Time spent writing data file blocks by backends in this database, in milliseconds\n\ntrack_io_timings needs to be activated",
"id": "bc11b349-8f36-4606-9ac1-a229ebe9a709",
"layout": {
"h": 9,
"i": "bc11b349-8f36-4606-9ac1-a229ebe9a709",
"w": 12,
"x": 0,
"y": 38
},
"links": [],
"maxPerRow": 4,
"name": "I/O Read/Write Time",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum (pg_stat_database_blk_read_time{instance=~\"$instance\",datname=~\"$datname\"})",
"legend": "blk_read_time",
"refId": "A"
},
{
"expr": "sum (pg_stat_database_blk_write_time{instance=~\"$instance\",datname=~\"$datname\"})",
"legend": "blk_write_time",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "a67d7b7a-c34a-4277-b3fc-fc10b4b88ec5",
"layout": {
"h": 9,
"i": "a67d7b7a-c34a-4277-b3fc-fc10b4b88ec5",
"w": 12,
"x": 12,
"y": 38
},
"links": [],
"maxPerRow": 4,
"name": "Checkpoint Stats",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "irate(pg_stat_bgwriter_checkpoint_write_time_total{instance=~\"$instance\"}[5m])",
"legend": "write_time",
"refId": "B"
},
{
"expr": "irate(pg_stat_bgwriter_checkpoint_sync_time_total{instance=~\"$instance\"}[5m])",
"legend": "sync_time",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "DS_PROMETHEUS",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(up{job=~'postgres'},instance)",
"multi": false,
"name": "instance",
"reg": "",
"type": "query"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(datname)",
"multi": true,
"name": "datname",
"reg": "/^(?!template)/",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327484272000
}
================================================
FILE: integrations/Gitlab/dashboards/Redis.json
================================================
{
"id": 0,
"group_id": 0,
"name": "GitLab - Redis",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "4ff9943b-6edd-4b6d-960c-0b992b8e1fd3",
"layout": {
"h": 3,
"i": "4ff9943b-6edd-4b6d-960c-0b992b8e1fd3",
"w": 4,
"x": 0,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Uptime",
"options": {
"standardOptions": {
"decimals": 0,
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "avg(time() - redis_start_time_seconds{instance=~\"$instance\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "avg",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "38a6c015-4dae-4765-8f5f-9961ca302e18",
"layout": {
"h": 3,
"i": "38a6c015-4dae-4765-8f5f-9961ca302e18",
"w": 4,
"x": 4,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Clients",
"options": {
"standardOptions": {
"decimals": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum(\n avg_over_time(redis_connected_clients{instance=~\"$instance\"}[5m])\n)",
"legend": "Clients",
"refId": "A"
},
{
"expr": "sum(\n avg_over_time(redis_connected_slaves{instance=~\"$instance\"}[5m])\n)",
"legend": "Slaves",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "0739f047-3fcd-4bf0-ac63-09c0a4170452",
"layout": {
"h": 6,
"i": "0739f047-3fcd-4bf0-ac63-09c0a4170452",
"w": 8,
"x": 8,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Commands Executed",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(\n rate(redis_commands_processed_total{instance=~\"$instance\"}[5m])\n)",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "7b23fa98-74eb-49b5-a29b-cb68fbd6d4d9",
"layout": {
"h": 6,
"i": "7b23fa98-74eb-49b5-a29b-cb68fbd6d4d9",
"w": 8,
"x": 16,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Hits, Misses per Second",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(\n rate(redis_keyspace_hits_total{instance=~\"$instance\"}[5m])\n)",
"legend": "hits",
"refId": "A"
},
{
"expr": "sum(\n rate(redis_keyspace_misses_total{instance=~\"$instance\"}[5m])\n)",
"legend": "misses",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "8a29cc91-697c-48d7-a613-14e81aeb6772",
"layout": {
"h": 10,
"i": "8a29cc91-697c-48d7-a613-14e81aeb6772",
"w": 8,
"x": 0,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "Memory Usage",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "redis_memory_used_bytes{instance=~\"$instance\"}",
"legend": "used - {{instance}}",
"refId": "A"
},
{
"expr": "redis_config_maxmemory{instance=~\"$instance\"} \u003e 0",
"legend": "max - {{instance}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "34e0f67f-acf1-49bb-b13c-ceebd0b17a7e",
"layout": {
"h": 7,
"i": "34e0f67f-acf1-49bb-b13c-ceebd0b17a7e",
"w": 8,
"x": 8,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Expired / Evicted",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(redis_expired_keys_total{instance=~\"$instance\"}[5m]))",
"legend": "expired",
"refId": "A"
},
{
"expr": "sum(rate(redis_evicted_keys_total{instance=~\"$instance\"}[5m]))",
"legend": "evicted",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "634f53c0-31d8-4a9f-9cfc-a7c241797359",
"layout": {
"h": 7,
"i": "634f53c0-31d8-4a9f-9cfc-a7c241797359",
"w": 8,
"x": 16,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Network I/O",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(\n rate(redis_net_input_bytes_total{instance=~\"$instance\"}[5m])\n)",
"legend": "In",
"refId": "A"
},
{
"expr": "sum(\n rate(redis_net_output_bytes_total{instance=~\"$instance\"}[5m])\n)",
"legend": "Out",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "normal"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "9267bb6a-edc7-4088-913e-5902e81cf736",
"layout": {
"h": 7,
"i": "9267bb6a-edc7-4088-913e-5902e81cf736",
"w": 16,
"x": 0,
"y": 13
},
"links": [],
"maxPerRow": 4,
"name": "Command Calls / sec",
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum without (instance) (\n rate(redis_commands_total{instance=~\"$instance\"}[5m])\n) \u003e 0",
"legend": "{{ cmd }}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "normal"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "51d28f91-c2f3-4f22-859e-bdc88f25ead2",
"layout": {
"h": 14,
"i": "51d28f91-c2f3-4f22-859e-bdc88f25ead2",
"w": 8,
"x": 16,
"y": 13
},
"links": [],
"maxPerRow": 4,
"name": "Expiring vs Not-Expiring Keys",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(redis_db_keys{instance=~\"$instance\"} - redis_db_keys_expiring{instance=~\"$instance\"}) ",
"legend": "not expiring",
"refId": "A"
},
{
"expr": "sum(redis_db_keys_expiring{instance=~\"$instance\"})",
"legend": "expiring",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "normal"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "1c0cd7d9-5899-49c0-8939-a361b76f60ac",
"layout": {
"h": 7,
"i": "1c0cd7d9-5899-49c0-8939-a361b76f60ac",
"w": 16,
"x": 0,
"y": 20
},
"links": [],
"maxPerRow": 4,
"name": "Items per DB",
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum by (db) (\n redis_db_keys{instance=~\"$instance\"}\n)",
"legend": "{{ db }} ",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "DS_PROMETHEUS",
"type": "datasource"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(up{job=\"redis\"}, instance)",
"multi": true,
"name": "instance",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327485951000
}
================================================
FILE: integrations/Gitlab/markdown/README.md
================================================
# Gitlab
Gitlab 默认提供 Prometheus 协议的监控数据,参考:[Monitoring GitLab with Prometheus](https://docs.gitlab.com/ee/administration/monitoring/prometheus/)。所以,使用 categraf 的 prometheus 插件即可采集。
## 采集配置
配置文件:categraf 的 `conf/input.prometheus/prometheus.toml`
```toml
[[instances]]
urls = [
"http://192.168.11.77:9236/metrics"
]
labels = {service="gitlab", job="gitaly"}
[[instances]]
urls = [
"http://192.168.11.77:9168/sidekiq"
]
labels = {service="gitlab", job="gitlab-exporter-sidekiq"}
[[instances]]
urls = [
"http://192.168.11.77:9168/database"
]
labels = {service="gitlab",job="gitlab-exporter-database"}
[[instances]]
urls = [
"http://192.168.11.77:8082/metrics"
]
labels = {service="gitlab", job="gitlab-sidekiq"}
[[instances]]
urls = [
"http://192.168.11.77:8082/metrics"
]
labels = {service="gitlab", job="gitlab-sidekiq"}
[[instances]]
urls = [
"http://192.168.11.77:9229/metrics"
]
labels = {service="gitlab",job="gitlab-workhorse"}
[[instances]]
urls = [
"http://192.168.11.77:9100/metrics"
]
labels = {service="gitlab", job="node"}
[[instances]]
urls = [
"http://192.168.11.77:9187/metrics"
]
labels = {service="gitlab", job="postgres"}
[[instances]]
urls = [
"http://192.168.11.77:9121/metrics"
]
labels = {service="gitlab", job="redis"}
[[instances]]
urls = [
"http://192.168.11.77:9999/metrics"
]
labels = {service="gitlab", job="nginx"}
```
================================================
FILE: integrations/GoogleCloud/collect/googlecloud/gcp.toml
================================================
#interval=60
#[[instances]]
#project_id="your-project-id"
#credentials_file="/path/to/your/key.json"
#delay="2m"
#period="1m"
#filter="metric.type=\"compute.googleapis.com/instance/cpu/utilization\" AND resource.labels.zone=\"asia-northeast1-a\""
#timeout="5s"
#cache_ttl="1h"
#gce_host_tag="xxx"
#request_inflight=30
================================================
FILE: integrations/GoogleCloud/markdown/README.md
================================================
# GCP 指标获取插件
## 需要权限
```shell
https://www.googleapis.com/auth/monitoring.read
```
## 配置
```toml
#采集周期,建议 >= 1分钟
interval=60
[[instances]]
#配置 project_id
project_id="your-project-id"
#配置认证的key文件
credentials_file="/path/to/your/key.json"
#或者配置认证的JSON
credentials_json="xxx"
# 指标的end time = now - delay
#delay="2m"
# 指标的start time = now - deley - period
#period="1m"
# 过滤器
#filter="metric.type=\"compute.googleapis.com/instance/cpu/utilization\" AND resource.labels.zone=\"asia-northeast1-a\""
# 请求超时时间
#timeout="5s"
# 指标列表的缓存时长 ,filter为空时 启用
#cache_ttl="1h"
# 给gce的instance_name 取个别名,放到label中
#gce_host_tag="xxx"
# 每次最多有多少请求同时发起
#request_inflight=30
# request_inflight 取值(0,100]
# 想配置更大的值 ,前提是你知道你在做什么
force_request_inflight= 200
```
================================================
FILE: integrations/HAProxy/collect/haproxy/haproxy.toml
================================================
[[instances]]
# URI on which to scrape HAProxy.
# e.g.
# uri = "http://localhost:5000/baz?stats;csv"
# uri = "http://user:pass@haproxy.example.com/haproxy?stats;csv"
# uri = "unix:/run/haproxy/admin.sock"
uri = ""
# Flag that enables SSL certificate verification for the scrape URI
ssl_verify = false
# Comma-separated list of exported server metrics. See http://cbonte.github.io/haproxy-dconv/configuration-1.5.html#9.1
server_metric_fields = ""
# Comma-separated list of exported server states to exclude. See https://cbonte.github.io/haproxy-dconv/1.8/management.html#9.1, field 17 status
server_exclude_states = ""
# Timeout for trying to get stats from HAProxy.
timeout = "5s"
# Flag that enables using HTTP proxy settings from environment variables ($http_proxy, $https_proxy, $no_proxy)
proxy_from_env = false
================================================
FILE: integrations/HAProxy/dashboards/dashboard.json
================================================
{
"id": 0,
"group_id": 0,
"name": "HAProxy By Categraf",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [
{
"targetBlank": true,
"title": "GitHub",
"url": "https://github.com/rfmoz/grafana-dashboards"
},
{
"targetBlank": true,
"title": "Grafana",
"url": "https://grafana.com/grafana/dashboards/12693-haproxy-2-full/"
}
],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3895acd5-4825-4ea6-b120-383b9b96d8de",
"layout": {
"h": 6,
"i": "3895acd5-4825-4ea6-b120-383b9b96d8de",
"isResizable": true,
"w": 2,
"x": 0,
"y": 0
},
"links": [],
"name": "Stats API UP?",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"special": 1
},
"result": {
"color": "#2c9d3d",
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"color": "#ce4f52",
"text": "DOWN"
},
"type": "special"
}
]
},
"targets": [
{
"expr": "haproxy_up",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"columns": [
"backend",
"value"
],
"displayMode": "labelsOfSeriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f98cd0f9-9979-4f4d-807f-757241d72d06",
"layout": {
"h": 6,
"i": "39b7c4ea-e19d-45b2-8a02-35b6cd1a0348",
"isResizable": true,
"w": 5,
"x": 2,
"y": 0
},
"links": [],
"name": "Backends UP?",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 1
},
"result": {
"color": "#2c9d3d",
"text": "UP"
},
"type": "special"
},
{
"match": {
"to": 0
},
"result": {
"color": "#e91515",
"text": "DOWN"
},
"type": "range"
}
]
},
"overrides": [
{}
],
"targets": [
{
"expr": "haproxy_backend_up{ident=\"$ident\"}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"columns": [
"backend",
"server",
"value"
],
"displayMode": "labelsOfSeriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "fb61f168-1f3e-4385-93e8-2bb168e01945",
"layout": {
"h": 6,
"i": "18fdab37-131a-4f24-8f0f-c173d5c8c9f9",
"isResizable": true,
"w": 5,
"x": 7,
"y": 0
},
"links": [],
"name": "Servers UP?",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 1
},
"result": {
"color": "#2c9d3d",
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 0,
"to": 0
},
"result": {
"color": "#e91515",
"text": "DOWN"
},
"type": "special"
}
]
},
"overrides": [
{}
],
"targets": [
{
"expr": "haproxy_server_up{ident=\"$ident\"}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "aeeb951c-a4d6-4b0e-918e-2f81c2f3b3be",
"layout": {
"h": 6,
"i": "aeeb951c-a4d6-4b0e-918e-2f81c2f3b3be",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"links": [],
"name": "Active sessions",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "green",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(haproxy_frontend_current_sessions{frontend=~\"$frontend\",ident=\"$ident\"}) by (frontend)",
"legend": "Front-{{frontend}}",
"refId": "B"
},
{
"expr": "sum(haproxy_backend_current_sessions{backend=~\"$backend\",ident=\"$ident\"}) by (backend)",
"legend": "Back-{{backend}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "96ca72c3-745e-458a-902f-fd3841b5453d",
"layout": {
"h": 6,
"i": "96ca72c3-745e-458a-902f-fd3841b5453d",
"isResizable": true,
"w": 12,
"x": 0,
"y": 6
},
"links": [],
"name": "Frontend responses by code",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "green",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(haproxy_frontend_http_responses_total{frontend=~\"$frontend\",code=~\"$code\",ident=\"$ident\"}[5m])) by (code)",
"legend": "{{ code }}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "cbadd1e1-25e2-41bd-9fe2-f5e932753c02",
"layout": {
"h": 6,
"i": "e9d01556-c4d0-4c2e-b555-97f0d99f7f6a",
"isResizable": true,
"w": 12,
"x": 12,
"y": 6
},
"links": [],
"name": "Backend responses by code",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "green",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(haproxy_backend_http_responses_total{backend=~\"$backend\",code=~\"$code\",ident=\"$ident\"}[5m])) by (code)",
"legend": "{{ code }}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "bb9db80d-7f9c-4b6b-8434-24e3ec480d5b",
"layout": {
"h": 6,
"i": "bb9db80d-7f9c-4b6b-8434-24e3ec480d5b",
"isResizable": true,
"w": 12,
"x": 0,
"y": 12
},
"links": [],
"name": "Incoming / Outgoing bits (Frontend)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bitsIEC"
},
"thresholds": {
"steps": [
{
"color": "green",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(haproxy_frontend_bytes_in_total{frontend=~\"$frontend\",ident=\"$ident\"}[5m])*8) by (frontend)",
"legend": "IN {{frontend}}",
"refId": "A"
},
{
"expr": "-sum(rate(haproxy_frontend_bytes_out_total{frontend=~\"$frontend\",ident=\"$ident\"}[5m])*8) by (frontend)",
"legend": "OUT {{frontend}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "6ab18da8-c354-4349-b25b-a50cdb02aae8",
"layout": {
"h": 6,
"i": "299e6721-990a-4070-afdd-6f6e402ca3af",
"isResizable": true,
"w": 12,
"x": 12,
"y": 12
},
"links": [],
"name": "Incoming / Outgoing bits(Backend)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bitsIEC"
},
"thresholds": {
"steps": [
{
"color": "green",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(haproxy_backend_bytes_in_total{backend=~\"$backend\",ident=\"$ident\"}[5m])*8) by (backend)",
"legend": "IN {{backend}}",
"refId": "A"
},
{
"expr": "-sum(rate(haproxy_backend_bytes_out_total{backend=~\"$backend\",ident=\"$ident\"}[5m])*8) by (backend)",
"legend": "OUT {{backend}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "5805c4b5-66c9-40d2-846d-4acc8a434ecd",
"layout": {
"h": 7,
"i": "5805c4b5-66c9-40d2-846d-4acc8a434ecd",
"isResizable": true,
"w": 12,
"x": 0,
"y": 18
},
"links": [],
"name": "Frontend Requests",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "green",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(haproxy_frontend_http_requests_total{frontend=~\"$frontend\",ident=\"$ident\"}[5m])) by (frontend)",
"legend": "Front requests - {{frontend}}",
"refId": "A"
},
{
"expr": "sum(rate(haproxy_frontend_request_errors_total{frontend=~\"$frontend\",ident=\"$ident\"}[5m])) by (frontend)",
"legend": "Front requests errors - {{frontend}}",
"refId": "C"
},
{
"expr": "sum(rate(haproxy_frontend_requests_denied_total{frontend=~\"$frontend\",ident=\"$ident\"}[5m])) by (frontend)",
"legend": "Front request denied - {{frontend}}",
"refId": "F"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "d1d2d177-eab6-43f5-a778-7999c6cd646c",
"layout": {
"h": 7,
"i": "5530a396-b52e-491b-8ca2-4c18ebf9c3f2",
"isResizable": true,
"w": 12,
"x": 12,
"y": 18
},
"links": [],
"name": "Backend Requests",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "green",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(haproxy_backend_redispatch_warnings_total{backend=~\"$backend\",ident=\"$ident\"}[5m])) by (backend)",
"legend": "Back redispatch warnings + {{backend}}",
"refId": "D"
},
{
"expr": "sum(rate(haproxy_backend_retry_warnings_total{backend=~\"$backend\",ident=\"$ident\"}[5m])) by (backend)",
"legend": "Back retry warnings + {{backend}}",
"refId": "E"
},
{
"expr": "sum(rate(haproxy_backend_response_errors_total{backend=~\"$backend\",ident=\"$ident\"}[5m])) by (backend)",
"legend": "Back response errors + {{backend}}",
"refId": "I"
},
{
"expr": "sum(haproxy_backend_current_queue{backend=~\"$backend\",ident=\"$host\"}) by (backend)",
"legend": "Back queued requests + {{backend}}",
"refId": "G"
},
{
"expr": "sum(rate(haproxy_backend_connection_errors_total{backend=~\"$backend\", ident=\"$ident\"}[5m])) by (backend)",
"legend": "Back conn errors + {{backend}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(haproxy_up,ident)",
"multi": false,
"name": "ident",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(haproxy_frontend_bytes_in_total{ident=\"$ident\"}, frontend)",
"multi": true,
"name": "frontend",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(haproxy_backend_bytes_in_total{ident=\"$ident\"}, backend)",
"multi": true,
"name": "backend",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(haproxy_server_bytes_in_total{ident=\"$ident\", backend=~\"$backend\"}, server)",
"multi": true,
"name": "server",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(haproxy_server_http_responses_total{ident=\"$ident\",backend=~\"$backend\", server=~\"$server\"}, code)",
"multi": true,
"name": "code",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327492256000
}
================================================
FILE: integrations/HAProxy/markdown/README.md
================================================
# HAProxy
forked from [haproxy_exporter](https://github.com/prometheus/haproxy_exporter)
Note: since HAProxy 2.0.0, the official source includes a Prometheus exporter module that can be built into your binary with a single flag during build time and offers an exporter-free Prometheus endpoint.
haproxy configurations for `/stats`:
```
frontend stats
bind *:8404
stats enable
stats uri /stats
stats refresh 10s
```
================================================
FILE: integrations/HTTP_Response/alerts/http_response_by_categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "http detect failed",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "http_response_result_code != 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327498098000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "https certificate will expire within 7 days",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(http_response_cert_expire_timestamp - time())/86400 \u003c= 7",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327498589000
}
]
================================================
FILE: integrations/HTTP_Response/collect/http_response/http_response.toml
================================================
## collect interval
# interval = 15
## Set the mapping of extra tags in batches
[mappings]
# "http://localhost" = { "job" = "local" }
# "https://www.baidu.com" = { "job" = "baidu" }
[[instances]]
targets = [
# "http://localhost",
# "https://www.baidu.com"
]
## append some labels for series
# labels = { region="cloud", product="n9e" }
## interval = global.interval * interval_times
# interval_times = 1
## Set http_proxy (categraf uses the system wide proxy settings if it's is not set)
# http_proxy = "http://localhost:8888"
## Interface to use when dialing an address
# interface = "eth0"
## HTTP Request Method
# method = "GET"
## Set response_timeout (default 5 seconds)
# response_timeout = "5s"
## Whether to follow redirects from the server (defaults to false)
# follow_redirects = false
## Optional HTTP Basic Auth Credentials
# username = "username"
# password = "pa$$word"
## Optional headers
# headers = ["Header-Key-1", "Header-Value-1", "Header-Key-2", "Header-Value-2"]
## Optional HTTP Request Body
# body = '''
# {'fake':'data'}
# '''
## Optional substring or regular expression match in body of the response(substring case sensitive).
## When both of the following parameters are enabled, one of them can be satisfied.
# expect_response_substring = "ok"
# expect_response_regular_expression = "green|yellow"
## Optional expected response status codes.
## "expect_response_status_codes" Supports adding multiple codes by delimiter("|" or ",").
## When both of the following parameters are enabled, one of them can be satisfied.
# expect_response_status_code = 0
# expect_response_status_codes = "200|301"
## Optional TLS Config
# use_tls = false
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
================================================
FILE: integrations/HTTP_Response/dashboards/http_response_by_categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "HTTP detect by UlricQin",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"aggrDimension": "target",
"calc": "lastNotNull",
"colorMode": "background",
"displayMode": "labelValuesToRows",
"showHeader": true,
"sortColumn": "target",
"sortOrder": "ascend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"id": "3674dbfa-243a-49f6-baa5-b7f887c1afb0",
"layout": {
"h": 15,
"i": "3674dbfa-243a-49f6-baa5-b7f887c1afb0",
"isResizable": true,
"w": 24,
"x": 0,
"y": 0
},
"name": "URL Details",
"options": {
"standardOptions": {},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"value": "A"
},
"properties": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "#417505",
"text": "UP"
},
"type": "special"
},
{
"match": {
"from": 1,
"special": 1
},
"result": {
"color": "#e90f0f",
"text": "DOWN"
},
"type": "range"
}
]
}
},
{
"matcher": {
"value": "D"
},
"properties": {
"standardOptions": {
"util": "humantimeSeconds"
},
"valueMappings": [
{
"match": {
"to": 604800
},
"result": {
"color": "#f60c0c"
},
"type": "range"
},
{
"match": {
"to": 2592000
},
"result": {
"color": "#ffae39"
},
"type": "range"
}
]
},
"type": "special"
},
{
"matcher": {
"value": "B"
},
"properties": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"to": 399
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
},
{
"match": {
"to": 499
},
"result": {
"color": "#ff656b"
},
"type": "range"
},
{
"match": {
"from": 500
},
"result": {
"color": "#f10808"
},
"type": "range"
}
]
},
"type": "special"
},
{
"matcher": {
"value": "C"
},
"properties": {
"standardOptions": {
"util": "milliseconds"
},
"valueMappings": [
{
"match": {
"to": 400
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
},
{
"match": {
"from": 400
},
"result": {
"color": "#ff656b"
},
"type": "range"
},
{
"match": {
"from": 2000
},
"result": {
"color": "#f11313"
},
"type": "range"
}
]
},
"type": "special"
}
],
"targets": [
{
"expr": "max(http_response_result_code) by (target)",
"instant": true,
"legend": "UP?",
"refId": "A"
},
{
"expr": "max(http_response_response_code) by (target)",
"instant": true,
"legend": "status code",
"refId": "B"
},
{
"expr": "max(http_response_response_time) by (target) *1000",
"instant": true,
"legend": "latency",
"refId": "C"
},
{
"expr": "max(http_response_cert_expire_timestamp) by (target) - time()",
"instant": true,
"legend": "cert expire",
"refId": "D"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "Datasource",
"type": "datasource"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327500066000
}
================================================
FILE: integrations/HTTP_Response/markdown/http.md
================================================
# http_response plugin
HTTP 探测插件,用于检测 HTTP 地址的连通性、延迟、HTTPS 证书过期时间。因为 Prometheus 生态的时序库只能存储 float64 类型的值,所以 HTTP 地址探测的结果也是 float64 类型的值,但是这个值的含义是不同的,具体含义如下:
```
Success = 0
ConnectionFailed = 1
Timeout = 2
DNSError = 3
AddressError = 4
BodyMismatch = 5
CodeMismatch = 6
```
如果一切正常,这个值是 0,如果有异常,这个值是 1-6 之间的值,具体含义如上。这个值对应的指标名字是 `http_response_result_code`。
## Configuration
categraf 的 `conf/input.http_response/http_response.toml`。最核心的配置就是 targets 配置,配置目标地址,比如想要监控两个地址:
```toml
[[instances]]
targets = [
"http://localhost:8080",
"https://www.baidu.com"
]
```
instances 下面的所有 targets 共享同一个 `[[instances]]` 下面的配置,比如超时时间,HTTP方法等,如果有些配置不同,可以拆成多个不同的 `[[instances]]`,比如:
```toml
[[instances]]
targets = [
"http://localhost:8080",
"https://www.baidu.com"
]
method = "GET"
[[instances]]
targets = [
"http://localhost:9090"
]
method = "POST"
```
完整的带有注释的配置如下:
```toml
[[instances]]
targets = [
# "http://localhost",
# "https://www.baidu.com"
]
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
## Set http_proxy (categraf uses the system wide proxy settings if it's is not set)
# http_proxy = "http://localhost:8888"
## Interface to use when dialing an address
# interface = "eth0"
## HTTP Request Method
# method = "GET"
## Set response_timeout (default 5 seconds)
# response_timeout = "5s"
## Whether to follow redirects from the server (defaults to false)
# follow_redirects = false
## Optional HTTP Basic Auth Credentials
# username = "username"
# password = "pa$$word"
## Optional headers
# headers = ["Header-Key-1", "Header-Value-1", "Header-Key-2", "Header-Value-2"]
## Optional HTTP Request Body
# body = '''
# {'fake':'data'}
# '''
## Optional substring match in body of the response (case sensitive)
# expect_response_substring = "ok"
## Optional expected response status code.
# expect_response_status_code = 0
## Optional TLS Config
# use_tls = false
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
```
================================================
FILE: integrations/HTTP_Response/metrics/categraf.json
================================================
[
{
"id": 0,
"uuid": 1717556327501087000,
"collector": "Categraf",
"typ": "HTTP_Response",
"name": "HTTP 探测响应码",
"unit": "none",
"note": "如果没有拿到 response,这个指标就没有值了",
"lang": "zh_CN",
"expression": "http_response_response_code",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "HTTP 探测响应码",
"note": "如果没有拿到 response,这个指标就没有值了"
},
{
"lang": "en_US",
"name": "HTTP probe response code",
"note": "If you don't get response, this indicator has no value"
}
]
},
{
"id": 0,
"uuid": 1717556327503611000,
"collector": "Categraf",
"typ": "HTTP_Response",
"name": "HTTP 探测结果状态码",
"unit": "none",
"note": "0 值表示正常,大于 0 就是异常,各个值的含义如下:\n\n```\nSuccess = 0\nConnectionFailed = 1\nTimeout = 2\nDNSError = 3\nAddressError = 4\nBodyMismatch = 5\nCodeMismatch = 6\n```",
"lang": "zh_CN",
"expression": "http_response_result_code",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "HTTP 探测结果状态码",
"note": "0 值表示正常,大于 0 就是异常,各个值的含义如下:\n\n```\nSuccess = 0\nConnectionFailed = 1\nTimeout = 2\nDNSError = 3\nAddressError = 4\nBodyMismatch = 5\nCodeMismatch = 6\n```"
},
{
"lang": "en_US",
"name": "HTTP probe result status code",
"note": "A value of 0 means normal, and a value greater than 0 means abnormal. The meanings of each value are as follows: \n \n``` \nSuccess = 0 \nConnectionFailed = 1 \nTimeout = 2 \nDNSError = 3 \nAddressError = 4 \nBodyMismatch = 5 \nCodeMismatch = 6 \n```"
}
]
},
{
"id": 0,
"uuid": 1717556327506135000,
"collector": "Categraf",
"typ": "HTTP_Response",
"name": "HTTP 探测耗时",
"unit": "seconds",
"note": "",
"lang": "zh_CN",
"expression": "http_response_response_time",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "HTTP 探测耗时",
"note": ""
},
{
"lang": "en_US",
"name": "HTTP probe time-consuming",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327508519000,
"collector": "Categraf",
"typ": "HTTP_Response",
"name": "HTTP 证书过期时间",
"unit": "datetimeSeconds",
"note": "",
"lang": "zh_CN",
"expression": "http_response_cert_expire_timestamp",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "HTTP 证书过期时间",
"note": ""
},
{
"lang": "en_US",
"name": "HTTP certificate expiration time",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327511202000,
"collector": "Categraf",
"typ": "HTTP_Response",
"name": "拨测 - DNS 请求耗时",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "cdn_dns_request",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "拨测 - DNS 请求耗时",
"note": ""
},
{
"lang": "en_US",
"name": "Dial test-DNS request time-consuming",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327514018000,
"collector": "Categraf",
"typ": "HTTP_Response",
"name": "拨测 - TCP建连耗时",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "cdn_tcp_connect",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "拨测 - TCP建连耗时",
"note": ""
},
{
"lang": "en_US",
"name": "Dial test-TCP connection establishment time",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327516118000,
"collector": "Categraf",
"typ": "HTTP_Response",
"name": "拨测 - TLS握手耗时",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "cdn_tls_handshake",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "拨测 - TLS握手耗时",
"note": ""
},
{
"lang": "en_US",
"name": "Dial test-TLS handshake time-consuming",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327518519000,
"collector": "Categraf",
"typ": "HTTP_Response",
"name": "拨测 - 探测结果状态码",
"unit": "none",
"note": "探测结果,0 是正常,其他数字有不同含义\n- 0:成功\n- 1:连接失败\n- 2:监测超时\n- 3:DNS解析失败\n- 4:地址格式错误\n- 5:返回内容不匹配\n- 6:返回码不匹配\n- 其他数字为未知错误",
"lang": "zh_CN",
"expression": "cdn_probe_result_code",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "拨测 - 探测结果状态码",
"note": "探测结果,0 是正常,其他数字有不同含义\n- 0:成功\n- 1:连接失败\n- 2:监测超时\n- 3:DNS解析失败\n- 4:地址格式错误\n- 5:返回内容不匹配\n- 6:返回码不匹配\n- 其他数字为未知错误"
},
{
"lang": "en_US",
"name": "Dial test-detection result status code",
"note": "Detection result, 0 is normal, other numbers have different meanings \n-0: Success \n-1: Connection failed \n-2: Monitoring timeout \n-3: DNS resolution failed \n-4: Address format is wrong \n-5: Return content does not match \n-6: Return code mismatch \n-Other numbers are unknown error"
}
]
},
{
"id": 0,
"uuid": 1717556327521098000,
"collector": "Categraf",
"typ": "HTTP_Response",
"name": "拨测 - 整体耗时",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "cdn_total_cost",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "拨测 - 整体耗时",
"note": ""
},
{
"lang": "en_US",
"name": "Dial test-overall time-consuming",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327523493000,
"collector": "Categraf",
"typ": "HTTP_Response",
"name": "拨测 - 返回状态码",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "cdn_response_status_code",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "拨测 - 返回状态码",
"note": ""
},
{
"lang": "en_US",
"name": "Dial test-Return status code",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327525787000,
"collector": "Categraf",
"typ": "HTTP_Response",
"name": "拨测 - 首包耗时",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "cdn_first_byte",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "拨测 - 首包耗时",
"note": ""
},
{
"lang": "en_US",
"name": "Dial test-first package time-consuming",
"note": ""
}
]
}
]
================================================
FILE: integrations/IPMI/alerts/alerts.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CPU温度超过90",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "ipmi_cpu1_temp{} \u003e 90 or ipmi_cpu2_temp{} \u003e 90",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327537842000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CPU电压大于10",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "ipmi_vcpu1 \u003e 10 or ipmi_vcpu1 \u003e 10",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327538351000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CPU风扇转速超过1000",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "ipmi_fan1 \u003e 1000 or ipmi_fan2 \u003e 1000 or ipmi_fan3 \u003e 1000 or ipmi_fan4 \u003e 1000 or ipmi_fan5 \u003e 1000 or ipmi_fan6 \u003e 1000",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327538796000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "主板温度超过90",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "ipmi_system_temp \u003e 90 or ipmi_pch_temp \u003e 90",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327539285000
}
]
================================================
FILE: integrations/IPMI/collect/ipmi/conf.toml
================================================
# Read metrics from the bare metal servers via freeipmi
[[instances]]
# target指定是本地采集还是远程采集
#target="localhost"
# 指定采集的用户名和密码,这里务必保证ipmi命令能获取正确输出,不是网上查到一个用户名 密码就可以。
#user = "user"
#pass = "1234"
# ipmi协议版本,支持1.5 和 2.0
#driver = "LAN_2_0"
# 指定特权用户名
#privilege = "user"
## session-timeout, ms
#timeout = 100000
# 支持的采集器 bmc, bmc-watchdog, ipmi, chassis, dcmi, sel,sm-lan-mode
# 默认使用 bmc, ipmi, chassis和dcmi,建议保持下列配置便于仪表盘更好的展示
collectors = [ "bmc", "ipmi", "chassis", "sel", "dcmi"]
# 不关注的传感器,指定id 排除掉
#exclude_sensor_ids = [ 2, 29, 32, 50, 52, 55 ]
# 如果你想使用定制化的参数覆盖内置的命令,可以修改以下内容; 建议保持注释
#[instances.collector_cmd]
#ipmi = "sudo"
#sel = "sudo"
#[instances.default_args]
#ipmi = [ "--bridge-sensors" ]
#[instances.custom_args]
#ipmi = [ "--bridge-sensors" ]
#sel = [ "ipmi-sel" ]
================================================
FILE: integrations/IPMI/dashboards/IPMI.json
================================================
{
"id": 0,
"group_id": 0,
"name": "IPMI",
"ident": "",
"tags": "Categraf latest",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a9c9b473-8182-4f84-9083-e96d656ac4fe",
"layout": {
"h": 4,
"i": "a9c9b473-8182-4f84-9083-e96d656ac4fe",
"w": 4,
"x": 0,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Power Status",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"options": {
"0": {
"color": "red",
"index": 1,
"text": "Powered Off"
},
"1": {
"color": "dark-green",
"index": 0,
"text": "Powered On"
}
},
"type": "value"
}
]
},
"targets": [
{
"expr": "ipmi_chassis_power_state{target=\"$instance\"}",
"legend": "{{target}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"columns": [
"data_center",
"dept",
"env",
"firmware_revision",
"manufacturer_id",
"target"
],
"displayMode": "labelsOfSeriesToRows",
"linkMode": "appendLinkColumn",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "47c94f54-43a2-46d1-b549-d199b30736a3",
"layout": {
"h": 4,
"i": "47c94f54-43a2-46d1-b549-d199b30736a3",
"w": 12,
"x": 4,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Machine Info",
"options": {
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
}
}
],
"targets": [
{
"expr": "ipmi_bmc_info{target=\"$instance\"}",
"legend": "",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"linkMode": "appendLinkColumn",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c782803d-a0b5-4d13-a6ed-ae6dcceb6049",
"layout": {
"h": 9,
"i": "c782803d-a0b5-4d13-a6ed-ae6dcceb6049",
"w": 8,
"x": 16,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Fan Speed State",
"options": {
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
}
}
],
"targets": [
{
"expr": "ipmi_fan_speed_state{target=\"$instance\"}",
"legend": "{{name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"textMode": "value",
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "99586558-1e82-48f0-8bc4-132e54369872",
"layout": {
"h": 5,
"i": "99586558-1e82-48f0-8bc4-132e54369872",
"w": 16,
"x": 0,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Fan speed in rotations per minute",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "ipmi_fan_speed_rpm{target=\"$instance\"}",
"legend": "{{name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "gauge",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "53982e93-520d-41f2-bac4-cfed93fea2aa",
"layout": {
"h": 8,
"i": "53982e93-520d-41f2-bac4-cfed93fea2aa",
"w": 12,
"x": 0,
"y": 9
},
"links": [],
"maxPerRow": 4,
"name": "Power Consumption watts",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "single"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ipmi_dcmi_power_consumption_watts{target=\"$instance\"}",
"legend": "{{target}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"linkMode": "appendLinkColumn",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6c73328d-c927-4329-8a54-19cdc9c1f056",
"layout": {
"h": 8,
"i": "6c73328d-c927-4329-8a54-19cdc9c1f056",
"w": 4,
"x": 12,
"y": 9
},
"links": [],
"maxPerRow": 4,
"name": "Power State",
"options": {
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
}
}
],
"targets": [
{
"expr": "ipmi_power_state{target=\"$instance\"}",
"legend": "{{name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"textMode": "value",
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3e7ac15e-4cf6-4472-9b99-f686e337a2c5",
"layout": {
"h": 8,
"i": "3e7ac15e-4cf6-4472-9b99-f686e337a2c5",
"w": 8,
"x": 16,
"y": 9
},
"links": [],
"maxPerRow": 4,
"name": "Power reading in Watts",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "ipmi_power_watts{target=\"$instance\"}",
"legend": "{{name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "gauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"columns": [
"data_center",
"dept",
"env",
"target",
"name",
"type",
"value"
],
"displayMode": "labelsOfSeriesToRows",
"linkMode": "appendLinkColumn",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "605380bb-763e-4fbd-af57-ad329ba2ac71",
"layout": {
"h": 8,
"i": "605380bb-763e-4fbd-af57-ad329ba2ac71",
"w": 12,
"x": 0,
"y": 17
},
"links": [],
"maxPerRow": 4,
"name": "IPMI Sensors State",
"options": {
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
}
}
],
"targets": [
{
"expr": "ipmi_sensor_state{target=\"$instance\"}",
"legend": "",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"linkMode": "appendLinkColumn",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "68e23440-dc48-4a05-bb59-441349b02afd",
"layout": {
"h": 8,
"i": "68e23440-dc48-4a05-bb59-441349b02afd",
"w": 6,
"x": 12,
"y": 17
},
"links": [],
"maxPerRow": 4,
"name": "Temperature State",
"options": {
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
}
}
],
"targets": [
{
"expr": "ipmi_temperature_state{target=\"$instance\"}",
"legend": "{{name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"textMode": "value",
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "663a5ef9-b735-4187-8793-074a3d78dd39",
"layout": {
"h": 8,
"i": "663a5ef9-b735-4187-8793-074a3d78dd39",
"w": 6,
"x": 18,
"y": 17
},
"links": [],
"maxPerRow": 4,
"name": "Temperatures",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#8AB8FF",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 70
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "ipmi_temperature_celsius{target=\"$instance\"}",
"legend": "{{name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "gauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorRange": [
"thresholds"
],
"textMode": "valueAndName",
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e7027cd8-1ccb-4238-9b91-5a18a3ccd29c",
"layout": {
"h": 3,
"i": "e7027cd8-1ccb-4238-9b91-5a18a3ccd29c",
"w": 24,
"x": 0,
"y": 25
},
"links": [],
"maxPerRow": 4,
"name": "Reported state of a voltage sensor",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "ipmi_voltage_state{target=\"$instance\"}",
"legend": "{{name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "hexbin",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"textMode": "value",
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "38437e8e-5c8c-4fc2-a274-515e8cc38ea0",
"layout": {
"h": 5,
"i": "38437e8e-5c8c-4fc2-a274-515e8cc38ea0",
"w": 24,
"x": 0,
"y": 28
},
"links": [],
"maxPerRow": 4,
"name": "Voltage reading in Volts",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "ipmi_voltage_volts{target=\"$instance\"}",
"legend": "{{name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "gauge",
"version": "3.0.0"
}
],
"var": [
{
"defaultValue": 1,
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(ipmi_bmc_info, target)",
"hide": false,
"multi": false,
"name": "instance",
"reg": "/.*/",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327540521000
}
================================================
FILE: integrations/IPMI/dashboards/IPMI_by_categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "IPMI by Categraf",
"ident": "",
"tags": "Categraf ~v0.3.44-pre Exporter",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"graphMode": "area",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f7d72708-f857-4d67-b9ab-1df6464bc685",
"layout": {
"h": 4,
"i": "f7d72708-f857-4d67-b9ab-1df6464bc685",
"isResizable": true,
"w": 4,
"x": 0,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Power Status",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"special": 1
},
"options": {
"0": {
"color": "red",
"index": 1,
"text": "Powered Off"
},
"1": {
"color": "dark-green",
"index": 0,
"text": "Powered On"
}
},
"result": {
"color": "rgba(44, 157, 61, 1)",
"text": "Powered On"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"color": "rgba(206, 79, 82, 1)",
"text": "Powered Off"
},
"type": "special"
}
]
},
"targets": [
{
"expr": "ipmi_chassis_power_state",
"legend": "{{ident}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"columns": [],
"displayMode": "labelsOfSeriesToRows",
"linkMode": "appendLinkColumn",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0d86649f-8b64-4a3c-8794-984ab92052a2",
"layout": {
"h": 4,
"i": "0d86649f-8b64-4a3c-8794-984ab92052a2",
"isResizable": true,
"w": 12,
"x": 4,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Machine Info",
"options": {
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
}
}
],
"targets": [
{
"expr": "ipmi_bmc_info",
"legend": "",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"__name__": true,
"value": true
}
}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "background",
"displayMode": "seriesToRows",
"linkMode": "appendLinkColumn",
"showHeader": false
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "20e67b9f-3d7b-4915-86a9-abbc74a11b57",
"layout": {
"h": 8,
"i": "20e67b9f-3d7b-4915-86a9-abbc74a11b57",
"isResizable": true,
"w": 8,
"x": 16,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Fan Speed State",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "rgba(44, 157, 61, 1)",
"text": "Normal"
},
"type": "special"
},
{
"match": {
"special": 1
},
"result": {
"color": "rgba(255, 174, 57, 1)",
"text": "Warning"
},
"type": "special"
},
{
"match": {
"special": 2
},
"result": {
"color": "rgba(206, 79, 82, 1)",
"text": "Critical"
},
"type": "special"
}
]
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
}
}
],
"targets": [
{
"expr": "ipmi_fan_speed_state{ident=\"$ident\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorDomainAuto": true,
"colorRange": [
"#c7f1ff",
"#42a1fa",
"#083294"
],
"reverseColorOrder": false,
"textMode": "valueAndName",
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "62decb7d-fcdb-4865-87eb-38217ceaaddf",
"layout": {
"h": 4,
"i": "62decb7d-fcdb-4865-87eb-38217ceaaddf",
"isResizable": true,
"w": 16,
"x": 0,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Fan speed in rotations per minute",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "ipmi_fan_speed_rpm{ident=\"$ident\"}",
"legend": "{{name}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "hexbin",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "This chart is only provided if the dcmi collector is enabled.",
"id": "a9796ec2-79dd-4ba9-946b-2849434523e0",
"layout": {
"h": 4,
"i": "a9796ec2-79dd-4ba9-946b-2849434523e0",
"isResizable": true,
"w": 12,
"x": 0,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": " Power Consumption watts",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "single"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "ipmi_dcmi_power_consumption_watts{ident=\"$ident\"}",
"legend": "{{ident}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "background",
"displayMode": "seriesToRows",
"linkMode": "appendLinkColumn",
"showHeader": false
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0ad29227-43f9-4649-9170-3930103c4c38",
"layout": {
"h": 4,
"i": "0ad29227-43f9-4649-9170-3930103c4c38",
"isResizable": true,
"w": 4,
"x": 12,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Power State",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "rgba(44, 157, 61, 1)",
"text": "Normal"
},
"type": "special"
},
{
"match": {
"special": 1
},
"result": {
"color": "rgba(255, 153, 25, 1)",
"text": "Warning"
},
"type": "special"
},
{
"match": {
"special": 2
},
"result": {
"color": "rgba(206, 79, 82, 1)",
"text": "Critical"
},
"type": "special"
}
]
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
}
}
],
"targets": [
{
"expr": "ipmi_power_state{ident=\"$ident\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorDomainAuto": true,
"colorRange": [
"#ffeda0",
"#fc4e2a",
"#800026"
],
"reverseColorOrder": false,
"textMode": "value",
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "38e2b452-3d6d-4b50-b821-6bafdcb6b1ba",
"layout": {
"h": 4,
"i": "38e2b452-3d6d-4b50-b821-6bafdcb6b1ba",
"isResizable": true,
"w": 8,
"x": 16,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Power reading in Watts",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "ipmi_power_watts{ident=\"$ident\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "hexbin",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "background",
"columns": [],
"displayMode": "labelsOfSeriesToRows",
"linkMode": "appendLinkColumn",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "726d9793-87a9-4942-ad36-8359f56cdbf2",
"layout": {
"h": 8,
"i": "726d9793-87a9-4942-ad36-8359f56cdbf2",
"isResizable": true,
"w": 12,
"x": 0,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "IPMI Sensors State",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "rgba(44, 157, 61, 1)",
"text": "Normal"
},
"type": "special"
},
{
"match": {
"special": 1
},
"result": {
"color": "rgba(255, 174, 57, 1)",
"text": "Warning"
},
"type": "special"
},
{
"match": {
"special": 2
},
"result": {
"color": "rgba(206, 79, 82, 1)",
"text": "Critical"
},
"type": "special"
}
]
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
}
}
],
"targets": [
{
"expr": "ipmi_sensor_state{ident=\"$ident\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"__name__": true,
"id": true
},
"indexByName": {
"__name__": 1,
"id": 2,
"ident": 0,
"name": 3,
"type": 4,
"value": 5
},
"renameByName": {
"value": "state"
}
}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "background",
"displayMode": "seriesToRows",
"linkMode": "appendLinkColumn",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "59080d53-8b88-4f32-8d66-f4a3f7b092b4",
"layout": {
"h": 8,
"i": "59080d53-8b88-4f32-8d66-f4a3f7b092b4",
"isResizable": true,
"w": 6,
"x": 12,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "Temperature State",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "rgba(44, 157, 61, 1)",
"text": "Normal"
},
"type": "special"
},
{
"match": {
"special": 1
},
"result": {
"color": "rgba(255, 153, 25, 1)",
"text": "Warning"
},
"type": "special"
},
{
"match": {
"special": 2
},
"result": {
"color": "rgba(206, 79, 82, 1)",
"text": "Critical"
},
"type": "special"
}
]
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
}
}
],
"targets": [
{
"expr": "ipmi_temperature_state{ident=\"$ident\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"textMode": "value",
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4c20f19c-878e-4dba-a6b2-b2af197ceee1",
"layout": {
"h": 8,
"i": "4c20f19c-878e-4dba-a6b2-b2af197ceee1",
"isResizable": true,
"w": 6,
"x": 18,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "Temperatures",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "rgba(138, 202, 255, 1)",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 70
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "ipmi_temperature_celsius{ident=\"$ident\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "gauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "background",
"columns": [],
"displayMode": "labelsOfSeriesToRows",
"linkMode": "appendLinkColumn",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "25c1201e-a1b8-4c65-a00e-3b6322aa4da5",
"layout": {
"h": 7,
"i": "25c1201e-a1b8-4c65-a00e-3b6322aa4da5",
"isResizable": true,
"w": 24,
"x": 0,
"y": 20
},
"links": [],
"maxPerRow": 4,
"name": "Reported state of a voltage sensor",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "rgba(44, 157, 61, 1)",
"text": "Normal"
},
"type": "special"
},
{
"match": {
"special": 1
},
"result": {
"color": "rgba(255, 174, 57, 1)",
"text": "Warning"
},
"type": "special"
},
{
"match": {
"special": 2
},
"result": {
"color": "rgba(206, 79, 82, 1)",
"text": "Critical"
},
"type": "special"
}
]
},
"overrides": [
{
"matcher": {},
"properties": {
"valueMappings": []
}
}
],
"targets": [
{
"expr": "ipmi_voltage_state{ident=\"$ident\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"__name__": true,
"ident": true
}
}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"textMode": "valueAndName",
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4373d423-43e7-4001-b1f9-6ac53a4a1ae6",
"layout": {
"h": 4,
"i": "4373d423-43e7-4001-b1f9-6ac53a4a1ae6",
"isResizable": true,
"w": 24,
"x": 0,
"y": 27
},
"links": [],
"maxPerRow": 4,
"name": "Voltage reading in Volts",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "ipmi_voltage_volts{ident=\"$ident\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "gauge",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(ipmi_bmc_info, ident)",
"hide": false,
"multi": false,
"name": "ident",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327542077000
}
================================================
FILE: integrations/IPMI/dashboards/IPMI_by_prometheus.json
================================================
{
"name": "IPMI for Prometheus",
"ident": "",
"configs": {
"version": "2.0.0",
"links": [],
"var": [
{
"name": "node",
"type": "query",
"datasource": {
"cate": "prometheus"
},
"definition": "label_values(ipmi_bmc_info, ident)",
"reg": "",
"multi": false
}
],
"panels": [
{
"type": "gauge",
"id": "f975fded-f57e-4a6e-80b4-50d5be6dd84c",
"layout": {
"h": 7,
"w": 24,
"x": 0,
"y": 0,
"i": "f975fded-f57e-4a6e-80b4-50d5be6dd84c",
"isResizable": true
},
"version": "2.0.0",
"datasourceCate": "prometheus",
"targets": [
{
"refId": "A",
"expr": "ipmi_temperature_celsius{ident='$node'}",
"legend": "{{name}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Temperatures",
"links": [],
"custom": {
"textMode": "valueAndName",
"calc": "avg"
},
"options": {
"valueMappings": [],
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "green",
"value": null,
"type": "base"
},
{
"color": "red",
"value": 80
}
]
}
}
},
{
"type": "timeseries",
"id": "681f1191-4777-4377-8b77-404d9f036406",
"layout": {
"h": 5,
"w": 12,
"x": 0,
"y": 7,
"i": "681f1191-4777-4377-8b77-404d9f036406",
"isResizable": true
},
"version": "2.0.0",
"datasourceCate": "prometheus",
"targets": [
{
"refId": "A",
"expr": "ipmi_power_watts{ident='$node'}",
"legend": "{{name}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Power",
"links": [],
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "feede24c-8296-4127-982e-08cfc4151933",
"layout": {
"h": 5,
"w": 12,
"x": 12,
"y": 7,
"i": "feede24c-8296-4127-982e-08cfc4151933",
"isResizable": true
},
"version": "2.0.0",
"datasourceCate": "prometheus",
"targets": [
{
"refId": "A",
"expr": "ipmi_power_watts{ident='$node'} * 30 * 24 ",
"legend": "{{name}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Power usage 30d",
"links": [],
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "9e11e7f5-ed3c-49eb-8a72-ee76c8700c24",
"layout": {
"h": 7,
"w": 12,
"x": 0,
"y": 12,
"i": "9e11e7f5-ed3c-49eb-8a72-ee76c8700c24",
"isResizable": true
},
"version": "2.0.0",
"datasourceCate": "prometheus",
"targets": [
{
"refId": "A",
"expr": "ipmi_temperature_celsius{ident='$node'}",
"legend": "{{name}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Temperatures",
"links": [],
"description": "",
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "green",
"value": null,
"type": "base"
},
{
"color": "red",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "95c734f7-26cb-41a7-8376-49332cc220c2",
"layout": {
"h": 7,
"w": 12,
"x": 12,
"y": 12,
"i": "95c734f7-26cb-41a7-8376-49332cc220c2",
"isResizable": true
},
"version": "2.0.0",
"datasourceCate": "prometheus",
"targets": [
{
"refId": "A",
"expr": "ipmi_power_watts{ident='$node'}",
"legend": "{{name}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Power",
"links": [],
"description": "",
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "green",
"value": null,
"type": "base"
},
{
"color": "red",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.01,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "0313f34f-afcf-41e9-8f69-9a3dbd4b2e56",
"layout": {
"h": 7,
"w": 12,
"x": 0,
"y": 19,
"i": "0313f34f-afcf-41e9-8f69-9a3dbd4b2e56",
"isResizable": true
},
"version": "2.0.0",
"datasourceCate": "prometheus",
"targets": [
{
"refId": "A",
"expr": "ipmi_fan_speed_rpm{ident='$node'}",
"legend": "{{name}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Fans",
"links": [],
"description": "",
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "green",
"value": null,
"type": "base"
},
{
"color": "red",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
},
{
"type": "timeseries",
"id": "29ee004d-a95c-405d-97d1-d715fab4e1de",
"layout": {
"h": 7,
"w": 12,
"x": 12,
"y": 19,
"i": "29ee004d-a95c-405d-97d1-d715fab4e1de",
"isResizable": true
},
"version": "2.0.0",
"datasourceCate": "prometheus",
"targets": [
{
"refId": "A",
"expr": "ipmi_voltage_volts{ident='$node',name!~\"Voltage 1|Voltage 2\"}",
"legend": "{{name}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Voltages",
"links": [],
"description": "",
"options": {
"tooltip": {
"mode": "multi"
},
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "green",
"value": null,
"type": "base"
},
{
"color": "red",
"value": 80
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 1,
"fillOpacity": 0.5,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
}
}
]
},
"uuid": 1727587308068775000
}
================================================
FILE: integrations/IPMI/markdown/README.md
================================================
# IPMI plugin
ipmi插件是从ipmi exporter迁移过来。 基本原理是通过执行ipmi的一系列命令并将命令输出转换为指标,如果ipmi没有配置好,是无法采集到指标的,请务必将ipmi配置好。
categraf的ipmi插件配置举例如下:
```toml
# Read metrics from the bare metal servers via freeipmi
[[instances]]
# target指定是本地采集还是远程采集
#target="localhost"
# 指定采集的用户名和密码,这里务必保证ipmi命令能获取正确输出,不是网上查到一个用户名 密码就可以。
#user = "user"
#pass = "1234"
# ipmi协议版本,支持1.5 和 2.0
#driver = "LAN_2_0"
# 指定特权用户名
#privilege = "user"
## session-timeout, ms
#timeout = 100000
# 支持的采集器 bmc, bmc-watchdog, ipmi, chassis, dcmi, sel,sm-lan-mode
# 默认使用 bmc, ipmi, chassis和dcmi,建议保持下列配置便于仪表盘更好的展示
collectors = [ "bmc", "ipmi", "chassis", "sel", "dcmi"]
# 不关注的传感器,指定id 排除掉
#exclude_sensor_ids = [ 2, 29, 32, 50, 52, 55 ]
# 如果你想使用定制化的参数覆盖内置的命令,可以修改以下内容; 建议保持注释
#[instances.collector_cmd]
#ipmi = "sudo"
#sel = "sudo"
#[instances.default_args]
#ipmi = [ "--bridge-sensors" ]
#[instances.custom_args]
#ipmi = [ "--bridge-sensors" ]
#sel = [ "ipmi-sel" ]
```
================================================
FILE: integrations/IPVS/collect/ipvs/ipvs.toml
================================================
# Collect virtual and real server stats from Linux IPVS
# no configuration
================================================
FILE: integrations/IPVS/markdown/README.md
================================================
# ipvs
Forked from telegraf. The IPVS input plugin uses the linux kernel netlink socket interface to gather
metrics about ipvs virtual and real servers.
**Supported Platforms:** Linux
### Permissions
Assuming you installed the telegraf package via one of the published packages,
the process will be running as the `telegraf` user. However, in order for this
plugin to communicate over netlink sockets it needs the telegraf process to be
running as `root` (or some user with `CAP_NET_ADMIN` and `CAP_NET_RAW`). Be sure
to ensure these permissions before running telegraf with this plugin included.
## Configuration
```
# Collect virtual and real server stats from Linux IPVS
# no configuration
```
## Metrics
Server will contain tags identifying how it was configured, using one of
`address` + `port` + `protocol` *OR* `fwmark`. This is how one would normally
configure a virtual server using `ipvsadm`.
- ipvs_virtual_server
- tags:
- sched (the scheduler in use)
- netmask (the mask used for determining affinity)
- address_family (inet/inet6)
- address
- port
- protocol
- fwmark
- fields:
- connections
- pkts_in
- pkts_out
- bytes_in
- bytes_out
- pps_in
- pps_out
- cps
- ipvs_real_server
- tags:
- address
- port
- address_family (inet/inet6)
- virtual_address
- virtual_port
- virtual_protocol
- virtual_fwmark
- fields:
- active_connections
- inactive_connections
- connections
- pkts_in
- pkts_out
- bytes_in
- bytes_out
- pps_in
- pps_out
- cps
## Example Output
Virtual server is configured using `fwmark` and backed by 2 real servers:
```shell
ipvs_virtual_server,address=172.18.64.234,address_family=inet,netmask=32,port=9000,protocol=tcp,sched=rr bytes_in=0i,bytes_out=0i,pps_in=0i,pps_out=0i,cps=0i,connections=0i,pkts_in=0i,pkts_out=0i 1541019340000000000
ipvs_real_server,address=172.18.64.220,address_family=inet,port=9000,virtual_address=172.18.64.234,virtual_port=9000,virtual_protocol=tcp active_connections=0i,inactive_connections=0i,pkts_in=0i,bytes_out=0i,pps_out=0i,connections=0i,pkts_out=0i,bytes_in=0i,pps_in=0i,cps=0i 1541019340000000000
ipvs_real_server,address=172.18.64.219,address_family=inet,port=9000,virtual_address=172.18.64.234,virtual_port=9000,virtual_protocol=tcp active_connections=0i,inactive_connections=0i,pps_in=0i,pps_out=0i,connections=0i,pkts_in=0i,pkts_out=0i,bytes_in=0i,bytes_out=0i,cps=0i 1541019340000000000
```
Virtual server is configured using `proto+addr+port` and backed by 2 real
servers:
```shell
ipvs_virtual_server,address_family=inet,fwmark=47,netmask=32,sched=rr cps=0i,connections=0i,pkts_in=0i,pkts_out=0i,bytes_in=0i,bytes_out=0i,pps_in=0i,pps_out=0i 1541019340000000000
ipvs_real_server,address=172.18.64.220,address_family=inet,port=9000,virtual_fwmark=47 inactive_connections=0i,pkts_out=0i,bytes_out=0i,pps_in=0i,cps=0i,active_connections=0i,pkts_in=0i,bytes_in=0i,pps_out=0i,connections=0i 1541019340000000000
ipvs_real_server,address=172.18.64.219,address_family=inet,port=9000,virtual_fwmark=47 cps=0i,active_connections=0i,inactive_connections=0i,connections=0i,pkts_in=0i,bytes_out=0i,pkts_out=0i,bytes_in=0i,pps_in=0i,pps_out=0i 1541019340000000000
```
================================================
FILE: integrations/Java/dashboards/jmx_by_exporter.json
================================================
{
"name": "JMX",
"tags": "Prometheus JMX",
"ident": "",
"uuid": 1760503245274000,
"configs": {
"panels": [
{
"collapsed": true,
"id": "a26c5c3d-7b60-4746-bd1f-ca95581cf2fd",
"layout": {
"h": 1,
"i": "a26c5c3d-7b60-4746-bd1f-ca95581cf2fd",
"w": 24,
"x": 0,
"y": 0
},
"name": "Basic Info",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "0721ee76-816b-469f-9c49-2bef94a9299e",
"layout": {
"h": 3,
"i": "0721ee76-816b-469f-9c49-2bef94a9299e",
"w": 6,
"x": 0,
"y": 1
},
"name": "Status",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 1
},
"result": {
"color": "#1eac02",
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"color": "#f00a0a",
"text": "DOWN"
},
"type": "special"
}
]
},
"targets": [
{
"expr": "up{job=\"$job\", instance=\"$instance\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "a55c40fc-dc25-4d2a-8e99-928e02c5ff5d",
"layout": {
"h": 3,
"i": "a55c40fc-dc25-4d2a-8e99-928e02c5ff5d",
"w": 6,
"x": 6,
"y": 1
},
"name": "Uptime",
"options": {
"standardOptions": {
"util": "humantimeSeconds"
}
},
"targets": [
{
"expr": "time() - process_start_time_seconds{job=\"$job\",instance=\"$instance\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "60c3389c-808d-4412-b74b-cb762e89a8ad",
"layout": {
"h": 3,
"i": "60c3389c-808d-4412-b74b-cb762e89a8ad",
"w": 6,
"x": 12,
"y": 1
},
"name": "Available CPUs",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "os_available_processors{job=\"$job\",instance=\"$instance\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1c9a8cca-3578-485e-837d-21618d383065",
"layout": {
"h": 3,
"i": "1c9a8cca-3578-485e-837d-21618d383065",
"w": 6,
"x": 18,
"y": 1
},
"name": "Open file descriptors",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "os_open_file_descriptor_count{job=\"$job\",instance=\"$instance\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "705c90e0-e8b6-4f1c-b35c-c8a785009a20",
"layout": {
"h": 1,
"i": "705c90e0-e8b6-4f1c-b35c-c8a785009a20",
"w": 24,
"x": 0,
"y": 4
},
"name": "JVM Memory",
"type": "row"
},
{
"type": "timeseries",
"id": "5455e2f2-f6bb-4888-9d88-240d7e12cce2",
"layout": {
"h": 7,
"i": "5455e2f2-f6bb-4888-9d88-240d7e12cce2",
"w": 12,
"x": 0,
"y": 5
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_used_bytes{area=\"heap\",job=\"$job\",instance=\"$instance\"}",
"legend": "Used",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_bytes_max{area=\"heap\",job=\"$job\",instance=\"$instance\"}",
"legend": "Max",
"refId": "B",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "JVM Memory(heap)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "765b22a9-1ddc-4c08-8758-684e3c13252b",
"layout": {
"h": 7,
"i": "765b22a9-1ddc-4c08-8758-684e3c13252b",
"w": 12,
"x": 12,
"y": 5
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_used_bytes{area=\"nonheap\",job=\"$job\",instance=\"$instance\"}",
"legend": "Used",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_bytes_max{area=\"nonheap\",job=\"$job\",instance=\"$instance\"}",
"legend": "Max",
"refId": "B",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "JVM Memory(nonheap)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "c43aa6f5-7252-400f-bb9f-8c96e436151c",
"layout": {
"h": 1,
"i": "c43aa6f5-7252-400f-bb9f-8c96e436151c",
"w": 24,
"x": 0,
"y": 12
},
"name": "Memory Pool",
"type": "row"
},
{
"type": "timeseries",
"id": "5ab2434c-a905-43c1-a563-4cee2dc9dce9",
"layout": {
"h": 7,
"i": "5ab2434c-a905-43c1-a563-4cee2dc9dce9",
"w": 6,
"x": 0,
"y": 13
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=\"CodeHeap 'non-nmethods'\", job=\"$job\",instance=\"$instance\"}",
"legend": "Max",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_used{pool=\"CodeHeap 'non-nmethods'\", job=\"$job\",instance=\"$instance\"}",
"legend": "Used",
"refId": "B",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=\"CodeHeap 'non-nmethods'\", job=\"$job\",instance=\"$instance\"}",
"legend": "Committed",
"refId": "C",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "CodeHeap 'non-nmethods'",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "bfe16d07-91ff-44e6-87bc-9d5d93d2ebd6",
"layout": {
"h": 7,
"i": "bfe16d07-91ff-44e6-87bc-9d5d93d2ebd6",
"w": 6,
"x": 6,
"y": 13
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=\"CodeHeap 'profiled nmethods'\", job=\"$job\",instance=\"$instance\"}",
"legend": "Max",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_used{pool=\"CodeHeap 'profiled nmethods'\", job=\"$job\",instance=\"$instance\"}",
"legend": "Used",
"refId": "B",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=\"CodeHeap 'profiled nmethods'\", job=\"$job\",instance=\"$instance\"}",
"legend": "Committed",
"refId": "C",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "CodeHeap 'profiled nmethods'",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "18d10f97-5ab2-41c4-a3ad-09f2c7a03e1a",
"layout": {
"h": 7,
"i": "18d10f97-5ab2-41c4-a3ad-09f2c7a03e1a",
"w": 6,
"x": 12,
"y": 13
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=\"CodeHeap 'non-profiled nmethods'\", job=\"$job\",instance=\"$instance\"}",
"legend": "Max",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_used{pool=\"CodeHeap 'non-profiled nmethods'\", job=\"$job\",instance=\"$instance\"}",
"legend": "Used",
"refId": "B",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=\"CodeHeap 'non-profiled nmethods'\", job=\"$job\",instance=\"$instance\"}",
"legend": "Committed",
"refId": "C",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "CodeHeap 'non-profiled nmethods'",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "314a3893-c1d4-4f85-bce0-33ecfda2f521",
"layout": {
"h": 7,
"i": "314a3893-c1d4-4f85-bce0-33ecfda2f521",
"w": 6,
"x": 18,
"y": 13
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=~\"G1 Eden Space|Eden Space|PS Eden Space\", job=\"$job\",instance=\"$instance\"}",
"legend": "Max {{pool}}",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_used{pool=~\"G1 Eden Space|Eden Space|PS Eden Space\", job=\"$job\",instance=\"$instance\"}",
"legend": "Used {{pool}}",
"refId": "B",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=~\"G1 Eden Space|Eden Space|PS Eden Space\", job=\"$job\",instance=\"$instance\"}",
"legend": "Committed {{pool}}",
"refId": "C",
"maxDataPoints": 240
}
],
"name": "Eden Space",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "1e5f03e7-af5d-447b-9c1b-23d81915e8df",
"layout": {
"h": 7,
"i": "1e5f03e7-af5d-447b-9c1b-23d81915e8df",
"w": 6,
"x": 0,
"y": 15
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=\"Compressed Class Space\", job=\"$job\",instance=\"$instance\"}",
"legend": "Max",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_used{pool=\"Compressed Class Space\", job=\"$job\",instance=\"$instance\"}",
"legend": "Used",
"refId": "B",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=\"Compressed Class Space\", job=\"$job\",instance=\"$instance\"}",
"legend": "Committed",
"refId": "C",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Compressed Class Space",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "86a68ff6-238c-4fc9-b77e-3b964e564500",
"layout": {
"h": 7,
"i": "86a68ff6-238c-4fc9-b77e-3b964e564500",
"w": 6,
"x": 6,
"y": 15
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=~\"Survivor Space|PS Survivor Space|G1 Survivor Space\", job=\"$job\",instance=\"$instance\"}",
"legend": "Max {{pool}}",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_used{pool=~\"Survivor Space|PS Survivor Space|G1 Survivor Space\", job=\"$job\",instance=\"$instance\"}",
"legend": "Used {{pool}}",
"refId": "B",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=~\"Survivor Space|PS Survivor Space|G1 Survivor Space\", job=\"$job\",instance=\"$instance\"}",
"legend": "Committed {{pool}}",
"refId": "C",
"maxDataPoints": 240
}
],
"name": "Survivor Space",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "595af7d1-e53c-43b5-8f62-ddb9b3a4ffcb",
"layout": {
"h": 7,
"i": "595af7d1-e53c-43b5-8f62-ddb9b3a4ffcb",
"w": 6,
"x": 12,
"y": 15
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=~\"PS Old Gen|G1 Old Gen|Tenured Gen\", job=\"$job\",instance=\"$instance\"}",
"legend": "Max {{pool}}",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_used{pool=~\"PS Old Gen|G1 Old Gen|Tenured Gen\", job=\"$job\",instance=\"$instance\"}",
"legend": "Used {{pool}}",
"refId": "B",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=~\"PS Old Gen|G1 Old Gen|Tenured Gen\", job=\"$job\",instance=\"$instance\"}",
"legend": "Committed {{pool}}",
"refId": "C",
"maxDataPoints": 240
}
],
"name": "Old Gen",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "380fdfcb-16a6-4131-abaa-a3911b7de6fa",
"layout": {
"h": 7,
"i": "380fdfcb-16a6-4131-abaa-a3911b7de6fa",
"w": 6,
"x": 18,
"y": 15
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=\"Metaspace\", job=\"$job\",instance=\"$instance\"}",
"legend": "Max",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_used{pool=\"Metaspace\", job=\"$job\",instance=\"$instance\"}",
"legend": "Used",
"refId": "B",
"maxDataPoints": 240
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=\"Metaspace\", job=\"$job\",instance=\"$instance\"}",
"legend": "Committed",
"refId": "C",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Metaspace",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "0aaf3516-4938-41e3-b7cb-323de6de75d9",
"layout": {
"h": 1,
"i": "0aaf3516-4938-41e3-b7cb-323de6de75d9",
"w": 24,
"x": 0,
"y": 22
},
"name": "GC",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "5303bda0-47c2-4aca-bb12-1da512500f4a",
"layout": {
"h": 7,
"i": "5303bda0-47c2-4aca-bb12-1da512500f4a",
"w": 8,
"x": 0,
"y": 23
},
"name": "过去一分钟GC耗时(秒)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "increase(jvm_gc_collection_seconds_sum{job=\"$job\",instance=~\"$instance\"}[1m])",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "cf410459-b5df-4aca-a410-ecda091d6097",
"layout": {
"h": 7,
"i": "cf410459-b5df-4aca-a410-ecda091d6097",
"w": 8,
"x": 8,
"y": 23
},
"name": "过去一分钟GC次数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "increase(jvm_gc_collection_seconds_count{job=\"$job\",instance=\"$instance\"}[1m])",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "bars",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "30feb928-b7c3-4e71-aeeb-cc10994b313c",
"layout": {
"h": 7,
"i": "30feb928-b7c3-4e71-aeeb-cc10994b313c",
"w": 8,
"x": 16,
"y": 23
},
"name": "过去一分钟每次GC平均耗时(秒)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "increase(jvm_gc_collection_seconds_sum{job=\"$job\",instance=\"$instance\"}[1m])/increase(jvm_gc_collection_seconds_count{job=\"$job\",instance=\"$instance\"}[1m])",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "fd6d0772-40d7-4211-b9bb-601e35fb6431",
"layout": {
"h": 1,
"i": "fd6d0772-40d7-4211-b9bb-601e35fb6431",
"w": 24,
"x": 0,
"y": 30
},
"name": "Threads and Class loading",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "65c74a2b-5f01-4491-b45a-dffe4a9b678a",
"layout": {
"h": 7,
"i": "65c74a2b-5f01-4491-b45a-dffe4a9b678a",
"w": 12,
"x": 0,
"y": 31
},
"name": "Threads",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "jvm_threads_current{job=\"$job\",instance=\"$instance\"}",
"legend": "current",
"refId": "A"
},
{
"expr": "jvm_threads_daemon{job=\"$job\",instance=\"$instance\"}",
"legend": "daemon",
"refId": "B"
},
{
"expr": "jvm_threads_deadlocked{job=\"$job\",instance=\"$instance\"}",
"legend": "deadlocked",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"type": "timeseries",
"id": "2da16907-adf7-4561-9338-4254c89a311b",
"layout": {
"h": 7,
"i": "2da16907-adf7-4561-9338-4254c89a311b",
"w": 12,
"x": 12,
"y": 31
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_classes_loaded_total{job=\"$job\", instance=\"$instance\"}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Class loading",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "12fe119e-54f0-4219-9846-ac982c1e9b4d",
"layout": {
"h": 1,
"i": "12fe119e-54f0-4219-9846-ac982c1e9b4d",
"w": 24,
"x": 0,
"y": 38
},
"name": "Physical memory",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "5a859147-edfc-4dac-9457-8a928213bc00",
"layout": {
"h": 7,
"i": "5a859147-edfc-4dac-9457-8a928213bc00",
"w": 24,
"x": 0,
"y": 39
},
"name": "Physical memory",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "os_total_physical_memory_bytes{job=\"$job\",instance=\"$instance\"}",
"legend": "Total physical memory",
"refId": "A"
},
{
"expr": "os_committed_virtual_memory_bytes{job=\"$job\",instance=\"$instance\"}",
"legend": "Committed virtual memory",
"refId": "B"
},
{
"expr": "os_free_physical_memory_bytes{job=\"$job\",instance=\"$instance\"}",
"legend": "Free physical memory",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"name": "job",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(jmx_exporter_build_info,job)"
},
{
"name": "instance",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(jmx_exporter_build_info{job=\"$job\"},instance)"
}
],
"version": "3.0.0"
}
}
================================================
FILE: integrations/Java/dashboards/jmx_by_kubernetes.json
================================================
{
"name": "JMX - Kubernetes",
"tags": "Prometheus JMX Kubernetes",
"configs": {
"panels": [
{
"collapsed": true,
"id": "a26c5c3d-7b60-4746-bd1f-ca95581cf2fd",
"layout": {
"h": 1,
"i": "a26c5c3d-7b60-4746-bd1f-ca95581cf2fd",
"w": 24,
"x": 0,
"y": 0
},
"name": "Basic Info",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"orientation": "auto",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "0721ee76-816b-469f-9c49-2bef94a9299e",
"layout": {
"h": 3,
"i": "0721ee76-816b-469f-9c49-2bef94a9299e",
"w": 6,
"x": 0,
"y": 1
},
"maxPerRow": 4,
"name": "Status",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"special": 1
},
"result": {
"color": "#1eac02",
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"color": "#f00a0a",
"text": "DOWN"
},
"type": "special"
}
]
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
}
}
}
],
"targets": [
{
"expr": "up{namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.1.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"orientation": "auto",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "a55c40fc-dc25-4d2a-8e99-928e02c5ff5d",
"layout": {
"h": 3,
"i": "a55c40fc-dc25-4d2a-8e99-928e02c5ff5d",
"w": 6,
"x": 6,
"y": 1
},
"maxPerRow": 4,
"name": "Uptime",
"options": {
"standardOptions": {
"util": "humantimeSeconds"
},
"thresholds": {
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
}
}
}
],
"targets": [
{
"expr": "time() - process_start_time_seconds{namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.1.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"orientation": "auto",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "60c3389c-808d-4412-b74b-cb762e89a8ad",
"layout": {
"h": 3,
"i": "60c3389c-808d-4412-b74b-cb762e89a8ad",
"w": 6,
"x": 12,
"y": 1
},
"maxPerRow": 4,
"name": "Container CPU Limit",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
}
}
}
],
"targets": [
{
"expr": "container_spec_cpu_quota{namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}/container_spec_cpu_period{namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.1.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"orientation": "auto",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1c9a8cca-3578-485e-837d-21618d383065",
"layout": {
"h": 3,
"i": "1c9a8cca-3578-485e-837d-21618d383065",
"w": 6,
"x": 18,
"y": 1
},
"maxPerRow": 4,
"name": "Container Open File Descriptors",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
}
}
}
],
"targets": [
{
"expr": "container_file_descriptors{namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.1.0"
},
{
"collapsed": true,
"id": "705c90e0-e8b6-4f1c-b35c-c8a785009a20",
"layout": {
"h": 1,
"i": "705c90e0-e8b6-4f1c-b35c-c8a785009a20",
"w": 24,
"x": 0,
"y": 4
},
"name": "JVM Memory",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "5455e2f2-f6bb-4888-9d88-240d7e12cce2",
"layout": {
"h": 7,
"i": "5455e2f2-f6bb-4888-9d88-240d7e12cce2",
"w": 12,
"x": 0,
"y": 5
},
"maxPerRow": 4,
"name": "JVM Memory(heap)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_memory_used_bytes{namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Used",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "jvm_memory_bytes_max{area=\"heap\",job=\"$job\",instance=\"$instance\"}",
"legend": "Max",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "765b22a9-1ddc-4c08-8758-684e3c13252b",
"layout": {
"h": 7,
"i": "765b22a9-1ddc-4c08-8758-684e3c13252b",
"w": 12,
"x": 12,
"y": 5
},
"maxPerRow": 4,
"name": "JVM Memory(nonheap)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_memory_used_bytes{namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Used",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "jvm_memory_bytes_max{area=\"nonheap\",job=\"$job\",instance=\"$instance\"}",
"legend": "Max",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"collapsed": true,
"id": "c43aa6f5-7252-400f-bb9f-8c96e436151c",
"layout": {
"h": 1,
"i": "c43aa6f5-7252-400f-bb9f-8c96e436151c",
"w": 24,
"x": 0,
"y": 12
},
"name": "Memory Pool",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "5ab2434c-a905-43c1-a563-4cee2dc9dce9",
"layout": {
"h": 7,
"i": "5ab2434c-a905-43c1-a563-4cee2dc9dce9",
"w": 6,
"x": 0,
"y": 13
},
"maxPerRow": 4,
"name": "CodeHeap 'non-nmethods'",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=\"CodeHeap 'non-nmethods'\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Max",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "jvm_memory_pool_bytes_used{pool=\"CodeHeap 'non-nmethods'\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Used",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=\"CodeHeap 'non-nmethods'\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Committed",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "bfe16d07-91ff-44e6-87bc-9d5d93d2ebd6",
"layout": {
"h": 7,
"i": "bfe16d07-91ff-44e6-87bc-9d5d93d2ebd6",
"w": 6,
"x": 6,
"y": 13
},
"maxPerRow": 4,
"name": "CodeHeap 'profiled nmethods'",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=\"CodeHeap 'profiled nmethods'\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Max",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "jvm_memory_pool_bytes_used{pool=\"CodeHeap 'profiled nmethods'\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Used",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=\"CodeHeap 'profiled nmethods'\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Committed",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "18d10f97-5ab2-41c4-a3ad-09f2c7a03e1a",
"layout": {
"h": 7,
"i": "18d10f97-5ab2-41c4-a3ad-09f2c7a03e1a",
"w": 6,
"x": 12,
"y": 13
},
"maxPerRow": 4,
"name": "CodeHeap 'non-profiled nmethods'",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=\"CodeHeap 'non-profiled nmethods'\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Max",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "jvm_memory_pool_bytes_used{pool=\"CodeHeap 'non-profiled nmethods'\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Used",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=\"CodeHeap 'non-profiled nmethods'\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Committed",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "314a3893-c1d4-4f85-bce0-33ecfda2f521",
"layout": {
"h": 7,
"i": "314a3893-c1d4-4f85-bce0-33ecfda2f521",
"w": 6,
"x": 18,
"y": 13
},
"maxPerRow": 4,
"name": "G1 Eden Space",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=\"G1 Eden Space\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Max",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "jvm_memory_pool_bytes_used{pool=\"G1 Eden Space\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Used",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=\"G1 Eden Space\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Committed",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1e5f03e7-af5d-447b-9c1b-23d81915e8df",
"layout": {
"h": 7,
"i": "1e5f03e7-af5d-447b-9c1b-23d81915e8df",
"w": 6,
"x": 0,
"y": 15
},
"maxPerRow": 4,
"name": "Compressed Class Space",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=\"Compressed Class Space\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Max",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "jvm_memory_pool_bytes_used{pool=\"Compressed Class Space\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Used",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=\"Compressed Class Space\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Committed",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "86a68ff6-238c-4fc9-b77e-3b964e564500",
"layout": {
"h": 7,
"i": "86a68ff6-238c-4fc9-b77e-3b964e564500",
"w": 6,
"x": 6,
"y": 15
},
"maxPerRow": 4,
"name": "G1 Survivor Space",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=\"G1 Survivor Space\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Max",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "jvm_memory_pool_bytes_used{pool=\"G1 Survivor Space\", job=\"$job\",instance=\"$instance\"}",
"legend": "Used",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=\"G1 Survivor Space\", job=\"$job\",instance=\"$instance\"}",
"legend": "Committed",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "595af7d1-e53c-43b5-8f62-ddb9b3a4ffcb",
"layout": {
"h": 7,
"i": "595af7d1-e53c-43b5-8f62-ddb9b3a4ffcb",
"w": 6,
"x": 12,
"y": 15
},
"maxPerRow": 4,
"name": "G1 Old Gen",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=\"G1 Old Gen\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Max",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "jvm_memory_pool_bytes_used{pool=\"G1 Old Gen\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Used",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=\"G1 Old Gen\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Committed",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "380fdfcb-16a6-4131-abaa-a3911b7de6fa",
"layout": {
"h": 7,
"i": "380fdfcb-16a6-4131-abaa-a3911b7de6fa",
"w": 6,
"x": 18,
"y": 15
},
"maxPerRow": 4,
"name": "Metaspace",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_memory_pool_max_bytes{pool=\"Metaspace\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Max",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "jvm_memory_pool_bytes_used{pool=\"Metaspace\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Used",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "jvm_memory_pool_bytes_committed{pool=\"Metaspace\", namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Committed",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"collapsed": true,
"id": "0aaf3516-4938-41e3-b7cb-323de6de75d9",
"layout": {
"h": 1,
"i": "0aaf3516-4938-41e3-b7cb-323de6de75d9",
"w": 24,
"x": 0,
"y": 22
},
"name": "GC",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "5303bda0-47c2-4aca-bb12-1da512500f4a",
"layout": {
"h": 7,
"i": "5303bda0-47c2-4aca-bb12-1da512500f4a",
"w": 8,
"x": 0,
"y": 23
},
"maxPerRow": 4,
"name": "过去一分钟GC耗时(秒)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "increase(jvm_gc_collection_seconds_sum{namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}[1m])",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "cf410459-b5df-4aca-a410-ecda091d6097",
"layout": {
"h": 7,
"i": "cf410459-b5df-4aca-a410-ecda091d6097",
"w": 8,
"x": 8,
"y": 23
},
"maxPerRow": 4,
"name": "过去一分钟GC次数",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "increase(jvm_gc_collection_seconds_count{namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}[1m])",
"legend": "",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"custom": {
"drawStyle": "bars",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "30feb928-b7c3-4e71-aeeb-cc10994b313c",
"layout": {
"h": 7,
"i": "30feb928-b7c3-4e71-aeeb-cc10994b313c",
"w": 8,
"x": 16,
"y": 23
},
"maxPerRow": 4,
"name": "过去一分钟每次GC平均耗时(秒)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "increase(jvm_gc_collection_seconds_sum{namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}[1m])/increase(jvm_gc_collection_seconds_count{namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}[1m])",
"legend": "",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"collapsed": true,
"id": "fd6d0772-40d7-4211-b9bb-601e35fb6431",
"layout": {
"h": 1,
"i": "fd6d0772-40d7-4211-b9bb-601e35fb6431",
"w": 24,
"x": 0,
"y": 30
},
"name": "Threads and Class loading",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "65c74a2b-5f01-4491-b45a-dffe4a9b678a",
"layout": {
"h": 7,
"i": "65c74a2b-5f01-4491-b45a-dffe4a9b678a",
"w": 12,
"x": 0,
"y": 31
},
"maxPerRow": 4,
"name": "Threads",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_threads_current{namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "current",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "jvm_threads_daemon{job=\"$job\",instance=\"$instance\"}",
"legend": "daemon",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "jvm_threads_deadlocked{job=\"$job\",instance=\"$instance\"}",
"legend": "deadlocked",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "2da16907-adf7-4561-9338-4254c89a311b",
"layout": {
"h": 7,
"i": "2da16907-adf7-4561-9338-4254c89a311b",
"w": 12,
"x": 12,
"y": 31
},
"maxPerRow": 4,
"name": "Class loading",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_classes_loaded_total{namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"collapsed": true,
"id": "12fe119e-54f0-4219-9846-ac982c1e9b4d",
"layout": {
"h": 1,
"i": "12fe119e-54f0-4219-9846-ac982c1e9b4d",
"w": 24,
"x": 0,
"y": 38
},
"name": "Container memory",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "5a859147-edfc-4dac-9457-8a928213bc00",
"layout": {
"h": 7,
"i": "5a859147-edfc-4dac-9457-8a928213bc00",
"w": 24,
"x": 0,
"y": 39
},
"maxPerRow": 4,
"name": "Container Memory Limit",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "os_total_physical_memory_bytes{namespace=\"$namespace\", container=\"$service\", pod=\"$pod\"}",
"legend": "Total physical memory",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "os_committed_virtual_memory_bytes{job=\"$job\",instance=\"$instance\"}",
"legend": "Committed virtual memory",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "os_free_physical_memory_bytes{job=\"$job\",instance=\"$instance\"}",
"legend": "Free physical memory",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(jmx_exporter_build_info, namespace)",
"hide": false,
"name": "namespace",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(jmx_exporter_build_info{namespace=\"$namespace\"},container)",
"hide": false,
"name": "service",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(jmx_exporter_build_info{namespace=\"$namespace\", container=\"$service\"},pod)",
"hide": false,
"name": "pod",
"type": "query"
}
],
"version": "3.0.0"
},
"uuid": 1755595969673000
}
================================================
FILE: integrations/Java/dashboards/jvm_by_opentelementry.json
================================================
{
"name": "JVM by OpenTelementry",
"tags": "Prometheus OpenTelementry",
"ident": "",
"uuid": 1749052689795000,
"configs": {
"panels": [
{
"collapsed": true,
"id": "a26c5c3d-7b60-4746-bd1f-ca95581cf2fd",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 0,
"i": "a26c5c3d-7b60-4746-bd1f-ca95581cf2fd",
"isResizable": false
},
"name": "CPU Info",
"type": "row"
},
{
"type": "stat",
"id": "a55c40fc-dc25-4d2a-8e99-928e02c5ff5d",
"layout": {
"h": 4,
"w": 8,
"x": 0,
"y": 1,
"i": "a55c40fc-dc25-4d2a-8e99-928e02c5ff5d",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_cpu_count{job=\"$job\",instance=~\"$instance\"}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "CPU Count",
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 1,
"textSize": {
"value": 64
},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"standardOptions": {
"util": ""
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "timeseries",
"id": "0721ee76-816b-469f-9c49-2bef94a9299e",
"layout": {
"h": 4,
"w": 8,
"x": 8,
"y": 1,
"i": "0721ee76-816b-469f-9c49-2bef94a9299e",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "((increase(jvm_cpu_time_seconds_total{instance=~\"$instance\",job=\"$job\"}[1m]) /60)/jvm_cpu_count{job=\"$job\",instance=~\"$instance\"}) *100",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "JVM CPU 每分钟平均使用率",
"description": "CPU 每分钟平均使用率,已做归一化处理,即不管几个核,CPU使用率最大值为 100%",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "single"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.01,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "9d27fdb7-9ab5-41ce-9266-c97860244283",
"layout": {
"h": 4,
"w": 8,
"x": 16,
"y": 1,
"i": "83786365-c832-479c-ae25-e9ff69c5245f",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_cpu_recent_utilization_ratio{instance=~\"$instance\",job=\"$job\"}*100",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "JVM CPU 瞬时使用率",
"links": [],
"description": "JVM 进程在最近系统采样周期内(通常是几百毫秒) CPU 的使用率\nhttps://opentelemetry.io/docs/specs/semconv/runtime/jvm-metrics/",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "single"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.01,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "705c90e0-e8b6-4f1c-b35c-c8a785009a20",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 5,
"i": "705c90e0-e8b6-4f1c-b35c-c8a785009a20",
"isResizable": false
},
"name": "JVM Memory",
"type": "row",
"panels": []
},
{
"type": "timeseries",
"id": "5455e2f2-f6bb-4888-9d88-240d7e12cce2",
"layout": {
"h": 7,
"w": 12,
"x": 0,
"y": 6,
"i": "5455e2f2-f6bb-4888-9d88-240d7e12cce2",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_committed_bytes{jvm_memory_type=\"heap\",job=\"$job\",instance=\"$instance\"}",
"legend": "{{jvm_memory_pool_name}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "JVM Commit Memory(heap)",
"description": "已提交内存\nhttps://opentelemetry.io/docs/specs/semconv/runtime/jvm-metrics/",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "765b22a9-1ddc-4c08-8758-684e3c13252b",
"layout": {
"h": 7,
"w": 12,
"x": 12,
"y": 6,
"i": "765b22a9-1ddc-4c08-8758-684e3c13252b",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_committed_bytes{jvm_memory_type=\"non_heap\",job=\"$job\",instance=\"$instance\"}",
"legend": "{{jvm_memory_pool_name}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "JVM Commit Memory(nonheap)",
"description": "已提交内存\nhttps://opentelemetry.io/docs/specs/semconv/runtime/jvm-metrics/",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "6d4caef8-cd49-4fd5-9bb5-feadeb4f777e",
"layout": {
"h": 7,
"w": 12,
"x": 0,
"y": 13,
"i": "af0eee36-6791-48a1-8715-39e57848e1b8",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_limit_bytes{jvm_memory_type=\"heap\",job=\"$job\",instance=\"$instance\"}",
"legend": "{{jvm_memory_pool_name}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "JVM Memory Limit(heap)",
"description": "最大可获取内存\nhttps://opentelemetry.io/docs/specs/semconv/runtime/jvm-metrics/",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "aee4fa22-73ac-4991-ab6c-935999ac4729",
"layout": {
"h": 7,
"w": 12,
"x": 12,
"y": 13,
"i": "d2c78767-39bb-4ac4-858f-cc5501311f0b",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_limit_bytes{jvm_memory_type=\"non_heap\",job=\"$job\",instance=\"$instance\"}",
"legend": "{{jvm_memory_pool_name}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "JVM Memory Limit(nonheap)",
"description": "最大可获取内存\nhttps://opentelemetry.io/docs/specs/semconv/runtime/jvm-metrics/",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "5a96cae3-a0f5-4b8d-b76a-063efd592060",
"layout": {
"h": 7,
"w": 12,
"x": 0,
"y": 20,
"i": "0381f655-147e-4cfb-84b4-46ea8a1ae4b2",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_used_bytes{jvm_memory_type=\"heap\",job=\"$job\",instance=\"$instance\"}",
"legend": "{{jvm_memory_pool_name}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "JVM Memory Used(heap)",
"description": "已使用内存\nhttps://opentelemetry.io/docs/specs/semconv/runtime/jvm-metrics/",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "ef4c8781-8876-4e59-8b6d-781813ae433e",
"layout": {
"h": 7,
"w": 12,
"x": 12,
"y": 20,
"i": "be93dba2-55c5-49a3-a511-ef0c9835843c",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_used_bytes{jvm_memory_type=\"non_heap\",job=\"$job\",instance=\"$instance\"}",
"legend": "{{jvm_memory_pool_name}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "JVM Memory Used(nonheap)",
"description": "已使用内存\nhttps://opentelemetry.io/docs/specs/semconv/runtime/jvm-metrics/",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "29b53a9d-b38a-4462-a4f4-d4c1db300596",
"layout": {
"h": 7,
"w": 12,
"x": 0,
"y": 27,
"i": "5b7c0a0d-d887-4edb-a025-ec3bb2298f1c",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_used_after_last_gc_bytes{jvm_memory_type=\"heap\",job=\"$job\",instance=\"$instance\"}",
"legend": "{{jvm_memory_pool_name}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "上次GC后 JVM Memory Used(heap)",
"description": "最近一次GC后的内存使用量",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "d50ff8ea-d8c1-4f87-b34c-64661cca7700",
"layout": {
"h": 7,
"w": 12,
"x": 12,
"y": 27,
"i": "66887b68-53ea-4d9b-96f6-1cbcf8531c92",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_memory_used_after_last_gc_bytes{jvm_memory_type=\"non_heap\",job=\"$job\",instance=\"$instance\"}",
"legend": "{{jvm_memory_pool_name}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "上次GC后 JVM Memory Used(nonheap)",
"description": "最近一次GC后的内存使用量",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "0aaf3516-4938-41e3-b7cb-323de6de75d9",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 34,
"i": "0aaf3516-4938-41e3-b7cb-323de6de75d9",
"isResizable": false
},
"name": "GC",
"type": "row",
"panels": []
},
{
"type": "timeseries",
"id": "5303bda0-47c2-4aca-bb12-1da512500f4a",
"layout": {
"h": 7,
"w": 8,
"x": 0,
"y": 35,
"i": "5303bda0-47c2-4aca-bb12-1da512500f4a",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "increase(jvm_gc_duration_seconds_sum{job=\"$job\",instance=~\"$instance\"}[1m])",
"refId": "A",
"maxDataPoints": 240,
"legend": "{{jvm_gc_action}} - {{jvm_gc_name}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "过去一分钟GC总耗时(秒)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "cf410459-b5df-4aca-a410-ecda091d6097",
"layout": {
"h": 7,
"w": 8,
"x": 8,
"y": 35,
"i": "cf410459-b5df-4aca-a410-ecda091d6097",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "increase(jvm_gc_duration_seconds_count{job=\"$job\",instance=\"$instance\"}[1m])",
"legend": "{{jvm_gc_action}} - {{jvm_gc_name}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "过去一分钟GC次数",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "30feb928-b7c3-4e71-aeeb-cc10994b313c",
"layout": {
"h": 7,
"w": 8,
"x": 16,
"y": 35,
"i": "30feb928-b7c3-4e71-aeeb-cc10994b313c",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "histogram_quantile(\n 0.95,\n sum by(le, jvm_gc_action, jvm_gc_name) (\n rate(jvm_gc_duration_seconds_bucket{instance=\"$instance\", job=\"$job\"}[1m])\n )\n)",
"legend": "{{jvm_gc_action}} - {{jvm_gc_name}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "过去1分钟单次GC耗时95分位值(秒)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "bars",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "fd6d0772-40d7-4211-b9bb-601e35fb6431",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 42,
"i": "fd6d0772-40d7-4211-b9bb-601e35fb6431",
"isResizable": false
},
"name": "Threads and Class Loading",
"type": "row",
"panels": []
},
{
"type": "timeseries",
"id": "65c74a2b-5f01-4491-b45a-dffe4a9b678a",
"layout": {
"h": 4,
"w": 6,
"x": 0,
"y": 43,
"i": "65c74a2b-5f01-4491-b45a-dffe4a9b678a",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_thread_count{job=\"$job\",instance=\"$instance\"}",
"legend": "{{jvm_thread_daemon}} - {{jvm_thread_state}}",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "jvm_threads_daemon{job=\"$job\",instance=\"$instance\"}",
"legend": "daemon",
"refId": "B",
"maxDataPoints": 240
},
{
"expr": "jvm_threads_deadlocked{job=\"$job\",instance=\"$instance\"}",
"legend": "deadlocked",
"refId": "C",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Threads",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "list",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "2da16907-adf7-4561-9338-4254c89a311b",
"layout": {
"h": 4,
"w": 6,
"x": 6,
"y": 43,
"i": "2da16907-adf7-4561-9338-4254c89a311b",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_class_count{job=\"$job\", instance=\"$instance\"}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Class Count",
"description": "Number of classes currently loaded.",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "2eaccbc6-0084-40e8-8c4d-ea12fd562ba0",
"layout": {
"h": 4,
"w": 6,
"x": 12,
"y": 43,
"i": "7b261291-6429-4bb4-a433-3266495f10ce",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_class_loaded_total{job=\"$job\", instance=\"$instance\"}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Class Loaded Count",
"description": "Number of classes loaded since JVM start.\n",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "ce49545c-3cd3-4a9f-ac3d-163e1aafa3f8",
"layout": {
"h": 4,
"w": 6,
"x": 18,
"y": 43,
"i": "94b94ee3-a907-4815-b8c6-80ffa8502df1",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "jvm_class_unloaded_total{job=\"$job\", instance=\"$instance\"}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Class Unload Count",
"description": "Number of classes unloaded since JVM start.",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"name": "job",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(jvm_class_count, job)",
"multi": false
},
{
"name": "instance",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(jvm_class_count{job=\"$job\"}, instance)",
"multi": false
}
],
"version": "3.0.0"
}
}
================================================
FILE: integrations/Jenkins/collect/jenkins/jenkins.toml
================================================
# # collect interval
# interval = 15
[[instances]]
# Address (host:port) of jenkins server.
# jenkins_url = "http://my-jenkins-instance:8080"
#jenkins_username = "admin"
#jenkins_password = ""
#response_timeout = "5s"
================================================
FILE: integrations/Jenkins/markdown/README.md
================================================
## Jenkins
Jenkins 采集插件, 采集 Jenkins 数据
## Configuration
```toml
# # collect interval
# interval = 15
[[instances]]
# Address (host:port) of jenkins server.
# jenkins_url = "http://my-jenkins-instance:8080"
#jenkins_username = "admin"
#jenkins_password = ""
#response_timeout = "5s"
```
================================================
FILE: integrations/Jolokia_Agent/collect/jolokia_agent/activemq.toml
================================================
## JolokiaAgent is bundled with ActiveMQ
[[instances]]
urls = ["http://localhost:8161/api/jolokia"]
metrics_name_prefix = "activemq_"
username = "admin"
password = "admin"
### JVM Generic
[[instances.metric]]
name = "OperatingSystem"
mbean = "java.lang:type=OperatingSystem"
paths = ["ProcessCpuLoad","SystemLoadAverage","SystemCpuLoad"]
[[instances.metric]]
name = "jvm_runtime"
mbean = "java.lang:type=Runtime"
paths = ["Uptime"]
[[instances.metric]]
name = "jvm_memory"
mbean = "java.lang:type=Memory"
paths = ["HeapMemoryUsage", "NonHeapMemoryUsage", "ObjectPendingFinalizationCount"]
[[instances.metric]]
name = "jvm_garbage_collector"
mbean = "java.lang:name=*,type=GarbageCollector"
paths = ["CollectionTime", "CollectionCount"]
tag_keys = ["name"]
[[instances.metric]]
name = "jvm_memory_pool"
mbean = "java.lang:name=*,type=MemoryPool"
paths = ["Usage", "PeakUsage", "CollectionUsage"]
tag_keys = ["name"]
tag_prefix = "pool_"
### ACTIVEMQ
[[instances.metric]]
name = "queue"
mbean = "org.apache.activemq:brokerName=*,destinationName=*,destinationType=Queue,type=Broker"
paths = ["QueueSize","EnqueueCount","ConsumerCount","DispatchCount","DequeueCount","ProducerCount","InFlightCount"]
tag_keys = ["brokerName","destinationName"]
[[instances.metric]]
name = "topic"
mbean = "org.apache.activemq:brokerName=*,destinationName=*,destinationType=Topic,type=Broker"
paths = ["ProducerCount","DequeueCount","ConsumerCount","QueueSize","EnqueueCount"]
tag_keys = ["brokerName","destinationName"]
[[instances.metric]]
name = "broker"
mbean = "org.apache.activemq:brokerName=*,type=Broker"
paths = ["TotalConsumerCount","TotalMessageCount","TotalEnqueueCount","TotalDequeueCount","MemoryLimit","MemoryPercentUsage","StoreLimit","StorePercentUsage","TempPercentUsage","TempLimit"]
tag_keys = ["brokerName"]
================================================
FILE: integrations/Jolokia_Agent/collect/jolokia_agent/bitbucket.toml
================================================
[[instances]]
urls = ["http://localhost:8778/jolokia"]
metrics_name_prefix = "bitbucket_"
[[instances.metric]]
name = "jvm_operatingsystem"
mbean = "java.lang:type=OperatingSystem"
[[instances.metric]]
name = "jvm_runtime"
mbean = "java.lang:type=Runtime"
[[instances.metric]]
name = "jvm_thread"
mbean = "java.lang:type=Threading"
[[instances.metric]]
name = "jvm_memory"
mbean = "java.lang:type=Memory"
[[instances.metric]]
name = "jvm_class_loading"
mbean = "java.lang:type=ClassLoading"
[[instances.metric]]
name = "jvm_memory_pool"
mbean = "java.lang:type=MemoryPool,name=*"
[[instances.metric]]
name = "webhooks"
mbean = "com.atlassian.webhooks:name=*"
[[instances.metric]]
name = "atlassian"
mbean = "com.atlassian.bitbucket:name=*"
[[instances.metric]]
name = "thread_pools"
mbean = "com.atlassian.bitbucket.thread-pools:name=*"
================================================
FILE: integrations/Jolokia_Agent/collect/jolokia_agent/cassandra.toml
================================================
[[instances]]
urls = ["http://localhost:8778/jolokia"]
metrics_name_prefix = "java_"
[[instances.metric]]
name = "Memory"
mbean = "java.lang:type=Memory"
[[instances.metric]]
name = "GarbageCollector"
mbean = "java.lang:name=*,type=GarbageCollector"
tag_keys = ["name"]
field_prefix = "$1_"
[[instances]]
urls = ["http://localhost:8778/jolokia"]
metrics_name_prefix = "cassandra_"
[[instances.metric]]
name = "Cache"
mbean = "org.apache.cassandra.metrics:name=*,scope=*,type=Cache"
tag_keys = ["name", "scope"]
field_prefix = "$1_"
[[instances.metric]]
name = "Client"
mbean = "org.apache.cassandra.metrics:name=*,type=Client"
tag_keys = ["name"]
field_prefix = "$1_"
[[instances.metric]]
name = "ClientRequestMetrics"
mbean = "org.apache.cassandra.metrics:name=*,type=ClientRequestMetrics"
tag_keys = ["name"]
field_prefix = "$1_"
[[instances.metric]]
name = "ClientRequest"
mbean = "org.apache.cassandra.metrics:name=*,scope=*,type=ClientRequest"
tag_keys = ["name", "scope"]
field_prefix = "$1_"
[[instances.metric]]
name = "ColumnFamily"
mbean = "org.apache.cassandra.metrics:keyspace=*,name=*,scope=*,type=ColumnFamily"
tag_keys = ["keyspace", "name", "scope"]
field_prefix = "$2_"
[[instances.metric]]
name = "CommitLog"
mbean = "org.apache.cassandra.metrics:name=*,type=CommitLog"
tag_keys = ["name"]
field_prefix = "$1_"
[[instances.metric]]
name = "Compaction"
mbean = "org.apache.cassandra.metrics:name=*,type=Compaction"
tag_keys = ["name"]
field_prefix = "$1_"
[[instances.metric]]
name = "CQL"
mbean = "org.apache.cassandra.metrics:name=*,type=CQL"
tag_keys = ["name"]
field_prefix = "$1_"
[[instances.metric]]
name = "DroppedMessage"
mbean = "org.apache.cassandra.metrics:name=*,scope=*,type=DroppedMessage"
tag_keys = ["name", "scope"]
field_prefix = "$1_"
[[instances.metric]]
name = "FileCache"
mbean = "org.apache.cassandra.metrics:name=*,type=FileCache"
tag_keys = ["name"]
field_prefix = "$1_"
[[instances.metric]]
name = "ReadRepair"
mbean = "org.apache.cassandra.metrics:name=*,type=ReadRepair"
tag_keys = ["name"]
field_prefix = "$1_"
[[instances.metric]]
name = "Storage"
mbean = "org.apache.cassandra.metrics:name=*,type=Storage"
tag_keys = ["name"]
field_prefix = "$1_"
[[instances.metric]]
name = "ThreadPools"
mbean = "org.apache.cassandra.metrics:name=*,path=*,scope=*,type=ThreadPools"
tag_keys = ["name", "path", "scope"]
field_prefix = "$1_"
================================================
FILE: integrations/Jolokia_Agent/collect/jolokia_agent/hadoop-hdfs.toml
================================================
################
# NAMENODE #
################
[[instances]]
urls = ["http://localhost:8778/jolokia"]
metrics_name_prefix = "hadoop_hdfs_namenode_"
[[instances.metric]]
name = "FSNamesystem"
mbean = "Hadoop:name=FSNamesystem,service=NameNode"
paths = ["CapacityTotal", "CapacityRemaining", "CapacityUsedNonDFS", "NumLiveDataNodes", "NumDeadDataNodes", "NumInMaintenanceDeadDataNodes", "NumDecomDeadDataNodes"]
[[instances.metric]]
name = "FSNamesystemState"
mbean = "Hadoop:name=FSNamesystemState,service=NameNode"
paths = ["VolumeFailuresTotal", "UnderReplicatedBlocks", "BlocksTotal"]
[[instances.metric]]
name = "OperatingSystem"
mbean = "java.lang:type=OperatingSystem"
paths = ["ProcessCpuLoad", "SystemLoadAverage", "SystemCpuLoad"]
[[instances.metric]]
name = "jvm_runtime"
mbean = "java.lang:type=Runtime"
paths = ["Uptime"]
[[instances.metric]]
name = "jvm_memory"
mbean = "java.lang:type=Memory"
paths = ["HeapMemoryUsage", "NonHeapMemoryUsage", "ObjectPendingFinalizationCount"]
[[instances.metric]]
name = "jvm_garbage_collector"
mbean = "java.lang:name=*,type=GarbageCollector"
paths = ["CollectionTime", "CollectionCount"]
tag_keys = ["name"]
[[instances.metric]]
name = "jvm_memory_pool"
mbean = "java.lang:name=*,type=MemoryPool"
paths = ["Usage", "PeakUsage", "CollectionUsage"]
tag_keys = ["name"]
tag_prefix = "pool_"
################
# DATANODE #
################
[[instances]]
urls = ["http://localhost:7778/jolokia"]
metrics_name_prefix = "hadoop_hdfs_datanode_"
[[instances.metric]]
name = "FSDatasetState"
mbean = "Hadoop:name=FSDatasetState,service=DataNode"
paths = ["Capacity", "DfsUsed", "Remaining", "NumBlocksFailedToUnCache", "NumBlocksFailedToCache", "NumBlocksCached"]
[[instances.metric]]
name = "OperatingSystem"
mbean = "java.lang:type=OperatingSystem"
paths = ["ProcessCpuLoad", "SystemLoadAverage", "SystemCpuLoad"]
[[instances.metric]]
name = "jvm_runtime"
mbean = "java.lang:type=Runtime"
paths = ["Uptime"]
[[instances.metric]]
name = "jvm_memory"
mbean = "java.lang:type=Memory"
paths = ["HeapMemoryUsage", "NonHeapMemoryUsage", "ObjectPendingFinalizationCount"]
[[instances.metric]]
name = "jvm_garbage_collector"
mbean = "java.lang:name=*,type=GarbageCollector"
paths = ["CollectionTime", "CollectionCount"]
tag_keys = ["name"]
[[instances.metric]]
name = "jvm_memory_pool"
mbean = "java.lang:name=*,type=MemoryPool"
paths = ["Usage", "PeakUsage", "CollectionUsage"]
tag_keys = ["name"]
tag_prefix = "pool_"
================================================
FILE: integrations/Jolokia_Agent/collect/jolokia_agent/java.toml
================================================
[[instances]]
urls = ["http://localhost:8080/jolokia"]
[[instances.metric]]
name = "java_runtime"
mbean = "java.lang:type=Runtime"
paths = ["Uptime"]
[[instances.metric]]
name = "java_memory"
mbean = "java.lang:type=Memory"
paths = ["HeapMemoryUsage", "NonHeapMemoryUsage", "ObjectPendingFinalizationCount"]
[[instances.metric]]
name = "java_garbage_collector"
mbean = "java.lang:name=*,type=GarbageCollector"
paths = ["CollectionTime", "CollectionCount"]
tag_keys = ["name"]
[[instances.metric]]
name = "java_last_garbage_collection"
mbean = "java.lang:name=G1 Young Generation,type=GarbageCollector"
paths = ["LastGcInfo/duration", "LastGcInfo/GcThreadCount", "LastGcInfo/memoryUsageAfterGc"]
[[instances.metric]]
name = "java_threading"
mbean = "java.lang:type=Threading"
paths = ["TotalStartedThreadCount", "ThreadCount", "DaemonThreadCount", "PeakThreadCount"]
[[instances.metric]]
name = "java_class_loading"
mbean = "java.lang:type=ClassLoading"
paths = ["LoadedClassCount", "UnloadedClassCount", "TotalLoadedClassCount"]
[[instances.metric]]
name = "java_memory_pool"
mbean = "java.lang:name=*,type=MemoryPool"
paths = ["Usage", "PeakUsage", "CollectionUsage"]
tag_keys = ["name"]
================================================
FILE: integrations/Jolokia_Agent/collect/jolokia_agent/jboss.toml
================================================
[[instances]]
urls = ["http://localhost:8080/jolokia"]
metrics_name_prefix = "jboss_"
### JVM Generic
[[instances.metric]]
name = "OperatingSystem"
mbean = "java.lang:type=OperatingSystem"
paths = ["ProcessCpuLoad","SystemLoadAverage","SystemCpuLoad"]
[[instances.metric]]
name = "jvm_runtime"
mbean = "java.lang:type=Runtime"
paths = ["Uptime"]
[[instances.metric]]
name = "jvm_memory"
mbean = "java.lang:type=Memory"
paths = ["HeapMemoryUsage", "NonHeapMemoryUsage", "ObjectPendingFinalizationCount"]
[[instances.metric]]
name = "jvm_garbage_collector"
mbean = "java.lang:name=*,type=GarbageCollector"
paths = ["CollectionTime", "CollectionCount"]
tag_keys = ["name"]
[[instances.metric]]
name = "jvm_memory_pool"
mbean = "java.lang:name=*,type=MemoryPool"
paths = ["Usage", "PeakUsage", "CollectionUsage"]
tag_keys = ["name"]
tag_prefix = "pool_"
### JBOSS
[[instances.metric]]
name = "connectors.http"
mbean = "jboss.as:https-listener=*,server=*,subsystem=undertow"
paths = ["bytesReceived","bytesSent","errorCount","requestCount"]
tag_keys = ["server","https-listener"]
[[instances.metric]]
name = "connectors.http"
mbean = "jboss.as:http-listener=*,server=*,subsystem=undertow"
paths = ["bytesReceived","bytesSent","errorCount","requestCount"]
tag_keys = ["server","http-listener"]
[[instances.metric]]
name = "datasource.jdbc"
mbean = "jboss.as:data-source=*,statistics=jdbc,subsystem=datasources"
paths = ["PreparedStatementCacheAccessCount","PreparedStatementCacheHitCount","PreparedStatementCacheMissCount"]
tag_keys = ["data-source"]
[[instances.metric]]
name = "datasource.pool"
mbean = "jboss.as:data-source=*,statistics=pool,subsystem=datasources"
paths = ["AvailableCount","ActiveCount","MaxUsedCount"]
tag_keys = ["data-source"]
================================================
FILE: integrations/Jolokia_Agent/collect/jolokia_agent/kafka-connect.toml
================================================
[[instances]]
urls = ["http://localhost:8080/jolokia"]
metrics_name_prefix = "kafka_connect_"
[[processor_enum]]
metrics = ["status"]
[processor_enum.value_mappings]
paused = 0
running = 1
unassigned = 2
failed = 3
destroyed = 4
[instances.labels]
input_type = "kafka-connect"
# https://kafka.apache.org/documentation/#connect_monitoring
[[instances.metric]]
name = "connectWorkerMetrics"
mbean = "kafka.connect:type=connect-worker-metrics"
paths = ["connector-count", "connector-startup-attempts-total", "connector-startup-failure-percentage", "connector-startup-failure-total", "connector-startup-success-percentage", "connector-startup-success-total", "task-count", "task-startup-attempts-total", "task-startup-failure-percentage", "task-startup-failure-total", "task-startup-success-percentage", "task-startup-success-total"]
[[instances.metric]]
name = "connectWorkerMetrics"
mbean = "kafka.connect:type=connect-worker-metrics,connector=*"
paths = ["connector-destroyed-task-count", "connector-failed-task-count", "connector-paused-task-count", "connector-running-task-count", "connector-total-task-count", "connector-unassigned-task-count"]
tag_keys = ["connector"]
[[instances.metric]]
name = "connectWorkerRebalanceMetrics"
mbean = "kafka.connect:type=connect-worker-rebalance-metrics"
paths = ["completed-rebalances-total", "connect-protocol", "epoch", "leader-name", "rebalance-avg-time-ms", "rebalance-max-time-ms", "rebalancing", "time-since-last-rebalance-ms"]
[[instances.metric]]
name = "connectorMetrics"
mbean = "kafka.connect:type=connector-metrics,connector=*"
paths = ["connector-class", "connector-version", "connector-type", "status"]
tag_keys = ["connector"]
[[instances.metric]]
name = "connectorTaskMetrics"
mbean = "kafka.connect:type=connector-task-metrics,connector=*,task=*"
paths = ["batch-size-avg", "batch-size-max", "offset-commit-avg-time-ms", "offset-commit-failure-percentage", "offset-commit-max-time-ms", "offset-commit-success-percentage", "pause-ratio", "running-ratio", "status"]
tag_keys = ["connector", "task"]
[[instances.metric]]
name = "sinkTaskMetrics"
mbean = "kafka.connect:type=sink-task-metrics,connector=*,task=*"
paths = ["offset-commit-completion-rate", "offset-commit-completion-total", "offset-commit-seq-no", "offset-commit-skip-rate", "offset-commit-skip-total", "partition-count", "put-batch-avg-time-ms", "put-batch-max-time-ms", "sink-record-active-count", "sink-record-active-count-avg", "sink-record-active-count-max", "sink-record-lag-max", "sink-record-read-rate", "sink-record-read-total", "sink-record-send-rate", "sink-record-send-total"]
tag_keys = ["connector", "task"]
[[instances.metric]]
name = "sourceTaskMetrics"
mbean = "kafka.connect:type=source-task-metrics,connector=*,task=*"
paths = ["poll-batch-avg-time-ms", "poll-batch-max-time-ms", "source-record-active-count", "source-record-active-count-avg", "source-record-active-count-max", "source-record-poll-rate", "source-record-poll-total", "source-record-write-rate", "source-record-write-total"]
tag_keys = ["connector", "task"]
[[instances.metric]]
name = "taskErrorMetrics"
mbean = "kafka.connect:type=task-error-metrics,connector=*,task=*"
paths = ["deadletterqueue-produce-failures", "deadletterqueue-produce-requests", "last-error-timestamp", "total-errors-logged", "total-record-errors", "total-record-failures", "total-records-skipped", "total-retries"]
tag_keys = ["connector", "task"]
# https://kafka.apache.org/documentation/#selector_monitoring
[[instances.metric]]
name = "connectMetrics"
mbean = "kafka.connect:type=connect-metrics,client-id=*"
paths = ["connection-close-rate", "connection-close-total", "connection-creation-rate", "connection-creation-total", "network-io-rate", "network-io-total", "outgoing-byte-rate", "outgoing-byte-total", "request-rate", "request-total", "request-size-avg", "request-size-max", "incoming-byte-rate", "incoming-byte-rate", "incoming-byte-total", "response-rate", "response-total", "select-rate", "select-total", "io-wait-time-ns-avg", "io-wait-ratio", "io-time-ns-avg", "io-ratio", "connection-count", "successful-authentication-rate", "successful-authentication-total", "failed-authentication-rate", "failed-authentication-total", "successful-reauthentication-rate", "successful-reauthentication-total", "reauthentication-latency-max", "reauthentication-latency-avg", "failed-reauthentication-rate", "failed-reauthentication-total", "successful-authentication-no-reauth-total"]
tag_keys = ["client-id"]
# https://kafka.apache.org/documentation/#common_node_monitoring
[[instances.metric]]
name = "connectNodeMetrics"
mbean = "kafka.connect:type=connect-node-metrics,client-id=*,node-id=*"
paths = ["outgoing-byte-rate", "outgoing-byte-total", "request-rate", "request-total", "request-size-avg", "request-size-max", "incoming-byte-rate", "incoming-byte-total", "request-latency-avg", "request-latency-max", "response-rate", "response-total"]
tag_keys = ["client-id", "node-id"]
[[instances.metric]]
name = "appInfo"
mbean = "kafka.connect:type=app-info,client-id=*"
paths = ["start-time-ms", "commit-id", "version"]
tag_keys = ["client-id"]
[[instances.metric]]
name = "connectCoordinatorMetrics"
mbean = "kafka.connect:type=connect-coordinator-metrics,client-id=*"
paths = ["join-time-max", "failed-rebalance-rate-per-hour", "rebalance-latency-total", "sync-time-avg", "join-rate", "sync-rate", "failed-rebalance-total", "rebalance-total", "last-heartbeat-seconds-ago", "heartbeat-rate", "join-time-avg", "sync-total", "rebalance-latency-max", "sync-time-max", "last-rebalance-seconds-ago", "rebalance-rate-per-hour", "assigned-connectors", "heartbeat-total", "assigned-tasks", "heartbeat-response-time-max", "rebalance-latency-avg", "join-total"]
tag_keys = ["client-id"]
================================================
FILE: integrations/Jolokia_Agent/collect/jolokia_agent/kafka.toml
================================================
[[instances]]
metrics_name_prefix = "kafka_"
## If you intend to use "non_negative_derivative(1s)" with "*.count" fields, you don't need precalculated fields.
# fielddrop = [
# "*.EventType",
# "*.FifteenMinuteRate",
# "*.FiveMinuteRate",
# "*.MeanRate",
# "*.OneMinuteRate",
# "*.RateUnit",
# "*.LatencyUnit",
# "*.50thPercentile",
# "*.75thPercentile",
# "*.95thPercentile",
# "*.98thPercentile",
# "*.99thPercentile",
# "*.999thPercentile",
# "*.Min",
# "*.Mean",
# "*.Max",
# "*.StdDev"
# ]
urls = ["http://localhost:8080/jolokia"]
[[instances.metric]]
name = "controller"
mbean = "kafka.controller:name=*,type=*"
field_prefix = "$1."
[[instances.metric]]
name = "replica_manager"
mbean = "kafka.server:name=*,type=ReplicaManager"
field_prefix = "$1."
[[instances.metric]]
name = "purgatory"
mbean = "kafka.server:delayedOperation=*,name=*,type=DelayedOperationPurgatory"
field_prefix = "$1."
field_name = "$2"
[[instances.metric]]
name = "zookeeper"
mbean = "kafka.server:name=*,type=SessionExpireListener"
field_prefix = "$1."
[[instances.metric]]
name = "user"
mbean = "kafka.server:user=*,type=Request"
field_prefix = ""
tag_keys = ["user"]
[[instances.metric]]
name = "request"
mbean = "kafka.network:name=*,request=*,type=RequestMetrics"
field_prefix = "$1."
tag_keys = ["request"]
[[instances.metric]]
name = "topics"
mbean = "kafka.server:name=*,type=BrokerTopicMetrics"
field_prefix = "$1."
[[instances.metric]]
name = "topic"
mbean = "kafka.server:name=*,topic=*,type=BrokerTopicMetrics"
field_prefix = "$1."
tag_keys = ["topic"]
[[instances.metric]]
name = "partition"
mbean = "kafka.log:name=*,partition=*,topic=*,type=Log"
field_name = "$1"
tag_keys = ["topic", "partition"]
[[instances.metric]]
name = "partition"
mbean = "kafka.cluster:name=UnderReplicated,partition=*,topic=*,type=Partition"
field_name = "UnderReplicatedPartitions"
tag_keys = ["topic", "partition"]
## If you have multiple instances of Kafka on the server, use 'jolokia_agent_url' as identity of each instance
# [[processors.rename]]
# namepass = ["kafka_*"]
# order = 1
# [[processors.rename.replace]]
# tag = "jolokia_agent_url"
# dest = "instance"
#
# [[processors.regex]]
# namepass = ["kafka_*"]
# order = 2
# [[processors.regex.tags]]
# key = "instance"
# pattern = "^.+:8080/.+$"
# replacement = "0"
# [[processors.regex.tags]]
# key = "instance"
# pattern = "^.+:8081/.+$"
# replacement = "1"
# [[processors.regex.tags]]
# key = "instance"
# pattern = "^.+:8082/.+$"
# replacement = "2"
================================================
FILE: integrations/Jolokia_Agent/collect/jolokia_agent/tomcat.toml
================================================
[[instances]]
urls = ["http://localhost:8080/jolokia"]
metrics_name_prefix = "tomcat_"
### JVM Generic
[[instances.metric]]
name = "OperatingSystem"
mbean = "java.lang:type=OperatingSystem"
paths = ["ProcessCpuLoad","SystemLoadAverage","SystemCpuLoad"]
[[instances.metric]]
name = "jvm_runtime"
mbean = "java.lang:type=Runtime"
paths = ["Uptime"]
[[instances.metric]]
name = "jvm_memory"
mbean = "java.lang:type=Memory"
paths = ["HeapMemoryUsage", "NonHeapMemoryUsage", "ObjectPendingFinalizationCount"]
[[instances.metric]]
name = "jvm_garbage_collector"
mbean = "java.lang:name=*,type=GarbageCollector"
paths = ["CollectionTime", "CollectionCount"]
tag_keys = ["name"]
[[instances.metric]]
name = "jvm_memory_pool"
mbean = "java.lang:name=*,type=MemoryPool"
paths = ["Usage", "PeakUsage", "CollectionUsage"]
tag_keys = ["name"]
tag_prefix = "pool_"
### TOMCAT
[[instances.metric]]
name = "GlobalRequestProcessor"
mbean = "Catalina:name=*,type=GlobalRequestProcessor"
paths = ["requestCount","bytesReceived","bytesSent","processingTime","errorCount"]
tag_keys = ["name"]
[[instances.metric]]
name = "JspMonitor"
mbean = "Catalina:J2EEApplication=*,J2EEServer=*,WebModule=*,name=jsp,type=JspMonitor"
paths = ["jspReloadCount","jspCount","jspUnloadCount"]
tag_keys = ["J2EEApplication","J2EEServer","WebModule"]
[[instances.metric]]
name = "ThreadPool"
mbean = "Catalina:name=*,type=ThreadPool"
paths = ["maxThreads","currentThreadCount","currentThreadsBusy"]
tag_keys = ["name"]
[[instances.metric]]
name = "Servlet"
mbean = "Catalina:J2EEApplication=*,J2EEServer=*,WebModule=*,j2eeType=Servlet,name=*"
paths = ["processingTime","errorCount","requestCount"]
tag_keys = ["name","J2EEApplication","J2EEServer","WebModule"]
[[instances.metric]]
name = "Cache"
mbean = "Catalina:context=*,host=*,name=Cache,type=WebResourceRoot"
paths = ["hitCount","lookupCount"]
tag_keys = ["context","host"]
================================================
FILE: integrations/Jolokia_Agent/collect/jolokia_agent/weblogic.toml
================================================
[[instances]]
urls = ["http://localhost:8080/jolokia"]
metrics_name_prefix = "weblogic_"
### JVM Generic
[[instances.metric]]
name = "OperatingSystem"
mbean = "java.lang:type=OperatingSystem"
paths = ["ProcessCpuLoad","SystemLoadAverage","SystemCpuLoad"]
[[instances.metric]]
name = "jvm_runtime"
mbean = "java.lang:type=Runtime"
paths = ["Uptime"]
[[instances.metric]]
name = "jvm_memory"
mbean = "java.lang:type=Memory"
paths = ["HeapMemoryUsage", "NonHeapMemoryUsage", "ObjectPendingFinalizationCount"]
[[instances.metric]]
name = "jvm_garbage_collector"
mbean = "java.lang:name=*,type=GarbageCollector"
paths = ["CollectionTime", "CollectionCount"]
tag_keys = ["name"]
[[instances.metric]]
name = "jvm_memory_pool"
mbean = "java.lang:name=*,type=MemoryPool"
paths = ["Usage", "PeakUsage", "CollectionUsage"]
tag_keys = ["name"]
tag_prefix = "pool_"
### WLS
[[instances.metric]]
name = "JTARuntime"
mbean = "com.bea:Name=JTARuntime,ServerRuntime=*,Type=JTARuntime"
paths = ["SecondsActiveTotalCount","TransactionRolledBackTotalCount","TransactionRolledBackSystemTotalCount","TransactionRolledBackAppTotalCount","TransactionRolledBackResourceTotalCount","TransactionHeuristicsTotalCount","TransactionAbandonedTotalCount","TransactionTotalCount","TransactionRolledBackTimeoutTotalCount","ActiveTransactionsTotalCount","TransactionCommittedTotalCount"]
tag_keys = ["ServerRuntime"]
tag_prefix = "wls_"
[[instances.metric]]
name = "ThreadPoolRuntime"
mbean = "com.bea:Name=ThreadPoolRuntime,ServerRuntime=*,Type=ThreadPoolRuntime"
paths = ["StuckThreadCount","CompletedRequestCount","ExecuteThreadTotalCount","ExecuteThreadIdleCount","StandbyThreadCount","Throughput","HoggingThreadCount","PendingUserRequestCount"]
tag_keys = ["ServerRuntime"]
tag_prefix = "wls_"
[[instances.metric]]
name = "JMSRuntime"
mbean = "com.bea:Name=*.jms,ServerRuntime=*,Type=JMSRuntime"
paths = ["ConnectionsCurrentCount","ConnectionsHighCount","ConnectionsTotalCount","JMSServersCurrentCount","JMSServersHighCount","JMSServersTotalCount"]
tag_keys = ["name","ServerRuntime"]
tag_prefix = "wls_"
================================================
FILE: integrations/Jolokia_Agent/collect/jolokia_agent/zookeeper.toml
================================================
[[instances]]
urls = ["http://localhost:8080/jolokia"]
name_prefix = "zk_"
[[instances.metric]]
name = "quorum"
mbean = "org.apache.ZooKeeperService:name0=*"
tag_keys = ["name0"]
[[instances.metric]]
name = "leader"
mbean = "org.apache.ZooKeeperService:name0=*,name1=*,name2=Leader"
tag_keys = ["name1"]
[[instances.metric]]
name = "follower"
mbean = "org.apache.ZooKeeperService:name0=*,name1=*,name2=Follower"
tag_keys = ["name1"]
================================================
FILE: integrations/Jolokia_Agent/markdown/README.md
================================================
# Jolokia Agent
forked from telegraf/inputs.jolokia2_agent
## 停用该插件
- 方法一:把 `input.jolokia_agent_misc` 目录改个别的名字,不用 `input.` 打头
- 方法二:xx.toml 中的配置留空
================================================
FILE: integrations/Kafka/alerts/kafka_by_categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "kafka 数据有丢失风险-副本数小于3",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(kafka_topic_partition_in_sync_replica) by (topic) \u003c 3",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"service=kafka",
"type=categraf"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327567317000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "kafka 服务宕机",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kafka_broker_info{service=~\"kafka\"} \u003c 1",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 60,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"type=categraf",
"service=kafka"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327568065000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "kafka 消费能力不足-延迟超过5分钟",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "max(kafka_consumer_lag_millis) by (topic, consumergroup) / 1000 \u003e 300",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"service=kafka",
"type=categraf"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327568624000
}
]
================================================
FILE: integrations/Kafka/alerts/kafka_by_exporter.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Insufficient consumption ability - delay exceeds 5 minutes - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kafka_consumer_lag_millis / 1000 \u003e 300",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"service=kafka"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327569664000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
52
],
"cluster": "",
"name": "Risk of data loss - number of replicas less than 3 - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(kafka_topic_partition_in_sync_replica) by (topic) \u003c 3 - exporter",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"service=kafka"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327570285000
}
]
================================================
FILE: integrations/Kafka/collect/kafka/kafka.toml
================================================
# # collect interval
# interval = 15
############################################################################
# !!! uncomment [[instances]] to enable this plugin
[[instances]]
# # interval = global.interval * interval_times
# interval_times = 1
# append some labels to metrics
# cluster is a preferred tag with the cluster name. If none is provided, the first of kafka_uris will be used
labels = { cluster="kafka-cluster-01" }
# log level only for kafka exporter
log_level = "error"
# Address (host:port) of Kafka server.
# kafka_uris = ["127.0.0.1:9092","127.0.0.1:9092","127.0.0.1:9092"]
kafka_uris = []
# Connect using SASL/PLAIN
# Default is false
# use_sasl = false
# Only set this to false if using a non-Kafka SASL proxy
# Default is true
# use_sasl_handshake = false
# SASL user name
# sasl_username = "username"
# SASL user password
# sasl_password = "password"
# The SASL SCRAM SHA algorithm sha256 or sha512 as mechanism
# sasl_mechanism = ""
# Connect using TLS
# use_tls = false
# The optional certificate authority file for TLS client authentication
# ca_file = ""
# The optional certificate file for TLS client authentication
# cert_file = ""
# The optional key file for TLS client authentication
# key_file = ""
# If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure
# insecure_skip_verify = true
# Kafka broker version
# Default is 2.0.0
# kafka_version = "2.0.0"
# if you need to use a group from zookeeper
# Default is false
# use_zookeeper_lag = false
# Address array (hosts) of zookeeper server.
# zookeeper_uris = []
# Metadata refresh interval
# Default is 1m
# metadata_refresh_interval = "1m"
# Whether show the offset/lag for all consumer group, otherwise, only show connected consumer groups, default is true
# Default is true
# offset_show_all = true
# If true, all scrapes will trigger kafka operations otherwise, they will share results. WARN: This should be disabled on large clusters
# Default is false
# allow_concurrency = false
# Maximum number of offsets to store in the interpolation table for a partition
# Default is 1000
# max_offsets = 1000
# How frequently should the interpolation table be pruned, in seconds.
# Default is 30
# prune_interval_seconds = 30
# Regex filter for topics to be monitored
# Default is ".*"
# topics_filter_regex = ".*"
# Regex filter for consumer groups to be monitored
# Default is ".*"
# groups_filter_regex = ".*"
# if rename kafka_consumergroup_uncommitted_offsets to kafka_consumergroup_lag
# Default is false
# rename_uncommit_offset_to_lag = false
# if disable calculating lag rate
# Default is false
# disable_calculate_lag_rate = false
================================================
FILE: integrations/Kafka/dashboards/kafka_by_categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Kafka By Categraf",
"ident": "",
"tags": "Kafka Prometheus Categraf",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [
{
"targetBlank": true,
"title": "文档",
"url": "https://github.com/ccfos/nightingale/tree/main/integrations/kafka/markdown/"
}
],
"panels": [
{
"collapsed": true,
"id": "51502c3a-dd6f-41c7-b8f1-87b88826c96e",
"layout": {
"h": 1,
"i": "51502c3a-dd6f-41c7-b8f1-87b88826c96e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "overview",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {
"value": 50
}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e2c1d271-ec43-4821-aa19-451e856af755",
"layout": {
"h": 3,
"i": "e2c1d271-ec43-4821-aa19-451e856af755",
"isResizable": true,
"w": 6,
"x": 0,
"y": 1
},
"name": "brokers",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "kafka_brokers{cluster=\"$cluster\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {
"value": 50
}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "fd3a0b9f-fd67-4360-a94c-869fee7b5b98",
"layout": {
"h": 3,
"i": "fd3a0b9f-fd67-4360-a94c-869fee7b5b98",
"isResizable": true,
"w": 6,
"x": 6,
"y": 1
},
"name": "topics",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "count(count by (topic) (kafka_topic_partitions{cluster=\"$cluster\"}))",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {
"value": 50
}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e228d857-746b-41b6-8d2d-0152453c46f4",
"layout": {
"h": 3,
"i": "e228d857-746b-41b6-8d2d-0152453c46f4",
"isResizable": true,
"w": 6,
"x": 12,
"y": 1
},
"name": "partitions",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kafka_topic_partitions{cluster=\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "85438099-8d6b-4817-b9b9-1d0ed36029cd",
"layout": {
"h": 3,
"i": "85438099-8d6b-4817-b9b9-1d0ed36029cd",
"isResizable": true,
"w": 6,
"x": 18,
"y": 1
},
"name": "Replicas",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kafka_topic_partition_replicas{cluster=\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "0db4aac4-86cf-44cd-950e-6c6a99be8ff4",
"layout": {
"h": 1,
"i": "0db4aac4-86cf-44cd-950e-6c6a99be8ff4",
"isResizable": false,
"w": 24,
"x": 0,
"y": 4
},
"name": "throughput",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c2ec4036-3081-45cc-b672-024c6df93833",
"layout": {
"h": 7,
"i": "c2ec4036-3081-45cc-b672-024c6df93833",
"isResizable": true,
"w": 8,
"x": 0,
"y": 5
},
"name": "Messages produced per second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(kafka_topic_partition_current_offset{cluster=\"$cluster\"}[1m])) by (topic)"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7ad651a6-c12c-4d46-8d01-749fa776faef",
"layout": {
"h": 7,
"i": "7ad651a6-c12c-4d46-8d01-749fa776faef",
"isResizable": true,
"w": 8,
"x": 8,
"y": 5
},
"name": "Messages consumed per second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(kafka_consumergroup_current_offset{cluster=\"$cluster\"}[1m])) by (topic)"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "855aa8f5-0c51-42d4-b9a4-5460b7cd0f5a",
"layout": {
"h": 7,
"i": "855aa8f5-0c51-42d4-b9a4-5460b7cd0f5a",
"isResizable": true,
"w": 8,
"x": 16,
"y": 5
},
"name": "Latency by Consumer Group",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "humantimeMilliseconds"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(kafka_consumer_lag_millis{cluster=\"$cluster\"}) by (consumergroup, topic)",
"legend": "{{consumergroup}} (topic: {{topic}})"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "20166830-7f85-4665-8f39-bf904267af29",
"layout": {
"h": 1,
"i": "20166830-7f85-4665-8f39-bf904267af29",
"isResizable": false,
"w": 24,
"x": 0,
"y": 18
},
"name": "patition/replicate",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8837a52e-c9eb-4afa-acc1-c3a5dac72d3b",
"layout": {
"h": 7,
"i": "8837a52e-c9eb-4afa-acc1-c3a5dac72d3b",
"isResizable": true,
"w": 12,
"x": 0,
"y": 19
},
"name": "Partitions per Topic",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "kafka_topic_partitions{cluster=\"$cluster\"}",
"legend": "{{topic}}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "副本不同步预案\n1. Restart the Zookeeper leader.\n2. Restart the broker\\brokers that are not replicating some of the partitions.",
"id": "dd615767-dda7-4da6-b37f-0d484553aac6",
"layout": {
"h": 7,
"i": "dd615767-dda7-4da6-b37f-0d484553aac6",
"isResizable": true,
"w": 12,
"x": 12,
"y": 19
},
"name": "Partitions Under Replicated",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "kafka_topic_partition_under_replicated_partition{cluster=\"$cluster\"}",
"legend": "{{topic}}-{{partition}}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(kafka_brokers, cluster)",
"name": "cluster",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327571507000
}
================================================
FILE: integrations/Kafka/dashboards/kafka_by_exporter.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Kafka - exporter",
"ident": "",
"tags": "Kafka Prometheus ",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "a3ac9979-6e3a-42ae-9d52-ebddb8960dc4",
"layout": {
"h": 1,
"i": "a3ac9979-6e3a-42ae-9d52-ebddb8960dc4",
"w": 24,
"x": 0,
"y": 0
},
"name": "overview",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {
"value": 50
}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "ed68dc7b-4f01-4aef-ab10-20158aadfab7",
"layout": {
"h": 3,
"i": "ed68dc7b-4f01-4aef-ab10-20158aadfab7",
"w": 8,
"x": 8,
"y": 1
},
"name": "topics",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "count(count by (topic) (kafka_topic_partitions))",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {
"value": 50
}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "3678c9d7-cb0a-4114-a0cd-7a06b976f6b8",
"layout": {
"h": 3,
"i": "3678c9d7-cb0a-4114-a0cd-7a06b976f6b8",
"w": 8,
"x": 0,
"y": 1
},
"name": "brokers",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "kafka_brokers",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {
"value": 50
}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "8adb0df0-13bc-452a-ac63-209ae3748d77",
"layout": {
"h": 3,
"i": "8adb0df0-13bc-452a-ac63-209ae3748d77",
"w": 8,
"x": 16,
"y": 1
},
"name": "partitions",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kafka_topic_partitions)",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "7071dc1f-9410-4899-9c43-206a11bfaab2",
"layout": {
"h": 1,
"i": "7071dc1f-9410-4899-9c43-206a11bfaab2",
"w": 24,
"x": 0,
"y": 4
},
"name": "throughput",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "b68719ad-ba54-4326-a956-43acaef10e2e",
"layout": {
"h": 7,
"i": "b68719ad-ba54-4326-a956-43acaef10e2e",
"w": 12,
"x": 0,
"y": 5
},
"name": "Message in per second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(kafka_topic_partition_current_offset{instance=\"$instance\"}[1m])) by (topic)"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "bfd08ec7-a539-4c5e-8499-4e5c437b97d7",
"layout": {
"h": 7,
"i": "bfd08ec7-a539-4c5e-8499-4e5c437b97d7",
"w": 12,
"x": 0,
"y": 7
},
"name": "Latency by Consumer Group",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "humantimeMilliseconds"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(kafka_consumer_lag_millis{instance=\"$instance\"}) by (consumergroup, topic) ",
"legend": "{{consumergroup}} (topic: {{topic}})"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "9a42427a-0e01-432e-838d-a6baca6c42b2",
"layout": {
"h": 7,
"i": "9a42427a-0e01-432e-838d-a6baca6c42b2",
"w": 12,
"x": 12,
"y": 5
},
"name": "Message consume per second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(kafka_consumergroup_current_offset{instance=\"$instance\"}[1m])) by (topic)"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "7324f196-467b-4590-ae47-d56be683a0c3",
"layout": {
"h": 7,
"i": "7324f196-467b-4590-ae47-d56be683a0c3",
"w": 12,
"x": 12,
"y": 7
},
"name": "Lag by Consumer Group",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(kafka_topic_partition_current_offset{instance=\"$instance\"}) by (topic) - sum(kafka_consumergroup_current_offset{instance=\"$instance\"}) by (topic) ",
"legend": "{{consumergroup}} (topic: {{topic}})"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "bd4d2d51-7b4d-4523-b586-0bf2b248d4d4",
"layout": {
"h": 1,
"i": "bd4d2d51-7b4d-4523-b586-0bf2b248d4d4",
"w": 24,
"x": 0,
"y": 14
},
"name": "patition/replicate",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"displayMode": "seriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "04d1f6cc-40ec-4584-be17-a4d10cd5b6e9",
"layout": {
"h": 7,
"i": "04d1f6cc-40ec-4584-be17-a4d10cd5b6e9",
"w": 12,
"x": 0,
"y": 15
},
"name": "Partitions per Topic",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "kafka_topic_partitions{instance=\"$instance\"}",
"legend": "{{topic}}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"displayMode": "seriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "副本不同步预案\n1. Restart the Zookeeper leader.\n2. Restart the broker\\brokers that are not replicating some of the partitions.",
"id": "5b589c1c-fd35-4ce5-8b24-c0e05d307345",
"layout": {
"h": 7,
"i": "5b589c1c-fd35-4ce5-8b24-c0e05d307345",
"w": 12,
"x": 12,
"y": 15
},
"name": "Under Replicated",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "kafka_topic_partition_under_replicated_partition",
"legend": "{{topic}}-{{partition}}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(kafka_brokers, instance)",
"name": "instance",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(kafka_brokers, job)",
"name": "job",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327573482000
}
================================================
FILE: integrations/Kafka/markdown/README.md
================================================
# kafka plugin
Kafka 的核心指标,其实都是通过 JMX 的方式暴露的。对于 JMX 暴露的指标,使用 jolokia 或者使用 jmx_exporter 那个 jar 包来采集即可,不需要本插件。
本插件主要是采集的消费者延迟数据,这个数据无法通过 Kafka 服务端的 JMX 拿到。
本插件 fork 自 [https://github.com/davidmparrott/kafka_exporter](https://github.com/davidmparrott/kafka_exporter)(以下简称 davidmparrott 版本),davidmparrott 版本 fork 自 [https://github.com/danielqsj/kafka_exporter](https://github.com/danielqsj/kafka_exporter)(以下简称 danielqsj 版本)。
danielqsj 版本作为原始版本, github 版本也相对活跃, prometheus 生态使用较多。davidmparrott 版本与 danielqsj 版本相比, 有以下 metric 名字不同:
| davidmparrott 版本 | danielqsj 版本 |
| ---- | ---- |
| kafka_consumergroup_uncommit_offsets | kafka_consumergroup_lag |
| kafka_consumergroup_uncommit_offsets_sum | kafka_consumergroup_lag_sum |
| kafka_consumergroup_uncommitted_offsets_zookeeper | kafka_consumergroup_lag_zookeeper |
如果想使用 danielqsj 版本的 metric, 在 `[[instances]]` 中进行如下配置:
```toml
rename_uncommit_offset_to_lag = true
```
davidmparrott 版本比 danielqsj 版本多了以下 metric,这些指标是对延迟速率做了预估计算:
- kafka_consumer_lag_millis
- kafka_consumer_lag_interpolation
- kafka_consumer_lag_extrapolation
为什么要计算速率?因为 lag 很大,但是消费很快,是不会积压的,而 lag 很小,消费很慢,仍然会积压,所以,通过 lag 大小是没法判断积压风险的。通过计算历史消费速率,来判断积压风险会更为合理。要计算这个速率,需要占用较多内存,可以通过如下配置关闭这个计算逻辑:
```toml
disable_calculate_lag_rate = true
```
## 采集配置
categraf 配置文件:`conf/input.kafka/kafka.toml`。配置样例如下:
```toml
[[instances]]
log_level = "error"
kafka_uris = ["192.168.0.250:9092"]
labels = { cluster="kafka-cluster-01", service="kafka" }
```
完整的带有注释的配置如下:
```toml
[[instances]]
# # interval = global.interval * interval_times
# interval_times = 1
# append some labels to metrics
# cluster is a preferred tag with the cluster name. If none is provided, the first of kafka_uris will be used
labels = { cluster="kafka-cluster-01" }
# log level only for kafka exporter
log_level = "error"
# Address (host:port) of Kafka server.
# kafka_uris = ["127.0.0.1:9092","127.0.0.1:9092","127.0.0.1:9092"]
kafka_uris = []
# Connect using SASL/PLAIN
# Default is false
# use_sasl = false
# Only set this to false if using a non-Kafka SASL proxy
# Default is true
# use_sasl_handshake = false
# SASL user name
# sasl_username = "username"
# SASL user password
# sasl_password = "password"
# The SASL SCRAM SHA algorithm sha256 or sha512 as mechanism
# sasl_mechanism = ""
# Connect using TLS
# use_tls = false
# The optional certificate authority file for TLS client authentication
# ca_file = ""
# The optional certificate file for TLS client authentication
# cert_file = ""
# The optional key file for TLS client authentication
# key_file = ""
# If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure
# insecure_skip_verify = true
# Kafka broker version
# Default is 2.0.0
# kafka_version = "2.0.0"
# if you need to use a group from zookeeper
# Default is false
# use_zookeeper_lag = false
# Address array (hosts) of zookeeper server.
# zookeeper_uris = []
# Metadata refresh interval
# Default is 1m
# metadata_refresh_interval = "1m"
# Whether show the offset/lag for all consumer group, otherwise, only show connected consumer groups, default is true
# Default is true
# offset_show_all = true
# If true, all scrapes will trigger kafka operations otherwise, they will share results. WARN: This should be disabled on large clusters
# Default is false
# allow_concurrency = false
# Maximum number of offsets to store in the interpolation table for a partition
# Default is 1000
# max_offsets = 1000
# How frequently should the interpolation table be pruned, in seconds.
# Default is 30
# prune_interval_seconds = 30
# Regex filter for topics to be monitored
# Default is ".*"
# topics_filter_regex = ".*"
# Regex filter for consumer groups to be monitored
# Default is ".*"
# groups_filter_regex = ".*"
# if rename kafka_consumergroup_uncommitted_offsets to kafka_consumergroup_lag
# Default is false
# rename_uncommit_offset_to_lag = false
# if disable calculating lag rate
# Default is false
# disable_calculate_lag_rate = false
```
================================================
FILE: integrations/Kafka/metrics/categraf-base.json
================================================
[
{
"id": 0,
"uuid": 1717556327574937000,
"collector": "Categraf",
"typ": "Kafka",
"name": "Broker 数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "kafka_brokers",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Broker 数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of Brokers",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327578367000,
"collector": "Categraf",
"typ": "Kafka",
"name": "Partition 副本不同步的数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "kafka_topic_partition_under_replicated_partition",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Partition 副本不同步的数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of out-of-sync copies of Partition",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327581728000,
"collector": "Categraf",
"typ": "Kafka",
"name": "Partition 副本数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "kafka_topic_partition_replicas",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Partition 副本数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of Partition copies",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327584595000,
"collector": "Categraf",
"typ": "Kafka",
"name": "各个 Topic 每秒消费消息量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "sum(irate(kafka_consumergroup_current_offset[3m])) without (partition)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "各个 Topic 每秒消费消息量",
"note": ""
},
{
"lang": "en_US",
"name": "Each Topic consumes messages per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327590335000,
"collector": "Categraf",
"typ": "Kafka",
"name": "各个 Topic 每秒生产消息量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "sum(irate(kafka_topic_partition_current_offset[3m])) without (partition)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "各个 Topic 每秒生产消息量",
"note": ""
},
{
"lang": "en_US",
"name": "Production message volume per second per Topic",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327592951000,
"collector": "Categraf",
"typ": "Kafka",
"name": "各个 Topic 的 Partition 数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "kafka_topic_partitions",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "各个 Topic 的 Partition 数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of Partitions for each Topic",
"note": ""
}
]
}
]
================================================
FILE: integrations/Kubernetes/alerts/apiserver.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "KubeClientCertificateExpiration-S2",
"note": "A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} \u003e 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) \u003c 604800\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327602560000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "KubeClientCertificateExpiration-S1",
"note": "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": null,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} \u003e 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) \u003c 86400\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327603535000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "AggregatedAPIErrors",
"note": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) \u003e 2\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327604347000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "AggregatedAPIDown",
"note": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 \u003c 85\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327605135000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "KubeAPIDown",
"note": "KubeAPI has disappeared from Prometheus target discovery.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": null,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "absent(up{job=\"apiserver\"} == 1)\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327606255000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "KubeAPIErrorBudgetBurn-S1-120秒",
"note": "The API server is burning too much error budget.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "sum(apiserver_request:burnrate1h) \u003e (14.40 * 0.01000)\nand\nsum(apiserver_request:burnrate5m) \u003e (14.40 * 0.01000)\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"long=1h",
"short=5m"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327608028000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "KubeAPIErrorBudgetBurn-S1-900秒",
"note": "The API server is burning too much error budget.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": null,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "sum(apiserver_request:burnrate6h) \u003e (6.00 * 0.01000)\nand\nsum(apiserver_request:burnrate30m) \u003e (6.00 * 0.01000)\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"long=6h",
"short=30m"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327608676000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "KubeAPIErrorBudgetBurn-S2-3600秒",
"note": "The API server is burning too much error budget.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 3600,
"prom_ql": "sum(apiserver_request:burnrate1d) \u003e (3.00 * 0.01000)\nand\nsum(apiserver_request:burnrate2h) \u003e (3.00 * 0.01000)\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"long=1d",
"short=2h"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327609366000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "KubeAPIErrorBudgetBurn-S2-10800秒",
"note": "The API server is burning too much error budget.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 10800,
"prom_ql": "sum(apiserver_request:burnrate3d) \u003e (1.00 * 0.01000)\nand\nsum(apiserver_request:burnrate6h) \u003e (1.00 * 0.01000)\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"long=3d",
"short=6h"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327609980000
}
]
================================================
FILE: integrations/Kubernetes/alerts/kube-controller-plane.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CPUThrottlingHigh",
"note": "{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", }[5m])) by (container, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)\n \u003e ( 25 / 100 )\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", }[5m])) by (container, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)\n \u003e ( 25 / 100 )\n",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327613181000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeAggregatedAPIDown",
"note": "Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "(1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job=\"apiserver\"}[10m]))) * 100 \u003c 85\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job=\"apiserver\"}[10m]))) * 100 \u003c 85\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327613934000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeAggregatedAPIErrors",
"note": "Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job=\"apiserver\"}[10m])) \u003e 4\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job=\"apiserver\"}[10m])) \u003e 4\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327614501000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeAPIDown",
"note": "KubeAPI has disappeared from Prometheus target discovery.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "absent(up{job=\"apiserver\"} == 1)\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "absent(up{job=\"apiserver\"} == 1)\n",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327615035000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeAPIErrorBudgetBurn",
"note": "The API server is burning too much error budget.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "sum(apiserver_request:burnrate1h) \u003e (14.40 * 0.01000)\nand\nsum(apiserver_request:burnrate5m) \u003e (14.40 * 0.01000)\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(apiserver_request:burnrate1h) \u003e (14.40 * 0.01000)\nand\nsum(apiserver_request:burnrate5m) \u003e (14.40 * 0.01000)\n",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"long=1h",
"short=5m"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327615643000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeAPITerminatedRequests",
"note": "The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m])) / ( sum(rate(apiserver_request_total{job=\"apiserver\"}[10m])) + sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m])) ) \u003e 0.20\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m])) / ( sum(rate(apiserver_request_total{job=\"apiserver\"}[10m])) + sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m])) ) \u003e 0.20\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327616331000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeClientCertificateExpiration",
"note": "A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} \u003e 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) \u003c 604800\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} \u003e 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) \u003c 604800\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327616904000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeClientErrors",
"note": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "(sum(rate(rest_client_requests_total{job=\"apiserver\",code=~\"5..\"}[5m])) by (cluster, instance, job, namespace)\n /\nsum(rate(rest_client_requests_total{job=\"apiserver\"}[5m])) by (cluster, instance, job, namespace))\n\u003e 0.01\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(sum(rate(rest_client_requests_total{job=\"apiserver\",code=~\"5..\"}[5m])) by (cluster, instance, job, namespace)\n /\nsum(rate(rest_client_requests_total{job=\"apiserver\"}[5m])) by (cluster, instance, job, namespace))\n\u003e 0.01\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327617388000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeContainerWaiting",
"note": "pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 3600,
"prom_ql": "sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\"}) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\"}) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327618152000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeControllerManagerDown",
"note": "KubeControllerManager has disappeared from Prometheus target discovery.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "absent(up{job=\"kube-controller-manager\"} == 1)\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "absent(up{job=\"kube-controller-manager\"} == 1)\n",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327618705000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeCPUOvercommit",
"note": "Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{job=\"kube-state-metrics\",}) by (cluster) - (sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster) - max(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster)) \u003e 0\nand\n(sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster) - max(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster)) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{job=\"kube-state-metrics\",}) by (cluster) - (sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster) - max(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster)) \u003e 0\nand\n(sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster) - max(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"cpu\"}) by (cluster)) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327619215000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeCPUQuotaOvercommit",
"note": "Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Namespaces.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(cpu|requests.cpu)\"})) by (cluster)\n /\nsum(kube_node_status_allocatable{resource=\"cpu\", job=\"kube-state-metrics\"}) by (cluster)\n \u003e 1.5\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(cpu|requests.cpu)\"})) by (cluster)\n /\nsum(kube_node_status_allocatable{resource=\"cpu\", job=\"kube-state-metrics\"}) by (cluster)\n \u003e 1.5\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327619730000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeDaemonSetMisScheduled",
"note": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"} \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"} \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327620239000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeDaemonSetNotScheduled",
"note": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n -\nkube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n -\nkube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327620796000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeDaemonSetRolloutStuck",
"note": "DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "(\n (\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"}\n !=\n 0\n ) or (\n kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(\n (\n kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"}\n !=\n 0\n ) or (\n kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_status_updated_number_scheduled{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327621274000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeDeploymentGenerationMismatch",
"note": "Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327621811000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeDeploymentReplicasMismatch",
"note": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "(\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n \u003e\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n) and (\n changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(\n kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n \u003e\n kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n) and (\n changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327622401000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeDeploymentRolloutStuck",
"note": "Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "kube_deployment_status_condition{condition=\"Progressing\", status=\"false\",job=\"kube-state-metrics\"}\n!= 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_deployment_status_condition{condition=\"Progressing\", status=\"false\",job=\"kube-state-metrics\"}\n!= 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327622995000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeHpaMaxedOut",
"note": "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n ==\nkube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"}\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n ==\nkube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"}\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327623537000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeHpaReplicasMismatch",
"note": "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "(kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\"}\n !=\nkube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n \u003e\nkube_horizontalpodautoscaler_spec_min_replicas{job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n \u003c\nkube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"})\n and\nchanges(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}[15m]) == 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\"}\n !=\nkube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n \u003e\nkube_horizontalpodautoscaler_spec_min_replicas{job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}\n \u003c\nkube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"})\n and\nchanges(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}[15m]) == 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327624174000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeJobFailed",
"note": "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "kube_job_failed{job=\"kube-state-metrics\"} \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_job_failed{job=\"kube-state-metrics\"} \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327624761000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeJobNotCompleted",
"note": "Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ \"43200\" | humanizeDuration }} to complete.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job=\"kube-state-metrics\"}\n and\nkube_job_status_active{job=\"kube-state-metrics\"} \u003e 0) \u003e 43200\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job=\"kube-state-metrics\"}\n and\nkube_job_status_active{job=\"kube-state-metrics\"} \u003e 0) \u003e 43200\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327625347000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeletClientCertificateExpiration",
"note": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "kubelet_certificate_manager_client_ttl_seconds \u003c 604800\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kubelet_certificate_manager_client_ttl_seconds \u003c 604800\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327625941000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeletClientCertificateRenewalErrors",
"note": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327626429000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeletDown",
"note": "Kubelet has disappeared from Prometheus target discovery.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "absent(up{job=\"kubelet\", metrics_path=\"/metrics\"} == 1)\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "absent(up{job=\"kubelet\", metrics_path=\"/metrics\"} == 1)\n",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327626961000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeletPlegDurationHigh",
"note": "The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"0.99\"} \u003e= 10\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"0.99\"} \u003e= 10\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327627451000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeletPodStartUpLatencyHigh",
"note": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\", metrics_path=\"/metrics\"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"} \u003e 60\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\", metrics_path=\"/metrics\"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"} \u003e 60\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327628011000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeletServerCertificateExpiration",
"note": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "kubelet_certificate_manager_server_ttl_seconds \u003c 604800\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kubelet_certificate_manager_server_ttl_seconds \u003c 604800\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327628498000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeletServerCertificateRenewalErrors",
"note": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "increase(kubelet_server_expiration_renew_errors[5m]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(kubelet_server_expiration_renew_errors[5m]) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327629074000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeletTooManyPods",
"note": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "count by(cluster, node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(cluster, node) (\n kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1\n) \u003e 0.95\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "count by(cluster, node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(cluster, node) (\n kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1\n) \u003e 0.95\n",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327629530000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeMemoryOvercommit",
"note": "Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster) - max(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster)) \u003e 0\nand\n(sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster) - max(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster)) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster) - max(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster)) \u003e 0\nand\n(sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster) - max(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster)) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327630107000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeMemoryQuotaOvercommit",
"note": "Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Namespaces.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(memory|requests.memory)\"})) by (cluster)\n /\nsum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster)\n \u003e 1.5\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(memory|requests.memory)\"})) by (cluster)\n /\nsum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) by (cluster)\n \u003e 1.5\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327630617000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeNodeNotReady",
"note": "{{ $labels.node }} has been unready for more than 15 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327631263000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeNodeReadinessFlapping",
"note": "The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "sum(changes(kube_node_status_condition{job=\"kube-state-metrics\",status=\"true\",condition=\"Ready\"}[15m])) by (cluster, node) \u003e 2\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(changes(kube_node_status_condition{job=\"kube-state-metrics\",status=\"true\",condition=\"Ready\"}[15m])) by (cluster, node) \u003e 2\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327631823000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeNodeUnreachable",
"note": "{{ $labels.node }} is unreachable and some workloads may be rescheduled.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "(kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} unless ignoring(key,value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} unless ignoring(key,value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327632304000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubePersistentVolumeErrors",
"note": "The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "kube_persistentvolume_status_phase{phase=~\"Failed|Pending\",job=\"kube-state-metrics\"} \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_persistentvolume_status_phase{phase=~\"Failed|Pending\",job=\"kube-state-metrics\"} \u003e 0\n",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327632772000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubePersistentVolumeFillingUp",
"note": "The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) \u003c 0.03\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} \u003e 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) \u003c 0.03\nand\nkubelet_volume_stats_used_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} \u003e 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1\n",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327633208000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubePersistentVolumeInodesFillingUp",
"note": "The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} only has {{ $value | humanizePercentage }} free inodes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "(\n kubelet_volume_stats_inodes_free{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_inodes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) \u003c 0.03\nand\nkubelet_volume_stats_inodes_used{job=\"kubelet\", metrics_path=\"/metrics\"} \u003e 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(\n kubelet_volume_stats_inodes_free{job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_inodes{job=\"kubelet\", metrics_path=\"/metrics\"}\n) \u003c 0.03\nand\nkubelet_volume_stats_inodes_used{job=\"kubelet\", metrics_path=\"/metrics\"} \u003e 0\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1\nunless on(namespace, persistentvolumeclaim)\nkube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1\n",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327633654000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubePodCrashLooping",
"note": "Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: \"CrashLoopBackOff\").",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "max_over_time(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\", job=\"kube-state-metrics\"}[5m]) \u003e= 1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "max_over_time(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\", job=\"kube-state-metrics\"}[5m]) \u003e= 1\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327634127000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubePodNotReady",
"note": "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "sum by (namespace, pod, cluster) (\n max by(namespace, pod, cluster) (\n kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown|Failed\"}\n ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (\n 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!=\"Job\"})\n )\n) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum by (namespace, pod, cluster) (\n max by(namespace, pod, cluster) (\n kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown|Failed\"}\n ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (\n 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!=\"Job\"})\n )\n) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327634578000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeQuotaAlmostFull",
"note": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} \u003e 0)\n \u003e 0.9 \u003c 1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} \u003e 0)\n \u003e 0.9 \u003c 1\n",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327634994000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeQuotaExceeded",
"note": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} \u003e 0)\n \u003e 1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} \u003e 0)\n \u003e 1\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327635438000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeQuotaFullyUsed",
"note": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} \u003e 0)\n == 1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} \u003e 0)\n == 1\n",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327635894000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeSchedulerDown",
"note": "KubeScheduler has disappeared from Prometheus target discovery.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "absent(up{job=\"kube-scheduler\"} == 1)\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "absent(up{job=\"kube-scheduler\"} == 1)\n",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327636372000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeStatefulSetGenerationMismatch",
"note": "StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_statefulset_metadata_generation{job=\"kube-state-metrics\"}\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_statefulset_metadata_generation{job=\"kube-state-metrics\"}\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327636798000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeStatefulSetReplicasMismatch",
"note": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "(\n kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas{job=\"kube-state-metrics\"}\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(\n kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas{job=\"kube-state-metrics\"}\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327637207000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeStatefulSetUpdateNotRolledOut",
"note": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "(\n max without (revision) (\n kube_statefulset_status_current_revision{job=\"kube-state-metrics\"}\n unless\n kube_statefulset_status_update_revision{job=\"kube-state-metrics\"}\n )\n *\n (\n kube_statefulset_replicas{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(\n max without (revision) (\n kube_statefulset_status_current_revision{job=\"kube-state-metrics\"}\n unless\n kube_statefulset_status_update_revision{job=\"kube-state-metrics\"}\n )\n *\n (\n kube_statefulset_replicas{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327637666000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "KubeVersionMismatch",
"note": "There are {{ $value }} different semantic versions of Kubernetes components running.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"},\"git_version\",\"$1\",\"git_version\",\"(v[0-9]*.[0-9]*).*\"))) \u003e 1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"},\"git_version\",\"$1\",\"git_version\",\"(v[0-9]*.[0-9]*).*\"))) \u003e 1\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327638202000
}
]
================================================
FILE: integrations/Kubernetes/alerts/kubelet.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "Node状态异常",
"note": "{{ $labels.node }} has been unready for more than 15 minutes.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327639942000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "Node不可达",
"note": "{{ $labels.node }} is unreachable and some workloads may be rescheduled.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "(kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} unless ignoring(key,value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327640501000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "Node运行太多Pod",
"note": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "count by(node) (\n (kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})\n)\n/\nmax by(node) (\n kube_node_status_capacity_pods{job=\"kube-state-metrics\"} != 1\n) \u003e 0.95\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327641040000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "Node状态抖动",
"note": "The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "sum(changes(kube_node_status_condition{status=\"true\",condition=\"Ready\"}[15m])) by (node) \u003e 2\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327641505000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "PLEG耗时高",
"note": "The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"0.99\"} \u003e= 10\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327642077000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "Pod启动耗时高",
"note": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\", metrics_path=\"/metrics\"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\", metrics_path=\"/metrics\"} \u003e 60\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327642559000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "客户端证书过期-S2",
"note": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "kubelet_certificate_manager_client_ttl_seconds \u003c 604800\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327643034000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "客户端证书过期-S1",
"note": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": null,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "kubelet_certificate_manager_client_ttl_seconds \u003c 86400\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327643517000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "服务端证书过期-S2",
"note": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "kubelet_certificate_manager_server_ttl_seconds \u003c 604800\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327643966000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "服务端证书过期-S1",
"note": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": null,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "kubelet_certificate_manager_server_ttl_seconds \u003c 86400\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327644422000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "客户端证书续签错误",
"note": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) \u003e 0\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327644887000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "服务证书续签错误",
"note": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "increase(kubelet_server_expiration_renew_errors[5m]) \u003e 0\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327645326000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "kubelet故障",
"note": "Kubelet has disappeared from Prometheus target discovery.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": null,
"disabled": 0,
"prom_for_duration": 900,
"prom_ql": "absent(up{job=\"kubelet\"} == 1)\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327645784000
}
]
================================================
FILE: integrations/Kubernetes/alerts/node-exporter.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机内存不足",
"note": "节点内存不足 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327647038000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机内存有压力",
"note": "节点内存压力大 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "rate(node_vmstat_pgmajfault[1m]) \u003e 1000",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327647631000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机入口网络吞吐量异常",
"note": "主机异常网络吞吐量 入 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 \u003e 100",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327648186000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机出口网络吞吐量异常",
"note": "主机异常网络吞吐量 出 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 \u003e 100",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327648692000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机磁盘读取速率异常",
"note": "主机异常磁盘读取率 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 \u003e 50",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327649279000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机磁盘写入速率异常",
"note": "主机异常磁盘写入率 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 \u003e 50",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327649827000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机磁盘空间不足",
"note": "主机磁盘空间不足 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327650331000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机磁盘将在 24 小时内填满",
"note": "主机磁盘将在 24 小时内填满 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~\"tmpfs\"}[1h], 24 * 3600) \u003c 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327650785000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "文件系统inode使用率高",
"note": "主机不足 inode (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "node_filesystem_files_free{mountpoint =\"/rootfs\"} / node_filesystem_files{mountpoint=\"/rootfs\"} * 100 \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint=\"/rootfs\"} == 0",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327651197000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "文件系统inode将在24小时内填满",
"note": "主机 inode 将在 24 小时内填满 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "node_filesystem_files_free{mountpoint =\"/rootfs\"} / node_filesystem_files{mountpoint=\"/rootfs\"} * 100 \u003c 10 and predict_linear(node_filesystem_files_free{mountpoint=\"/rootfs\"}[1h], 24 * 3600) \u003c 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint=\"/rootfs\"} == 0",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327651641000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机磁盘读取延迟异常",
"note": "主机异常磁盘读取延迟 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) \u003e 0.1 and rate(node_disk_reads_completed_total[1m]) \u003e 0",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327652097000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机磁盘写入延迟异常",
"note": "主机异常磁盘写入延迟 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) \u003e 0.1 and rate(node_disk_writes_completed_total[1m]) \u003e 0",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327652522000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "节点cpu负载高",
"note": "主机 CPU 负载高 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100) \u003e 80",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327652932000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "有其他云主机窃取cpu",
"note": "Host CPU steal noisy neighbor (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "avg by(instance) (rate(node_cpu_seconds_total{mode=\"steal\"}[5m])) * 100 \u003e 10",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327653518000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机上下文切换异常",
"note": "主机上下文切换 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "(rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode=\"idle\"})) \u003e 1000",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327653948000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机交换内存快满了",
"note": "主机交换内存已满 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "(1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 \u003e 80",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327654459000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机系统服务崩溃",
"note": "主机 systemd 服务崩溃 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "node_systemd_unit_state{state=\"failed\"} == 1",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327654942000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机内核版本偏差",
"note": "主机内核版本偏差 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 21600,
"prom_ql": "count(sum(label_replace(node_uname_info, \"kernel\", \"$1\", \"release\", \"([0-9]+.[0-9]+.[0-9]+).*\")) by (kernel)) \u003e 1",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327655373000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "检测到 HostOomKill",
"note": "检测到主机 OOM 终止 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "increase(node_vmstat_oom_kill[1m]) \u003e 0",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327655821000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机网络接收错误",
"note": "主机网络接收错误 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) \u003e 0.01",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327656340000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机网络出口错误",
"note": "主机网络传输错误 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) \u003e 0.01",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327656756000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机网络接口饱和",
"note": "主机网络接口饱和 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "(rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8 \u003c 10000",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327657159000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "conntrack数量接近极限",
"note": "主机连接限制 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327657626000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机时钟偏差",
"note": "主机时钟偏差 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "(node_timex_offset_seconds \u003e 0.05 and deriv(node_timex_offset_seconds[5m]) \u003e= 0) or (node_timex_offset_seconds \u003c -0.05 and deriv(node_timex_offset_seconds[5m]) \u003c= 0)",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327658114000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "主机时钟不同步",
"note": "主机时钟不同步 (instance {{ $labels.instance }})",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds \u003e= 16",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327658621000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "节点文件系统空间填满-S2",
"note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 3600,
"prom_ql": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 \u003c 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) \u003c 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327659253000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "节点文件系统空间填满-S1",
"note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up fast.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": null,
"disabled": 0,
"prom_for_duration": 3600,
"prom_ql": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 \u003c 15\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) \u003c 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327659712000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "节点磁盘快满了-S2-space-5",
"note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 3600,
"prom_ql": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 \u003c 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327660233000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "节点磁盘快满了-S1-space-3",
"note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": null,
"disabled": 0,
"prom_for_duration": 3600,
"prom_ql": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} * 100 \u003c 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327660700000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "节点磁盘快满了-S2-inodes-40",
"note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 3600,
"prom_ql": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 \u003c 40\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 24*60*60) \u003c 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327661181000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "节点磁盘快满了-S1-inodes-20",
"note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up fast.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": null,
"disabled": 0,
"prom_for_duration": 3600,
"prom_ql": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 \u003c 20\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"}[6h], 4*60*60) \u003c 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327661660000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "节点磁盘快满了-S2-inodes-5",
"note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 3600,
"prom_ql": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 \u003c 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327662110000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "节点磁盘快满了-S1-inodes-3",
"note": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": null,
"disabled": 0,
"prom_for_duration": 3600,
"prom_ql": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\"} * 100 \u003c 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\"} == 0\n)\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327662492000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "使用的节点Conntrack条目数量高",
"note": "{{ $value | humanizePercentage }} of conntrack entries are used.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) \u003e 0.75\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327662954000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "节点文本文件收集器抓取错误",
"note": "Node Exporter text file collector failed to scrape.",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "node_textfile_scrape_error{job=\"node-exporter\"} == 1\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327663328000
},
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "节点网络接口抖动",
"note": "Network interface \"{{ $labels.device }}\" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}\"",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "changes(node_network_up{job=\"node-exporter\",device!~\"veth.+\"}[2m]) \u003e 2\n",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327663731000
}
]
================================================
FILE: integrations/Kubernetes/alerts/prometheus-operator.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ConfigReloaderSidecarErrors",
"note": "Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace.\nAs a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "max_over_time(reloader_last_reload_successful{namespace=~\".+\"}[5m]) == 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "max_over_time(reloader_last_reload_successful{namespace=~\".+\"}[5m]) == 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327664922000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusOperatorListErrors",
"note": "Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "(sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[10m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[10m]))) \u003e 0.4\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[10m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[10m]))) \u003e 0.4\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327665326000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusOperatorNodeLookupErrors",
"note": "Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "rate(prometheus_operator_node_address_lookup_errors_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) \u003e 0.1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(prometheus_operator_node_address_lookup_errors_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) \u003e 0.1\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327665709000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusOperatorNotReady",
"note": "Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) == 0)\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) == 0)\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327666207000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusOperatorReconcileErrors",
"note": "{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "(sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]))) \u003e 0.1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]))) \u003e 0.1\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327666637000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusOperatorRejectedResources",
"note": "Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf \"%0.0f\" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "min_over_time(prometheus_operator_managed_resources{state=\"rejected\",job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "min_over_time(prometheus_operator_managed_resources{state=\"rejected\",job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327667061000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusOperatorSyncFailed",
"note": "Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "min_over_time(prometheus_operator_syncs{status=\"failed\",job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "min_over_time(prometheus_operator_syncs{status=\"failed\",job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327667462000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusOperatorWatchErrors",
"note": "Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "(sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]))) \u003e 0.4\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job=\"prometheus-operator\",namespace=\"monitoring\"}[5m]))) \u003e 0.4\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327667840000
}
]
================================================
FILE: integrations/Kubernetes/alerts/prometheus.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ConfigReloaderSidecarErrors",
"note": "Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace.\nAs a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "max_over_time(reloader_last_reload_successful{namespace=~\".+\"}[5m]) == 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "max_over_time(reloader_last_reload_successful{namespace=~\".+\"}[5m]) == 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327669056000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusBadConfig",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\nmax_over_time(prometheus_config_last_reload_successful{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) == 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\nmax_over_time(prometheus_config_last_reload_successful{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) == 0\n",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327669467000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusDuplicateTimestamps",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf \"%.4g\" $value }} samples/s with different values but duplicated timestamp.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327669963000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusErrorSendingAlertsToAnyAlertmanager",
"note": "{{ printf \"%.1f\" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "min without (alertmanager) (\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n)\n* 100\n\u003e 3\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "min without (alertmanager) (\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\",alertmanager!~``}[5m])\n)\n* 100\n\u003e 3\n",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327670315000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusErrorSendingAlertsToSomeAlertmanagers",
"note": "{{ printf \"%.1f\" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "(\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n* 100\n\u003e 1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(\n rate(prometheus_notifications_errors_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n* 100\n\u003e 1\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327670910000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusHighQueryLoad",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "avg_over_time(prometheus_engine_queries{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0.8\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(prometheus_engine_queries{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0.8\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327671338000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusLabelLimitHit",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf \"%.0f\" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327671805000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusMissingRuleEvaluations",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf \"%.0f\" $value }} rule group evaluations in the last 5m.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "increase(prometheus_rule_group_iterations_missed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(prometheus_rule_group_iterations_missed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327672340000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusNotConnectedToAlertmanagers",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\nmax_over_time(prometheus_notifications_alertmanagers_discovered{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003c 1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\nmax_over_time(prometheus_notifications_alertmanagers_discovered{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003c 1\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327672820000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusNotificationQueueRunningFull",
"note": "Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "# Without min_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n predict_linear(prometheus_notifications_queue_length{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m], 60 * 30)\n\u003e\n min_over_time(prometheus_notifications_queue_capacity{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "# Without min_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n predict_linear(prometheus_notifications_queue_length{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m], 60 * 30)\n\u003e\n min_over_time(prometheus_notifications_queue_capacity{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327673317000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusNotIngestingSamples",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "(\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003c= 0\nand\n (\n sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"prometheus-k8s\",namespace=\"monitoring\"}) \u003e 0\n or\n sum without(rule_group) (prometheus_rule_group_rules{job=\"prometheus-k8s\",namespace=\"monitoring\"}) \u003e 0\n )\n)\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003c= 0\nand\n (\n sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"prometheus-k8s\",namespace=\"monitoring\"}) \u003e 0\n or\n sum without(rule_group) (prometheus_rule_group_rules{job=\"prometheus-k8s\",namespace=\"monitoring\"}) \u003e 0\n )\n)\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327673794000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusOutOfOrderTimestamps",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf \"%.4g\" $value }} samples/s with timestamps arriving out of order.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "rate(prometheus_target_scrapes_sample_out_of_order_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(prometheus_target_scrapes_sample_out_of_order_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327674275000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusRemoteStorageFailures",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf \"%.1f\" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "(\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n/\n (\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n +\n (rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n )\n)\n* 100\n\u003e 1\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n/\n (\n (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n +\n (rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]))\n )\n)\n* 100\n\u003e 1\n",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327674647000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusRemoteWriteBehind",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf \"%.1f\" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n- ignoring(remote_name, url) group_right\n max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n\u003e 120\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n- ignoring(remote_name, url) group_right\n max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n\u003e 120\n",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327675106000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusRemoteWriteDesiredShards",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance=\"%s\",job=\"prometheus-k8s\",namespace=\"monitoring\"}` $labels.instance | query | first | value }}.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n\u003e\n max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n\u003e\n max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m])\n)\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327675490000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusRuleFailures",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf \"%.0f\" $value }} rules in the last 5m.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "increase(prometheus_rule_evaluation_failures_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(prometheus_rule_evaluation_failures_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327675864000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusScrapeBodySizeLimitHit",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf \"%.0f\" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327676304000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusScrapeSampleLimitHit",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf \"%.0f\" $value }} scrapes in the last 5m because some targets exceeded the configured sample_limit.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "increase(prometheus_target_scrapes_exceeded_sample_limit_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(prometheus_target_scrapes_exceeded_sample_limit_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327676668000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusSDRefreshFailure",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to refresh SD with mechanism {{$labels.mechanism}}.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 1200,
"prom_ql": "increase(prometheus_sd_refresh_failures_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[10m]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(prometheus_sd_refresh_failures_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[10m]) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327677046000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusTargetLimitHit",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf \"%.0f\" $value }} targets because the number of targets exceeded the configured target_limit.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 900,
"prom_ql": "increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[5m]) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327677488000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusTargetSyncFailure",
"note": "{{ printf \"%.0f\" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} have failed to sync because invalid configuration was supplied.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "increase(prometheus_target_sync_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[30m]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(prometheus_target_sync_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[30m]) \u003e 0\n",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327677914000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusTSDBCompactionsFailing",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 14400,
"prom_ql": "increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[3h]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[3h]) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327678338000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PrometheusTSDBReloadsFailing",
"note": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 14400,
"prom_ql": "increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[3h]) \u003e 0\n",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus-k8s\",namespace=\"monitoring\"}[3h]) \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327678713000
}
]
================================================
FILE: integrations/Kubernetes/dashboards/APIServer.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Kubernetes / API Server",
"ident": "",
"tags": "Categraf",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "apiserver的实例健康状态,0表示down,1表示up",
"id": "98f46bc1-c078-40f2-915c-f0836957bf2f",
"layout": {
"h": 8,
"i": "98f46bc1-c078-40f2-915c-f0836957bf2f",
"isResizable": true,
"w": 12,
"x": 0,
"y": 0
},
"links": [],
"name": "API Server - Health Status",
"options": {
"standardOptions": {
"util": "none"
},
"valueMappings": [
{
"options": {
"0": {
"text": "DOWN"
},
"1": {
"text": "UP"
}
},
"type": "value"
},
{
"match": {
"special": 1
},
"result": {
"color": "#3fc453",
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"color": "#f80202",
"text": "DOWN"
},
"type": "special"
}
]
},
"overrides": [
{}
],
"targets": [
{
"expr": "up{job=\"apiserver\"}",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"columns": [
"group",
"version",
"resource",
"removed_release"
],
"displayMode": "labelsOfSeriesToRows",
"showHeader": true,
"sortOrder": "ascend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "当前版本apiserver使用,未来版本中要移除的资源",
"id": "73beb13a-bd10-4a68-bb9e-5b9ab63da154",
"layout": {
"h": 8,
"i": "73beb13a-bd10-4a68-bb9e-5b9ab63da154",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"links": [],
"name": "Deprecated Kubernetes Resources",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "apiserver_requested_deprecated_apis{job=\"apiserver\"}",
"legend": "",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "按照返回码分类统计apiserver请求数",
"id": "1cfa42b1-9dcf-471c-90ff-8ffe656d4b11",
"layout": {
"h": 8,
"i": "1cfa42b1-9dcf-471c-90ff-8ffe656d4b11",
"isResizable": true,
"w": 12,
"x": 0,
"y": 8
},
"links": [],
"name": "API Server - HTTP Requests by code",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (instance,code) (rate(apiserver_request_total{job=\"apiserver\"}[5m]))",
"legend": "{{ instance }} {{ code }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "按照请求动作分类统计apiserver的请求数",
"id": "94def0cb-0b86-42f7-a4b2-dde714bbb918",
"layout": {
"h": 8,
"i": "94def0cb-0b86-42f7-a4b2-dde714bbb918",
"isResizable": true,
"w": 12,
"x": 12,
"y": 8
},
"links": [],
"name": "API Server - HTTP Requests by verb",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum by (instance,verb) (rate(apiserver_request_total{job=\"apiserver\"}[5m]))",
"legend": "{{ instance }} {{ verb }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "当前并发请求apiserver的数量",
"id": "fb6266a3-3da0-4310-bfe8-c64a53db5db3",
"layout": {
"h": 8,
"i": "ce5a15ad-11c6-44a2-a071-be57009162e1",
"isResizable": true,
"w": 12,
"x": 0,
"y": 16
},
"links": [],
"name": "API Server - Current Inflight Requests by kind",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "apiserver_current_inflight_requests{job=\"apiserver\"}",
"legend": "{{ instance }} {{ request_kind }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "apiserver的响应延迟,按请求动作分类统计",
"id": "045dca2d-d69b-47a7-b25e-656adb357e11",
"layout": {
"h": 8,
"i": "045dca2d-d69b-47a7-b25e-656adb357e11",
"isResizable": true,
"w": 12,
"x": 12,
"y": 16
},
"links": [],
"name": "API Server - HTTP Requests Latency by verb",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\"}[5m])) by (instance,verb,le))*1000",
"legend": "{{ instance }} {{ verb }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "apiserver的响应延迟(非watch请求)",
"id": "1e775704-9ee4-45ce-9d24-b49af89fb5c7",
"layout": {
"h": 8,
"i": "1e775704-9ee4-45ce-9d24-b49af89fb5c7",
"isResizable": true,
"w": 12,
"x": 0,
"y": 24
},
"links": [],
"name": "API Server - HTTP Requests Latency by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket {job=\"apiserver\",verb!=\"WATCH\"}[5m])) by (instance,le))*1000",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "apiserver的5xx错误率,按请求动作分类统计",
"id": "1ca62e0b-72df-47d1-93ba-048ed49e9cb5",
"layout": {
"h": 8,
"i": "1ca62e0b-72df-47d1-93ba-048ed49e9cb5",
"isResizable": true,
"w": 12,
"x": 12,
"y": 24
},
"links": [],
"name": "API Server - Errors by verb",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum by(instance,verb) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\"}[5m]))\n / sum by(instance,verb) (rate(apiserver_request_total{job=\"apiserver\"}[5m]))",
"legend": "{{ instance }} {{ verb }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "apiserver的5xx 错误率(5xx请求数/总请求数)",
"id": "92a209a1-7d30-4627-9ae1-55ded5095ed7",
"layout": {
"h": 8,
"i": "92a209a1-7d30-4627-9ae1-55ded5095ed7",
"isResizable": true,
"w": 12,
"x": 0,
"y": 32
},
"links": [],
"name": "API Server - Errors by Instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum by(instance) (rate(apiserver_request_total{code=~\"5..\", job=\"apiserver\"}[5m]))\n / sum by(instance) (rate(apiserver_request_total{job=\"apiserver\"}[5m]))",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "apiserver工作队列深度,越接近0越好",
"id": "83f22cf4-9c65-4ad3-900b-fa6fc914dd88",
"layout": {
"h": 8,
"i": "83f22cf4-9c65-4ad3-900b-fa6fc914dd88",
"isResizable": true,
"w": 12,
"x": 12,
"y": 32
},
"links": [],
"name": "API Server - Work Queue by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(workqueue_depth{job=\"apiserver\"}[5m])) by (instance,name)",
"legend": "{{ instance }} {{ name }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "normal"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "5分钟内apiserver的请求数统计",
"id": "3e9f9df7-d9fb-4791-b3b2-2c52678f060f",
"layout": {
"h": 8,
"i": "3e9f9df7-d9fb-4791-b3b2-2c52678f060f",
"isResizable": true,
"w": 12,
"x": 0,
"y": 40
},
"links": [],
"name": "API Server - HTTP Requests by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) by (instance)",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "apiserver的cpu使用率",
"id": "3d5c1ae5-e640-4986-9202-78258169bffb",
"layout": {
"h": 8,
"i": "3d5c1ae5-e640-4986-9202-78258169bffb",
"isResizable": true,
"w": 12,
"x": 12,
"y": 40
},
"links": [],
"name": "API Server - CPU Usage by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 2,
"util": "percent"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(process_cpu_seconds_total{job=\"apiserver\"}[5m])",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "apiserver的内存使用量",
"id": "1550a2d5-c808-4174-865a-a41b2c16b486",
"layout": {
"h": 8,
"i": "1550a2d5-c808-4174-865a-a41b2c16b486",
"isResizable": true,
"w": 12,
"x": 0,
"y": 48
},
"links": [],
"name": "API Server - Memory Usage by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "process_resident_memory_bytes{job=\"apiserver\"}",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327680034000
}
================================================
FILE: integrations/Kubernetes/dashboards/ControllerManager.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Kubernetes / Controller Manager",
"ident": "",
"tags": "Categraf",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager健康状态",
"id": "5d6560c5-6137-4632-bb88-ff8c9cf42e9d",
"layout": {
"h": 6,
"i": "5d6560c5-6137-4632-bb88-ff8c9cf42e9d",
"isResizable": true,
"w": 12,
"x": 0,
"y": 0
},
"links": [],
"name": "Controller Manager - Health Status",
"options": {
"standardOptions": {
"util": "none"
},
"valueMappings": [
{
"options": {
"0": {
"text": "DOWN"
},
"1": {
"text": "UP"
}
},
"type": "value"
},
{
"match": {
"special": 1
},
"result": {
"color": "#3fc453",
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"color": "#f60707",
"text": "DOWN"
},
"type": "special"
}
]
},
"overrides": [
{}
],
"targets": [
{
"expr": "up{job=\"controller-manager\"}",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager 主从状态",
"id": "62e3b249-fefe-4f32-8baf-394eac053f2a",
"layout": {
"h": 6,
"i": "a5d1ef0c-83e3-4194-b242-d5c51ba4bdd2",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"links": [],
"name": "Controller Manager - Member Status",
"options": {
"standardOptions": {
"util": "none"
},
"valueMappings": [
{
"options": {
"0": {
"text": "DOWN"
},
"1": {
"text": "UP"
}
},
"result": {
"text": "val"
},
"type": "value"
},
{
"match": {
"special": 1
},
"result": {
"color": "#3fc453",
"text": "MASTER"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"color": "#9470ff",
"text": "BACKUP"
},
"type": "special"
}
]
},
"overrides": [
{
"properties": {
"valueMappings": []
}
}
],
"targets": [
{
"expr": "leader_election_master_status{job=\"controller-manager\"}",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager 请求量 按返回码统计",
"id": "94713dc3-acb7-43b5-ae2f-399b2da61763",
"layout": {
"h": 8,
"i": "94713dc3-acb7-43b5-ae2f-399b2da61763",
"isResizable": true,
"w": 12,
"x": 0,
"y": 6
},
"links": [],
"name": "Controller Manager - Requests by code",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(rest_client_requests_total{job=\"controller-manager\"}[5m])) by (instance,code)",
"legend": "{{ instance }} {{ code }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager请求量,按请求类型统计",
"id": "a6928b49-cf0a-443e-a8fd-b999685df0be",
"layout": {
"h": 8,
"i": "a6928b49-cf0a-443e-a8fd-b999685df0be",
"isResizable": true,
"w": 12,
"x": 12,
"y": 6
},
"links": [],
"name": "Controller Manager - Requests by verb",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(rest_client_requests_total{job=\"controller-manager\"}[5m])) by (instance,method)",
"legend": "{{ instance }} {{ method }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager请求(apiserver)量",
"id": "69690063-d044-4547-9f5f-126e5f8bf55a",
"layout": {
"h": 8,
"i": "69690063-d044-4547-9f5f-126e5f8bf55a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 14
},
"links": [],
"name": "Controller Manager -Requests by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(rest_client_requests_total{job=\"controller-manager\"}[5m])) by (instance)",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager 90分位请求apiserver延迟,按请求类型统计",
"id": "053d10f3-1113-40e0-85aa-dfbabb706995",
"layout": {
"h": 8,
"i": "d5a67103-9930-46e0-97e1-296e0d71e30e",
"isResizable": true,
"w": 12,
"x": 12,
"y": 14
},
"links": [],
"name": "Controller Manager - Requests Latancy by verb",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.9, sum(rate(rest_client_request_duration_seconds_bucket{job=\"controller-manager\"}[5m])) by (instance,verb,le))*1000",
"legend": "{{ instance }} {{ verb }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager90分位请求延迟,按url统计",
"id": "c86ed101-a91c-4478-b67a-7182a5e856d1",
"layout": {
"h": 8,
"i": "2924bb3f-20c3-4f56-96ff-76d473743d8b",
"isResizable": true,
"w": 12,
"x": 0,
"y": 22
},
"links": [],
"name": "Controller Manager - Requests Latancy by url",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.9, sum(rate(rest_client_request_duration_seconds_bucket{job=\"controller-manager\"}[5m])) by (instance,url,verb,le))*1000",
"legend": "{{ instance }} {{ verb }} {{ url }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager90分位请求延迟,按实例统计",
"id": "80bd434e-21dc-4864-97c6-bfd1e2e27bbe",
"layout": {
"h": 8,
"i": "75671720-bca1-449f-9c68-bf562f105b66",
"isResizable": true,
"w": 12,
"x": 12,
"y": 22
},
"links": [],
"name": "Controller Manager - Requests Latancy by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.9, sum(rate(rest_client_request_duration_seconds_bucket{job=\"controller-manager\"}[5m])) by (instance,le)) * 1000",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager请求5xx,按请求类型统计",
"id": "32a09298-d0f8-4d54-808e-d223d0a428ff",
"layout": {
"h": 8,
"i": "cfc389ad-5648-4107-a5bd-1680f6ede2ed",
"isResizable": true,
"w": 12,
"x": 0,
"y": 30
},
"links": [],
"name": "Controller Manager - Errors by verb",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum by(instance,method) (rate(rest_client_requests_total{code=~\"5..\",job=\"controller-manager\"}[5m]))\n / sum by(instance,method) (rate(rest_client_requests_total{job=\"controller-manager\"}[5m]))",
"legend": "{{ instance }} {{ method }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager请求5xx,按实例统计",
"id": "b6931f1f-6c43-478e-bcc7-26d1b121bceb",
"layout": {
"h": 8,
"i": "152b5817-ad87-44d0-a71f-5fbd0fc10ca3",
"isResizable": true,
"w": 12,
"x": 12,
"y": 30
},
"links": [],
"name": "Controller Manager - Errors by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum by(instance) (rate(rest_client_requests_total{code=~\"5..\",job=\"controller-manager\"}[5m]))\n / sum by(instance) (rate(rest_client_requests_total{job=\"controller-manager\"}[5m]))",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager workqueue 添加任务数的速率",
"id": "0c8ac9ee-2a3e-4e7e-b338-748d79f6cbb6",
"layout": {
"h": 8,
"i": "0c8ac9ee-2a3e-4e7e-b338-748d79f6cbb6",
"isResizable": true,
"w": 12,
"x": 0,
"y": 38
},
"links": [],
"name": "Controller Manager - Average Enqueue Rate by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "avg(rate(workqueue_adds_total{job=\"controller-manager\"}[5m])) by (instance,name)",
"legend": "{{ instance }} {{ name }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "各个controller 队列深度",
"id": "b15af6b1-107d-4246-9eec-06fd370d4d35",
"layout": {
"h": 8,
"i": "b15af6b1-107d-4246-9eec-06fd370d4d35",
"isResizable": true,
"w": 12,
"x": 12,
"y": 38
},
"links": [],
"name": "Controller Manager - WorkQueue Depth by name",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(workqueue_depth{job=\"controller-manager\"}[5m])) by(instance,name)",
"legend": "{{ instance }} {{ name }} workqueue depth",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "任务在队列中的90分位等待耗时",
"id": "1b4705d3-cb3c-49c9-b60d-69edd28b662c",
"layout": {
"h": 8,
"i": "1f2a42da-cdf8-4ce9-830f-92dabeb387be",
"isResizable": true,
"w": 12,
"x": 0,
"y": 46
},
"links": [],
"name": "Controller Manager - WorkQueue Queue Time by name",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.9, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"controller-manager\"}[5m])) by (instance,name,le))*1000",
"legend": "{{ instance }} {{ name }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager 90分位 任务出队到完成的耗时",
"id": "1779a666-696e-4f07-b93f-f4b5bdd9d102",
"layout": {
"h": 8,
"i": "30b7d514-fc44-4e24-9379-da697ceba79a",
"isResizable": true,
"w": 12,
"x": 12,
"y": 46
},
"links": [],
"name": "Controller Manager - WorkQueue Work Time by name",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.9, sum(rate(workqueue_work_duration_seconds_bucket{job=\"controller-manager\"}[5m])) by (instance,name,le))*1000",
"legend": "{{ instance }} {{ name }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager 任务进入队列的重试次数",
"id": "084ccfb0-b7a1-4865-825d-f913bbc8456f",
"layout": {
"h": 8,
"i": "4841fbd4-8393-412f-ba40-7e63d79827e8",
"isResizable": true,
"w": 12,
"x": 0,
"y": 54
},
"links": [],
"name": "Controller Manager - WorkQueue Retries Total",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(workqueue_retries_total{job=\"controller-manager\"}[5m])) by (instance,name)",
"legend": "{{ instance }} {{ name }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "队列中最耗时任务的处理时间(500ms采样周期,未完成的任务-任务开始处理即出队的时间)",
"id": "a1644e81-5aee-409b-be7a-6445f9478373",
"layout": {
"h": 8,
"i": "5c576ba5-0e85-4401-9e44-346b987eb8ba",
"isResizable": true,
"w": 12,
"x": 12,
"y": 54
},
"links": [],
"name": "Controller Manager - WorkQueue longest running time",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "workqueue_longest_running_processor_seconds{job=\"controller-manager\"}*1000",
"legend": "{{ instance }} {{ name }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager 绑定的pv数目",
"id": "d17fab46-1a62-47a5-9666-ee083f2ec9e5",
"layout": {
"h": 8,
"i": "c28c023d-08e3-4614-8d20-0f0f3f5c7044",
"isResizable": true,
"w": 12,
"x": 0,
"y": 62
},
"links": [],
"name": "Controller Manager - Bound PV Count",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "pv_collector_bound_pv_count{job=\"controller-manager\"}",
"legend": "{{ instance }} {{ storage_class }} bound pv ",
"refId": "A"
},
{
"expr": "pv_collector_total_pv_count{job=\"controller-manager\"}",
"legend": "{{ instance }} total pv count",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager zone中的node数据",
"id": "ce8b2909-215d-4a2a-a2ea-5faff34cf4ef",
"layout": {
"h": 8,
"i": "8ccebf05-e22a-475e-9c4b-308887757855",
"isResizable": true,
"w": 12,
"x": 12,
"y": 62
},
"links": [],
"name": "Controller Manager - Zone Size",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "node_collector_zone_size{job=\"controller-manager\"}",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager cpu使用率",
"id": "9c187c1c-f5cd-4aab-af81-09169948ab82",
"layout": {
"h": 8,
"i": "9c187c1c-f5cd-4aab-af81-09169948ab82",
"isResizable": true,
"w": 12,
"x": 0,
"y": 70
},
"links": [],
"name": "Controller Manager - CPU Usage by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 2,
"util": "percentUnit"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(process_cpu_seconds_total{job=\"controller-manager\"}[5m])",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager 绑定/非绑定的pvc数目",
"id": "92b035eb-2bd3-417d-b82a-f5734b8a4aec",
"layout": {
"h": 8,
"i": "6ad946ea-8cee-42f5-9863-3a76d53340a4",
"isResizable": true,
"w": 12,
"x": 12,
"y": 70
},
"links": [],
"name": "Controller Manager - PVC Count",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "pv_collector_bound_pvc_count{job=\"controller-manager\"}",
"legend": "{{ instance }} {{ namespace }} bound pvc",
"refId": "A"
},
{
"expr": "pv_collector_unbound_pvc_count{job=\"controller-manager\"}",
"legend": "{{ instance }} {{ namespace }} unbound pvc",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager打开的fd数量",
"id": "94bb09a7-dbf7-41d8-b6a4-16b262365474",
"layout": {
"h": 8,
"i": "e438aed2-6d4a-4254-a8ec-26752385dc74",
"isResizable": true,
"w": 12,
"x": 0,
"y": 78
},
"links": [],
"name": "Controller Manager - Open fds by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "process_open_fds{job=\"controller-manager\"}",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "controller manager 内存使用量",
"id": "8cff2618-b2d4-4fb4-bfc2-d1d4c4f1b35c",
"layout": {
"h": 8,
"i": "8cff2618-b2d4-4fb4-bfc2-d1d4c4f1b35c",
"isResizable": true,
"w": 12,
"x": 12,
"y": 78
},
"links": [],
"name": "Controller Manager - Memory Usage by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "process_resident_memory_bytes{job=\"controller-manager\"}",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327685813000
}
================================================
FILE: integrations/Kubernetes/dashboards/DeploymentContainer.json
================================================
{
"name": "Kubernetes / Deployment / Container",
"tags": "Categraf",
"configs": {
"panels": [
{
"collapsed": true,
"id": "79d7e3b0-b64f-4591-b5dd-994ce16b68ca",
"layout": {
"h": 1,
"i": "79d7e3b0-b64f-4591-b5dd-994ce16b68ca",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "整体概况",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min CPU平均使用率,如果pod内有多个容器,会分别显示各个容器CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率400%表示当前用满4核,容器配置2核,使用率200%表示当前用满2核。",
"id": "860c1484-1f83-497e-a061-a50fbb3ff1dc",
"layout": {
"h": 7,
"i": "860c1484-1f83-497e-a061-a50fbb3ff1dc",
"isResizable": true,
"w": 6,
"x": 0,
"y": 1
},
"name": "容器CPU使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"min": 0,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min CPU平均使用率,如果pod内有多个容器,会分别显示各个容器CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率100%表示当前用满4核,容器配置2核,使用率100%表示当前用满2核,如果容器没有配置cpu limit,则不会显示该数值",
"id": "bb66b2cc-3658-4cae-817a-61ec3fbb93e4",
"layout": {
"h": 7,
"i": "bb66b2cc-3658-4cae-817a-61ec3fbb93e4",
"isResizable": true,
"w": 6,
"x": 6,
"y": 1
},
"name": "容器CPU归一化后使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"min": 0,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)/((sum(container_spec_cpu_quota{pod=\"$pod_name\"}/container_spec_cpu_period{pod=\"$pod_name\"}) by (name)))",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "container_memory_rss + container_memory_cache + container_memory_swap + kernel memory",
"id": "4d5f9cbb-3b78-4fe6-8a4e-59ca52a49666",
"layout": {
"h": 7,
"i": "4d5f9cbb-3b78-4fe6-8a4e-59ca52a49666",
"isResizable": true,
"w": 6,
"x": 12,
"y": 1
},
"name": "容器内存使用",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(container_memory_usage_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)) ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "978e93a0-770e-42f5-a374-cafb5b4fc585",
"layout": {
"h": 7,
"i": "978e93a0-770e-42f5-a374-cafb5b4fc585",
"isResizable": true,
"w": 6,
"x": 18,
"y": 1
},
"name": "文件系统写入速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"min": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_fs_writes_bytes_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])) by(name)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "f385736d-fd05-4705-a27d-41e67fb6c843",
"layout": {
"h": 7,
"i": "f385736d-fd05-4705-a27d-41e67fb6c843",
"isResizable": true,
"w": 6,
"x": 0,
"y": 8
},
"name": "文件系统读取速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"min": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_fs_reads_bytes_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])) by(name)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器网络发送的字节数(1 分钟内)",
"id": "f9b140e8-0b44-4b32-9d54-9360b06faa48",
"layout": {
"h": 7,
"i": "f9b140e8-0b44-4b32-9d54-9360b06faa48",
"isResizable": true,
"w": 6,
"x": 6,
"y": 8
},
"name": "网络发送速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器网络接收的字节数(1 分钟内)",
"id": "8f98a264-7058-4fa0-8efc-3c87954a0370",
"layout": {
"h": 7,
"i": "8f98a264-7058-4fa0-8efc-3c87954a0370",
"isResizable": true,
"w": 6,
"x": 12,
"y": 8
},
"name": "网络接收速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1868973e-c3d3-4fd4-83b0-36ec2e06dfe0",
"layout": {
"h": 7,
"i": "1868973e-c3d3-4fd4-83b0-36ec2e06dfe0",
"isResizable": true,
"w": 6,
"x": 18,
"y": 8
},
"name": "容器启动时长(小时)",
"options": {
"standardOptions": {
"util": "humantimeSeconds"
}
},
"targets": [
{
"expr": "sum((time()-container_start_time_seconds{pod=\"$pod_name\", image!~\".*pause.*\"})) by (name)",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "8a4942a2-81a3-4473-81be-79b3541e09a3",
"layout": {
"h": 1,
"i": "8a4942a2-81a3-4473-81be-79b3541e09a3",
"isResizable": false,
"w": 24,
"x": 0,
"y": 15
},
"name": "CPU",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min CPU平均使用率,如果pod内有多个容器,会分别显示各个容器CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率400%表示当前用满4核,容器配置2核,使用率200%表示当前用满2核。",
"id": "54650fe9-007f-4b16-a523-baf2e91ef823",
"layout": {
"h": 7,
"i": "54650fe9-007f-4b16-a523-baf2e91ef823",
"isResizable": true,
"w": 6,
"x": 0,
"y": 16
},
"name": "容器CPU使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min CPU平均使用率,如果pod内有多个容器,会分别显示各个容器CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率100%表示当前用满4核,容器配置2核,使用率100%表示当前用满2核,如果容器没有配置cpu limit,则不会显示该数值",
"id": "715c1e5a-c504-4f2c-a790-dad1c73aae29",
"layout": {
"h": 7,
"i": "715c1e5a-c504-4f2c-a790-dad1c73aae29",
"isResizable": true,
"w": 6,
"x": 6,
"y": 16
},
"name": "容器CPU归一化后使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)/((sum(container_spec_cpu_quota{pod=\"$pod_name\"}/container_spec_cpu_period{pod=\"$pod_name\"}) by (name)))",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min 内核态CPU平均使用率,如果pod内有多个容器,会分别显示各个容器内核态CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率400%表示当前内核态用满4核,容器配置2核,使用率200%表示当前内核态用满2核。",
"id": "d23a509d-3bb0-4680-b579-b89a411830a4",
"layout": {
"h": 7,
"i": "d23a509d-3bb0-4680-b579-b89a411830a4",
"isResizable": true,
"w": 6,
"x": 12,
"y": 16
},
"name": "容器内核态CPU使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_system_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min 内核态CPU平均使用率,如果pod内有多个容器,会分别显示各个容器用户态CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率400%表示当前用户态用满4核,容器配置2核,使用率200%表示当前用户态用满2核。",
"id": "2921e6d0-1b9c-449d-9dd8-fb29dce8ca7d",
"layout": {
"h": 7,
"i": "2921e6d0-1b9c-449d-9dd8-fb29dce8ca7d",
"isResizable": true,
"w": 6,
"x": 18,
"y": 16
},
"name": "容器用户态CPU使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_user_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "该值大于0,说明CPU在运行期间发生throttle情况,即容器设置的CPU规格,不满足容器当前对CPU的实际需求,在这种情况下,往往需要调大容器CPU规格,或者优化程序,降低CPU开销。",
"id": "b302c1a0-a499-4a99-aff2-d460685846ab",
"layout": {
"h": 7,
"i": "b302c1a0-a499-4a99-aff2-d460685846ab",
"isResizable": true,
"w": 6,
"x": 0,
"y": 23
},
"name": "容器发生CPU throttle的比率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_cfs_throttled_periods_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m]))by(name) *100",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器CPU Limit,2代表容器CPU Limit为2核, 0.2代表容器CPU Limit为0.2核, 没有数据表明没有设置Limit值",
"id": "cd5e8b99-8a76-4dea-b8b2-3bff825a7f8d",
"layout": {
"h": 7,
"i": "cd5e8b99-8a76-4dea-b8b2-3bff825a7f8d",
"isResizable": true,
"w": 6,
"x": 6,
"y": 23
},
"name": "容器CPU Limit",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "(sum(container_spec_cpu_quota{pod=\"$pod_name\"}/container_spec_cpu_period{pod=\"$pod_name\"}) by (name))",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "过去10s的CPU负载",
"id": "408c29b9-2a49-49a2-a98b-3ad7da9c57bd",
"layout": {
"h": 7,
"i": "408c29b9-2a49-49a2-a98b-3ad7da9c57bd",
"isResizable": true,
"w": 6,
"x": 12,
"y": 23
},
"name": "容器CPU load 10",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_cpu_load_average_10s{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "fe62f7db-4f87-4da1-bfc3-60ed5039e31e",
"layout": {
"h": 7,
"i": "fe62f7db-4f87-4da1-bfc3-60ed5039e31e",
"isResizable": true,
"w": 6,
"x": 18,
"y": 23
},
"name": "uninterruptible task 数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_tasks_state{pod=\"$pod_name\", image!~\".*pause.*\", state=\"uninterruptible\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7152f66a-d907-4ee9-afd3-a04b12f1019b",
"layout": {
"h": 7,
"i": "7152f66a-d907-4ee9-afd3-a04b12f1019b",
"isResizable": true,
"w": 6,
"x": 0,
"y": 30
},
"name": "running task 数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_tasks_state{pod=\"$pod_name\", image!~\".*pause.*\", state=\"running\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d2627af4-5753-4384-880b-84bead73002a",
"layout": {
"h": 7,
"i": "d2627af4-5753-4384-880b-84bead73002a",
"isResizable": true,
"w": 6,
"x": 6,
"y": 30
},
"name": "ioawaiting task 数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_tasks_state{pod=\"$pod_name\", image!~\".*pause.*\", state=\"iowaiting\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "78801242-2aae-48dc-a3c3-3d0050fac92c",
"layout": {
"h": 7,
"i": "78801242-2aae-48dc-a3c3-3d0050fac92c",
"isResizable": true,
"w": 6,
"x": 12,
"y": 30
},
"name": "sleeping task 数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_tasks_state{pod=\"$pod_name\", image!~\".*pause.*\", state=\"sleeping\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "723a4b98-52e8-4284-b193-98a470189415",
"layout": {
"h": 7,
"i": "723a4b98-52e8-4284-b193-98a470189415",
"isResizable": true,
"w": 6,
"x": 18,
"y": 30
},
"name": "stopped task 数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_tasks_state{pod=\"$pod_name\", image!~\".*pause.*\", state=\"stopped\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "39a8bcef-7b91-4ec4-8644-4e5674267437",
"layout": {
"h": 1,
"i": "39a8bcef-7b91-4ec4-8644-4e5674267437",
"isResizable": false,
"w": 24,
"x": 0,
"y": 37
},
"name": "内存",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "(container_memory_rss + container_memory_cache + container_memory_swap + kernel memory)/ (memory limit), 没有数据表明容器配置没有设置mem limit",
"id": "e7b80412-d9b2-4589-8175-abf93a62a524",
"layout": {
"h": 7,
"i": "e7b80412-d9b2-4589-8175-abf93a62a524",
"isResizable": true,
"w": 6,
"x": 0,
"y": 38
},
"name": "容器内存使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "((sum(container_memory_usage_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)) /(sum(container_spec_memory_limit_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)))*100",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "container_memory_rss + container_memory_cache + container_memory_swap + kernel memory",
"id": "4c2e1b54-1adb-480e-a581-1763bec5113f",
"layout": {
"h": 7,
"i": "4c2e1b54-1adb-480e-a581-1763bec5113f",
"isResizable": true,
"w": 6,
"x": 6,
"y": 38
},
"name": "容器内存使用",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(container_memory_usage_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)) ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器cache 占用大小",
"id": "429e0214-7b71-4fc0-a771-945363948999",
"layout": {
"h": 7,
"i": "429e0214-7b71-4fc0-a771-945363948999",
"isResizable": true,
"w": 6,
"x": 12,
"y": 38
},
"name": "容器cache使用",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(container_memory_cache{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name))",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器RSS内存占用大小。",
"id": "27c1a22c-bdf3-4a8d-bd31-16c66a307f3f",
"layout": {
"h": 7,
"i": "27c1a22c-bdf3-4a8d-bd31-16c66a307f3f",
"isResizable": true,
"w": 6,
"x": 18,
"y": 38
},
"name": "容器RSS内存使用",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(container_memory_rss{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name))",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器内存Limit配置,0表示没有设置内存 Limit",
"id": "13173c71-14a9-43a0-9cbe-34c1bb453f2b",
"layout": {
"h": 7,
"i": "13173c71-14a9-43a0-9cbe-34c1bb453f2b",
"isResizable": true,
"w": 6,
"x": 0,
"y": 45
},
"name": "容器内存 Limit",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_spec_memory_limit_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d3a7153b-f174-473c-a5ae-1ec8ff4ef318",
"layout": {
"h": 7,
"i": "d3a7153b-f174-473c-a5ae-1ec8ff4ef318",
"isResizable": true,
"w": 6,
"x": 6,
"y": 45
},
"name": "容器发生OOM次数",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(container_oom_events_total{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name) ",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络接收错误数(1分钟内)",
"id": "271d8ff6-2382-488d-b479-83e38b247012",
"layout": {
"h": 7,
"i": "271d8ff6-2382-488d-b479-83e38b247012",
"isResizable": true,
"w": 6,
"x": 12,
"y": 45
},
"name": "网络接收错误数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_errors_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络接收丢包数(1分钟内)",
"id": "e7a0879b-82a9-4518-8c5c-93bc73548401",
"layout": {
"h": 7,
"i": "e7a0879b-82a9-4518-8c5c-93bc73548401",
"isResizable": true,
"w": 6,
"x": 18,
"y": 45
},
"name": "网络接收丢包数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_packets_dropped_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "8d49afd3-d8fa-4714-aaa2-b10e045c8976",
"layout": {
"h": 1,
"i": "8d49afd3-d8fa-4714-aaa2-b10e045c8976",
"isResizable": false,
"w": 24,
"x": 0,
"y": 52
},
"name": "磁盘",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ef7e1b00-2a77-4a34-a138-5c8207088476",
"layout": {
"h": 7,
"i": "ef7e1b00-2a77-4a34-a138-5c8207088476",
"isResizable": true,
"w": 6,
"x": 0,
"y": 53
},
"name": "文件系统读取速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_fs_reads_bytes_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])) by(name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e55b7db3-2b13-43f9-aa2d-e9a0a929df4b",
"layout": {
"h": 7,
"i": "e55b7db3-2b13-43f9-aa2d-e9a0a929df4b",
"isResizable": true,
"w": 6,
"x": 6,
"y": 53
},
"name": "文件系统写入速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_fs_writes_bytes_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])) by(name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a2d7ba64-fd4d-4713-9295-8663fa72f675",
"layout": {
"h": 7,
"i": "a2d7ba64-fd4d-4713-9295-8663fa72f675",
"isResizable": true,
"w": 6,
"x": 12,
"y": 53
},
"name": "容器I/O",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_fs_io_current{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name) ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2b4e94ef-c994-4910-b72b-428bf5156a21",
"layout": {
"h": 7,
"i": "2b4e94ef-c994-4910-b72b-428bf5156a21",
"isResizable": true,
"w": 6,
"x": 18,
"y": 53
},
"name": "inode数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_fs_inodes_total{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name) ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3abade99-8f34-4a1e-bb48-c5ba1e128749",
"layout": {
"h": 7,
"i": "3abade99-8f34-4a1e-bb48-c5ba1e128749",
"isResizable": true,
"w": 6,
"x": 0,
"y": 60
},
"name": "容器已使用的文件系统大小",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_fs_usage_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name) ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络接收数据包数(1分钟内)",
"id": "8b0bac93-8bd0-4452-a200-a9cea74345b3",
"layout": {
"h": 7,
"i": "8b0bac93-8bd0-4452-a200-a9cea74345b3",
"isResizable": true,
"w": 6,
"x": 6,
"y": 60
},
"name": "网络接收数据包数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_packets_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "fde213bc-fe49-431a-9495-2219fcaaa01a",
"layout": {
"h": 1,
"i": "fde213bc-fe49-431a-9495-2219fcaaa01a",
"isResizable": false,
"w": 24,
"x": 0,
"y": 67
},
"name": "网络",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器网络发送的字节数(1 分钟内)",
"id": "4e70bc47-1510-41fe-a924-3ffa36d11f0f",
"layout": {
"h": 7,
"i": "4e70bc47-1510-41fe-a924-3ffa36d11f0f",
"isResizable": true,
"w": 6,
"x": 0,
"y": 68
},
"name": "网络发送速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络发送数据包数(1分钟内)",
"id": "00ca29e5-7c97-491d-a28f-0f2d66ea47c5",
"layout": {
"h": 7,
"i": "00ca29e5-7c97-491d-a28f-0f2d66ea47c5",
"isResizable": true,
"w": 6,
"x": 6,
"y": 68
},
"name": "网络发送数据包",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_packets_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络发送错误数(1分钟内)",
"id": "8cd76b5f-5bba-421a-9fda-42f406f54ed5",
"layout": {
"h": 7,
"i": "8cd76b5f-5bba-421a-9fda-42f406f54ed5",
"isResizable": true,
"w": 6,
"x": 12,
"y": 68
},
"name": "网络发送错误数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_errors_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络发送丢包数(1分钟内)",
"id": "58aaebbb-b1cf-4089-bcb5-70b03c05318d",
"layout": {
"h": 7,
"i": "58aaebbb-b1cf-4089-bcb5-70b03c05318d",
"isResizable": true,
"w": 6,
"x": 18,
"y": 68
},
"name": "网络发送丢包数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_packets_dropped_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器网络接收的字节数(1 分钟内)",
"id": "120d3540-cce5-4b08-b596-0ec6a5b2d497",
"layout": {
"h": 7,
"i": "120d3540-cce5-4b08-b596-0ec6a5b2d497",
"isResizable": true,
"w": 6,
"x": 0,
"y": 75
},
"name": "网络接收速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(kube_deployment_labels, deployment)",
"hide": false,
"multi": false,
"name": "deployment",
"reg": "",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(kube_pod_status_ready{pod=~\"$deployment.*\"}, pod)",
"hide": false,
"multi": false,
"name": "pod_name",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"uuid": 1727335102129685000
}
================================================
FILE: integrations/Kubernetes/dashboards/KubeStateMetrics.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Kubernetes / Kube State Metrics",
"ident": "",
"tags": "Categraf",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "0786021a-c1e7-4425-89b5-221c548e66ac",
"layout": {
"h": 1,
"i": "0786021a-c1e7-4425-89b5-221c548e66ac",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Node",
"panels": [],
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "65cefb81-39fa-455a-b514-6c7de7b666b1",
"layout": {
"h": 3,
"i": "65cefb81-39fa-455a-b514-6c7de7b666b1",
"isResizable": true,
"w": 4,
"x": 0,
"y": 1
},
"name": "Total Node",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_node_status_condition{cluster=~\"$cluster\", condition=\"Ready\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2bc28c00-c417-43c0-b577-2d86e42e7cc7",
"layout": {
"h": 3,
"i": "2bc28c00-c417-43c0-b577-2d86e42e7cc7",
"isResizable": true,
"w": 4,
"x": 4,
"y": 1
},
"name": "Not Ready Node",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": null,
"to": 1
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 1
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(kube_node_status_condition{cluster=~\"$cluster\",condition=\"Ready\", status!=\"true\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5ea68c0a-2a9a-4dac-a134-9768bdbdc6eb",
"layout": {
"h": 3,
"i": "968c87d3-6a9b-401e-895f-55e929eb37ac",
"isResizable": true,
"w": 4,
"x": 8,
"y": 1
},
"name": "有磁盘压力",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": null,
"to": 1
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 1
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(kube_node_status_condition{cluster=~\"$cluster\",condition=\"DiskPressure\", status=\"true\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "72462b79-e531-4bbb-a43e-26b8e40e40fd",
"layout": {
"h": 3,
"i": "fd201859-06bf-42c8-b4d8-e38bd1382bbe",
"isResizable": true,
"w": 4,
"x": 12,
"y": 1
},
"name": "有内存压力",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": null,
"to": 1
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 1
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(kube_node_status_condition{cluster=~\"$cluster\",condition=\"MemoryPressure\", status=\"true\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "66324dd4-c937-44c2-8e71-9c97d13d2e07",
"layout": {
"h": 3,
"i": "e6280126-40bf-4683-95b8-7a0a38bbd943",
"isResizable": true,
"w": 4,
"x": 16,
"y": 1
},
"name": "有网络压力",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": null,
"to": 1
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 1
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(kube_node_status_condition{cluster=~\"$cluster\",condition=\"NetworkUnavailable\", status=\"true\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "caa0c577-12d8-4c4d-92ac-99ba3e38995e",
"layout": {
"h": 3,
"i": "9488bd89-3527-453a-aca7-3fb779a182b1",
"isResizable": true,
"w": 4,
"x": 20,
"y": 1
},
"name": "有PID压力",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": null,
"to": 1
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 1
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(kube_node_status_condition{cluster=~\"$cluster\",condition=\"PIDPressure\", status=\"true\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a028f8d2-b1fd-47d7-8331-e12df56c8ae6",
"layout": {
"h": 3,
"i": "a028f8d2-b1fd-47d7-8331-e12df56c8ae6",
"isResizable": true,
"w": 8,
"x": 0,
"y": 4
},
"name": "集群容量:CPU Cores",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_node_status_capacity{cluster=~\"$cluster\", resource=\"cpu\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1bdfd329-01a4-453a-b071-b14d18f943df",
"layout": {
"h": 3,
"i": "81f63499-c085-44e4-a8e9-e49212c7bdcb",
"isResizable": true,
"w": 8,
"x": 8,
"y": 4
},
"name": "集群容量:Memory",
"options": {
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
}
},
"targets": [
{
"expr": "sum(kube_node_status_capacity{cluster=~\"$cluster\", resource=\"memory\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ffa6374c-d8ef-442b-bc3b-2fa4a0e4cc94",
"layout": {
"h": 3,
"i": "c1a1d33c-2d35-41fd-a32c-6297ae8a6912",
"isResizable": true,
"w": 8,
"x": 16,
"y": 4
},
"name": "集群容量:Ephemeral Storage",
"options": {
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
}
},
"targets": [
{
"expr": "sum(kube_node_status_capacity{cluster=~\"$cluster\", resource=\"ephemeral_storage\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "f03009d7-e495-43e0-9200-2026a84ec54b",
"layout": {
"h": 1,
"i": "f03009d7-e495-43e0-9200-2026a84ec54b",
"isResizable": false,
"w": 24,
"x": 0,
"y": 13
},
"name": "Daemonset",
"panels": [],
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "57d6e18b-d083-4732-86ca-e266db3191f4",
"layout": {
"h": 3,
"i": "57d6e18b-d083-4732-86ca-e266db3191f4",
"isResizable": true,
"w": 4,
"x": 0,
"y": 14
},
"name": "Desired Number Scheduled",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_daemonset_status_desired_number_scheduled{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "43fb02d1-1647-421d-a5c7-f8f9c6bc5cdd",
"layout": {
"h": 3,
"i": "67af9443-9f8e-4400-90e2-7af3b6fc2f0c",
"isResizable": true,
"w": 4,
"x": 4,
"y": 14
},
"name": "Current Number Scheduled",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_daemonset_status_current_number_scheduled{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ba24545a-3c6d-475c-b7e6-aeb4367a03e3",
"layout": {
"h": 3,
"i": "fd32a463-738b-44b8-91b8-395058f177f7",
"isResizable": true,
"w": 4,
"x": 8,
"y": 14
},
"name": "Ready",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_daemonset_status_number_ready{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0a918de7-32e1-4a68-b954-81b1307d3ef2",
"layout": {
"h": 3,
"i": "9b509227-dbc4-41b1-8ba1-6c9b94405cf0",
"isResizable": true,
"w": 4,
"x": 12,
"y": 14
},
"name": "Available",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_daemonset_status_number_available{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9fee3cc3-cdd2-4597-8aab-a32f5b99b7fe",
"layout": {
"h": 3,
"i": "79b65e9e-f4f7-4e2b-8fa9-977b093eac11",
"isResizable": true,
"w": 4,
"x": 16,
"y": 14
},
"name": "Unavailable",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": null,
"to": 1
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 1
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(kube_daemonset_status_number_unavailable{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e8dab8ca-d0ba-456d-a358-f32b39d564eb",
"layout": {
"h": 3,
"i": "eb2a28b2-fb09-4273-8cc6-76a19f96ab06",
"isResizable": true,
"w": 4,
"x": 20,
"y": 14
},
"name": "Misscheduled",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": null,
"to": 1
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 1
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(kube_daemonset_status_number_misscheduled{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": false,
"id": "ec1c6595-4fab-4d00-aab6-2e7e34d5d208",
"layout": {
"h": 1,
"i": "ec1c6595-4fab-4d00-aab6-2e7e34d5d208",
"isResizable": false,
"w": 24,
"x": 0,
"y": 20
},
"name": "Deployment",
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6712f9ba-c40c-4f11-8565-485d3321b434",
"layout": {
"h": 3,
"i": "9e867792-c82e-4b06-8384-fb351b1da247",
"isResizable": true,
"w": 6,
"x": 0,
"y": 12
},
"name": "Replicas",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_deployment_status_replicas{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "363ed020-5a36-4322-9f8a-1a4aa6507684",
"layout": {
"h": 3,
"i": "75631d60-84c9-482f-a2e4-3aaaa5738512",
"isResizable": true,
"w": 6,
"x": 6,
"y": 12
},
"name": "Replicas Available",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_deployment_status_replicas_available{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e859dd8b-8b78-4950-b406-b2b8aed3a095",
"layout": {
"h": 3,
"i": "c916543c-d212-4dbc-8d53-9d46a914fdbe",
"isResizable": true,
"w": 6,
"x": 12,
"y": 12
},
"name": "Replicas Ready",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_deployment_status_replicas_ready{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "abd803ab-e282-4211-adb8-036154e81b4b",
"layout": {
"h": 3,
"i": "6d642aff-058a-45ed-93e2-409dcfaf32d4",
"isResizable": true,
"w": 6,
"x": 18,
"y": 12
},
"name": "Unavailable",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": null,
"to": 1
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 1
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(kube_deployment_status_replicas_unavailable{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "b72d699a-d900-449c-ae7e-d52ba70128fe",
"layout": {
"h": 1,
"i": "b72d699a-d900-449c-ae7e-d52ba70128fe",
"isResizable": false,
"w": 24,
"x": 0,
"y": 24
},
"name": "Statefulset",
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d77f502d-881f-4a45-bbea-dbf0acc18611",
"layout": {
"h": 3,
"i": "5d69ceaa-8e93-4b1d-8a51-f7b5d295507b",
"isResizable": true,
"w": 6,
"x": 0,
"y": 16
},
"name": "Replicas",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_statefulset_status_replicas{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "cdbcf704-c5c5-45ec-b392-0e53d9b177a1",
"layout": {
"h": 3,
"i": "5c915ccd-0096-4812-a1d5-0772ecacf435",
"isResizable": true,
"w": 6,
"x": 6,
"y": 16
},
"name": "Replicas Available",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_statefulset_status_replicas_available{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "db5d5574-9f9d-4856-b4d0-5f313e7466d9",
"layout": {
"h": 3,
"i": "c01d88bb-9747-4f7f-ad38-5b055967732c",
"isResizable": true,
"w": 6,
"x": 12,
"y": 16
},
"name": "Replicas Current",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_statefulset_status_replicas_current{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0fbe57c3-58f5-4a8d-9d71-fc20b2ffe762",
"layout": {
"h": 3,
"i": "cf027f9e-9257-4e5d-8548-f6c30698859f",
"isResizable": true,
"w": 6,
"x": 18,
"y": 16
},
"name": "Replicas Ready",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_statefulset_status_replicas_ready{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "3c515e1b-cc46-4bd3-84dc-a0ba893a6af6",
"layout": {
"h": 1,
"i": "3c515e1b-cc46-4bd3-84dc-a0ba893a6af6",
"isResizable": false,
"w": 24,
"x": 0,
"y": 28
},
"name": "Pod",
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a3dd94c4-06e1-4425-ab32-d1a4eb173eba",
"layout": {
"h": 3,
"i": "a4dfbc69-3688-4bc8-b1fb-7d5853bc4da4",
"isResizable": true,
"w": 6,
"x": 0,
"y": 20
},
"name": "Running Pods",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_pod_status_phase{cluster=~\"$cluster\", phase=\"Running\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b7379a63-cdb3-4fdb-9067-def62580822d",
"layout": {
"h": 3,
"i": "34d7f46e-7efb-44d1-a0be-cfdb659b6784",
"isResizable": true,
"w": 6,
"x": 6,
"y": 20
},
"name": "Pending Pods",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_pod_status_phase{cluster=~\"$cluster\", phase=\"Pending\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "fa974aab-dd2a-46ee-bb00-10461f396240",
"layout": {
"h": 3,
"i": "fa974aab-dd2a-46ee-bb00-10461f396240",
"isResizable": true,
"w": 6,
"x": 12,
"y": 20
},
"name": "Failed | Unknown Pods",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#eb0909"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(kube_pod_status_phase{cluster=~\"$cluster\", phase=~\"Failed|Unknown\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "245eee0d-e0c1-4ff3-801d-3de31c74bdcb",
"layout": {
"h": 3,
"i": "3a314f6b-7da2-40e1-8e6b-f32da4295f8e",
"isResizable": true,
"w": 6,
"x": 18,
"y": 20
},
"name": "Restarts in last 5min",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#eb0909"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(increase(kube_pod_container_status_restarts_total{cluster=~\"$cluster\"}[5m]))",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "3789e988-a132-48f6-9953-c2985769e23d",
"layout": {
"h": 1,
"i": "3789e988-a132-48f6-9953-c2985769e23d",
"isResizable": false,
"w": 24,
"x": 0,
"y": 32
},
"name": "Job",
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9bb45396-e0bc-4823-a663-d6fbdeb545a1",
"layout": {
"h": 3,
"i": "9bb45396-e0bc-4823-a663-d6fbdeb545a1",
"isResizable": true,
"w": 6,
"x": 0,
"y": 24
},
"name": "Complete",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_job_complete{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9660891f-767e-4e5c-bddc-638c13be6e54",
"layout": {
"h": 3,
"i": "b126a364-4822-4de4-a676-56e6e7e95e87",
"isResizable": true,
"w": 6,
"x": 6,
"y": 24
},
"name": "Active",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_job_status_active{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c41b07f9-b513-4bd4-b0dc-43dbc4bbce6d",
"layout": {
"h": 3,
"i": "b8afaed1-7ffc-42bb-85c2-b4aa5fa2c116",
"isResizable": true,
"w": 6,
"x": 12,
"y": 24
},
"name": "Succeeded",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_job_status_succeeded{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b65a3ee3-7e07-4347-b04f-0f77c67877e6",
"layout": {
"h": 3,
"i": "52c5a536-bc77-41e0-99ec-81e769065feb",
"isResizable": true,
"w": 6,
"x": 18,
"y": 24
},
"name": "Failed",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kube_job_status_failed{cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
}
],
"type": "row"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(kube_node_info, cluster)",
"multi": true,
"name": "cluster",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327688278000
}
================================================
FILE: integrations/Kubernetes/dashboards/KubeletMetrics.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Kubernetes / Kubelet Metrics",
"ident": "",
"tags": "Categraf",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d3caf396-b3a1-449b-acec-f550967889e6",
"layout": {
"h": 3,
"i": "d3caf396-b3a1-449b-acec-f550967889e6",
"isResizable": true,
"w": 4,
"x": 0,
"y": 0
},
"name": "Kubelet UP",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(up{source=\"kubelet\", cluster=~\"$cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "38c38b23-a7e3-4177-8c41-3ce955ea0434",
"layout": {
"h": 3,
"i": "38c38b23-a7e3-4177-8c41-3ce955ea0434",
"isResizable": true,
"w": 4,
"x": 4,
"y": 0
},
"name": "Running Pods",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kubelet_running_pods{cluster=~\"$cluster\", instance=~\"$instance\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "525859b9-91d7-4180-b363-bf8ceec977d8",
"layout": {
"h": 3,
"i": "26bf2320-fcff-48f8-a6fc-aa9076bb9329",
"isResizable": true,
"w": 4,
"x": 8,
"y": 0
},
"name": "Running Containers",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(kubelet_running_containers{cluster=~\"$cluster\", instance=~\"$instance\", container_state=\"running\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "84af4617-2ae0-4b30-a82a-6e8586342224",
"layout": {
"h": 3,
"i": "54ae4ab3-e932-418c-a637-f2f515cce1b9",
"isResizable": true,
"w": 4,
"x": 12,
"y": 0
},
"name": "Desired Volumes",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(volume_manager_total_volumes{cluster=~\"$cluster\", instance=~\"$instance\", state=\"desired_state_of_world\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d431f4bd-9115-41d2-a494-1d680bdd1e0f",
"layout": {
"h": 3,
"i": "d9de76d7-2203-40e7-a792-9888ec869e82",
"isResizable": true,
"w": 4,
"x": 16,
"y": 0
},
"name": "Actual Volumes",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(volume_manager_total_volumes{cluster=~\"$cluster\", instance=~\"$instance\", state=\"actual_state_of_world\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "54de62bc-8af3-4c27-8b8e-1af567b363fc",
"layout": {
"h": 3,
"i": "bf2bbd15-347d-404c-9b8f-e524875befe2",
"isResizable": true,
"w": 4,
"x": 20,
"y": 0
},
"name": "OP Errors in 5min",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#d0021b"
},
"type": "range"
},
{
"match": {
"to": 1
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(increase(kubelet_runtime_operations_errors_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]))",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "730d4a9b-791f-4aaf-a042-668f66e73814",
"layout": {
"h": 1,
"i": "730d4a9b-791f-4aaf-a042-668f66e73814",
"isResizable": false,
"w": 24,
"x": 0,
"y": 3
},
"name": "Operations",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d26e6818-6704-492a-8cbf-58473dd85716",
"layout": {
"h": 4,
"i": "d26e6818-6704-492a-8cbf-58473dd85716",
"isResizable": true,
"w": 12,
"x": 0,
"y": 4
},
"name": "Operations in 5min",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "increase(kubelet_runtime_operations_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "09a6ad5b-8c0e-4f17-b17f-3ebc514f7d20",
"layout": {
"h": 4,
"i": "4e585d2f-c61c-4350-86ec-dca7ddc34ceb",
"isResizable": true,
"w": 12,
"x": 12,
"y": 4
},
"name": "Operation Errors in 5min",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "increase(kubelet_runtime_operations_errors_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b5e56f3e-fa20-4c19-8578-c0610fa0a7e7",
"layout": {
"h": 4,
"i": "b5e56f3e-fa20-4c19-8578-c0610fa0a7e7",
"isResizable": true,
"w": 24,
"x": 0,
"y": 8
},
"name": "Average Operation duration in 1 hour (Unit: Second)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "increase(kubelet_runtime_operations_duration_seconds_sum{cluster=~\"$cluster\", instance=~\"$instance\"}[1h])/increase(kubelet_runtime_operations_duration_seconds_count{cluster=~\"$cluster\", instance=~\"$instance\"}[1h])",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "dd7e84c5-03ce-467c-871a-aa110fe051f4",
"layout": {
"h": 1,
"i": "dd7e84c5-03ce-467c-871a-aa110fe051f4",
"isResizable": false,
"w": 24,
"x": 0,
"y": 12
},
"name": "PLEG relist",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f3822da8-a9c9-4db1-ba12-465d3ece823e",
"layout": {
"h": 4,
"i": "f3822da8-a9c9-4db1-ba12-465d3ece823e",
"isResizable": true,
"w": 12,
"x": 0,
"y": 13
},
"name": "relist rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(kubelet_pleg_relist_duration_seconds_count{cluster=~\"$cluster\", instance=~\"$instance\"}[1h])",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a6e4c914-bfca-4419-a264-f5b1cbab261a",
"layout": {
"h": 4,
"i": "2b4ada76-6c30-42cd-9bd3-c939b4c0139c",
"isResizable": true,
"w": 12,
"x": 12,
"y": 13
},
"name": "relist duration (Unit: Second)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "increase(kubelet_pleg_relist_duration_seconds_sum{cluster=~\"$cluster\", instance=~\"$instance\"}[1h])/increase(kubelet_pleg_relist_duration_seconds_count{cluster=~\"$cluster\", instance=~\"$instance\"}[1h])",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(kubelet_running_pods, cluster)",
"multi": true,
"name": "cluster",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(kubelet_running_pods{cluster=~\"$cluster\"}, instance)",
"multi": true,
"name": "instance",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327690164000
}
================================================
FILE: integrations/Kubernetes/dashboards/Pod.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Kubernetes / Pod",
"ident": "",
"tags": "Categraf",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "79d7e3b0-b64f-4591-b5dd-994ce16b68ca",
"layout": {
"h": 1,
"i": "79d7e3b0-b64f-4591-b5dd-994ce16b68ca",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "整体概况",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min CPU平均使用率,如果pod内有多个容器,会分别显示各个容器CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率400%表示当前用满4核,容器配置2核,使用率200%表示当前用满2核。",
"id": "860c1484-1f83-497e-a061-a50fbb3ff1dc",
"layout": {
"h": 7,
"i": "860c1484-1f83-497e-a061-a50fbb3ff1dc",
"isResizable": true,
"w": 6,
"x": 0,
"y": 1
},
"name": "容器CPU使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"min": 0,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min CPU平均使用率,如果pod内有多个容器,会分别显示各个容器CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率100%表示当前用满4核,容器配置2核,使用率100%表示当前用满2核,如果容器没有配置cpu limit,则不会显示该数值",
"id": "bb66b2cc-3658-4cae-817a-61ec3fbb93e4",
"layout": {
"h": 7,
"i": "bb66b2cc-3658-4cae-817a-61ec3fbb93e4",
"isResizable": true,
"w": 6,
"x": 6,
"y": 1
},
"name": "容器CPU归一化后使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"min": 0,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)/((sum(container_spec_cpu_quota{pod=\"$pod_name\"}/container_spec_cpu_period{pod=\"$pod_name\"}) by (name)))",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "container_memory_rss + container_memory_cache + container_memory_swap + kernel memory",
"id": "4d5f9cbb-3b78-4fe6-8a4e-59ca52a49666",
"layout": {
"h": 7,
"i": "4d5f9cbb-3b78-4fe6-8a4e-59ca52a49666",
"isResizable": true,
"w": 6,
"x": 12,
"y": 1
},
"name": "容器内存使用",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(container_memory_usage_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)) ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "978e93a0-770e-42f5-a374-cafb5b4fc585",
"layout": {
"h": 7,
"i": "978e93a0-770e-42f5-a374-cafb5b4fc585",
"isResizable": true,
"w": 6,
"x": 18,
"y": 1
},
"name": "文件系统写入速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"min": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_fs_writes_bytes_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])) by(name)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "f385736d-fd05-4705-a27d-41e67fb6c843",
"layout": {
"h": 7,
"i": "f385736d-fd05-4705-a27d-41e67fb6c843",
"isResizable": true,
"w": 6,
"x": 0,
"y": 8
},
"name": "文件系统读取速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"min": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_fs_reads_bytes_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])) by(name)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器网络发送的字节数(1 分钟内)",
"id": "f9b140e8-0b44-4b32-9d54-9360b06faa48",
"layout": {
"h": 7,
"i": "f9b140e8-0b44-4b32-9d54-9360b06faa48",
"isResizable": true,
"w": 6,
"x": 6,
"y": 8
},
"name": "网络发送速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器网络接收的字节数(1 分钟内)",
"id": "8f98a264-7058-4fa0-8efc-3c87954a0370",
"layout": {
"h": 7,
"i": "8f98a264-7058-4fa0-8efc-3c87954a0370",
"isResizable": true,
"w": 6,
"x": 12,
"y": 8
},
"name": "网络接收速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1868973e-c3d3-4fd4-83b0-36ec2e06dfe0",
"layout": {
"h": 7,
"i": "1868973e-c3d3-4fd4-83b0-36ec2e06dfe0",
"isResizable": true,
"w": 6,
"x": 18,
"y": 8
},
"name": "容器启动时长(小时)",
"options": {
"standardOptions": {
"util": "humantimeSeconds"
}
},
"targets": [
{
"expr": "sum((time()-container_start_time_seconds{pod=\"$pod_name\", image!~\".*pause.*\"})) by (name)",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "8a4942a2-81a3-4473-81be-79b3541e09a3",
"layout": {
"h": 1,
"i": "8a4942a2-81a3-4473-81be-79b3541e09a3",
"isResizable": false,
"w": 24,
"x": 0,
"y": 15
},
"name": "CPU",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min CPU平均使用率,如果pod内有多个容器,会分别显示各个容器CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率400%表示当前用满4核,容器配置2核,使用率200%表示当前用满2核。",
"id": "54650fe9-007f-4b16-a523-baf2e91ef823",
"layout": {
"h": 7,
"i": "54650fe9-007f-4b16-a523-baf2e91ef823",
"isResizable": true,
"w": 6,
"x": 0,
"y": 16
},
"name": "容器CPU使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min CPU平均使用率,如果pod内有多个容器,会分别显示各个容器CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率100%表示当前用满4核,容器配置2核,使用率100%表示当前用满2核,如果容器没有配置cpu limit,则不会显示该数值",
"id": "715c1e5a-c504-4f2c-a790-dad1c73aae29",
"layout": {
"h": 7,
"i": "715c1e5a-c504-4f2c-a790-dad1c73aae29",
"isResizable": true,
"w": 6,
"x": 6,
"y": 16
},
"name": "容器CPU归一化后使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)/((sum(container_spec_cpu_quota{pod=\"$pod_name\"}/container_spec_cpu_period{pod=\"$pod_name\"}) by (name)))",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min 内核态CPU平均使用率,如果pod内有多个容器,会分别显示各个容器内核态CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率400%表示当前内核态用满4核,容器配置2核,使用率200%表示当前内核态用满2核。",
"id": "d23a509d-3bb0-4680-b579-b89a411830a4",
"layout": {
"h": 7,
"i": "d23a509d-3bb0-4680-b579-b89a411830a4",
"isResizable": true,
"w": 6,
"x": 12,
"y": 16
},
"name": "容器内核态CPU使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_system_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min 内核态CPU平均使用率,如果pod内有多个容器,会分别显示各个容器用户态CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率400%表示当前用户态用满4核,容器配置2核,使用率200%表示当前用户态用满2核。",
"id": "2921e6d0-1b9c-449d-9dd8-fb29dce8ca7d",
"layout": {
"h": 7,
"i": "2921e6d0-1b9c-449d-9dd8-fb29dce8ca7d",
"isResizable": true,
"w": 6,
"x": 18,
"y": 16
},
"name": "容器用户态CPU使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_user_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "该值大于0,说明CPU在运行期间发生throttle情况,即容器设置的CPU规格,不满足容器当前对CPU的实际需求,在这种情况下,往往需要调大容器CPU规格,或者优化程序,降低CPU开销。",
"id": "b302c1a0-a499-4a99-aff2-d460685846ab",
"layout": {
"h": 7,
"i": "b302c1a0-a499-4a99-aff2-d460685846ab",
"isResizable": true,
"w": 6,
"x": 0,
"y": 23
},
"name": "容器发生CPU throttle的比率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_cfs_throttled_periods_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m]))by(name) *100",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器CPU Limit,2代表容器CPU Limit为2核, 0.2代表容器CPU Limit为0.2核, 没有数据表明没有设置Limit值",
"id": "cd5e8b99-8a76-4dea-b8b2-3bff825a7f8d",
"layout": {
"h": 7,
"i": "cd5e8b99-8a76-4dea-b8b2-3bff825a7f8d",
"isResizable": true,
"w": 6,
"x": 6,
"y": 23
},
"name": "容器CPU Limit",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "(sum(container_spec_cpu_quota{pod=\"$pod_name\"}/container_spec_cpu_period{pod=\"$pod_name\"}) by (name))",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "过去10s的CPU负载",
"id": "408c29b9-2a49-49a2-a98b-3ad7da9c57bd",
"layout": {
"h": 7,
"i": "408c29b9-2a49-49a2-a98b-3ad7da9c57bd",
"isResizable": true,
"w": 6,
"x": 12,
"y": 23
},
"name": "容器CPU load 10",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_cpu_load_average_10s{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "fe62f7db-4f87-4da1-bfc3-60ed5039e31e",
"layout": {
"h": 7,
"i": "fe62f7db-4f87-4da1-bfc3-60ed5039e31e",
"isResizable": true,
"w": 6,
"x": 18,
"y": 23
},
"name": "uninterruptible task 数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_tasks_state{pod=\"$pod_name\", image!~\".*pause.*\", state=\"uninterruptible\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7152f66a-d907-4ee9-afd3-a04b12f1019b",
"layout": {
"h": 7,
"i": "7152f66a-d907-4ee9-afd3-a04b12f1019b",
"isResizable": true,
"w": 6,
"x": 0,
"y": 30
},
"name": "running task 数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_tasks_state{pod=\"$pod_name\", image!~\".*pause.*\", state=\"running\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d2627af4-5753-4384-880b-84bead73002a",
"layout": {
"h": 7,
"i": "d2627af4-5753-4384-880b-84bead73002a",
"isResizable": true,
"w": 6,
"x": 6,
"y": 30
},
"name": "ioawaiting task 数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_tasks_state{pod=\"$pod_name\", image!~\".*pause.*\", state=\"iowaiting\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "78801242-2aae-48dc-a3c3-3d0050fac92c",
"layout": {
"h": 7,
"i": "78801242-2aae-48dc-a3c3-3d0050fac92c",
"isResizable": true,
"w": 6,
"x": 12,
"y": 30
},
"name": "sleeping task 数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_tasks_state{pod=\"$pod_name\", image!~\".*pause.*\", state=\"sleeping\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "723a4b98-52e8-4284-b193-98a470189415",
"layout": {
"h": 7,
"i": "723a4b98-52e8-4284-b193-98a470189415",
"isResizable": true,
"w": 6,
"x": 18,
"y": 30
},
"name": "stopped task 数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_tasks_state{pod=\"$pod_name\", image!~\".*pause.*\", state=\"stopped\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "39a8bcef-7b91-4ec4-8644-4e5674267437",
"layout": {
"h": 1,
"i": "39a8bcef-7b91-4ec4-8644-4e5674267437",
"isResizable": false,
"w": 24,
"x": 0,
"y": 37
},
"name": "内存",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "(container_memory_rss + container_memory_cache + container_memory_swap + kernel memory)/ (memory limit), 没有数据表明容器配置没有设置mem limit",
"id": "e7b80412-d9b2-4589-8175-abf93a62a524",
"layout": {
"h": 7,
"i": "e7b80412-d9b2-4589-8175-abf93a62a524",
"isResizable": true,
"w": 6,
"x": 0,
"y": 38
},
"name": "容器内存使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "((sum(container_memory_usage_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)) /(sum(container_spec_memory_limit_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)))*100",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "container_memory_rss + container_memory_cache + container_memory_swap + kernel memory",
"id": "4c2e1b54-1adb-480e-a581-1763bec5113f",
"layout": {
"h": 7,
"i": "4c2e1b54-1adb-480e-a581-1763bec5113f",
"isResizable": true,
"w": 6,
"x": 6,
"y": 38
},
"name": "容器内存使用",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(container_memory_usage_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)) ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器cache 占用大小",
"id": "429e0214-7b71-4fc0-a771-945363948999",
"layout": {
"h": 7,
"i": "429e0214-7b71-4fc0-a771-945363948999",
"isResizable": true,
"w": 6,
"x": 12,
"y": 38
},
"name": "容器cache使用",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(container_memory_cache{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name))",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器RSS内存占用大小。",
"id": "27c1a22c-bdf3-4a8d-bd31-16c66a307f3f",
"layout": {
"h": 7,
"i": "27c1a22c-bdf3-4a8d-bd31-16c66a307f3f",
"isResizable": true,
"w": 6,
"x": 18,
"y": 38
},
"name": "容器RSS内存使用",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(container_memory_rss{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name))",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器内存Limit配置,0表示没有设置内存 Limit",
"id": "13173c71-14a9-43a0-9cbe-34c1bb453f2b",
"layout": {
"h": 7,
"i": "13173c71-14a9-43a0-9cbe-34c1bb453f2b",
"isResizable": true,
"w": 6,
"x": 0,
"y": 45
},
"name": "容器内存 Limit",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_spec_memory_limit_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d3a7153b-f174-473c-a5ae-1ec8ff4ef318",
"layout": {
"h": 7,
"i": "d3a7153b-f174-473c-a5ae-1ec8ff4ef318",
"isResizable": true,
"w": 6,
"x": 6,
"y": 45
},
"name": "容器发生OOM次数",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(container_oom_events_total{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name) ",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络接收错误数(1分钟内)",
"id": "271d8ff6-2382-488d-b479-83e38b247012",
"layout": {
"h": 7,
"i": "271d8ff6-2382-488d-b479-83e38b247012",
"isResizable": true,
"w": 6,
"x": 12,
"y": 45
},
"name": "网络接收错误数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_errors_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络接收丢包数(1分钟内)",
"id": "e7a0879b-82a9-4518-8c5c-93bc73548401",
"layout": {
"h": 7,
"i": "e7a0879b-82a9-4518-8c5c-93bc73548401",
"isResizable": true,
"w": 6,
"x": 18,
"y": 45
},
"name": "网络接收丢包数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_packets_dropped_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "8d49afd3-d8fa-4714-aaa2-b10e045c8976",
"layout": {
"h": 1,
"i": "8d49afd3-d8fa-4714-aaa2-b10e045c8976",
"isResizable": false,
"w": 24,
"x": 0,
"y": 52
},
"name": "磁盘",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ef7e1b00-2a77-4a34-a138-5c8207088476",
"layout": {
"h": 7,
"i": "ef7e1b00-2a77-4a34-a138-5c8207088476",
"isResizable": true,
"w": 6,
"x": 0,
"y": 53
},
"name": "文件系统读取速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_fs_reads_bytes_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])) by(name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e55b7db3-2b13-43f9-aa2d-e9a0a929df4b",
"layout": {
"h": 7,
"i": "e55b7db3-2b13-43f9-aa2d-e9a0a929df4b",
"isResizable": true,
"w": 6,
"x": 6,
"y": 53
},
"name": "文件系统写入速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_fs_writes_bytes_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])) by(name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a2d7ba64-fd4d-4713-9295-8663fa72f675",
"layout": {
"h": 7,
"i": "a2d7ba64-fd4d-4713-9295-8663fa72f675",
"isResizable": true,
"w": 6,
"x": 12,
"y": 53
},
"name": "容器I/O",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_fs_io_current{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name) ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2b4e94ef-c994-4910-b72b-428bf5156a21",
"layout": {
"h": 7,
"i": "2b4e94ef-c994-4910-b72b-428bf5156a21",
"isResizable": true,
"w": 6,
"x": 18,
"y": 53
},
"name": "inode数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_fs_inodes_total{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name) ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3abade99-8f34-4a1e-bb48-c5ba1e128749",
"layout": {
"h": 7,
"i": "3abade99-8f34-4a1e-bb48-c5ba1e128749",
"isResizable": true,
"w": 6,
"x": 0,
"y": 60
},
"name": "容器已使用的文件系统大小",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_fs_usage_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name) ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络接收数据包数(1分钟内)",
"id": "8b0bac93-8bd0-4452-a200-a9cea74345b3",
"layout": {
"h": 7,
"i": "8b0bac93-8bd0-4452-a200-a9cea74345b3",
"isResizable": true,
"w": 6,
"x": 6,
"y": 60
},
"name": "网络接收数据包数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_packets_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "fde213bc-fe49-431a-9495-2219fcaaa01a",
"layout": {
"h": 1,
"i": "fde213bc-fe49-431a-9495-2219fcaaa01a",
"isResizable": false,
"w": 24,
"x": 0,
"y": 67
},
"name": "网络",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器网络发送的字节数(1 分钟内)",
"id": "4e70bc47-1510-41fe-a924-3ffa36d11f0f",
"layout": {
"h": 7,
"i": "4e70bc47-1510-41fe-a924-3ffa36d11f0f",
"isResizable": true,
"w": 6,
"x": 0,
"y": 68
},
"name": "网络发送速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络发送数据包数(1分钟内)",
"id": "00ca29e5-7c97-491d-a28f-0f2d66ea47c5",
"layout": {
"h": 7,
"i": "00ca29e5-7c97-491d-a28f-0f2d66ea47c5",
"isResizable": true,
"w": 6,
"x": 6,
"y": 68
},
"name": "网络发送数据包",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_packets_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络发送错误数(1分钟内)",
"id": "8cd76b5f-5bba-421a-9fda-42f406f54ed5",
"layout": {
"h": 7,
"i": "8cd76b5f-5bba-421a-9fda-42f406f54ed5",
"isResizable": true,
"w": 6,
"x": 12,
"y": 68
},
"name": "网络发送错误数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_errors_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络发送丢包数(1分钟内)",
"id": "58aaebbb-b1cf-4089-bcb5-70b03c05318d",
"layout": {
"h": 7,
"i": "58aaebbb-b1cf-4089-bcb5-70b03c05318d",
"isResizable": true,
"w": 6,
"x": 18,
"y": 68
},
"name": "网络发送丢包数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_packets_dropped_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器网络接收的字节数(1 分钟内)",
"id": "120d3540-cce5-4b08-b596-0ec6a5b2d497",
"layout": {
"h": 7,
"i": "120d3540-cce5-4b08-b596-0ec6a5b2d497",
"isResizable": true,
"w": 6,
"x": 0,
"y": 75
},
"name": "网络接收速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"name": "datasource",
"type": "datasource",
"definition": "prometheus",
"defaultValue": 40
},
{
"name": "namespace",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(container_cpu_usage_seconds_total, namespace)",
"reg": "",
"multi": false
},
{
"name": "pod_name",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(container_cpu_usage_seconds_total{namespace=\"$namespace\"}, pod)",
"reg": "",
"multi": false
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327681975000
}
================================================
FILE: integrations/Kubernetes/dashboards/Scheduler.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Kubernetes / Scheduler",
"ident": "",
"tags": "Categraf",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "scheduler健康状态",
"id": "5d6560c5-6137-4632-bb88-ff8c9cf42e9d",
"layout": {
"h": 6,
"i": "5d6560c5-6137-4632-bb88-ff8c9cf42e9d",
"isResizable": true,
"w": 12,
"x": 0,
"y": 0
},
"links": [],
"name": "Scheduler - Health Status",
"options": {
"standardOptions": {
"util": "none"
},
"valueMappings": [
{
"options": {
"0": {
"text": "DOWN"
},
"1": {
"text": "UP"
}
},
"type": "value"
},
{
"match": {
"special": 1
},
"result": {
"color": "#3fc453",
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"color": "#f80202",
"text": "DOWN"
},
"type": "special"
}
]
},
"overrides": [
{}
],
"targets": [
{
"expr": "up{job=\"scheduler\"}",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "scheduler 主从状态",
"id": "62e3b249-fefe-4f32-8baf-394eac053f2a",
"layout": {
"h": 6,
"i": "a5d1ef0c-83e3-4194-b242-d5c51ba4bdd2",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"links": [],
"name": "Scheduler - Member Status",
"options": {
"standardOptions": {
"util": "none"
},
"valueMappings": [
{
"options": {
"0": {
"text": "DOWN"
},
"1": {
"text": "UP"
}
},
"result": {
"text": "val2"
},
"type": "value"
},
{
"match": {
"special": 1
},
"result": {
"color": "#3fc453",
"text": "MASTER"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"color": "#9470ff",
"text": "BACKUP"
},
"type": "special"
}
]
},
"overrides": [
{
"properties": {
"valueMappings": []
}
}
],
"targets": [
{
"expr": "leader_election_master_status{job=\"scheduler\"}",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "scheduler 请求量 按返回码统计",
"id": "94713dc3-acb7-43b5-ae2f-399b2da61763",
"layout": {
"h": 8,
"i": "94713dc3-acb7-43b5-ae2f-399b2da61763",
"isResizable": true,
"w": 12,
"x": 0,
"y": 6
},
"links": [],
"name": "Scheduler - Requests by code",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(rest_client_requests_total{job=\"scheduler\"}[5m])) by (instance,code)",
"legend": "{{ instance }} {{ code }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "scheduler请求量,按请求类型统计",
"id": "a6928b49-cf0a-443e-a8fd-b999685df0be",
"layout": {
"h": 8,
"i": "a6928b49-cf0a-443e-a8fd-b999685df0be",
"isResizable": true,
"w": 12,
"x": 12,
"y": 6
},
"links": [],
"name": "Scheduler - Requests by verb",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(rest_client_requests_total{job=\"scheduler\"}[5m])) by (instance,method)",
"legend": "{{ instance }} {{ method }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "scheduler请求(apiserver)量",
"id": "69690063-d044-4547-9f5f-126e5f8bf55a",
"layout": {
"h": 8,
"i": "69690063-d044-4547-9f5f-126e5f8bf55a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 14
},
"links": [],
"name": "Scheduler -Requests by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(rest_client_requests_total{job=\"scheduler\"}[5m])) by (instance)",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "scheduler 90分位请求延迟,按请求类型统计",
"id": "053d10f3-1113-40e0-85aa-dfbabb706995",
"layout": {
"h": 8,
"i": "d5a67103-9930-46e0-97e1-296e0d71e30e",
"isResizable": true,
"w": 12,
"x": 12,
"y": 14
},
"links": [],
"name": "Scheduler - Requests Latancy by verb",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.9, sum(rate(rest_client_request_duration_seconds_bucket{job=\"scheduler\"}[5m])) by (instance,verb,le))*1000",
"legend": "{{ instance }} {{ verb }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "scheduler 90分位请求延迟,按请求类型统计",
"id": "c86ed101-a91c-4478-b67a-7182a5e856d1",
"layout": {
"h": 8,
"i": "2924bb3f-20c3-4f56-96ff-76d473743d8b",
"isResizable": true,
"w": 12,
"x": 0,
"y": 22
},
"links": [],
"name": "Scheduler - Requests Latancy by url",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.9, sum(rate(rest_client_request_duration_seconds_bucket{job=\"scheduler\"}[5m])) by (instance,url,verb,le))*1000",
"legend": "{{ instance }} {{ verb }} {{ url }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "scheduler 90分位请求延迟,按实例统计",
"id": "80bd434e-21dc-4864-97c6-bfd1e2e27bbe",
"layout": {
"h": 8,
"i": "75671720-bca1-449f-9c68-bf562f105b66",
"isResizable": true,
"w": 12,
"x": 12,
"y": 22
},
"links": [],
"name": "Scheduler - Requests Latancy by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.9, sum(rate(rest_client_request_duration_seconds_bucket{job=\"scheduler\"}[5m])) by (instance,le))*1000",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "scheduler请求5xx,按请求类型统计",
"id": "32a09298-d0f8-4d54-808e-d223d0a428ff",
"layout": {
"h": 8,
"i": "cfc389ad-5648-4107-a5bd-1680f6ede2ed",
"isResizable": true,
"w": 12,
"x": 0,
"y": 30
},
"links": [],
"name": "Scheduler - Errors by verb",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum by(instance,method) (rate(rest_client_requests_total{code=~\"5..\",job=\"scheduler\"}[5m]))\n / sum by(instance,method) (rate(rest_client_requests_total{job=\"scheduler\"}[5m]))",
"legend": "{{ instance }} {{ method }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "scheduler请求5xx,按实例统计",
"id": "b6931f1f-6c43-478e-bcc7-26d1b121bceb",
"layout": {
"h": 8,
"i": "152b5817-ad87-44d0-a71f-5fbd0fc10ca3",
"isResizable": true,
"w": 12,
"x": 12,
"y": 30
},
"links": [],
"name": "Scheduler - Errors by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum by(instance) (rate(rest_client_requests_total{code=~\"5..\",job=\"scheduler\"}[5m]))\n / sum by(instance) (rate(rest_client_requests_total{job=\"scheduler\"}[5m]))",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "pod进入调度队列的平均速率",
"id": "0c8ac9ee-2a3e-4e7e-b338-748d79f6cbb6",
"layout": {
"h": 8,
"i": "0c8ac9ee-2a3e-4e7e-b338-748d79f6cbb6",
"isResizable": true,
"w": 12,
"x": 0,
"y": 38
},
"links": [],
"name": "Scheduler - Average Enqueue Rate by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "avg(rate(scheduler_queue_incoming_pods_total{job=\"scheduler\"}[5m])) by (instance)",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "调度器驱逐容器的次数",
"id": "ed72c89c-9732-4fb0-9187-0cabe9a4a81c",
"layout": {
"h": 8,
"i": "730be715-cfdd-4f2a-b878-1b2fb9a9be45",
"isResizable": true,
"w": 12,
"x": 12,
"y": 38
},
"links": [],
"name": "Scheduler - Preemption Attempts Total by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "avg(rate(scheduler_preemption_attempts_total{job=\"scheduler\"}[5m])) by (instance)",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "调度器cache中pod node和已绑定pod的数量",
"id": "dd189a18-704b-44c0-8a8b-186ccc591c81",
"layout": {
"h": 8,
"i": "668362fe-211a-4286-861c-eeb90907b2fb",
"isResizable": true,
"w": 12,
"x": 0,
"y": 46
},
"links": [],
"name": "Scheduler - Cache Size",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": " scheduler_scheduler_cache_size{job=\"scheduler\"}",
"legend": "{{ instance }} {{ type }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "调度队列中pending pod的数目",
"id": "0e231e69-4651-4728-abcb-46fafd1a7d61",
"layout": {
"h": 8,
"i": "f20223e4-199a-4b08-b5f0-470ac1974afc",
"isResizable": true,
"w": 12,
"x": 12,
"y": 46
},
"links": [],
"name": "Scheduler - Pending Pod",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "scheduler_pending_pods{job=\"scheduler\"}",
"legend": "{{ instance }} {{ queue }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "调度插件在每个扩展点的执行时间,90分位",
"id": "51cb0215-0827-4543-bb58-64eaec6cdc77",
"layout": {
"h": 8,
"i": "ec0db15d-fa63-4448-a961-0b8a7b977dd0",
"isResizable": true,
"w": 12,
"x": 0,
"y": 54
},
"links": [],
"name": "Scheduler - Plugin Execution Duration by plugin",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.9, sum(rate(scheduler_plugin_execution_duration_seconds_bucket{job=\"scheduler\"}[5m])) by (instance,extension_point,plugin,status,le))*1000",
"legend": "{{ instance }} {{ plugin }} {{ extension_point }} {{ status }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "调度算法90分位耗时",
"id": "bf41347e-c307-4ae1-84d8-6202b563837f",
"layout": {
"h": 8,
"i": "c5c557a6-0f48-4154-91f7-30a300f00813",
"isResizable": true,
"w": 12,
"x": 12,
"y": 54
},
"links": [],
"name": "Scheduler - Scheduling Algorithm Duration by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.9, sum (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"scheduler\"}[5m])) by (instance,le))*1000",
"legend": "{{ instance }} ",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "调度算法+绑定 90分位耗时",
"id": "4f2de85b-7fa7-4e06-bcbf-7362fa71f010",
"layout": {
"h": 8,
"i": "68585816-6a08-422d-a7d3-c414ae427380",
"isResizable": true,
"w": 12,
"x": 0,
"y": 62
},
"links": [],
"name": "Scheduler - Scheduling Duration by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.9, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job=\"scheduler\"}[5m])) by (instance,le,profile,result))*1000",
"legend": "{{ instance }} {{ profile }} {{ result }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "scheduler cpu使用率",
"id": "9c187c1c-f5cd-4aab-af81-09169948ab82",
"layout": {
"h": 8,
"i": "9c187c1c-f5cd-4aab-af81-09169948ab82",
"isResizable": true,
"w": 12,
"x": 12,
"y": 62
},
"links": [],
"name": "Scheduler - CPU Usage by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 2,
"util": "percentUnit"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(process_cpu_seconds_total{job=\"scheduler\"}[5m])",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "scheduler 内存使用量",
"id": "8cff2618-b2d4-4fb4-bfc2-d1d4c4f1b35c",
"layout": {
"h": 8,
"i": "8cff2618-b2d4-4fb4-bfc2-d1d4c4f1b35c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 70
},
"links": [],
"name": "Scheduler - Memory Usage by instance",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "process_resident_memory_bytes{job=\"scheduler\"}",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.25,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "scheduler打开的fd数量",
"id": "94bb09a7-dbf7-41d8-b6a4-16b262365474",
"layout": {
"h": 8,
"i": "e438aed2-6d4a-4254-a8ec-26752385dc74",
"isResizable": true,
"w": 12,
"x": 12,
"y": 70
},
"links": [],
"name": "Scheduler - Open fds by instance",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "process_open_fds{job=\"scheduler\"}",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556327691577000
}
================================================
FILE: integrations/Kubernetes/dashboards/StatefulsetContainer.json
================================================
{
"name": "Kubernetes / Statefulset / Container ",
"tags": "Categraf",
"configs": {
"panels": [
{
"collapsed": true,
"id": "79d7e3b0-b64f-4591-b5dd-994ce16b68ca",
"layout": {
"h": 1,
"i": "79d7e3b0-b64f-4591-b5dd-994ce16b68ca",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "整体概况",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min CPU平均使用率,如果pod内有多个容器,会分别显示各个容器CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率400%表示当前用满4核,容器配置2核,使用率200%表示当前用满2核。",
"id": "860c1484-1f83-497e-a061-a50fbb3ff1dc",
"layout": {
"h": 7,
"i": "860c1484-1f83-497e-a061-a50fbb3ff1dc",
"isResizable": true,
"w": 6,
"x": 0,
"y": 1
},
"name": "容器CPU使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"min": 0,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min CPU平均使用率,如果pod内有多个容器,会分别显示各个容器CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率100%表示当前用满4核,容器配置2核,使用率100%表示当前用满2核,如果容器没有配置cpu limit,则不会显示该数值",
"id": "bb66b2cc-3658-4cae-817a-61ec3fbb93e4",
"layout": {
"h": 7,
"i": "bb66b2cc-3658-4cae-817a-61ec3fbb93e4",
"isResizable": true,
"w": 6,
"x": 6,
"y": 1
},
"name": "容器CPU归一化后使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"min": 0,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)/((sum(container_spec_cpu_quota{pod=\"$pod_name\"}/container_spec_cpu_period{pod=\"$pod_name\"}) by (name)))",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "container_memory_rss + container_memory_cache + container_memory_swap + kernel memory",
"id": "4d5f9cbb-3b78-4fe6-8a4e-59ca52a49666",
"layout": {
"h": 7,
"i": "4d5f9cbb-3b78-4fe6-8a4e-59ca52a49666",
"isResizable": true,
"w": 6,
"x": 12,
"y": 1
},
"name": "容器内存使用",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(container_memory_usage_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)) ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "978e93a0-770e-42f5-a374-cafb5b4fc585",
"layout": {
"h": 7,
"i": "978e93a0-770e-42f5-a374-cafb5b4fc585",
"isResizable": true,
"w": 6,
"x": 18,
"y": 1
},
"name": "文件系统写入速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"min": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_fs_writes_bytes_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])) by(name)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "f385736d-fd05-4705-a27d-41e67fb6c843",
"layout": {
"h": 7,
"i": "f385736d-fd05-4705-a27d-41e67fb6c843",
"isResizable": true,
"w": 6,
"x": 0,
"y": 8
},
"name": "文件系统读取速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"min": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_fs_reads_bytes_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])) by(name)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器网络发送的字节数(1 分钟内)",
"id": "f9b140e8-0b44-4b32-9d54-9360b06faa48",
"layout": {
"h": 7,
"i": "f9b140e8-0b44-4b32-9d54-9360b06faa48",
"isResizable": true,
"w": 6,
"x": 6,
"y": 8
},
"name": "网络发送速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器网络接收的字节数(1 分钟内)",
"id": "8f98a264-7058-4fa0-8efc-3c87954a0370",
"layout": {
"h": 7,
"i": "8f98a264-7058-4fa0-8efc-3c87954a0370",
"isResizable": true,
"w": 6,
"x": 12,
"y": 8
},
"name": "网络接收速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1868973e-c3d3-4fd4-83b0-36ec2e06dfe0",
"layout": {
"h": 7,
"i": "1868973e-c3d3-4fd4-83b0-36ec2e06dfe0",
"isResizable": true,
"w": 6,
"x": 18,
"y": 8
},
"name": "容器启动时长(小时)",
"options": {
"standardOptions": {
"util": "humantimeSeconds"
}
},
"targets": [
{
"expr": "sum((time()-container_start_time_seconds{pod=\"$pod_name\", image!~\".*pause.*\"})) by (name)",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "8a4942a2-81a3-4473-81be-79b3541e09a3",
"layout": {
"h": 1,
"i": "8a4942a2-81a3-4473-81be-79b3541e09a3",
"isResizable": false,
"w": 24,
"x": 0,
"y": 15
},
"name": "CPU",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min CPU平均使用率,如果pod内有多个容器,会分别显示各个容器CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率400%表示当前用满4核,容器配置2核,使用率200%表示当前用满2核。",
"id": "54650fe9-007f-4b16-a523-baf2e91ef823",
"layout": {
"h": 7,
"i": "54650fe9-007f-4b16-a523-baf2e91ef823",
"isResizable": true,
"w": 6,
"x": 0,
"y": 16
},
"name": "容器CPU使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min CPU平均使用率,如果pod内有多个容器,会分别显示各个容器CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率100%表示当前用满4核,容器配置2核,使用率100%表示当前用满2核,如果容器没有配置cpu limit,则不会显示该数值",
"id": "715c1e5a-c504-4f2c-a790-dad1c73aae29",
"layout": {
"h": 7,
"i": "715c1e5a-c504-4f2c-a790-dad1c73aae29",
"isResizable": true,
"w": 6,
"x": 6,
"y": 16
},
"name": "容器CPU归一化后使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)/((sum(container_spec_cpu_quota{pod=\"$pod_name\"}/container_spec_cpu_period{pod=\"$pod_name\"}) by (name)))",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min 内核态CPU平均使用率,如果pod内有多个容器,会分别显示各个容器内核态CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率400%表示当前内核态用满4核,容器配置2核,使用率200%表示当前内核态用满2核。",
"id": "d23a509d-3bb0-4680-b579-b89a411830a4",
"layout": {
"h": 7,
"i": "d23a509d-3bb0-4680-b579-b89a411830a4",
"isResizable": true,
"w": 6,
"x": 12,
"y": 16
},
"name": "容器内核态CPU使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_system_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器1min 内核态CPU平均使用率,如果pod内有多个容器,会分别显示各个容器用户态CPU使用率(pause容器默认不会显示);如果容器配置了4核,使用率400%表示当前用户态用满4核,容器配置2核,使用率200%表示当前用户态用满2核。",
"id": "2921e6d0-1b9c-449d-9dd8-fb29dce8ca7d",
"layout": {
"h": 7,
"i": "2921e6d0-1b9c-449d-9dd8-fb29dce8ca7d",
"isResizable": true,
"w": 6,
"x": 18,
"y": 16
},
"name": "容器用户态CPU使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_user_seconds_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "该值大于0,说明CPU在运行期间发生throttle情况,即容器设置的CPU规格,不满足容器当前对CPU的实际需求,在这种情况下,往往需要调大容器CPU规格,或者优化程序,降低CPU开销。",
"id": "b302c1a0-a499-4a99-aff2-d460685846ab",
"layout": {
"h": 7,
"i": "b302c1a0-a499-4a99-aff2-d460685846ab",
"isResizable": true,
"w": 6,
"x": 0,
"y": 23
},
"name": "容器发生CPU throttle的比率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_cfs_throttled_periods_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m]))by(name) *100",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器CPU Limit,2代表容器CPU Limit为2核, 0.2代表容器CPU Limit为0.2核, 没有数据表明没有设置Limit值",
"id": "cd5e8b99-8a76-4dea-b8b2-3bff825a7f8d",
"layout": {
"h": 7,
"i": "cd5e8b99-8a76-4dea-b8b2-3bff825a7f8d",
"isResizable": true,
"w": 6,
"x": 6,
"y": 23
},
"name": "容器CPU Limit",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "(sum(container_spec_cpu_quota{pod=\"$pod_name\"}/container_spec_cpu_period{pod=\"$pod_name\"}) by (name))",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "过去10s的CPU负载",
"id": "408c29b9-2a49-49a2-a98b-3ad7da9c57bd",
"layout": {
"h": 7,
"i": "408c29b9-2a49-49a2-a98b-3ad7da9c57bd",
"isResizable": true,
"w": 6,
"x": 12,
"y": 23
},
"name": "容器CPU load 10",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_cpu_load_average_10s{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "fe62f7db-4f87-4da1-bfc3-60ed5039e31e",
"layout": {
"h": 7,
"i": "fe62f7db-4f87-4da1-bfc3-60ed5039e31e",
"isResizable": true,
"w": 6,
"x": 18,
"y": 23
},
"name": "uninterruptible task 数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_tasks_state{pod=\"$pod_name\", image!~\".*pause.*\", state=\"uninterruptible\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7152f66a-d907-4ee9-afd3-a04b12f1019b",
"layout": {
"h": 7,
"i": "7152f66a-d907-4ee9-afd3-a04b12f1019b",
"isResizable": true,
"w": 6,
"x": 0,
"y": 30
},
"name": "running task 数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_tasks_state{pod=\"$pod_name\", image!~\".*pause.*\", state=\"running\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d2627af4-5753-4384-880b-84bead73002a",
"layout": {
"h": 7,
"i": "d2627af4-5753-4384-880b-84bead73002a",
"isResizable": true,
"w": 6,
"x": 6,
"y": 30
},
"name": "ioawaiting task 数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_tasks_state{pod=\"$pod_name\", image!~\".*pause.*\", state=\"iowaiting\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "78801242-2aae-48dc-a3c3-3d0050fac92c",
"layout": {
"h": 7,
"i": "78801242-2aae-48dc-a3c3-3d0050fac92c",
"isResizable": true,
"w": 6,
"x": 12,
"y": 30
},
"name": "sleeping task 数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_tasks_state{pod=\"$pod_name\", image!~\".*pause.*\", state=\"sleeping\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "723a4b98-52e8-4284-b193-98a470189415",
"layout": {
"h": 7,
"i": "723a4b98-52e8-4284-b193-98a470189415",
"isResizable": true,
"w": 6,
"x": 18,
"y": 30
},
"name": "stopped task 数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_tasks_state{pod=\"$pod_name\", image!~\".*pause.*\", state=\"stopped\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "39a8bcef-7b91-4ec4-8644-4e5674267437",
"layout": {
"h": 1,
"i": "39a8bcef-7b91-4ec4-8644-4e5674267437",
"isResizable": false,
"w": 24,
"x": 0,
"y": 37
},
"name": "内存",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "(container_memory_rss + container_memory_cache + container_memory_swap + kernel memory)/ (memory limit), 没有数据表明容器配置没有设置mem limit",
"id": "e7b80412-d9b2-4589-8175-abf93a62a524",
"layout": {
"h": 7,
"i": "e7b80412-d9b2-4589-8175-abf93a62a524",
"isResizable": true,
"w": 6,
"x": 0,
"y": 38
},
"name": "容器内存使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"max": null,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "((sum(container_memory_usage_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)) /(sum(container_spec_memory_limit_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)))*100",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "container_memory_rss + container_memory_cache + container_memory_swap + kernel memory",
"id": "4c2e1b54-1adb-480e-a581-1763bec5113f",
"layout": {
"h": 7,
"i": "4c2e1b54-1adb-480e-a581-1763bec5113f",
"isResizable": true,
"w": 6,
"x": 6,
"y": 38
},
"name": "容器内存使用",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(container_memory_usage_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)) ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器cache 占用大小",
"id": "429e0214-7b71-4fc0-a771-945363948999",
"layout": {
"h": 7,
"i": "429e0214-7b71-4fc0-a771-945363948999",
"isResizable": true,
"w": 6,
"x": 12,
"y": 38
},
"name": "容器cache使用",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(container_memory_cache{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name))",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器RSS内存占用大小。",
"id": "27c1a22c-bdf3-4a8d-bd31-16c66a307f3f",
"layout": {
"h": 7,
"i": "27c1a22c-bdf3-4a8d-bd31-16c66a307f3f",
"isResizable": true,
"w": 6,
"x": 18,
"y": 38
},
"name": "容器RSS内存使用",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(container_memory_rss{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name))",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器内存Limit配置,0表示没有设置内存 Limit",
"id": "13173c71-14a9-43a0-9cbe-34c1bb453f2b",
"layout": {
"h": 7,
"i": "13173c71-14a9-43a0-9cbe-34c1bb453f2b",
"isResizable": true,
"w": 6,
"x": 0,
"y": 45
},
"name": "容器内存 Limit",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_spec_memory_limit_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d3a7153b-f174-473c-a5ae-1ec8ff4ef318",
"layout": {
"h": 7,
"i": "d3a7153b-f174-473c-a5ae-1ec8ff4ef318",
"isResizable": true,
"w": 6,
"x": 6,
"y": 45
},
"name": "容器发生OOM次数",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(container_oom_events_total{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name) ",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络接收错误数(1分钟内)",
"id": "271d8ff6-2382-488d-b479-83e38b247012",
"layout": {
"h": 7,
"i": "271d8ff6-2382-488d-b479-83e38b247012",
"isResizable": true,
"w": 6,
"x": 12,
"y": 45
},
"name": "网络接收错误数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_errors_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络接收丢包数(1分钟内)",
"id": "e7a0879b-82a9-4518-8c5c-93bc73548401",
"layout": {
"h": 7,
"i": "e7a0879b-82a9-4518-8c5c-93bc73548401",
"isResizable": true,
"w": 6,
"x": 18,
"y": 45
},
"name": "网络接收丢包数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_packets_dropped_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "8d49afd3-d8fa-4714-aaa2-b10e045c8976",
"layout": {
"h": 1,
"i": "8d49afd3-d8fa-4714-aaa2-b10e045c8976",
"isResizable": false,
"w": 24,
"x": 0,
"y": 52
},
"name": "磁盘",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ef7e1b00-2a77-4a34-a138-5c8207088476",
"layout": {
"h": 7,
"i": "ef7e1b00-2a77-4a34-a138-5c8207088476",
"isResizable": true,
"w": 6,
"x": 0,
"y": 53
},
"name": "文件系统读取速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_fs_reads_bytes_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])) by(name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e55b7db3-2b13-43f9-aa2d-e9a0a929df4b",
"layout": {
"h": 7,
"i": "e55b7db3-2b13-43f9-aa2d-e9a0a929df4b",
"isResizable": true,
"w": 6,
"x": 6,
"y": 53
},
"name": "文件系统写入速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_fs_writes_bytes_total{pod=\"$pod_name\", image!~\".*pause.*\"}[1m])) by(name)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a2d7ba64-fd4d-4713-9295-8663fa72f675",
"layout": {
"h": 7,
"i": "a2d7ba64-fd4d-4713-9295-8663fa72f675",
"isResizable": true,
"w": 6,
"x": 12,
"y": 53
},
"name": "容器I/O",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_fs_io_current{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name) ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2b4e94ef-c994-4910-b72b-428bf5156a21",
"layout": {
"h": 7,
"i": "2b4e94ef-c994-4910-b72b-428bf5156a21",
"isResizable": true,
"w": 6,
"x": 18,
"y": 53
},
"name": "inode数量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_fs_inodes_total{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name) ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3abade99-8f34-4a1e-bb48-c5ba1e128749",
"layout": {
"h": 7,
"i": "3abade99-8f34-4a1e-bb48-c5ba1e128749",
"isResizable": true,
"w": 6,
"x": 0,
"y": 60
},
"name": "容器已使用的文件系统大小",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_fs_usage_bytes{pod=\"$pod_name\", image!~\".*pause.*\"}) by (name) ",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络接收数据包数(1分钟内)",
"id": "8b0bac93-8bd0-4452-a200-a9cea74345b3",
"layout": {
"h": 7,
"i": "8b0bac93-8bd0-4452-a200-a9cea74345b3",
"isResizable": true,
"w": 6,
"x": 6,
"y": 60
},
"name": "网络接收数据包数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_packets_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "fde213bc-fe49-431a-9495-2219fcaaa01a",
"layout": {
"h": 1,
"i": "fde213bc-fe49-431a-9495-2219fcaaa01a",
"isResizable": false,
"w": 24,
"x": 0,
"y": 67
},
"name": "网络",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器网络发送的字节数(1 分钟内)",
"id": "4e70bc47-1510-41fe-a924-3ffa36d11f0f",
"layout": {
"h": 7,
"i": "4e70bc47-1510-41fe-a924-3ffa36d11f0f",
"isResizable": true,
"w": 6,
"x": 0,
"y": 68
},
"name": "网络发送速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络发送数据包数(1分钟内)",
"id": "00ca29e5-7c97-491d-a28f-0f2d66ea47c5",
"layout": {
"h": 7,
"i": "00ca29e5-7c97-491d-a28f-0f2d66ea47c5",
"isResizable": true,
"w": 6,
"x": 6,
"y": 68
},
"name": "网络发送数据包",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_packets_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络发送错误数(1分钟内)",
"id": "8cd76b5f-5bba-421a-9fda-42f406f54ed5",
"layout": {
"h": 7,
"i": "8cd76b5f-5bba-421a-9fda-42f406f54ed5",
"isResizable": true,
"w": 6,
"x": 12,
"y": 68
},
"name": "网络发送错误数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_errors_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "网络发送丢包数(1分钟内)",
"id": "58aaebbb-b1cf-4089-bcb5-70b03c05318d",
"layout": {
"h": 7,
"i": "58aaebbb-b1cf-4089-bcb5-70b03c05318d",
"isResizable": true,
"w": 6,
"x": 18,
"y": 68
},
"name": "网络发送丢包数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_packets_dropped_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "容器网络接收的字节数(1 分钟内)",
"id": "120d3540-cce5-4b08-b596-0ec6a5b2d497",
"layout": {
"h": 7,
"i": "120d3540-cce5-4b08-b596-0ec6a5b2d497",
"isResizable": true,
"w": 6,
"x": 0,
"y": 75
},
"name": "网络接收速率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{pod=\"$pod_name\"}[1m])) by(name, interface)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(kube_statefulset_labels,statefulset)",
"hide": false,
"multi": false,
"name": "statefulset",
"reg": "",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(kube_pod_status_ready{pod=~\"$statefulset.*\"}, pod)",
"hide": false,
"multi": false,
"name": "pod_name",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"uuid": 1727335306984008000
}
================================================
FILE: integrations/Kubernetes/markdown/README.md
================================================
# Kubernetes
这个插件已经废弃。Kubernetes 监控系列可以参考这个 [文章](https://flashcat.cloud/categories/kubernetes%E7%9B%91%E6%8E%A7%E4%B8%93%E6%A0%8F/)。
不过 Kubernetes 这个类别下的内置告警规则和内置仪表盘都是可以使用的。
---
下面是老插件文档:
forked from telegraf/kubernetes. 这个插件的作用是通过kubelet提供的API获取监控数据,包括系统容器的监控数据、node的、pod数据卷的、pod网络的、pod容器的。
## Change
增加了一些控制开关:
`gather_system_container_metrics = true`
是否采集 system 容器(kubelet、runtime、misc、pods),比如 kubelet 一般就是静态容器,非业务容器
`gather_node_metrics = true`
是否采集 node 层面的指标,机器层面的指标其实 categraf 来采集了,这里理论上不需要再采集了,可以设置为 false,采集也没问题,也没多少数据
`gather_pod_container_metrics = true`
是否采集 Pod 中的容器的指标,这些 Pod 一般是业务容器
`gather_pod_volume_metrics = true`
是否采集 Pod 的数据卷的指标
`gather_pod_network_metrics = true`
是否采集 Pod 的网络监控数据
## 容器监控
通过这些开关可以看出,kubernetes 这个插件,采集的只是 pod、容器的监控指标,这些指标数据来自 kubelet 的 `/stats/summary` `/pods` 等接口。那么问题来了,容器监控到底是应该读取 `/metrics/cadvisor` 接口还是应该用这个 kubernetes 插件?有几个决策依据:
1. `/metrics/cadvisor` 采集的数据没有业务自定义标签,kubernetes 这个插件会自动带上业务自定义标签。但是业务标签可能比较混乱,建议每个公司制定规范,比如要求业务只能打 project、region、env、service、app、job 等标签,其他标签都过滤掉,通过 kubernetes 插件的 label_include label_exclude 配置,可以做标签过滤。
2. kubernetes 这个插件采集的数据比 `/metrics/cadvisor` 吐出的指标要少,不过常见的 cpu、mem、net、volume 相关的也都有。
================================================
FILE: integrations/Kubernetes/metrics/k8s-node.json
================================================
[
{
"uuid": 1745735239727485700,
"collector": "Node",
"typ": "Kubernetes",
"name": "TCP当前连接数",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "node_netstat_Tcp_CurrEstab * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "TCP当前连接数",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Current TCP Connections",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239701096000,
"collector": "Node",
"typ": "Kubernetes",
"name": "文件描述符使用数",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "node_filefd_allocated * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "文件描述符使用数",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Number of file descriptors used",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239704160000,
"collector": "Node",
"typ": "Kubernetes",
"name": "文件描述符最大限制",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "node_filefd_maximum * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "文件描述符最大限制",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "File Descriptor Maximum Limit",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239750006800,
"collector": "Node",
"typ": "Kubernetes",
"name": "文件系统inode使用率",
"unit": "",
"note": "节点指标\n类型: -",
"lang": "zh_CN",
"expression": "100 - (node_filesystem_files_free * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"} / node_filesystem_files * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"} * 100)",
"translation": [
{
"lang": "zh_CN",
"name": "文件系统inode使用率",
"note": "节点指标\n类型: -"
},
{
"lang": "en_US",
"name": "File system inode usage",
"note": "Node indicators \nType:-"
}
]
},
{
"uuid": 1745735239746991600,
"collector": "Node",
"typ": "Kubernetes",
"name": "文件系统使用率",
"unit": "",
"note": "节点指标\n类型: -",
"lang": "zh_CN",
"expression": "100 - ((node_filesystem_avail_bytes * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"} * 100) / node_filesystem_size_bytes * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"})",
"translation": [
{
"lang": "zh_CN",
"name": "文件系统使用率",
"note": "节点指标\n类型: -"
},
{
"lang": "en_US",
"name": "File system usage",
"note": "Node indicators \nType:-"
}
]
},
{
"uuid": 1745735239753550000,
"collector": "Node",
"typ": "Kubernetes",
"name": "文件系统错误数",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "sum(node_filesystem_device_error * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}) by (mountpoint)",
"translation": [
{
"lang": "zh_CN",
"name": "文件系统错误数",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Number of file system errors",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239743097300,
"collector": "Node",
"typ": "Kubernetes",
"name": "磁盘IO使用率",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "rate(node_disk_io_now[5m]) * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "磁盘IO使用率",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Disk IO usage",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239740169500,
"collector": "Node",
"typ": "Kubernetes",
"name": "磁盘写入IOPS",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "rate(node_disk_writes_completed_total[5m]) * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "磁盘写入IOPS",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Disk Write IOPS",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239734228700,
"collector": "Node",
"typ": "Kubernetes",
"name": "磁盘写入速率",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "rate(node_disk_written_bytes_total[5m]) * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "磁盘写入速率",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Disk write rate",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239737122600,
"collector": "Node",
"typ": "Kubernetes",
"name": "磁盘读取IOPS",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "rate(node_disk_reads_completed_total[5m]) * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "磁盘读取IOPS",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Disk Read IOPS",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239730406000,
"collector": "Node",
"typ": "Kubernetes",
"name": "磁盘读取速率",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "rate(node_disk_read_bytes_total[5m]) * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "磁盘读取速率",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Disk read rate",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239694202600,
"collector": "Node",
"typ": "Kubernetes",
"name": "系统上下文切换率",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "rate(node_context_switches_total[5m]) * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "系统上下文切换率",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "System context switching rate",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239697167400,
"collector": "Node",
"typ": "Kubernetes",
"name": "系统中断率",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "rate(node_intr_total[5m]) * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "系统中断率",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "System interruption rate",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239724650200,
"collector": "Node",
"typ": "Kubernetes",
"name": "网络发送丢包率",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "sum(rate(node_network_transmit_drop_total[5m]) * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"})",
"translation": [
{
"lang": "zh_CN",
"name": "网络发送丢包率",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Network transmission packet loss rate",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239710266000,
"collector": "Node",
"typ": "Kubernetes",
"name": "网络发送带宽",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "sum(rate(node_network_transmit_bytes_total[5m]) * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"})",
"translation": [
{
"lang": "zh_CN",
"name": "网络发送带宽",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Network transmission bandwidth",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239716205000,
"collector": "Node",
"typ": "Kubernetes",
"name": "网络发送错误率",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "sum(rate(node_network_transmit_errs_total[5m]) * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"})",
"translation": [
{
"lang": "zh_CN",
"name": "网络发送错误率",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Network transmission error rate",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239721688800,
"collector": "Node",
"typ": "Kubernetes",
"name": "网络接收丢包率",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "sum(rate(node_network_receive_drop_total[5m]) * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"})",
"translation": [
{
"lang": "zh_CN",
"name": "网络接收丢包率",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Network reception packet loss rate",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239707241500,
"collector": "Node",
"typ": "Kubernetes",
"name": "网络接收带宽",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "sum(rate(node_network_receive_bytes_total[5m]) * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"})",
"translation": [
{
"lang": "zh_CN",
"name": "网络接收带宽",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Network reception bandwidth",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239713318000,
"collector": "Node",
"typ": "Kubernetes",
"name": "网络接收错误率",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "sum(rate(node_network_receive_errs_total[5m]) * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"})",
"translation": [
{
"lang": "zh_CN",
"name": "网络接收错误率",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Network reception error rate",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239783181800,
"collector": "Node",
"typ": "Kubernetes",
"name": "网络连接跟踪条目数",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "node_nf_conntrack_entries * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "网络连接跟踪条目数",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Number of network connection tracking entries",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239786134000,
"collector": "Node",
"typ": "Kubernetes",
"name": "网络连接跟踪限制",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "node_nf_conntrack_entries_limit * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "网络连接跟踪限制",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Network connection tracking restrictions",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239675145700,
"collector": "Node",
"typ": "Kubernetes",
"name": "节点 CPU 使用率",
"unit": "",
"note": "节点指标\n类型: by",
"lang": "zh_CN",
"expression": "sum by (instance) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\"}[5m])) * on(instance) group_left(nodename) node_uname_info{nodename=~\"$node_name\"} *100",
"translation": [
{
"lang": "zh_CN",
"name": "节点 CPU 使用率",
"note": "节点指标\n类型: by"
},
{
"lang": "en_US",
"name": "Node CPU usage",
"note": "Node indicators \nType: by"
}
]
},
{
"uuid": 1745735239691192000,
"collector": "Node",
"typ": "Kubernetes",
"name": "节点15分钟负载",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "node_load15 * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "节点15分钟负载",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Node 15-minute load",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239685264100,
"collector": "Node",
"typ": "Kubernetes",
"name": "节点1分钟负载",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "node_load1 * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "节点1分钟负载",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Node 1 minute load",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239688232700,
"collector": "Node",
"typ": "Kubernetes",
"name": "节点5分钟负载",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "node_load5 * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "节点5分钟负载",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Node 5-minute load",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239776256800,
"collector": "Node",
"typ": "Kubernetes",
"name": "节点Swap使用量",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "node_memory_SwapTotal_bytes * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"} - node_memory_SwapFree_bytes * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "节点Swap使用量",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Node Swap usage",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239779806500,
"collector": "Node",
"typ": "Kubernetes",
"name": "节点Swap总量",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "node_memory_SwapTotal_bytes * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "节点Swap总量",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Total Node Swap",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239681529300,
"collector": "Node",
"typ": "Kubernetes",
"name": "节点上运行的Pod数量",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "sum(kube_pod_info * on(node) group_left(nodename) node_uname_info{nodename=~\"$node_name\"})",
"translation": [
{
"lang": "zh_CN",
"name": "节点上运行的Pod数量",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Number of Pods running on a node",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239678397700,
"collector": "Node",
"typ": "Kubernetes",
"name": "节点内存使用率",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "sum(node_memory_MemTotal_bytes * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"} - node_memory_MemAvailable_bytes * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}) / sum(node_memory_MemTotal_bytes * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"})",
"translation": [
{
"lang": "zh_CN",
"name": "节点内存使用率",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Node memory usage",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239760507400,
"collector": "Node",
"typ": "Kubernetes",
"name": "节点内存详细信息 - 可用",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "node_memory_MemAvailable_bytes * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "节点内存详细信息 - 可用",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Node memory details-Available",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239756641800,
"collector": "Node",
"typ": "Kubernetes",
"name": "节点内存详细信息 - 总量",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "node_memory_MemTotal_bytes * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "节点内存详细信息 - 总量",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Node Memory Details-Total",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239772786200,
"collector": "Node",
"typ": "Kubernetes",
"name": "节点内存详细信息 - 空闲",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "node_memory_MemFree_bytes * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "节点内存详细信息 - 空闲",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Node memory details-free",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239769542000,
"collector": "Node",
"typ": "Kubernetes",
"name": "节点内存详细信息 - 缓冲区",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "node_memory_Buffers_bytes * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "节点内存详细信息 - 缓冲区",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Node Memory Details-Buffer",
"note": "Node indicators \nType: *"
}
]
},
{
"uuid": 1745735239764136000,
"collector": "Node",
"typ": "Kubernetes",
"name": "节点内存详细信息 - 缓存",
"unit": "",
"note": "节点指标\n类型: *",
"lang": "zh_CN",
"expression": "node_memory_Cached_bytes * on(instance, cluster) group_left(nodename) node_uname_info{nodename=~\"$node_name\"}",
"translation": [
{
"lang": "zh_CN",
"name": "节点内存详细信息 - 缓存",
"note": "节点指标\n类型: *"
},
{
"lang": "en_US",
"name": "Node Memory Details-Cache",
"note": "Node indicators \nType: *"
}
]
}
]
================================================
FILE: integrations/Kubernetes/metrics/k8s-pod.json
================================================
[
{
"uuid": 1745893024149445000,
"collector": "Pod",
"typ": "Kubernetes",
"name": "Inode数量",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "sum(container_fs_inodes_total{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}) by (name)",
"translation": [
{
"lang": "zh_CN",
"name": "Inode数量",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "Number of Inodes",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024121015300,
"collector": "Pod",
"typ": "Kubernetes",
"name": "不可中断任务数量",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "sum(container_tasks_state{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\", state=\"uninterruptible\"}) by (name)",
"translation": [
{
"lang": "zh_CN",
"name": "不可中断任务数量",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "Number of uninterruptible tasks",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024130551800,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器cache使用",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "(sum(container_memory_cache{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}) by (name))",
"translation": [
{
"lang": "zh_CN",
"name": "容器cache使用",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "Container cache use",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024108569900,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器CPU Limit",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}/container_spec_cpu_period{namespace=\"$namespace\",",
"lang": "zh_CN",
"expression": "(sum(container_spec_cpu_quota{namespace=\"$namespace\", pod=~\"$pod_name\"}/container_spec_cpu_period{namespace=\"$namespace\", pod=~\"$pod_name\"}) by (name))",
"translation": [
{
"lang": "zh_CN",
"name": "容器CPU Limit",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}/container_spec_cpu_period{namespace=\"$namespace\","
},
{
"lang": "en_US",
"name": "Container CPU Limit",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\"}/container _ spec _ cpu _ period {namespace = \"$namespace\","
}
]
},
{
"uuid": 1745893024112672500,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器CPU load 10",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "sum(container_cpu_load_average_10s{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}) by (name)",
"translation": [
{
"lang": "zh_CN",
"name": "容器CPU load 10",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "Container CPU load 10",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024026246700,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器CPU使用率",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)",
"translation": [
{
"lang": "zh_CN",
"name": "容器CPU使用率",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "Container CPU usage",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024029544000,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器CPU归一化后使用率",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)/((sum(container_spec_cpu_quota{namespace=\"$namespace\", pod=~\"$pod_name\"}/container_spec_cpu_period{namespace=\"$namespace\", pod=~\"$pod_name\"}) by (name)))",
"translation": [
{
"lang": "zh_CN",
"name": "容器CPU归一化后使用率",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "Container CPU usage after normalization",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024146207700,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器I/O",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "sum(container_fs_io_current{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}) by (name)",
"translation": [
{
"lang": "zh_CN",
"name": "容器I/O",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "Container I/O",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024136457000,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器RSS内存使用",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "(sum(container_memory_rss{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}) by (name))",
"translation": [
{
"lang": "zh_CN",
"name": "容器RSS内存使用",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "Container RSS memory usage",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024139900200,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器内存 Limit",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "sum(container_spec_memory_limit_bytes{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}) by (name)",
"translation": [
{
"lang": "zh_CN",
"name": "容器内存 Limit",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "Container Memory Limit",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024032984300,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器内存使用",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "(sum(container_memory_usage_bytes{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}) by (name))",
"translation": [
{
"lang": "zh_CN",
"name": "容器内存使用",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "Container memory usage",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024127585500,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器内存使用率",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "((sum(container_memory_usage_bytes{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}) by (name)) /(sum(container_spec_memory_limit_bytes{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}) by (name)))*100",
"translation": [
{
"lang": "zh_CN",
"name": "容器内存使用率",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "Container memory usage",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024093620000,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器内核态CPU使用率",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "sum(rate(container_cpu_system_seconds_total{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)",
"translation": [
{
"lang": "zh_CN",
"name": "容器内核态CPU使用率",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "Container kernel mode CPU usage",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024102879000,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器发生CPU throttle的比率",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "sum(rate(container_cpu_cfs_throttled_periods_total{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}[1m]))by(name) *100",
"translation": [
{
"lang": "zh_CN",
"name": "容器发生CPU throttle的比率",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "The rate at which container CPU throttle occurs",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024143177000,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器发生OOM次数",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "sum(container_oom_events_total{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}) by (name)",
"translation": [
{
"lang": "zh_CN",
"name": "容器发生OOM次数",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "Number of OOM occurrences for container",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024083942000,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器启动时长(小时)",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "sum((time()-container_start_time_seconds{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"})) by (name)",
"translation": [
{
"lang": "zh_CN",
"name": "容器启动时长(小时)",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "Container startup time (hours)",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024152466200,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器已使用的文件系统大小",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "sum(container_fs_usage_bytes{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}) by (name)",
"translation": [
{
"lang": "zh_CN",
"name": "容器已使用的文件系统大小",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "File system size used by the container",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024097849600,
"collector": "Pod",
"typ": "Kubernetes",
"name": "容器用户态CPU使用率",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "sum(rate(container_cpu_user_seconds_total{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}[1m])*100) by(name)",
"translation": [
{
"lang": "zh_CN",
"name": "容器用户态CPU使用率",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "Container user mode CPU usage",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024036896800,
"collector": "Pod",
"typ": "Kubernetes",
"name": "文件系统写入速率",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "sum(rate(container_fs_writes_bytes_total{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}[1m])) by(name)",
"translation": [
{
"lang": "zh_CN",
"name": "文件系统写入速率",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "File system write rate",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024057722000,
"collector": "Pod",
"typ": "Kubernetes",
"name": "文件系统读取速率",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\",",
"lang": "zh_CN",
"expression": "sum(rate(container_fs_reads_bytes_total{namespace=\"$namespace\", pod=~\"$pod_name\", image!~\".*pause.*\"}[1m])) by(name)",
"translation": [
{
"lang": "zh_CN",
"name": "文件系统读取速率",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\","
},
{
"lang": "en_US",
"name": "File system read rate",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\","
}
]
},
{
"uuid": 1745893024166898000,
"collector": "Pod",
"typ": "Kubernetes",
"name": "网络发送丢包数",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))",
"lang": "zh_CN",
"expression": "sum(rate(container_network_transmit_packets_dropped_total{namespace=\"$namespace\", pod=~\"$pod_name\"}[1m])) by(name, interface)",
"translation": [
{
"lang": "zh_CN",
"name": "网络发送丢包数",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))"
},
{
"lang": "en_US",
"name": "Number of packets lost by network transmission",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\"} [1m]))"
}
]
},
{
"uuid": 1745893024160266500,
"collector": "Pod",
"typ": "Kubernetes",
"name": "网络发送数据包",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))",
"lang": "zh_CN",
"expression": "sum(rate(container_network_transmit_packets_total{namespace=\"$namespace\", pod=~\"$pod_name\"}[1m])) by(name, interface)",
"translation": [
{
"lang": "zh_CN",
"name": "网络发送数据包",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))"
},
{
"lang": "en_US",
"name": "The network sends packets",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\"} [1m]))"
}
]
},
{
"uuid": 1745893024069935000,
"collector": "Pod",
"typ": "Kubernetes",
"name": "网络发送速率",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))",
"lang": "zh_CN",
"expression": "sum(rate(container_network_transmit_bytes_total{namespace=\"$namespace\", pod=~\"$pod_name\"}[1m])) by(name, interface)",
"translation": [
{
"lang": "zh_CN",
"name": "网络发送速率",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))"
},
{
"lang": "en_US",
"name": "Network transmission rate",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\"} [1m]))"
}
]
},
{
"uuid": 1745893024163721700,
"collector": "Pod",
"typ": "Kubernetes",
"name": "网络发送错误数",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))",
"lang": "zh_CN",
"expression": "sum(rate(container_network_transmit_errors_total{namespace=\"$namespace\", pod=~\"$pod_name\"}[1m])) by(name, interface)",
"translation": [
{
"lang": "zh_CN",
"name": "网络发送错误数",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))"
},
{
"lang": "en_US",
"name": "Number of network transmission errors",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\"} [1m]))"
}
]
},
{
"uuid": 1745893024173485600,
"collector": "Pod",
"typ": "Kubernetes",
"name": "网络接收丢包数",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))",
"lang": "zh_CN",
"expression": "sum(rate(container_network_receive_packets_dropped_total{namespace=\"$namespace\", pod=~\"$pod_name\"}[1m])) by(name, interface)",
"translation": [
{
"lang": "zh_CN",
"name": "网络接收丢包数",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))"
},
{
"lang": "en_US",
"name": "Number of packet losses received by network",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\"} [1m]))"
}
]
},
{
"uuid": 1745893024156389600,
"collector": "Pod",
"typ": "Kubernetes",
"name": "网络接收数据包数",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))",
"lang": "zh_CN",
"expression": "sum(rate(container_network_receive_packets_total{namespace=\"$namespace\", pod=~\"$pod_name\"}[1m])) by(name, interface)",
"translation": [
{
"lang": "zh_CN",
"name": "网络接收数据包数",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))"
},
{
"lang": "en_US",
"name": "Number of packets received by network",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\"} [1m]))"
}
]
},
{
"uuid": 1745893024075864800,
"collector": "Pod",
"typ": "Kubernetes",
"name": "网络接收速率",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))",
"lang": "zh_CN",
"expression": "sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\", pod=~\"$pod_name\"}[1m])) by(name, interface)",
"translation": [
{
"lang": "zh_CN",
"name": "网络接收速率",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))"
},
{
"lang": "en_US",
"name": "Network reception rate",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\"} [1m]))"
}
]
},
{
"uuid": 1745893024170233300,
"collector": "Pod",
"typ": "Kubernetes",
"name": "网络接收错误数",
"unit": "",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))",
"lang": "zh_CN",
"expression": "sum(rate(container_network_receive_errors_total{namespace=\"$namespace\", pod=~\"$pod_name\"}[1m])) by(name, interface)",
"translation": [
{
"lang": "zh_CN",
"name": "网络接收错误数",
"note": "Pod自身指标\n类型: pod=~\"$pod_name\"}[1m]))"
},
{
"lang": "en_US",
"name": "Number of network reception errors",
"note": "Pod's own indicators \nType: pod = ~ \"$pod _ name\"} [1m]))"
}
]
}
]
================================================
FILE: integrations/Kubernetes/record-rules/kube-controller-plane.json
================================================
[
{
"cluster": "",
"name": ":node_memory_MemAvailable_bytes:sum",
"note": "",
"disabled": 0,
"prom_ql": "sum(\n node_memory_MemAvailable_bytes{job=\"node-exporter\"} or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"} +\n node_memory_Cached_bytes{job=\"node-exporter\"} +\n node_memory_MemFree_bytes{job=\"node-exporter\"} +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n )\n) by (cluster)\n",
"prom_eval_interval": 15,
"append_tags": []
},
{
"cluster": "",
"name": "apiserver_request:availability30d",
"note": "",
"disabled": 0,
"prom_ql": "1 - (\n (\n # write too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n ) +\n (\n # read too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"30\"})\n )\n ) +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d)\n",
"prom_eval_interval": 180,
"append_tags": [
"verb=all"
]
},
{
"cluster": "",
"name": "apiserver_request:availability30d",
"note": "",
"disabled": 0,
"prom_ql": "1 - (\n sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~\"LIST|GET\"})\n -\n (\n # too slow\n (\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=~\"resource|\",le=\"1\"})\n or\n vector(0)\n )\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"namespace\",le=\"5\"})\n +\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"LIST|GET\",scope=\"cluster\",le=\"30\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"read\"})\n",
"prom_eval_interval": 180,
"append_tags": [
"verb=read"
]
},
{
"cluster": "",
"name": "apiserver_request:availability30d",
"note": "",
"disabled": 0,
"prom_ql": "1 - (\n (\n # too slow\n sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})\n -\n sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~\"POST|PUT|PATCH|DELETE\",le=\"1\"})\n )\n +\n # errors\n sum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\",code=~\"5..\"} or vector(0))\n)\n/\nsum by (cluster) (code:apiserver_request_total:increase30d{verb=\"write\"})\n",
"prom_eval_interval": 180,
"append_tags": [
"verb=write"
]
},
{
"cluster": "",
"name": "apiserver_request:burnrate1d",
"note": "",
"disabled": 0,
"prom_ql": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[1d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[1d]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[1d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))\n",
"prom_eval_interval": 15,
"append_tags": [
"verb=read"
]
},
{
"cluster": "",
"name": "apiserver_request:burnrate1d",
"note": "",
"disabled": 0,
"prom_ql": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1d]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[1d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))\n",
"prom_eval_interval": 15,
"append_tags": [
"verb=write"
]
},
{
"cluster": "",
"name": "apiserver_request:burnrate1h",
"note": "",
"disabled": 0,
"prom_ql": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[1h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[1h]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[1h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))\n",
"prom_eval_interval": 15,
"append_tags": [
"verb=read"
]
},
{
"cluster": "",
"name": "apiserver_request:burnrate1h",
"note": "",
"disabled": 0,
"prom_ql": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[1h]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[1h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[1h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))\n",
"prom_eval_interval": 15,
"append_tags": [
"verb=write"
]
},
{
"cluster": "",
"name": "apiserver_request:burnrate2h",
"note": "",
"disabled": 0,
"prom_ql": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[2h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[2h]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[2h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))\n",
"prom_eval_interval": 15,
"append_tags": [
"verb=read"
]
},
{
"cluster": "",
"name": "apiserver_request:burnrate2h",
"note": "",
"disabled": 0,
"prom_ql": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[2h]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[2h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[2h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))\n",
"prom_eval_interval": 15,
"append_tags": [
"verb=write"
]
},
{
"cluster": "",
"name": "apiserver_request:burnrate30m",
"note": "",
"disabled": 0,
"prom_ql": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[30m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[30m]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[30m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))\n",
"prom_eval_interval": 15,
"append_tags": [
"verb=read"
]
},
{
"cluster": "",
"name": "apiserver_request:burnrate30m",
"note": "",
"disabled": 0,
"prom_ql": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[30m]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[30m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[30m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))\n",
"prom_eval_interval": 15,
"append_tags": [
"verb=write"
]
},
{
"cluster": "",
"name": "apiserver_request:burnrate3d",
"note": "",
"disabled": 0,
"prom_ql": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[3d]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[3d]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[3d]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))\n",
"prom_eval_interval": 15,
"append_tags": [
"verb=read"
]
},
{
"cluster": "",
"name": "apiserver_request:burnrate3d",
"note": "",
"disabled": 0,
"prom_ql": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[3d]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[3d]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[3d]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))\n",
"prom_eval_interval": 15,
"append_tags": [
"verb=write"
]
},
{
"cluster": "",
"name": "apiserver_request:burnrate5m",
"note": "",
"disabled": 0,
"prom_ql": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[5m]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[5m]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[5m]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))\n",
"prom_eval_interval": 15,
"append_tags": [
"verb=read"
]
},
{
"cluster": "",
"name": "apiserver_request:burnrate5m",
"note": "",
"disabled": 0,
"prom_ql": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[5m]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[5m]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))\n",
"prom_eval_interval": 15,
"append_tags": [
"verb=write"
]
},
{
"cluster": "",
"name": "apiserver_request:burnrate6h",
"note": "",
"disabled": 0,
"prom_ql": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n -\n (\n (\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=~\"resource|\",le=\"1\"}[6h]))\n or\n vector(0)\n )\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"namespace\",le=\"5\"}[6h]))\n +\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\",scope=\"cluster\",le=\"30\"}[6h]))\n )\n )\n +\n # errors\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))\n",
"prom_eval_interval": 15,
"append_tags": [
"verb=read"
]
},
{
"cluster": "",
"name": "apiserver_request:burnrate6h",
"note": "",
"disabled": 0,
"prom_ql": "(\n (\n # too slow\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[6h]))\n -\n sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\",le=\"1\"}[6h]))\n )\n +\n sum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",code=~\"5..\"}[6h]))\n)\n/\nsum by (cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))\n",
"prom_eval_interval": 15,
"append_tags": [
"verb=write"
]
},
{
"cluster": "",
"name": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits",
"note": "",
"disabled": 0,
"prom_ql": "kube_pod_container_resource_limits{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n )\n",
"prom_eval_interval": 15,
"append_tags": []
},
{
"cluster": "",
"name": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests",
"note": "",
"disabled": 0,
"prom_ql": "kube_pod_container_resource_requests{resource=\"cpu\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n",
"prom_eval_interval": 15,
"append_tags": []
},
{
"cluster": "",
"name": "cluster:namespace:pod_memory:active:kube_pod_container_resource_limits",
"note": "",
"disabled": 0,
"prom_ql": "kube_pod_container_resource_limits{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n",
"prom_eval_interval": 15,
"append_tags": []
},
{
"cluster": "",
"name": "cluster:namespace:pod_memory:active:kube_pod_container_resource_requests",
"note": "",
"disabled": 0,
"prom_ql": "kube_pod_container_resource_requests{resource=\"memory\",job=\"kube-state-metrics\"} * on (namespace, pod, cluster)\ngroup_left() max by (namespace, pod, cluster) (\n (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)\n)\n",
"prom_eval_interval": 15,
"append_tags": []
},
{
"cluster": "",
"name": "cluster:node_cpu:ratio_rate5m",
"note": "",
"disabled": 0,
"prom_ql": "avg by (cluster) (\n node:node_cpu_utilization:ratio_rate5m\n)\n",
"prom_eval_interval": 15,
"append_tags": []
},
{
"cluster": "",
"name": "cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile",
"note": "",
"disabled": 0,
"prom_ql": "histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))) > 0\n",
"prom_eval_interval": 15,
"append_tags": [
"quantile=0.99",
"verb=read"
]
},
{
"cluster": "",
"name": "cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile",
"note": "",
"disabled": 0,
"prom_ql": "histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\",subresource!~\"proxy|attach|log|exec|portforward\"}[5m]))) > 0\n",
"prom_eval_interval": 15,
"append_tags": [
"quantile=0.99",
"verb=write"
]
},
{
"cluster": "",
"name": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile",
"note": "",
"disabled": 0,
"prom_ql": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n",
"prom_eval_interval": 15,
"append_tags": [
"quantile=0.99"
]
},
{
"cluster": "",
"name": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile",
"note": "",
"disabled": 0,
"prom_ql": "histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n",
"prom_eval_interval": 15,
"append_tags": [
"quantile=0.9"
]
},
{
"cluster": "",
"name": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile",
"note": "",
"disabled": 0,
"prom_ql": "histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n",
"prom_eval_interval": 15,
"append_tags": [
"quantile=0.5"
]
},
{
"cluster": "",
"name": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile",
"note": "",
"disabled": 0,
"prom_ql": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n",
"prom_eval_interval": 15,
"append_tags": [
"quantile=0.99"
]
},
{
"cluster": "",
"name": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile",
"note": "",
"disabled": 0,
"prom_ql": "histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n",
"prom_eval_interval": 15,
"append_tags": [
"quantile=0.9"
]
}
]
================================================
FILE: integrations/Kubernetes/record-rules/node-exporter.json
================================================
[
{
"cluster": "",
"name": "cluster:node_cpu:ratio",
"note": "",
"disabled": 0,
"prom_ql": "cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))",
"prom_eval_interval": 15,
"append_tags": []
},
{
"cluster": "",
"name": "cluster:node_cpu:sum_rate5m",
"note": "",
"disabled": 0,
"prom_ql": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m]))",
"prom_eval_interval": 15,
"append_tags": []
},
{
"cluster": "",
"name": "count:up0",
"note": "",
"disabled": 0,
"prom_ql": "count without(instance, pod, node) (up == 0)",
"prom_eval_interval": 15,
"append_tags": []
},
{
"cluster": "",
"name": "count:up1",
"note": "",
"disabled": 0,
"prom_ql": "count without(instance, pod, node) (up == 1)",
"prom_eval_interval": 15,
"append_tags": []
},
{
"cluster": "",
"name": "instance:node_cpu:rate:sum",
"note": "",
"disabled": 0,
"prom_ql": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[3m])) BY (instance)",
"prom_eval_interval": 15,
"append_tags": []
},
{
"cluster": "",
"name": "instance:node_cpu:ratio",
"note": "",
"disabled": 0,
"prom_ql": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)",
"prom_eval_interval": 15,
"append_tags": []
},
{
"cluster": "",
"name": "instance:node_network_receive_bytes:rate:sum",
"note": "",
"disabled": 0,
"prom_ql": "sum(rate(node_network_receive_bytes_total[3m])) BY (instance)",
"prom_eval_interval": 15,
"append_tags": []
},
{
"cluster": "",
"name": "instance:node_network_transmit_bytes:rate:sum",
"note": "",
"disabled": 0,
"prom_ql": "sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)",
"prom_eval_interval": 15,
"append_tags": []
}
]
================================================
FILE: integrations/Ldap/collect/ldap/ldap.toml
================================================
# # collect interval
# interval = 15
[[instances]]
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
## Server to monitor
## The scheme determines the mode to use for connection with
## ldap://... -- unencrypted (non-TLS) connection
## ldaps://... -- TLS connection
## starttls://... -- StartTLS connection
## If no port is given, the default ports, 389 for ldap and starttls and
## 636 for ldaps, are used.
#server = "ldap://localhost"
## Server dialect, can be "openldap" or "389ds"
# dialect = "openldap"
# DN and password to bind with
## If bind_dn is empty an anonymous bind is performed.
bind_dn = ""
bind_password = ""
## Reverse the field names constructed from the monitoring DN
# reverse_field_names = false
## Optional TLS Config
# use_tls = false
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
================================================
FILE: integrations/Ldap/markdown/README.md
================================================
# LDAP Input Plugin
This plugin gathers metrics from LDAP servers' monitoring (`cn=Monitor`)
backend. Currently this plugin supports [OpenLDAP](https://www.openldap.org/)
and [389ds](https://www.port389.org/) servers.
To use this plugin you must enable the monitoring backend/plugin of your LDAP
server. See
[OpenLDAP](https://www.openldap.org/devel/admin/monitoringslapd.html) or 389ds
documentation for details.
## Metrics
Depending on the server dialect, different metrics are produced. The metrics
are usually named according to the selected dialect.
### Tags
- server -- Server name or IP
- port -- Port used for connecting
## Example Output
Using the `openldap` dialect
```text
openldap_modify_operations_completed agent_hostname=zy-fat port=389 server=localhost 0
openldap_referrals_statistics agent_hostname=zy-fat port=389 server=localhost 0
openldap_unbind_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0
openldap_delete_operations_completed agent_hostname=zy-fat port=389 server=localhost 0
openldap_extended_operations_completed agent_hostname=zy-fat port=389 server=localhost 0
openldap_pdu_statistics agent_hostname=zy-fat port=389 server=localhost 42
openldap_starting_threads agent_hostname=zy-fat port=389 server=localhost 0
openldap_active_threads agent_hostname=zy-fat port=389 server=localhost 1
openldap_uptime_time agent_hostname=zy-fat port=389 server=localhost 102
openldap_bytes_statistics agent_hostname=zy-fat port=389 server=localhost 3176
openldap_compare_operations_completed agent_hostname=zy-fat port=389 server=localhost 0
openldap_bind_operations_completed agent_hostname=zy-fat port=389 server=localhost 1
openldap_total_connections agent_hostname=zy-fat port=389 server=localhost 1002
openldap_search_operations_completed agent_hostname=zy-fat port=389 server=localhost 1
openldap_abandon_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0
openldap_add_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0
openldap_open_threads agent_hostname=zy-fat port=389 server=localhost 1
openldap_add_operations_completed agent_hostname=zy-fat port=389 server=localhost 0
openldap_operations_initiated agent_hostname=zy-fat port=389 server=localhost 3
openldap_write_waiters agent_hostname=zy-fat port=389 server=localhost 0
openldap_entries_statistics agent_hostname=zy-fat port=389 server=localhost 41
openldap_modrdn_operations_completed agent_hostname=zy-fat port=389 server=localhost 0
openldap_pending_threads agent_hostname=zy-fat port=389 server=localhost 0
openldap_max_pending_threads agent_hostname=zy-fat port=389 server=localhost 0
openldap_bind_operations_initiated agent_hostname=zy-fat port=389 server=localhost 1
openldap_max_file_descriptors_connections agent_hostname=zy-fat port=389 server=localhost 1024
openldap_compare_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0
openldap_search_operations_initiated agent_hostname=zy-fat port=389 server=localhost 2
openldap_modrdn_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0
openldap_read_waiters agent_hostname=zy-fat port=389 server=localhost 1
openldap_backload_threads agent_hostname=zy-fat port=389 server=localhost 1
openldap_current_connections agent_hostname=zy-fat port=389 server=localhost 1
openldap_unbind_operations_completed agent_hostname=zy-fat port=389 server=localhost 0
openldap_delete_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0
openldap_extended_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0
openldap_modify_operations_initiated agent_hostname=zy-fat port=389 server=localhost 0
openldap_max_threads agent_hostname=zy-fat port=389 server=localhost 16
openldap_abandon_operations_completed agent_hostname=zy-fat port=389 server=localhost 0
openldap_operations_completed agent_hostname=zy-fat port=389 server=localhost 2
openldap_database_2_databases agent_hostname=zy-fat port=389 server=localhost 0
```
Using the `389ds` dialect
```text
389ds_current_connections_at_max_threads agent_hostname=zy-fat port=389 server=localhost 0
389ds_connections_max_threads agent_hostname=zy-fat port=389 server=localhost 0
389ds_add_operations agent_hostname=zy-fat port=389 server=localhost 0
389ds_dtablesize agent_hostname=zy-fat port=389 server=localhost 63936
389ds_strongauth_binds agent_hostname=zy-fat port=389 server=localhost 13
389ds_modrdn_operations agent_hostname=zy-fat port=389 server=localhost 0
389ds_maxthreads_per_conn_hits agent_hostname=zy-fat port=389 server=localhost 0
389ds_current_connections agent_hostname=zy-fat port=389 server=localhost 2
389ds_security_errors agent_hostname=zy-fat port=389 server=localhost 0
389ds_entries_sent agent_hostname=zy-fat port=389 server=localhost 13
389ds_cache_entries agent_hostname=zy-fat port=389 server=localhost 0
389ds_backends agent_hostname=zy-fat port=389 server=localhost 0
389ds_threads agent_hostname=zy-fat port=389 server=localhost 17
389ds_connections agent_hostname=zy-fat port=389 server=localhost 2
389ds_read_operations agent_hostname=zy-fat port=389 server=localhost 0
389ds_entries_returned agent_hostname=zy-fat port=389 server=localhost 13
389ds_unauth_binds agent_hostname=zy-fat port=389 server=localhost 0
389ds_search_operations agent_hostname=zy-fat port=389 server=localhost 14
389ds_simpleauth_binds agent_hostname=zy-fat port=389 server=localhost 0
389ds_operations_completed agent_hostname=zy-fat port=389 server=localhost 51
389ds_connections_in_max_threads agent_hostname=zy-fat port=389 server=localhost 0
389ds_modify_operations agent_hostname=zy-fat port=389 server=localhost 0
389ds_wholesubtree_search_operations agent_hostname=zy-fat port=389 server=localhost 1
389ds_read_waiters agent_hostname=zy-fat port=389 server=localhost 0
389ds_compare_operations agent_hostname=zy-fat port=389 server=localhost 0
389ds_errors agent_hostname=zy-fat port=389 server=localhost 13
389ds_in_operations agent_hostname=zy-fat port=389 server=localhost 52
389ds_total_connections agent_hostname=zy-fat port=389 server=localhost 15
389ds_cache_hits agent_hostname=zy-fat port=389 server=localhost 0
389ds_list_operations agent_hostname=zy-fat port=389 server=localhost 0
389ds_referrals_returned agent_hostname=zy-fat port=389 server=localhost 0
389ds_copy_entries agent_hostname=zy-fat port=389 server=localhost 0
389ds_operations_initiated agent_hostname=zy-fat port=389 server=localhost 52
389ds_chainings agent_hostname=zy-fat port=389 server=localhost 0
389ds_bind_security_errors agent_hostname=zy-fat port=389 server=localhost 0
389ds_onelevel_search_operations agent_hostname=zy-fat port=389 server=localhost 0
389ds_bytes_sent agent_hostname=zy-fat port=389 server=localhost 1702
389ds_bytes_received agent_hostname=zy-fat port=389 server=localhost 0
389ds_referrals agent_hostname=zy-fat port=389 server=localhost 0
389ds_delete_operations agent_hostname=zy-fat port=389 server=localhost 0
389ds_anonymous_binds agent_hostname=zy-fat port=389 server=localhost 0
```
================================================
FILE: integrations/Linux/alerts/CommonAlertingRules-Categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Hard disk - expected to be written full in 4 hours - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "predict_linear(disk_free[1h], 4*3600) \u003c 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "predict_linear(disk_free[1h], 4*3600) \u003c 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "",
"enable_stimes": [
"00:00"
],
"enable_etime": "",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": null,
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327696047000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Hard disk - IO is a bit busy - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "rate(diskio_io_time[1m])/10 \u003e 99",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(diskio_io_time[1m])/10 \u003e 99",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "",
"enable_stimes": [
"00:00"
],
"enable_etime": "",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": null,
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327696483000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Lost connection with monitoring target - categraf",
"note": "",
"prod": "host",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
0
],
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"key": "all_hosts",
"op": "==",
"values": []
}
],
"triggers": [
{
"duration": 60,
"severity": 2,
"type": "target_miss"
}
]
},
"prom_eval_interval": 15,
"enable_stime": "",
"enable_stimes": [
"00:00"
],
"enable_etime": "",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": null,
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327697002000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Machine load - high memory, please pay attention - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3,
2,
1
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"prom_ql": "mem_available_percent \u003c 25",
"severity": 3
},
{
"prom_ql": "mem_available_percent \u003c 15",
"severity": 2
},
{
"prom_ql": "mem_available_percent \u003c 5",
"severity": 1
}
]
},
"prom_eval_interval": 15,
"enable_stime": "",
"enable_stimes": [
"00:00"
],
"enable_etime": "",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": null,
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327697454000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "number of TME_WAIT exceeds 20,000 - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "netstat_tcp_time_wait \u003e 20000",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "netstat_tcp_time_wait \u003e 20000",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "",
"enable_stimes": [
"00:00"
],
"enable_etime": "",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": null,
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327697935000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "packet loss in the inbound direction - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "increase(net_drop_in[1m]) \u003e 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(net_drop_in[1m]) \u003e 0",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "",
"enable_stimes": [
"00:00"
],
"enable_etime": "",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": null,
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327698403000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "packet loss in the outbound direction - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "increase(net_drop_out[1m]) \u003e 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(net_drop_out[1m]) \u003e 0",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "",
"enable_stimes": [
"00:00"
],
"enable_etime": "",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": null,
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327698824000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "A disk larger than 200G is running out of space",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3,
2,
1
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"prom_ql": "disk_free/1024/1024/1024 \u003c 20 and disk_total/1024/1024/1024 \u003e= 200",
"severity": 3
},
{
"prom_ql": "disk_free/1024/1024/1024 \u003c 10 and disk_total/1024/1024/1024 \u003e= 200",
"severity": 2
},
{
"prom_ql": "disk_free/1024/1024/1024 \u003c 2 and disk_total/1024/1024/1024 \u003e= 200",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "",
"enable_stimes": [
"00:00"
],
"enable_etime": "",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": null,
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327699274000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "A disk smaller than 200G is running out of space",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3,
2,
1
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"prom_ql": "disk_used_percent \u003e 90 and disk_total/1024/1024/1024 \u003c 200",
"severity": 3
},
{
"prom_ql": "disk_used_percent \u003e 95 and disk_total/1024/1024/1024 \u003c 200",
"severity": 2
},
{
"prom_ql": "disk_used_percent \u003e 99 and disk_total/1024/1024/1024 \u003c 200",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "",
"enable_stimes": [
"00:00"
],
"enable_etime": "",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": null,
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327699689000
}
]
================================================
FILE: integrations/Linux/alerts/linux_by_categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Hard disk - expected to be written full in 4 hours - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "predict_linear(disk_free[1h], 4*3600) \u003c 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327701151000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Hard disk - IO is a bit busy - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(diskio_io_time[1m])/10 \u003e 99",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327701630000
},
{
"id": 0,
"group_id": 0,
"cate": "host",
"datasource_ids": null,
"cluster": "",
"name": "Lost connection with monitoring target - categraf",
"note": "",
"prod": "host",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"key": "all_hosts",
"op": "==",
"values": []
}
],
"triggers": [
{
"duration": 60,
"severity": 2,
"type": "target_miss"
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {},
"notify_aggregation": {}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327702101000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Machine load - high memory, please pay attention - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3,
2,
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"prom_ql": "mem_available_percent \u003c 25",
"severity": 3
},
{
"prom_ql": "mem_available_percent \u003c 15",
"severity": 2
},
{
"prom_ql": "mem_available_percent \u003c 5",
"severity": 1
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327702614000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "number of TME_WAIT exceeds 20,000 - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "netstat_tcp_time_wait \u003e 20000",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327703157000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "packet loss in the inbound direction - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(net_drop_in[1m]) \u003e 0",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327703673000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "packet loss in the outbound direction - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(net_drop_out[1m]) \u003e 0",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327704127000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "大于200G的盘,空间不足了",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3,
2,
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"prom_ql": "disk_free/1024/1024/1024 \u003c 20 and disk_total/1024/1024/1024 \u003e= 200",
"severity": 3
},
{
"prom_ql": "disk_free/1024/1024/1024 \u003c 10 and disk_total/1024/1024/1024 \u003e= 200",
"severity": 2
},
{
"prom_ql": "disk_free/1024/1024/1024 \u003c 2 and disk_total/1024/1024/1024 \u003e= 200",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327704616000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "小于200G的盘,空间不足了",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2,
1,
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"prom_ql": "disk_used_percent \u003e 90 and disk_total/1024/1024/1024 \u003c 200",
"severity": 3
},
{
"prom_ql": "disk_used_percent \u003e 95 and disk_total/1024/1024/1024 \u003c 200",
"severity": 2
},
{
"prom_ql": "disk_used_percent \u003e 99 and disk_total/1024/1024/1024 \u003c 200",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327705264000
}
]
================================================
FILE: integrations/Linux/alerts/linux_by_exporter.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "A certain disk is unable to read/write normally",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(node_filesystem_device_error{mountpoint!~\"/var/lib/.*\",mountpoint!~\"/run.*\"}) \u003e 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327706838000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Disk needs to be cleaned - utilization has reached 92% - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(100 - ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes) ) \u003e 92 ",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327707545000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Insufficient computational resources - average load per core of the machine is greater than 10 - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg (node_load1) by (instance)/count(count(node_cpu_seconds_total) by (cpu,instance)) by (instance) \u003e10",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327708152000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Insufficient file handles - usage exceeds 90% - node exporter",
"note": "You can increase the file handle limit or expand the capacity",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(node_filefd_allocated/node_filefd_maximum*100) \u003e 90",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327708746000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Insufficient inode resources - usage exceeds 90% - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(100 - ((node_filesystem_files_free * 100) / node_filesystem_files))\u003e90",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327709386000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Insufficient memory resources - utilization is greater than 75% - node exporter",
"note": "Expansion or upgrading of configuration is required",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(node_memory_MemTotal_bytes - node_memory_MemFree_bytes - (node_memory_Cached_bytes + node_memory_Buffers_bytes))/node_memory_MemTotal_bytes*100 \u003e 75",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327709948000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Insufficient memory resources - utilization is greater than 95% - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(node_memory_MemTotal_bytes - node_memory_MemFree_bytes - (node_memory_Cached_bytes + node_memory_Buffers_bytes))/node_memory_MemTotal_bytes*100 \u003e 95",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327710525000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Packet loss in the inbound direction of the network card - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(node_network_receive_drop_total{device=~\"e.*\"}[1m]) \u003e 3",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327711029000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Packet loss in the outbound direction of the network card - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(node_network_transmit_drop_total{device=~\"e.*\"}[1m]) \u003e 3",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327711548000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "System conntrack needs to be adjusted - usage exceeds 80% - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "node_nf_conntrack_entries / node_nf_conntrack_entries_limit*100 \u003e 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327712073000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "System experiences OOM - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(node_vmstat_oom_kill[1m]) \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327712633000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Too many running processes - exceeding 3000 - node exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "node_procs_running \u003e 3000",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327713288000
}
]
================================================
FILE: integrations/Linux/alerts/linux_by_telegraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Address is unreachable by PING, please pay attention - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ping_result_code != 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327714548000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Hard disk - expected to be written full in 4 hours - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "predict_linear(disk_free[1h], 4*3600) \u003c 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327715253000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Hard disk - IO is very busy - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(diskio_io_time[1m])/10 \u003e 99",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327715994000
},
{
"id": 0,
"group_id": 0,
"cate": "host",
"datasource_ids": null,
"cluster": "",
"name": "Lost connection with monitoring target - telegraf",
"note": "",
"prod": "host",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"key": "all_hosts",
"op": "==",
"values": []
}
],
"triggers": [
{
"duration": 60,
"severity": 2,
"type": "target_miss"
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": {
"escalation": {},
"notify_aggregation": {}
},
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327717532000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Machine load - high CPU, please pay attention - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "cpu_usage_idle{cpu=\"cpu-total\"} \u003c 25",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327718353000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Machine load - high memory, please pay attention - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mem_available_percent \u003c 25",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327719248000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Network connection - number of TME_WAIT exceeds 20,000 - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "netstat_tcp_time_wait \u003e 20000",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327720659000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "packet loss in the inbound direction - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(net_drop_in[1m]) \u003e 0",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327721904000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "packet loss in the outbound direction - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(net_drop_out[1m]) \u003e 0",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327722565000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Port detection failed, please pay attention - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "net_response_result_code != 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327723152000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "process handle limit is too small - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_rlimit_num_fds_soft \u003c 2048",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327723625000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Process monitoring - lookup failure - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_lookup_result_code != 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327724170000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "there are processes with 0 count, a certain process may have crashed - telegraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_lookup_running == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327724727000
}
]
================================================
FILE: integrations/Linux/alerts/常用中文告警规则-采集器Categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Conntrack表的使用率超过80%",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "conntrack_ip_conntrack_count / ip_conntrack_max \u003e 0.8",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327726493000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "CPU利用率较高",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3,
2,
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"prom_ql": "cpu_usage_active\u003e75",
"severity": 3
},
{
"prom_ql": "cpu_usage_active\u003e85",
"severity": 2
},
{
"prom_ql": "cpu_usage_active\u003e95",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327726998000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "HTTP地址探测失败",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "http_response_result_code != 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327727426000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "IO比较繁忙",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"prom_ql": "rate(diskio_io_time[1m])/10 \u003e 99",
"severity": 2
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327727909000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "NTP时间偏移太大",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ntp_offset_ms \u003e 1000 or ntp_offset_ms \u003c -1000",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327728333000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PING地址探测失败",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ping_result_code != 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327728713000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "TME_WAIT状态的连接超过2万,可能需要关注",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"prom_ql": "netstat_tcp_time_wait \u003e 20000",
"severity": 2
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327729188000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "内存利用率比较高",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3,
2,
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"prom_ql": "mem_available_percent \u003c 25",
"severity": 3
},
{
"prom_ql": "mem_available_percent \u003c 15",
"severity": 2
},
{
"prom_ql": "mem_available_percent \u003c 5",
"severity": 1
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327729533000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "文件句柄使用率超过90%",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "linux_sysctl_fs_file_nr/linux_sysctl_fs_file_max\u003e0.9",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327729938000
},
{
"id": 0,
"group_id": 0,
"cate": "host",
"datasource_ids": null,
"cluster": "",
"name": "机器监控 - agent失联超过60秒",
"note": "",
"prod": "host",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"key": "all_hosts",
"op": "==",
"values": []
}
],
"triggers": [
{
"duration": 60,
"severity": 2,
"type": "target_miss"
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327730336000
},
{
"id": 0,
"group_id": 0,
"cate": "host",
"datasource_ids": null,
"cluster": "",
"name": "机器监控 - 时间偏移超过3秒,请注意",
"note": "",
"prod": "host",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"key": "all_hosts",
"op": "==",
"values": []
}
],
"triggers": [
{
"duration": 3000,
"severity": 2,
"type": "offset"
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327730730000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "每个CPU Core的平均负载任务超过3",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "system_load_norm_1\u003e3",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327731150000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "硬盘监控 - inode使用率超过90%",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"prom_ql": "disk_inodes_used/disk_inodes_total \u003e0.9",
"severity": 2
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327731534000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "硬盘监控 - 大于200G的盘,空间不足了",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3,
2,
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"prom_ql": "disk_free/1024/1024/1024 \u003c 20 and disk_total/1024/1024/1024 \u003e= 200",
"severity": 3
},
{
"prom_ql": "disk_free/1024/1024/1024 \u003c 10 and disk_total/1024/1024/1024 \u003e= 200",
"severity": 2
},
{
"prom_ql": "disk_free/1024/1024/1024 \u003c 2 and disk_total/1024/1024/1024 \u003e= 200",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327731912000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "硬盘监控 - 小于200G的盘,空间不足了",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3,
2,
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"prom_ql": "disk_used_percent \u003e 90 and disk_total/1024/1024/1024 \u003c 200",
"severity": 3
},
{
"prom_ql": "disk_used_percent \u003e 95 and disk_total/1024/1024/1024 \u003c 200",
"severity": 2
},
{
"prom_ql": "disk_used_percent \u003e 99 and disk_total/1024/1024/1024 \u003c 200",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327732287000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "硬盘监控 - 设备有报错",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "disk_device_error\u003e0",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327732665000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "硬盘监控 - 预测硬盘会在未来4小时内写满",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"prom_ql": "predict_linear(disk_free[1h], 4*3600) \u003c 0",
"severity": 1
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327733242000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "系统有OOM产生",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "increase(kernel_vmstat_oom_kill[2m]) > 0",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327733610000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "网卡监控 - 入方向有丢包",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"prom_ql": "increase(net_drop_in[1m]) \u003e 0",
"severity": 3
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327734000000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "网卡监控 - 入方向有错包",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"prom_ql": "increase(net_err_in[1m]) \u003e 0",
"severity": 3
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327734421000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "网卡监控 - 出方向有丢包",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"prom_ql": "increase(net_drop_out[1m]) \u003e 0",
"severity": 3
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327734847000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "网卡监控 - 出方向有错包",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"prom_ql": "increase(net_err_out[1m]) \u003e 0",
"severity": 3
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327735362000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "网络地址探活失败",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "net_response_result_code != 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327735768000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "进程监控-有进程数为0,某进程可能挂了",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_lookup_count == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327736228000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "进程监控-进程句柄限制过小",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_rlimit_num_fds_soft \u003c 2048",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327736656000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "进程监控-进程总量超过600",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"prom_ql": "processes_total \u003e 600",
"severity": 2
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556327737117000
}
]
================================================
FILE: integrations/Linux/collect/arp_packet/arp_packet.toml
================================================
# # collect interval
# interval = 15
[[instances]]
#eth_device="ens192"
================================================
FILE: integrations/Linux/collect/kernel_vmstat/kernel_vmstat.toml
================================================
# # collect interval
# interval = 15
# file: /proc/vmstat
[white_list]
oom_kill = 1
nr_free_pages = 0
nr_alloc_batch = 0
nr_inactive_anon = 0
nr_active_anon = 0
nr_inactive_file = 0
nr_active_file = 0
nr_unevictable = 0
nr_mlock = 0
nr_anon_pages = 0
nr_mapped = 0
nr_file_pages = 0
nr_dirty = 0
nr_writeback = 0
nr_slab_reclaimable = 0
nr_slab_unreclaimable = 0
nr_page_table_pages = 0
nr_kernel_stack = 0
nr_unstable = 0
nr_bounce = 0
nr_vmscan_write = 0
nr_vmscan_immediate_reclaim = 0
nr_writeback_temp = 0
nr_isolated_anon = 0
nr_isolated_file = 0
nr_shmem = 0
nr_dirtied = 0
nr_written = 0
numa_hit = 0
numa_miss = 0
numa_foreign = 0
numa_interleave = 0
numa_local = 0
numa_other = 0
workingset_refault = 0
workingset_activate = 0
workingset_nodereclaim = 0
nr_anon_transparent_hugepages = 0
nr_free_cma = 0
nr_dirty_threshold = 0
nr_dirty_background_threshold = 0
pgpgin = 0
pgpgout = 0
pswpin = 0
pswpout = 0
pgalloc_dma = 0
pgalloc_dma32 = 0
pgalloc_normal = 0
pgalloc_movable = 0
pgfree = 0
pgactivate = 0
pgdeactivate = 0
pgfault = 0
pgmajfault = 0
pglazyfreed = 0
pgrefill_dma = 0
pgrefill_dma32 = 0
pgrefill_normal = 0
pgrefill_movable = 0
pgsteal_kswapd_dma = 0
pgsteal_kswapd_dma32 = 0
pgsteal_kswapd_normal = 0
pgsteal_kswapd_movable = 0
pgsteal_direct_dma = 0
pgsteal_direct_dma32 = 0
pgsteal_direct_normal = 0
pgsteal_direct_movable = 0
pgscan_kswapd_dma = 0
pgscan_kswapd_dma32 = 0
pgscan_kswapd_normal = 0
pgscan_kswapd_movable = 0
pgscan_direct_dma = 0
pgscan_direct_dma32 = 0
pgscan_direct_normal = 0
pgscan_direct_movable = 0
pgscan_direct_throttle = 0
zone_reclaim_failed = 0
pginodesteal = 0
slabs_scanned = 0
kswapd_inodesteal = 0
kswapd_low_wmark_hit_quickly = 0
kswapd_high_wmark_hit_quickly = 0
pageoutrun = 0
allocstall = 0
pgrotated = 0
drop_pagecache = 0
drop_slab = 0
numa_pte_updates = 0
numa_huge_pte_updates = 0
numa_hint_faults = 0
numa_hint_faults_local = 0
numa_pages_migrated = 0
pgmigrate_success = 0
pgmigrate_fail = 0
compact_migrate_scanned = 0
compact_free_scanned = 0
compact_isolated = 0
compact_stall = 0
compact_fail = 0
compact_success = 0
htlb_buddy_alloc_success = 0
htlb_buddy_alloc_fail = 0
unevictable_pgs_culled = 0
unevictable_pgs_scanned = 0
unevictable_pgs_rescued = 0
unevictable_pgs_mlocked = 0
unevictable_pgs_munlocked = 0
unevictable_pgs_cleared = 0
unevictable_pgs_stranded = 0
thp_fault_alloc = 0
thp_fault_fallback = 0
thp_collapse_alloc = 0
thp_collapse_alloc_failed = 0
thp_split = 0
thp_zero_page_alloc = 0
thp_zero_page_alloc_failed = 0
balloon_inflate = 0
balloon_deflate = 0
balloon_migrate = 0
================================================
FILE: integrations/Linux/collect/netstat/netstat.toml
================================================
# # collect interval
# interval = 15
disable_summary_stats = false
## if machine has many network connections, use this plugin may exhaust your cpu resource, disable connection stat to avoid this
disable_connection_stats = true
tcp_ext = false
ip_ext = false
================================================
FILE: integrations/Linux/collect/ntp/ntp.toml
================================================
# # collect interval
# interval = 15
# # ntp servers
# ntp_servers = ["ntp.aliyun.com"]
# # response time out seconds
# timeout = 5
================================================
FILE: integrations/Linux/collect/processes/processes.toml
================================================
# # collect interval
# interval = 15
# # force use ps command to gather
# force_ps = false
# # force use /proc to gather
# force_proc = false
================================================
FILE: integrations/Linux/dashboards/categraf-detail.json
================================================
{
"name": "机器常用指标(如果只想看当前业务组内的机器修改大盘变量 ident 的变量类型为机器标识即可)",
"tags": "Categraf",
"ident": "",
"uuid": 1737103014612000,
"configs": {
"links": [
{
"targetBlank": true,
"title": "n9e",
"url": "https://n9e.github.io/"
},
{
"targetBlank": true,
"title": "author",
"url": "http://flashcat.cloud/"
}
],
"panels": [
{
"type": "stat",
"id": "c75ae525-ad39-458a-a7ce-2673b75fb95c",
"layout": {
"h": 5,
"w": 5,
"x": 0,
"y": 0,
"i": "507b6468-d429-42c7-b86d-d43ce65d6679",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "count(last_over_time(system_uptime{ident=~\"$ident\"}[$__rate_interval]))",
"maxDataPoints": 480,
"refId": "A",
"step": 15,
"instant": false
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "存活机器数量",
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "area",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {
"value": null
},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "rgba(44, 157, 61, 1)",
"value": null,
"type": "base"
}
]
},
"valueMappings": [],
"standardOptions": {
"util": "none",
"decimals": 1
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "stat",
"id": "7a7bd5db-d12e-49f0-92a8-15958e99ee54",
"layout": {
"h": 5,
"w": 5,
"x": 5,
"y": 0,
"i": "7a7bd5db-d12e-49f0-92a8-15958e99ee54",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "system_uptime{ident=~\"$ident\"}",
"maxDataPoints": 240,
"refId": "A",
"instant": true,
"legend": "{{ident}}"
}
],
"transformations": [
{
"id": "organize",
"options": {
"renameByName": {
"ident": "机器",
"value": "启动时长"
}
}
}
],
"name": "启动时长",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"standardOptions": {
"util": "seconds",
"decimals": 2
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "hexbin",
"id": "32298d9a-27ad-4af1-b388-72d454f0371f",
"layout": {
"h": 5,
"w": 7,
"x": 10,
"y": 0,
"i": "df545ab2-58d2-4793-8e8d-a0ca7b79a6e9",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "topk(25, disk_used_percent{ident=~\"$ident\"})",
"legend": "[{{ident}}] {{path}}",
"maxDataPoints": 240,
"refId": "A",
"step": 15,
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "硬盘使用率(最大的25个)",
"maxPerRow": 4,
"custom": {
"textMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colorRange": [
"thresholds"
]
},
"options": {
"thresholds": {
"steps": [
{
"color": "rgba(243, 9, 18, 1)",
"value": 99,
"type": ""
},
{
"color": "rgba(255, 130, 134, 1)",
"value": 95,
"type": ""
},
{
"color": "rgba(255, 174, 57, 1)",
"value": 80,
"type": ""
},
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"standardOptions": {
"util": "percent",
"decimals": 1
}
}
},
{
"type": "hexbin",
"id": "3f57e062-d9df-4eb6-b01d-abd9d7eca4dd",
"layout": {
"h": 5,
"w": 7,
"x": 17,
"y": 0,
"i": "b4e16ac1-86f6-4151-884f-f2c996a62c6b",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "topk(25, disk_inodes_used{ident=~\"$ident\"}/disk_inodes_total{ident=~\"$ident\"}*100)",
"legend": "[{{ident}}] {{path}}",
"maxDataPoints": 240,
"refId": "A",
"step": 15,
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "inode 使用率(最大的25个)",
"maxPerRow": 4,
"custom": {
"textMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colorRange": [
"thresholds"
]
},
"options": {
"thresholds": {
"steps": [
{
"color": "rgba(243, 9, 18, 1)",
"value": 99,
"type": ""
},
{
"color": "rgba(255, 130, 134, 1)",
"value": 95,
"type": ""
},
{
"color": "rgba(255, 174, 57, 1)",
"value": 80,
"type": ""
},
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"standardOptions": {
"util": "percent",
"decimals": 1
}
}
},
{
"type": "timeseries",
"id": "93b03928-5530-4cf6-b877-793964edd889",
"layout": {
"h": 5,
"w": 10,
"x": 0,
"y": 5,
"i": "ae05d971-5884-4d5e-9a0c-1f069358d8c4",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "100-cpu_usage_idle{ident=~\"$ident\",cpu=\"cpu-total\"}",
"legend": "{{ident}}",
"maxDataPoints": 480,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "CPU使用率历史趋势",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"columns": [
"max",
"min",
"avg",
"last"
],
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"min": null,
"max": null,
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(255, 101, 107, 1)",
"value": 80,
"type": ""
},
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "e1a7a7a1-a8dc-4408-8ed1-e15354b830c1",
"layout": {
"h": 5,
"w": 14,
"x": 10,
"y": 5,
"i": "906aeb9e-bcb7-48da-913e-a4d143403304",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(diskio_io_time{ident=~\"$ident\"}[$__rate_interval])/10",
"legend": "{{ident}} {{name}}",
"maxDataPoints": 480,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "IO使用率历史趋势",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"columns": [
"max",
"min",
"avg",
"last"
],
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"min": null,
"max": null,
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(255, 101, 107, 1)",
"value": 90,
"type": ""
},
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "b712ac92-1a59-42c0-90f2-270da6c38522",
"layout": {
"h": 5,
"w": 10,
"x": 0,
"y": 10,
"i": "e997491a-d932-473e-b739-49d21da30fa1",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "mem_used_percent{ident=~\"$ident\"}",
"legend": "{{ident}}",
"maxDataPoints": 480,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "内存使用率历史趋势",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"columns": [
"max",
"min",
"avg",
"last"
],
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"min": null,
"max": null,
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(255, 101, 107, 1)",
"value": 80,
"type": ""
},
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "6fe0d316-55ef-4222-9fef-6905a161b889",
"layout": {
"h": 5,
"w": 7,
"x": 10,
"y": 10,
"i": "41804f33-8263-43a2-84a9-18d9c06a47f5",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "(1 - mem_swap_free{ident=~\"$ident\"} / mem_swap_total{ident=~\"$ident\"})*100 and mem_swap_total{ident=~\"$ident\"} > 0",
"legend": "{{ident}}",
"maxDataPoints": 480,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "swap 使用率历史趋势",
"description": "如果没有启用 swap,这个图表是 No Data,是符合预期的",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"columns": [
"max",
"min",
"avg",
"last"
],
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"min": null,
"max": null,
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(255, 101, 107, 1)",
"value": 80,
"type": ""
},
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "6ff8ac2f-de61-4806-b166-bde46469bc14",
"layout": {
"h": 5,
"w": 7,
"x": 17,
"y": 10,
"i": "d83d8821-85c1-4ce7-9f65-2709f27c17b0",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(kernel_vmstat_oom_kill{ident=~\"$ident\"}[$__rate_interval])",
"legend": "{{ident}}",
"maxDataPoints": 480,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "每秒OOM次数",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"columns": [
"max",
"min",
"avg",
"last"
],
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "none",
"min": null,
"max": null,
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "307152d2-708c-4736-98cf-08b886cbf7f2",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 15,
"i": "307152d2-708c-4736-98cf-08b886cbf7f2",
"isResizable": false
},
"name": "网络详情",
"panels": [],
"type": "row"
},
{
"type": "timeseries",
"id": "f2ee5d32-737c-4095-b6b7-b15b778ffdb9",
"layout": {
"h": 5,
"w": 12,
"x": 0,
"y": 16,
"i": "f2ee5d32-737c-4095-b6b7-b15b778ffdb9",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(net_bytes_recv{ident=~\"$ident\"}[$__rate_interval])*8",
"legend": "{{ident}} {{interface}}",
"maxDataPoints": 480,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "网络流量-入向",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bitsSecSI",
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "noraml",
"standardOptions": {
"util": "bitsIEC"
}
}
}
]
},
{
"type": "timeseries",
"id": "1ab67f48-ff24-42e5-b532-a1496ae9b2b6",
"layout": {
"h": 5,
"w": 12,
"x": 12,
"y": 16,
"i": "5cab201a-9ea9-420b-90aa-48f8e00dccca",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(net_bytes_sent{ident=~\"$ident\"}[$__rate_interval])*8",
"legend": "{{ident}} {{interface}}",
"maxDataPoints": 480,
"refId": "B",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "网络流量-出向",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bitsSecSI",
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "noraml",
"standardOptions": {
"util": "bitsIEC"
}
}
}
]
},
{
"type": "timeseries",
"id": "cfb80689-de7b-47fb-9155-052b796dd7f5",
"layout": {
"h": 5,
"w": 12,
"x": 0,
"y": 21,
"i": "cfb80689-de7b-47fb-9155-052b796dd7f5",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "netstat_tcp_tw{ident=~\"$ident\"}",
"maxDataPoints": 480,
"refId": "B",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Time Wait 状态的连接数",
"description": "",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"decimals": 0
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "9634c41c-e124-4d7f-9406-0f86753e8d70",
"layout": {
"h": 5,
"w": 6,
"x": 12,
"y": 21,
"i": "9634c41c-e124-4d7f-9406-0f86753e8d70",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(net_err_in{ident=~\"$ident\"}[$__rate_interval])",
"legend": "{{ident}}-{{interface}}-in",
"maxDataPoints": 480,
"refId": "A",
"step": 15
},
{
"expr": "rate(net_err_out{ident=~\"$ident\"}[$__rate_interval])",
"legend": "{{ident}}-{{interface}}-out",
"maxDataPoints": 480,
"refId": "B",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "网络错包",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"decimals": 6
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.06,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "4123f4c1-bf8e-400e-b267-8d7f6a92691a",
"layout": {
"h": 5,
"w": 6,
"x": 18,
"y": 21,
"i": "4123f4c1-bf8e-400e-b267-8d7f6a92691a",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(net_drop_in{ident=~\"$ident\"}[$__rate_interval])",
"legend": "{{ident}}-{{interface}}-in",
"maxDataPoints": 480,
"refId": "A",
"step": 15
},
{
"expr": "rate(net_drop_out{ident=~\"$ident\"}[$__rate_interval])",
"legend": "{{ident}}-{{interface}}-out",
"maxDataPoints": 480,
"refId": "B",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "网络丢包",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"decimals": 6
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.06,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "aabb8263-1a9b-43fb-bee1-6c532f5012a3",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 26,
"i": "aabb8263-1a9b-43fb-bee1-6c532f5012a3",
"isResizable": false
},
"name": "其他指标",
"panels": [],
"type": "row"
},
{
"type": "timeseries",
"id": "1b4da538-29d4-4c58-b3f4-773fabb8616c",
"layout": {
"h": 5,
"w": 12,
"x": 0,
"y": 27,
"i": "1b4da538-29d4-4c58-b3f4-773fabb8616c",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "disk_device_error{ident=~\"$ident\"}",
"legend": "{{ident}} {{path}}",
"maxDataPoints": 480,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "硬盘出错,0是正常,非0是异常",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"decimals": 0
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(44, 157, 61, 1)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "0f713e5f-49cf-405a-9922-3b84cded33d4",
"layout": {
"h": 5,
"w": 12,
"x": 12,
"y": 27,
"i": "dbfdda49-de73-48ac-a224-b68a1ee61752",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "100 * conntrack_ip_conntrack_count{ident=~\"$ident\"} / conntrack_ip_conntrack_max{ident=~\"$ident\"}",
"legend": "ip_conntrack {{ident}}",
"maxDataPoints": 480,
"refId": "A",
"step": 15
},
{
"__mode__": "__query__",
"expr": "100 * conntrack_nf_conntrack_count{ident=~\"$ident\"} / conntrack_nf_conntrack_max{ident=~\"$ident\"}",
"legend": "nf_conntrack {{ident}}",
"maxDataPoints": 480,
"refId": "B",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Conntrack使用率",
"description": "`dmesg -T` 有时看到 conntrack table full 的报错,大概率就是 conntrack 限制太小了,需要调整内核参数",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"decimals": 6
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(44, 157, 61, 1)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "7c90380f-5ab6-4aa5-9070-f604985a0389",
"layout": {
"h": 5,
"w": 12,
"x": 0,
"y": 32,
"i": "e7117d7c-b946-49fa-bc49-2afb0d2b3a44",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "processes_total{ident=~\"$ident\"}",
"legend": "",
"maxDataPoints": 480,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Process 总量",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"decimals": 0
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(44, 157, 61, 1)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "3334c222-dd92-49eb-9744-4ce0f59031e4",
"layout": {
"h": 5,
"w": 12,
"x": 12,
"y": 32,
"i": "0ecb9f26-4c4d-40d7-9934-5116e3ffa51a",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "procstat_rlimit_num_fds_hard{ident=~\"$ident\"}",
"legend": "",
"maxDataPoints": 480,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "进程句柄数限制(低于4096要注意)",
"description": "以现在的硬件配置,通常句柄的 ulimit 应该比较大,如果低于 4096,大概率是忘记修改配置了,需要注意。这个数据是 Categraf 的 procstat 插件采集的。",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"decimals": 0
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(44, 157, 61, 1)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "c3ee640f-e654-4fc7-aa2a-0dd8e9de67cb",
"layout": {
"h": 5,
"w": 12,
"x": 0,
"y": 37,
"i": "423adbbf-8c23-45ab-b7d5-9a81b72291f1",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "ntp_offset_ms{ident=~\"$ident\"}",
"legend": "",
"maxDataPoints": 480,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "NTP时间偏移",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "milliseconds",
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(44, 157, 61, 1)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "9bb8d5ef-dc4e-419f-8e95-6dbb97b2afb6",
"layout": {
"h": 5,
"w": 12,
"x": 12,
"y": 37,
"i": "e97f1934-26e8-4bf3-be21-95307443f146",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "linux_sysctl_fs_file_nr{ident=~\"$ident\"}/linux_sysctl_fs_file_max{ident=~\"$ident\"} * 100",
"legend": "",
"maxDataPoints": 480,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "操作系统文件句柄使用率",
"description": "",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"decimals": 0
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(44, 157, 61, 1)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
}
],
"var": [
{
"definition": "prometheus",
"label": "数据源",
"name": "prom",
"type": "datasource"
},
{
"name": "ident",
"label": "所有机器",
"type": "query",
"hide": false,
"multi": true,
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(system_uptime, ident)"
}
],
"version": "3.0.0"
}
}
================================================
FILE: integrations/Linux/dashboards/categraf-overview.json
================================================
{
"name": "机器台账表格视图(使用 Categraf 作为采集器)",
"tags": "Categraf",
"ident": "",
"uuid": 1717556327742611000,
"configs": {
"links": [
{
"targetBlank": true,
"title": "n9e",
"url": "https://n9e.github.io/"
},
{
"targetBlank": true,
"title": "author",
"url": "http://flashcat.cloud/"
}
],
"panels": [
{
"type": "hexbin",
"id": "21b8b3ab-26aa-47cb-b814-f310f2d143aa",
"layout": {
"h": 5,
"i": "21b8b3ab-26aa-47cb-b814-f310f2d143aa",
"isResizable": true,
"w": 12,
"x": 0,
"y": 0
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "topk(100, cpu_usage_active{cpu=\"cpu-total\", ident=~\"$ident\"})",
"instant": true,
"legend": "{{ident}}",
"maxDataPoints": 480,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "CPU利用率",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"calc": "lastNotNull",
"valueField": "Value",
"colorRange": [
"thresholds"
],
"detailUrl": "/components/dashboard/detail?__uuid__=1737103014612000&ident=${__field.labels.ident}"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#ef3c3c",
"value": 95,
"type": ""
},
{
"color": "#ff656b",
"type": "",
"value": 85
},
{
"color": "#ffae39",
"type": "",
"value": 75
},
{
"color": "#2c9d3d",
"type": "base",
"value": null
}
]
},
"standardOptions": {
"util": "percent",
"decimals": 2
}
}
},
{
"type": "hexbin",
"id": "86d4a502-21f7-4981-9b38-ed8e696b6f49",
"layout": {
"h": 5,
"i": "872b2040-c5b0-43fe-92c7-e37cb77edffc",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "topk(100, mem_used_percent{ident=~\"$ident\"})",
"instant": true,
"legend": "{{ident}}",
"maxDataPoints": 480,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "内存利用率",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"calc": "lastNotNull",
"valueField": "Value",
"colorRange": [
"thresholds"
],
"detailUrl": "/components/dashboard/detail?__uuid__=1737103014612000&ident=${__field.labels.ident}"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#ef3c3c",
"value": 95,
"type": ""
},
{
"color": "#ff656b",
"type": "",
"value": 85
},
{
"color": "#ffae39",
"type": "",
"value": 75
},
{
"color": "#2c9d3d",
"type": "base",
"value": null
}
]
},
"standardOptions": {
"util": "percent",
"decimals": 2
}
}
},
{
"type": "table",
"id": "77bf513a-8504-4d33-9efe-75aaf9abc9e4",
"layout": {
"h": 11,
"i": "77bf513a-8504-4d33-9efe-75aaf9abc9e4",
"isResizable": true,
"w": 24,
"x": 0,
"y": 5
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "avg(cpu_usage_active{cpu=\"cpu-total\", ident=~\"$ident\"}) by (ident)",
"legend": "CPU使用率",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "avg(mem_used_percent{ident=~\"$ident\"}) by (ident)",
"legend": "内存使用率",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "avg(mem_total{ident=~\"$ident\"}) by (ident)",
"legend": "总内存",
"maxDataPoints": 240,
"refId": "C"
},
{
"expr": "avg(disk_used_percent{ident=~\"$ident\",path=\"/\"}) by (ident)",
"legend": "根分区使用率",
"maxDataPoints": 240,
"refId": "D"
}
],
"transformations": [
{
"id": "organize",
"options": {
"renameByName": {
"ident": "机器"
}
}
}
],
"name": "机器列表",
"maxPerRow": 4,
"custom": {
"showHeader": true,
"colorMode": "background",
"nowrap": false,
"tableLayout": "fixed",
"calc": "lastNotNull",
"displayMode": "labelValuesToRows",
"aggrDimension": "ident",
"sortColumn": "ident",
"sortOrder": "ascend",
"pageLimit": 500,
"linkMode": "appendLinkColumn",
"links": [
{
"targetBlank": true,
"title": "详情",
"url": "/components/dashboard/detail?__uuid__=1737103014612000&ident=${__field.labels.ident}"
}
]
},
"options": {
"standardOptions": {
"decimals": 2
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID",
"value": "A"
},
"properties": {
"standardOptions": {
"util": "percent",
"decimals": 2
},
"valueMappings": [
{
"match": {
"to": 65
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
},
{
"match": {
"to": 90
},
"result": {
"color": "#ff656b"
},
"type": "range"
},
{
"match": {
"from": 90
},
"result": {
"color": "#f50505"
},
"type": "range"
}
]
}
},
{
"matcher": {
"id": "byFrameRefID",
"value": "B"
},
"properties": {
"standardOptions": {
"util": "percent",
"decimals": 2
},
"valueMappings": [
{
"match": {
"to": 65
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
},
{
"match": {
"to": 90
},
"result": {
"color": "#ff656b"
},
"type": "range"
},
{
"match": {
"from": 90
},
"result": {
"color": "#fa0a0a"
},
"type": "range"
}
]
},
"type": "special"
},
{
"matcher": {
"id": "byFrameRefID",
"value": "C"
},
"properties": {
"standardOptions": {
"decimals": 2,
"util": "bytesIEC"
},
"valueMappings": []
},
"type": "special"
},
{
"matcher": {
"id": "byFrameRefID",
"value": "D"
},
"properties": {
"standardOptions": {
"decimals": 2,
"util": "percent"
},
"valueMappings": [
{
"match": {
"to": 90
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
},
{
"match": {
"from": 90
},
"result": {
"color": "#ff656b"
},
"type": "range"
}
]
},
"type": "special"
}
]
}
],
"var": [
{
"name": "prom",
"label": "数据源",
"type": "datasource",
"hide": false,
"definition": "prometheus"
},
{
"name": "ident",
"label": "机器",
"type": "query",
"hide": false,
"multi": true,
"allOption": true,
"allValue": ".*",
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(system_load1,ident)"
}
],
"version": "3.0.0"
}
}
================================================
FILE: integrations/Linux/dashboards/categraf-processes.json
================================================
{
"name": "机器进程数量统计(使用 Categraf 作为采集器)",
"tags": "Categraf",
"ident": "",
"uuid": 1717556327738575000,
"configs": {
"panels": [
{
"custom": {
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"id": "adc3f1d3-6d0d-4c1e-80ca-5b6d8103bac5",
"layout": {
"h": 8,
"i": "adc3f1d3-6d0d-4c1e-80ca-5b6d8103bac5",
"isResizable": true,
"w": 12,
"x": 0,
"y": 0
},
"name": "Running Processes",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 50
},
"result": {
"color": "#f10808"
},
"type": "range"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#9470FF",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "processes_running{ident=~\"$ident\"}",
"instant": true,
"legend": "{{ident}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"id": "659f5f75-24ca-493c-97cb-3d99abd52172",
"layout": {
"h": 8,
"i": "df457bf0-17c8-4d05-a527-cfaf0f2b844c",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"name": "Total Processes",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 600
},
"result": {
"color": "#f10808"
},
"type": "range"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#9470FF",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "processes_total{ident=~\"$ident\"}",
"instant": true,
"legend": "{{ident}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"id": "5e849509-1c41-44c7-85ee-d8c0adf7c623",
"layout": {
"h": 8,
"i": "62291285-be84-470a-9ccc-53be7a8733fd",
"isResizable": true,
"w": 12,
"x": 0,
"y": 8
},
"name": "Total Threads",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 2000
},
"result": {
"color": "#ff8286"
},
"type": "range"
},
{
"match": {
"from": 4000
},
"result": {
"color": "#f30909"
},
"type": "range"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#9470FF",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "processes_total_threads{ident=~\"$ident\"}",
"instant": true,
"legend": "{{ident}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"columns": [],
"displayMode": "labelsOfSeriesToRows",
"showHeader": true,
"sortColumn": "value",
"sortOrder": "descend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"id": "b2850506-6cdd-48cc-9223-70acff9212b0",
"layout": {
"h": 8,
"i": "b2850506-6cdd-48cc-9223-70acff9212b0",
"isResizable": true,
"w": 12,
"x": 12,
"y": 8
},
"name": "SUM by Process state",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "sum({__name__=~\"processes_sleeping|processes_dead|processes_paging|processes_total_threads|processes_total|processes_idle|processes_running|processes_zombies|processes_stopped|processes_unknown|processes_blocked\", ident=~\"$ident\"}) by (__name__)",
"instant": true,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
}
],
"var": [
{
"name": "Datasource",
"label": "数据源",
"type": "datasource",
"hide": false,
"definition": "prometheus"
},
{
"name": "ident",
"label": "机器",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${Datasource}"
},
"definition": "label_values(processes_running, ident)",
"multi": true,
"allOption": true
}
],
"version": "3.0.0"
}
}
================================================
FILE: integrations/Linux/dashboards/categraf-table-ng.json
================================================
{
"name": "Host Table NG",
"tags": "Categraf",
"ident": "",
"uuid": 1756720567064000,
"configs": {
"var": [
{
"name": "prom",
"label": "PROM",
"type": "datasource",
"hide": false,
"definition": "prometheus"
},
{
"name": "ident",
"label": "机器",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(mem_free, ident)",
"multi": true,
"allOption": true
}
],
"panels": [
{
"type": "tableNG",
"id": "306cab0d-f643-4d86-94d0-248fc05fd8a8",
"layout": {
"h": 10,
"w": 24,
"x": 0,
"y": 0,
"i": "306cab0d-f643-4d86-94d0-248fc05fd8a8",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"refId": "A",
"expr": "cpu_usage_active{ident=~\"$ident\"}",
"instant": true
},
{
"expr": "100 - mem_available_percent{ident=~\"$ident\"}",
"__mode__": "__query__",
"refId": "B",
"instant": true
},
{
"expr": "disk_used_percent{path=\"/\", ident=~\"$ident\"}",
"__mode__": "__query__",
"refId": "C",
"instant": true
},
{
"expr": "categraf_info{ident=~\"$ident\"}",
"__mode__": "__query__",
"refId": "D",
"instant": true
}
],
"transformationsNG": [
{
"id": "joinByField",
"options": {
"mode": "outer",
"byField": "ident"
}
},
{
"id": "organize",
"options": {
"fields": [
"ident",
"__time_0",
"__name___0",
"cpu",
"__value_#A",
"__time_1",
"__value_#B",
"__time_2",
"__name___2",
"device",
"fstype",
"mode",
"path",
"__value_#C",
"__time_3",
"__name___3",
"version",
"__value_#D"
],
"renameByName": {
"ident": "机器",
"__value_#A": "CPU利用率%",
"__value_#B": "内存利用率%",
"__value_#C": "根分区利用率%",
"version": "Categraf Version"
},
"excludeByName": {
"__time_0": true,
"__name__": true,
"agent_isp": true,
"agent_region": true,
"cpu": true,
"env": true,
"myenv": true,
"__time_1": true,
"__time_2": true,
"__name___2": true,
"device": true,
"fstype": true,
"mode": true,
"path": true,
"__name___0": true,
"__value_#D": true,
"__time_3": true,
"__name___3": true
},
"indexByName": {
"ident": 0,
"version": 1,
"__time_0": 2,
"__name___0": 3,
"agent_isp": 4,
"agent_region": 5,
"cpu": 6,
"env": 7,
"myenv": 8,
"__value_#A": 9,
"__time_1": 10,
"__value_#B": 11,
"__time_2": 12,
"__name___2": 13,
"device": 14,
"fstype": 15,
"mode": 16,
"path": 17,
"__value_#C": 18,
"__time_3": 19,
"__name___3": 20,
"__value_#D": 21
}
}
}
],
"name": "机器表格样例",
"maxPerRow": 4,
"custom": {
"showHeader": true,
"filterable": true,
"cellOptions": {
"type": "none",
"wrapText": false
}
},
"options": {
"links": [
{
"title": "详情",
"url": "/components/dashboard/detail?__uuid__=1737103014612000&ident=${ident}&prom=${prom}",
"targetBlank": true
}
],
"standardOptions": {
"decimals": 2
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"value": "CPU利用率%"
},
"properties": {
"cellOptions": {
"type": "color-background",
"mode": "lcd",
"valueDisplayMode": "text"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(255, 101, 107)",
"value": 85,
"type": ""
},
{
"color": "rgba(236, 210, 69, 1)",
"value": 70,
"type": ""
},
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"valueMappings": [],
"standardOptions": {
"util": "percent",
"decimals": 2,
"min": 0,
"max": 100
}
}
},
{
"matcher": {
"id": "byName",
"value": "内存利用率%"
},
"properties": {
"cellOptions": {
"type": "gauge",
"mode": "lcd",
"valueDisplayMode": "text"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(255, 101, 107)",
"value": 80,
"type": ""
},
{
"color": "rgba(236, 210, 69, 1)",
"value": 60,
"type": ""
},
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"standardOptions": {
"util": "percent",
"decimals": 2,
"min": 0,
"max": 100
}
}
},
{
"matcher": {
"id": "byName",
"value": "根分区利用率%"
},
"properties": {
"cellOptions": {
"type": "gauge",
"mode": "basic",
"valueDisplayMode": "text"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(255, 101, 107)",
"value": 90,
"type": ""
},
{
"color": "rgba(236, 210, 69, 1)",
"value": 60,
"type": ""
},
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"standardOptions": {
"util": "percent",
"decimals": 2,
"min": 0,
"max": 100
}
}
}
]
}
],
"version": "3.1.0"
}
}
================================================
FILE: integrations/Linux/dashboards/exporter-detail.json
================================================
{
"name": "机器常用指标(使用 NodeExporter 作为采集器)",
"tags": "NodeExporter",
"ident": "",
"uuid": 1717556327748611000,
"configs": {
"links": [
{
"targetBlank": true,
"title": "n9e",
"url": "https://n9e.gitee.io/"
},
{
"targetBlank": true,
"title": "author",
"url": "http://flashcat.cloud/"
}
],
"panels": [
{
"collapsed": true,
"id": "396bf5e2-f204-4349-8e00-fb9d25ed7e79",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 0,
"i": "396bf5e2-f204-4349-8e00-fb9d25ed7e79",
"isResizable": false
},
"name": "单机概况",
"type": "row"
},
{
"type": "stat",
"id": "534ca690-87e5-4c53-9c8a-d3afe0276bf5",
"layout": {
"h": 5,
"w": 6,
"x": 0,
"y": 1,
"i": "534ca690-87e5-4c53-9c8a-d3afe0276bf5",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "node_time_seconds{instance=~\"$node\"} - node_boot_time_seconds{instance=~\"$node\"}",
"maxDataPoints": 480,
"instant": true,
"step": 15,
"legend": "{{instance}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "启动时长",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {
"title": null,
"value": null
},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "rgba(148, 112, 255, 1)",
"value": null,
"type": "base"
}
]
},
"valueMappings": [],
"standardOptions": {
"util": "seconds",
"decimals": 1
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "stat",
"id": "69c96540-965b-4e87-9eb7-c24a0c974474",
"layout": {
"h": 5,
"w": 6,
"x": 6,
"y": 1,
"i": "69c96540-965b-4e87-9eb7-c24a0c974474",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "node_memory_SwapTotal_bytes{instance=~\"$node\"} - node_memory_SwapFree_bytes{instance=~\"$node\"}",
"maxDataPoints": 480,
"instant": true,
"step": 15,
"legend": "{{instance}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "SWAP内存使用",
"description": "swap使用过高,会影响系统io性能,如果内存够用但swap使用很高,可以调小swappiness的值",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {
"value": null
},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "rgba(148, 112, 255, 1)",
"value": null,
"type": "base"
}
]
},
"valueMappings": [],
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "stat",
"id": "84b04d6b-1f97-47b8-86ff-77e6b1af4f1d",
"layout": {
"h": 5,
"w": 6,
"x": 12,
"y": 1,
"i": "84b04d6b-1f97-47b8-86ff-77e6b1af4f1d",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "sum(node_filesystem_device_error{instance=~\"$node\",mountpoint!~\"/var/lib/.*\",mountpoint!~\"/run.*\"}) by (instance)",
"legend": "{{instance}}",
"maxDataPoints": 480,
"step": 15,
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "写文件错误数总和",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {
"value": null
},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"valueMappings": [
{
"match": {
"from": 0,
"to": 0
},
"result": {
"color": "#369903"
},
"type": "range"
},
{
"match": {
"from": 1,
"to": null
},
"result": {
"color": "#f0310f"
},
"type": "range"
}
],
"standardOptions": {
"decimals": 1
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "timeseries",
"id": "39715d51-4d18-4185-8584-68a4d44adf2b",
"layout": {
"h": 5,
"w": 6,
"x": 18,
"y": 1,
"i": "39715d51-4d18-4185-8584-68a4d44adf2b",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(node_vmstat_oom_kill{instance=~\"$node\"}[$__rate_interval])",
"legend": "",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "每秒OOM次数",
"description": "大于0,说明有进程内存不够用了,需要考虑扩容或升级配置了",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"decimals": 0
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
},
{
"color": "#f90101",
"value": 1
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "hexbin",
"id": "981e2271-2c6c-4410-b3fb-73c35049c11a",
"layout": {
"h": 5,
"w": 8,
"x": 0,
"y": 6,
"i": "981e2271-2c6c-4410-b3fb-73c35049c11a",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "topk(25, max(100 - ((node_filesystem_avail_bytes{instance=~\"$node\",} * 100) / node_filesystem_size_bytes{instance=~\"$node\"})) by (instance, mountpoint))",
"legend": "{{instance}} {{mountpoint}}",
"maxDataPoints": 480,
"step": 15,
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "磁盘分区使用率",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"fontBackground": true,
"calc": "lastNotNull",
"valueField": "Value",
"colorRange": [
"thresholds"
]
},
"options": {
"thresholds": {
"steps": [
{
"color": "rgb(255, 101, 107)",
"value": 95,
"type": ""
},
{
"color": "rgba(230, 198, 39, 1)",
"value": 75,
"type": ""
},
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"standardOptions": {
"util": "percent",
"decimals": 1
}
}
},
{
"type": "hexbin",
"id": "82310aef-8db6-46bf-96a0-fcad68ae7d9e",
"layout": {
"h": 5,
"w": 8,
"x": 8,
"y": 6,
"i": "82310aef-8db6-46bf-96a0-fcad68ae7d9e",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "topk(25, max(100 - ((node_filesystem_files_free{instance=~\"$node\",mountpoint!~\"/var/lib/.*\",mountpoint!~\"/run/user.*\"} * 100) / node_filesystem_files{instance=~\"$node\",mountpoint!~\"/var/lib/.*\",mountpoint!~\"/run/user.*\"})) by (instance, mountpoint))",
"legend": "{{instance}} {{mountpoint}}",
"maxDataPoints": 480,
"instant": true,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "inode分区使用率",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"fontBackground": true,
"calc": "lastNotNull",
"valueField": "Value",
"colorRange": [
"thresholds"
]
},
"options": {
"thresholds": {
"steps": [
{
"color": "rgb(255, 101, 107)",
"value": 75,
"type": ""
},
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"standardOptions": {
"util": "percent",
"decimals": 1
}
}
},
{
"type": "timeseries",
"id": "14caedd8-a1fd-412b-8c50-e35d3df57a2b",
"layout": {
"h": 5,
"w": 8,
"x": 16,
"y": 6,
"i": "14caedd8-a1fd-412b-8c50-e35d3df57a2b",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "node_filefd_allocated{instance=~\"$node\"}/node_filefd_maximum{instance=~\"$node\"}*100",
"maxDataPoints": 480,
"instant": false,
"step": 15,
"legend": "{{instance}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "FD使用率",
"description": "如果超过80%,建议把文件描述符的最大个数调大,或者扩容",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.01,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "639f4668-fb33-427d-8ec8-4f11127a1bf3",
"layout": {
"h": 6,
"w": 8,
"x": 0,
"y": 11,
"i": "639f4668-fb33-427d-8ec8-4f11127a1bf3",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "avg without (mode,cpu) ( 1 - rate(node_cpu_seconds_total{mode=\"idle\", instance=~\"$node\"}[$__rate_interval]) ) * 100",
"maxDataPoints": 480,
"step": 15,
"instant": false,
"legend": "{{instance}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "CPU使用率",
"description": "如果cpu使用率超过50%,可以通过top命令查看机器上是否有异常进程,如果没有异常进程,则说明服务需要扩容或者机器需要升级配置了",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.01,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "56dc011d-fc1c-4682-a903-1b778cbff9e8",
"layout": {
"h": 6,
"w": 8,
"x": 8,
"y": 11,
"i": "56dc011d-fc1c-4682-a903-1b778cbff9e8",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes{instance=~\"$node\"} / node_memory_MemTotal_bytes{instance=~\"$node\"})) * 100",
"maxDataPoints": 480,
"step": 15,
"instant": false,
"legend": "{{instance}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "内存使用率",
"description": "如果内存使用率超过50%,则需要扩容或者升级配置了",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.01,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "4a9ea87d-d650-43ff-bf1e-70f44afabace",
"layout": {
"h": 6,
"w": 8,
"x": 16,
"y": 11,
"i": "4a9ea87d-d650-43ff-bf1e-70f44afabace",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(node_disk_io_time_seconds_total{instance=~\"$node\"}[$__rate_interval]) * 100",
"legend": "",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "IO Util",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(255, 101, 107)",
"value": 75,
"type": ""
},
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "22df4dfe-6f93-4f44-b7ea-254a690922a5",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 17,
"i": "22df4dfe-6f93-4f44-b7ea-254a690922a5",
"isResizable": false
},
"name": "系统指标",
"type": "row"
},
{
"type": "timeseries",
"id": "7c4fede9-18b6-4a45-9278-76b6f724716e",
"layout": {
"h": 6,
"w": 6,
"x": 0,
"y": 18,
"i": "7c4fede9-18b6-4a45-9278-76b6f724716e",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "node_procs_running{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "进程数",
"description": "进程数超过2000,可以考虑扩容了",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
},
{
"color": "#ff0000",
"value": 2000
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "28f96d86-79ed-4564-84c9-8e8c58d66985",
"layout": {
"h": 6,
"w": 6,
"x": 6,
"y": 18,
"i": "28f96d86-79ed-4564-84c9-8e8c58d66985",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(node_intr_total{instance=~\"$node\"}[$__rate_interval])",
"legend": "{{instance}} interrupts",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "irate(node_context_switches_total{instance=~\"$node\"}[$__rate_interval])",
"legend": "{{instance}} context switches",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "上下文切换/中断",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "1a16c5ad-1a71-4771-9f50-f3dcc4524f71",
"layout": {
"h": 6,
"w": 6,
"x": 12,
"y": 18,
"i": "1a16c5ad-1a71-4771-9f50-f3dcc4524f71",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "node_entropy_available_bits{instance=~\"$node\"}",
"legend": "{{instance}}",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "熵池大小",
"description": "熵池太小 ,程序使用随机函数会阻塞,可以安装 rng-tools 工具增加熵池大小,可参考\nhttps://codeantenna.com/a/Ab6aMd3NSA ",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"decimals": null
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
},
{
"color": "#f70202",
"value": 100
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "077b1181-00e2-44ce-9295-7528a8b829d5",
"layout": {
"h": 6,
"w": 6,
"x": 18,
"y": 18,
"i": "077b1181-00e2-44ce-9295-7528a8b829d5",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "node_timex_offset_seconds{instance=~\"$node\"}",
"legend": "{{instance}}",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "NTP偏移",
"description": "",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "seconds",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "406a3fd8-52fb-4935-9971-d7a8f37437df",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 24,
"i": "406a3fd8-52fb-4935-9971-d7a8f37437df",
"isResizable": false
},
"name": "CPU详情",
"type": "row"
},
{
"type": "timeseries",
"id": "28f582ca-dd5c-41a8-8cc8-bcc88f755253",
"layout": {
"h": 7,
"w": 8,
"x": 0,
"y": 25,
"i": "28f582ca-dd5c-41a8-8cc8-bcc88f755253",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": " (avg without (cpu, mode)(rate(node_cpu_seconds_total{instance=~\"$node\",mode=\"idle\"}[$__rate_interval])))*100",
"legend": "",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "CPU空闲率",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
},
{
"color": "#f90101",
"value": 10
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "bb2b0b33-2f78-428d-8847-d900d0bbdf25",
"layout": {
"h": 7,
"w": 8,
"x": 8,
"y": 25,
"i": "bb2b0b33-2f78-428d-8847-d900d0bbdf25",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": " (avg without (cpu)(rate(node_cpu_seconds_total{instance=~\"$node\",mode!=\"idle\"}[$__rate_interval])))*100",
"legend": "{{instance}} {{mode}}",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "CPU使用率详情",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "51e95b78-940e-4e5e-9cf5-7c8515209de0",
"layout": {
"h": 7,
"w": 8,
"x": 16,
"y": 25,
"i": "51e95b78-940e-4e5e-9cf5-7c8515209de0",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "node_load1{instance=~\"$node\"}",
"legend": "{{instance}} load1",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_load5{instance=~\"$node\"}",
"legend": "{{instance}} load5",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_load15{instance=~\"$node\"}",
"legend": "{{instance}} load15",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "CPU负载",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "f3ab98b2-318b-451b-868e-d967555b7925",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 32,
"i": "f3ab98b2-318b-451b-868e-d967555b7925",
"isResizable": false
},
"name": "内存详情",
"type": "row"
},
{
"type": "timeseries",
"id": "6b084867-c1a4-4e7f-a0d7-5dd24524f82d",
"layout": {
"h": 7,
"w": 12,
"x": 0,
"y": 33,
"i": "6b084867-c1a4-4e7f-a0d7-5dd24524f82d",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "node_memory_HugePages_Total{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_Hugepagesize_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_HugePages_Surp{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_HugePages_Free{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_HugePages_Rsvd{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_AnonHugePages_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_Inactive_file_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_Inactive_anon_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_Active_file_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_Active_anon_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_Unevictable_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_AnonPages_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_Shmem_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_Mapped_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_Cached_bytes{instance=~\"$node\"} ",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_SwapCached_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_Mlocked_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_Buffers_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "用户态内存使用",
"description": "",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "single"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"columns": [
"last"
],
"behaviour": "showItem",
"selectMode": "single",
"heightInPercentage": null
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "10b1dde7-7cad-4992-be3f-7e8c8aac8c03",
"layout": {
"h": 7,
"w": 12,
"x": 12,
"y": 33,
"i": "10b1dde7-7cad-4992-be3f-7e8c8aac8c03",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "node_memory_Slab_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_SReclaimable_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_SUnreclaim_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_VmallocUsed_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_VmallocChunk_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_KernelStack_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_Bounce_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "内核态内存使用",
"description": "",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "single"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"columns": [
"last"
],
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "ea5480e7-ec20-41e5-a007-74c846fae91a",
"layout": {
"h": 7,
"w": 12,
"x": 0,
"y": 40,
"i": "ea5480e7-ec20-41e5-a007-74c846fae91a",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "node_memory_DirectMap1G_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_DirectMap2M_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_DirectMap4k_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "TLB效率",
"description": "/proc/meminfo中的DirectMap所统计的不是关于内存的使用,而是一个反映TLB效率的指标。TLB(Translation Lookaside Buffer)是位于CPU上的缓存,用于将内存的虚拟地址翻译成物理地址,由于TLB的大小有限,不能缓存的地址就需要访问内存里的page table来进行翻译,速度慢很多。为了尽可能地将地址放进TLB缓存,新的CPU硬件支持比4k更大的页面从而达到减少地址数量的目的, 比如2MB,4MB,甚至1GB的内存页,视不同的硬件而定。”DirectMap4k”表示映射为4kB的内存数量, “DirectMap2M”表示映射为2MB的内存数量,以此类推。所以DirectMap其实是一个反映TLB效率的指标",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "single"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"columns": [
"last"
],
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "8d31c2aa-38c9-434e-8c93-4cbae4e1bd8d",
"layout": {
"h": 7,
"w": 12,
"x": 12,
"y": 40,
"i": "8d31c2aa-38c9-434e-8c93-4cbae4e1bd8d",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "node_memory_NFS_Unstable_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_Writeback_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_memory_Dirty_bytes{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "dirty page",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "single"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"columns": [
"last"
],
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "88aeb766-214b-43a9-85f3-9ec6368f0da0",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 47,
"i": "88aeb766-214b-43a9-85f3-9ec6368f0da0",
"isResizable": false
},
"name": "磁盘详情",
"type": "row"
},
{
"type": "timeseries",
"id": "c06c49fa-60ce-4982-bfe5-1d5012dc4af1",
"layout": {
"h": 6,
"w": 6,
"x": 0,
"y": 48,
"i": "c06c49fa-60ce-4982-bfe5-1d5012dc4af1",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(node_disk_read_time_seconds_total{instance=~\"$node\"}[$__rate_interval]) / rate(node_disk_reads_completed_total{instance=~\"$node\"}[$__rate_interval])\n+\nrate(node_disk_write_time_seconds_total{instance=~\"$node\"}[$__rate_interval]) / rate(node_disk_writes_completed_total{instance=~\"$node\"}[$__rate_interval])",
"legend": "",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "io await",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "seconds",
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "b516483d-ea9b-4f4c-9fdc-5ac70fe159d2",
"layout": {
"h": 6,
"w": 6,
"x": 6,
"y": 48,
"i": "b516483d-ea9b-4f4c-9fdc-5ac70fe159d2",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(node_disk_read_bytes_total{instance=~\"$node\"}[$__rate_interval])",
"legend": "{{instance}} {{device}}-Read",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "rate(node_disk_written_bytes_total{instance=~\"$node\"}[$__rate_interval])",
"legend": "{{instance}} {{device}}-Write",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "每秒读写数据大小",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesSecIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "b118aac2-217d-4fb3-881a-7926f0e8078e",
"layout": {
"h": 6,
"w": 6,
"x": 12,
"y": 48,
"i": "b118aac2-217d-4fb3-881a-7926f0e8078e",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(node_disk_reads_completed_total{instance=~\"$node\"}[$__rate_interval])",
"legend": "{{instance}} {{device}} - Reads",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "rate(node_disk_writes_completed_total{instance=~\"$node\"}[$__rate_interval])",
"legend": "{{instance}} {{device}} - Writes",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "rate(node_disk_reads_merged_total{instance=~\"$node\"}[$__rate_interval])",
"legend": "{{instance}} {{device}} - Read merged",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "rate(node_disk_writes_merged_total{instance=~\"$node\"}[$__rate_interval])",
"legend": "{{instance}} {{device}} - Write merged",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "IO/Merged次数",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "iops",
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "362bfd2d-2ef2-49a0-b64e-14264495672c",
"layout": {
"h": 6,
"w": 6,
"x": 18,
"y": 48,
"i": "362bfd2d-2ef2-49a0-b64e-14264495672c",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "node_filesystem_readonly{instance=~\"$node\",device!~'rootfs',device!~\"tmpfs\",mountpoint!~\"/var/lib.*\"}",
"legend": "{{instance}} {{mountpoint}} - ReadOnly",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "硬盘 ReadOnly",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "1c19b6ae-95fb-4358-bd01-bb0beec8d619",
"layout": {
"h": 6,
"w": 12,
"x": 0,
"y": 54,
"i": "1c19b6ae-95fb-4358-bd01-bb0beec8d619",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "(rate(node_disk_read_bytes_total{instance=~\"$node\"}[$__rate_interval]) + rate(node_disk_written_bytes_total{instance=~\"$node\"}[$__rate_interval]))\n/\n(rate(node_disk_reads_completed_total{instance=~\"$node\"}[$__rate_interval]) + rate(node_disk_writes_completed_total{instance=~\"$node\"}[$__rate_interval]))",
"legend": "",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "avgrq-sz",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "8da1e948-283d-4e68-990f-8e6c9af22d05",
"layout": {
"h": 6,
"w": 6,
"x": 12,
"y": 54,
"i": "8da1e948-283d-4e68-990f-8e6c9af22d05",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(node_disk_io_time_weighted_seconds_total{instance=~\"$node\"}[$__rate_interval])\n",
"legend": "",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "avgqu-sz",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "925fc17e-565d-4c16-a8fb-9a6d07d446c6",
"layout": {
"h": 6,
"w": 6,
"x": 18,
"y": 54,
"i": "41e3a384-7fdd-4da3-90cf-068f0058fac1",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "node_filesystem_device_error{instance=~\"$node\",device!~'rootfs',device!~\"tmpfs\",mountpoint!~\"/var/lib.*\"}",
"legend": "{{instance}} {{mountpoint}} - Device error",
"maxDataPoints": 480
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "硬盘设备错误",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "8fb70dca-4296-45a6-8dd3-770fc898ee65",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 60,
"i": "8fb70dca-4296-45a6-8dd3-770fc898ee65",
"isResizable": false
},
"name": "网络详情",
"type": "row"
},
{
"type": "timeseries",
"id": "ae5d7236-c89f-4fb3-ae5c-e70cb03cb168",
"layout": {
"h": 6,
"w": 6,
"x": 0,
"y": 61,
"i": "ae5d7236-c89f-4fb3-ae5c-e70cb03cb168",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(node_network_receive_bytes_total{instance=~\"$node\",device=~\"e.*\"}[$__rate_interval])*8",
"legend": "{{instance}} {{device}} - in",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "rate(node_network_transmit_bytes_total{instance=~\"$node\",device=~\"e.*\"}[$__rate_interval])*8",
"legend": "{{instance}} {{device}} - out",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "出入流量大小",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bitsSecSI",
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "9cc72d79-c0bd-440d-849a-bf8af74a2b6c",
"layout": {
"h": 6,
"w": 6,
"x": 6,
"y": 61,
"i": "9cc72d79-c0bd-440d-849a-bf8af74a2b6c",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(node_network_receive_packets_total{instance=~\"$node\",device=~\"e.*\"}[$__rate_interval])",
"legend": "{{instance}} {{device}} - in",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "rate(node_network_transmit_packets_total{instance=~\"$node\",device=~\"e.*\"}[$__rate_interval])",
"legend": "{{instance}} {{device}} - out",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "packets",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "packetsSec",
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "8b54d503-daa0-4bea-bb00-d9c723123efd",
"layout": {
"h": 6,
"w": 6,
"x": 12,
"y": 61,
"i": "8b54d503-daa0-4bea-bb00-d9c723123efd",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(node_network_receive_errs_total{instance=~\"$node\",device=~\"e.*\"}[$__rate_interval])",
"legend": "{{instance}} {{device}} - in",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "rate(node_network_transmit_errs_total{instance=~\"$node\",device=~\"e.*\"}[$__rate_interval])",
"legend": "{{instance}} {{device}} - out",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "error",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "packetsSec",
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "293fc82f-1e25-412b-93cc-f8daceb61bd9",
"layout": {
"h": 6,
"w": 6,
"x": 18,
"y": 61,
"i": "293fc82f-1e25-412b-93cc-f8daceb61bd9",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(node_network_receive_drop_total{instance=~\"$node\",device=~\"e.*\"}[$__rate_interval])",
"legend": "{{instance}} {{device}} - in",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "rate(node_network_transmit_drop_total{instance=~\"$node\",device=~\"e.*\"}[$__rate_interval])",
"legend": "{{instance}} {{device}} - out",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "drop",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "packetsSec",
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "8fe595ad-f393-4d1c-a523-888071ca41b9",
"layout": {
"h": 7,
"w": 8,
"x": 0,
"y": 67,
"i": "8fe595ad-f393-4d1c-a523-888071ca41b9",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "100 * node_nf_conntrack_entries{instance=~\"$node\"} / node_nf_conntrack_entries_limit{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "nf_conntrack 使用率",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "64b815bc-b041-40f3-aaf2-21861836ab1d",
"layout": {
"h": 7,
"w": 8,
"x": 8,
"y": 67,
"i": "64b815bc-b041-40f3-aaf2-21861836ab1d",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "node_sockstat_TCP_alloc{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_sockstat_TCP_inuse{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_sockstat_TCP_orphan{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_sockstat_TCP_tw{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
},
{
"expr": "node_netstat_Tcp_CurrEstab{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "tcp socket stat",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.04,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "d089d8a2-dbbf-4dbf-89a8-b463522b1c1d",
"layout": {
"h": 7,
"w": 8,
"x": 16,
"y": 67,
"i": "d089d8a2-dbbf-4dbf-89a8-b463522b1c1d",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "node_sockstat_sockets_used{instance=~\"$node\"}",
"legend": "",
"maxDataPoints": 480,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "socket used",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
}
],
"var": [
{
"name": "prom",
"label": "时序库",
"type": "datasource",
"hide": false,
"definition": "prometheus"
},
{
"name": "node",
"label": "机器",
"type": "query",
"hide": false,
"multi": true,
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(node_uname_info, instance)",
"allOption": true,
"allValue": ".*"
}
],
"version": "3.0.0"
}
}
================================================
FILE: integrations/Linux/markdown/README.md
================================================
# Linux
Linux 类别下,包含多个内置插件,比如 cpu、mem、net、netstat、kernel_vmstat 等,这些插件大都是默认是开启的,无需额外配置,可能有额外配置需求的插件如下。
## cpu
统计 CPU 使用率,默认只采集整机的情况,不采集每个 CPU Core 的情况,如果想采集每个 CPU Core 的情况,可以配置如下。
```ini
collect_per_cpu = true
```
## netstat
统计网络连接数,默认配置如下,可根据实际情况调整。
```ini
# 默认开启了 smmary 统计,类似 ss -s 命令的输出
disable_summary_stats = false
# 默认关闭了所有连接的详细统计,在连接数较多的机器上统计此数据会影响性能
disable_connection_stats = true
# 读取 /proc/net/netstat 的内容,默认关闭了,可以开启,这部分不影响性能
tcp_ext = false
ip_ext = false
```
## disk
统计磁盘使用率,默认配置如下,可根据实际情况调整。
```ini
# 严格指定要采集的挂载点,如果指定了,就只采集指定的挂载点
# mount_points = ["/"]
# 有些 fstype 没必要采集,可以忽略
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs", "nsfs", "CDFS", "fuse.juicefs"]
# 有些挂载点没必要采集,可以忽略,这里可以配置前缀,符合前缀的挂载点都会被忽略
ignore_mount_points = ["/boot", "/var/lib/kubelet/pods"]
```
## kernel_vmstat
统计的信息来自 `/proc/vmstat`,只有高版本内核才支持,这个文件的内容较多,默认配置只采集了 oom_kill 次数,其他指标均未采集,如果你想打开其他采集开关,可以修改 white_list 部分的配置。下面是截取了一部分内容,供参考:
```toml
[white_list]
oom_kill = 1
nr_free_pages = 0
nr_alloc_batch = 0
...
```
## arp_package
统计 ARP 包的数量,该插件依赖 cgo,如果需要该插件需要下载 `with-cgo` 的 categraf 发布包。
## ntp
监控机器时间偏移量,只需要给出 ntp 服务端地址,Categraf 就会周期性去请求,对比本机时间,得到偏移量,监控指标是 ntp_offset_ms 顾名思义,单位是毫秒,一般这个值不能超过 1000
================================================
FILE: integrations/Linux/metrics/categraf-base.json
================================================
[
{
"id": 0,
"uuid": 1717556327758798000,
"collector": "Categraf",
"typ": "Linux",
"name": "CPU Guest 时间占比",
"unit": "percent",
"note": "CPU 为 Guest OS 运行虚拟 CPU 的时间占比。\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的统计值\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的统计值",
"lang": "zh_CN",
"expression": "cpu_usage_guest",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU Guest 时间占比",
"note": "CPU 为 Guest OS 运行虚拟 CPU 的时间占比。\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的统计值\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的统计值"
},
{
"lang": "en_US",
"name": "CPU Guest time ratio",
"note": "The proportion of time the CPU spends running the virtual CPU for the Guest OS. \n \n-Label ` cpu = \"cpu-total\" `, indicating the statistical value of the whole machine \n-Tag ` CPU = \"0\" `, indicating the statistical value of CPU number 0"
}
]
},
{
"id": 0,
"uuid": 1717556327761258000,
"collector": "Categraf",
"typ": "Linux",
"name": "CPU iowait 时间占比",
"unit": "percent",
"note": "表示 CPU 花费在等待 I/O 操作(输入/输出)完成的时间比例\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的 iowait 占用率\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的 iowait 占用率",
"lang": "zh_CN",
"expression": "cpu_usage_iowait",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU iowait 时间占比",
"note": "表示 CPU 花费在等待 I/O 操作(输入/输出)完成的时间比例\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的 iowait 占用率\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的 iowait 占用率"
},
{
"lang": "en_US",
"name": "CPU iowait time ratio",
"note": "Represents the proportion of time the CPU spends waiting for I/O operations (input/output) to complete \n \n-Tag ` cpu = \"cpu-total\" `, indicating the iowait occupancy rate of the whole machine \n-Tag ` CPU = \"0\" `, indicating iowait usage rate of CPU # 0"
}
]
},
{
"id": 0,
"uuid": 1717556327764051000,
"collector": "Categraf",
"typ": "Linux",
"name": "CPU Steal 时间占比",
"unit": "percent",
"note": "表示虚拟机(Guest OS)CPU 时间被虚拟化管理器(Hypervisor)用于运行其他虚拟机的时间比例。物理机中这个指标值是 0,没有用处;虚拟机中这个指标值长时间大于 10 表示争抢严重。[参考资料](https://mp.weixin.qq.com/s/zlrfMNrDHJVE5lkTVow5zA)\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的统计值\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的统计值",
"lang": "zh_CN",
"expression": "cpu_usage_steal",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU Steal 时间占比",
"note": "表示虚拟机(Guest OS)CPU 时间被虚拟化管理器(Hypervisor)用于运行其他虚拟机的时间比例。物理机中这个指标值是 0,没有用处;虚拟机中这个指标值长时间大于 10 表示争抢严重。[参考资料](https://mp.weixin.qq.com/s/zlrfMNrDHJVE5lkTVow5zA)\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的统计值\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的统计值"
},
{
"lang": "en_US",
"name": "CPU Steal time ratio",
"note": "Represents the proportion of time that the virtual machine (Guest OS) CPU time is used by the virtualization manager (Hypervisor) to run other virtual machines. The value of this indicator in the physical machine is 0, which is useless; If the value of this indicator in the virtual machine is greater than 10 for a long time, it means that the competition is serious. [Reference] (https://mp.weixin.qq.com/s/zlrfMNrDHJVE5lkTVow5zA) \n \n-Label ` cpu = \"cpu-total\" `, indicating the statistical value of the whole machine \n-Tag ` CPU = \"0\" `, indicating the statistical value of CPU number 0"
}
]
},
{
"id": 0,
"uuid": 1717556327766578000,
"collector": "Categraf",
"typ": "Linux",
"name": "CPU 低优先级进程的时间占比",
"unit": "percent",
"note": "在Linux系统中,每个进程都有一个“nice”值,这个值的范围通常是从-20到19。默认的 nice 值为0。负的 nice 值表示高优先级(更快的 CPU 访问),而正的 nice 值表示低优先级(更慢的 CPU 访问)。该指标统计 nice 值为 1-19 的进程的时间占比。\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的统计值\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的统计值",
"lang": "zh_CN",
"expression": "cpu_usage_nice",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU 低优先级进程的时间占比",
"note": "在Linux系统中,每个进程都有一个“nice”值,这个值的范围通常是从-20到19。默认的 nice 值为0。负的 nice 值表示高优先级(更快的 CPU 访问),而正的 nice 值表示低优先级(更慢的 CPU 访问)。该指标统计 nice 值为 1-19 的进程的时间占比。\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的统计值\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的统计值"
},
{
"lang": "en_US",
"name": "Time proportion of CPU low-priority processes",
"note": "In Linux systems, each process has a \"nice\" value, which usually ranges from-20 to 19. The default nice value is 0. A negative nice value indicates high priority (faster CPU access), while a positive nice value indicates low priority (slower CPU access). This indicator counts the time proportion of processes with nice values of 1-19. \n \n-Tag ` cpu = \"cpu-total\" `, indicating the statistical value of the whole machine \n-Tag ` CPU = \"0\" `, indicating the statistical value of CPU number 0"
}
]
},
{
"id": 0,
"uuid": 1717556327768675000,
"collector": "Categraf",
"typ": "Linux",
"name": "CPU 内核态时间占比",
"unit": "percent",
"note": "表示 CPU 内核态的时间占比\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的统计值\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的统计值",
"lang": "zh_CN",
"expression": "cpu_usage_system",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU 内核态时间占比",
"note": "表示 CPU 内核态的时间占比\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的统计值\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的统计值"
},
{
"lang": "en_US",
"name": "CPU core mode time ratio",
"note": "Represents the time ratio of CPU core mode \n \n-Tag ` cpu = \"cpu-total\" `, indicating the statistical value of the whole machine \n-Tag ` CPU = \"0\" `, indicating the statistical value of CPU number 0"
}
]
},
{
"id": 0,
"uuid": 1717556327770969000,
"collector": "Categraf",
"typ": "Linux",
"name": "CPU 利用率",
"unit": "percent",
"note": "- 标签 `cpu=\"cpu-total\"`,表示整机的 CPU 利用率\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的单核利用率",
"lang": "zh_CN",
"expression": "cpu_usage_active",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU 利用率",
"note": "- 标签 `cpu=\"cpu-total\"`,表示整机的 CPU 利用率\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的单核利用率"
},
{
"lang": "en_US",
"name": "CPU utilization",
"note": "-Tag ` CPU = \"CPU-total\" `, indicating the CPU utilization of the whole machine \n-Tag ` CPU = \"0\" `, indicating single-core utilization of CPU # 0"
}
]
},
{
"id": 0,
"uuid": 1717556327773481000,
"collector": "Categraf",
"typ": "Linux",
"name": "CPU 用户态时间占比",
"unit": "percent",
"note": "表示 CPU 用户态的时间占比\n\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的统计值\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的统计值",
"lang": "zh_CN",
"expression": "cpu_usage_user",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU 用户态时间占比",
"note": "表示 CPU 用户态的时间占比\n\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的统计值\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的统计值"
},
{
"lang": "en_US",
"name": "CPU user mode time ratio",
"note": "Indicates the time ratio of CPU user mode \n \n \n-Tag ` cpu = \"cpu-total\" `, indicating the statistical value of the whole machine \n-Tag ` CPU = \"0\" `, indicating the statistical value of CPU number 0"
}
]
},
{
"id": 0,
"uuid": 1717556327775725000,
"collector": "Categraf",
"typ": "Linux",
"name": "CPU 硬中断时间占比",
"unit": "percent",
"note": "表示 CPU 处理硬中断的时间占比\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的统计值\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的统计值",
"lang": "zh_CN",
"expression": "cpu_usage_irq",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU 硬中断时间占比",
"note": "表示 CPU 处理硬中断的时间占比\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的统计值\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的统计值"
},
{
"lang": "en_US",
"name": "CPU hard interrupt time ratio",
"note": "Indicates the proportion of time spent by the CPU handling hard interrupts \n \n-Tag ` cpu = \"cpu-total\" `, indicating the statistical value of the whole machine \n-Tag ` CPU = \"0\" `, indicating the statistical value of CPU number 0"
}
]
},
{
"id": 0,
"uuid": 1717556327777851000,
"collector": "Categraf",
"typ": "Linux",
"name": "CPU 空闲率",
"unit": "percent",
"note": "- 标签 `cpu=\"cpu-total\"`,表示整机的 CPU 空闲率\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的单核空闲率",
"lang": "zh_CN",
"expression": "cpu_usage_idle",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU 空闲率",
"note": "- 标签 `cpu=\"cpu-total\"`,表示整机的 CPU 空闲率\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的单核空闲率"
},
{
"lang": "en_US",
"name": "CPU idle rate",
"note": "-Tag ` CPU = \"CPU-total\" `, indicating the CPU idle rate of the whole machine \n-Tag ` CPU = \"0\" `, indicating the single core idle rate of CPU # 0"
}
]
},
{
"id": 0,
"uuid": 1717556327780202000,
"collector": "Categraf",
"typ": "Linux",
"name": "CPU 软中断时间占比",
"unit": "percent",
"note": "表示 CPU 处理软中断的时间占比\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的统计值\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的统计值",
"lang": "zh_CN",
"expression": "cpu_usage_softirq",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU 软中断时间占比",
"note": "表示 CPU 处理软中断的时间占比\n\n- 标签 `cpu=\"cpu-total\"`,表示整机的统计值\n- 标签 `cpu=\"0\"`,表示 0 号 CPU 的统计值"
},
{
"lang": "en_US",
"name": "CPU soft interrupt time ratio",
"note": "Indicates the proportion of time the CPU takes to process soft interrupts \n \n-Tag ` cpu = \"cpu-total\" `, indicating the statistical value of the whole machine \n-Tag ` CPU = \"0\" `, indicating the statistical value of CPU number 0"
}
]
},
{
"id": 0,
"uuid": 1717556327782194000,
"collector": "Categraf",
"typ": "Linux",
"name": "ESTABLISHED 状态的网络链接数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "netstat_tcp_established",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "ESTABLISHED 状态的网络链接数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of network links with ESTABLISHED status",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327784357000,
"collector": "Categraf",
"typ": "Linux",
"name": "FIN_WAIT1 状态的网络链接数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "netstat_tcp_fin_wait1",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "FIN_WAIT1 状态的网络链接数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of network links in FIN _ WAIT1 state",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327786336000,
"collector": "Categraf",
"typ": "Linux",
"name": "FIN_WAIT2 状态的网络链接数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "netstat_tcp_fin_wait2",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "FIN_WAIT2 状态的网络链接数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of network links in FIN _ WAIT2 state",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327788305000,
"collector": "Categraf",
"typ": "Linux",
"name": "IP conntrack 使用率",
"unit": "percent",
"note": "如果用满了,通常会在系统日志中看到:conntrack table full 这样的报错,可以调整系统最大值解决",
"lang": "zh_CN",
"expression": "100 * conntrack_ip_conntrack_count / conntrack_ip_conntrack_max",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "IP conntrack 使用率",
"note": "如果用满了,通常会在系统日志中看到:conntrack table full 这样的报错,可以调整系统最大值解决"
},
{
"lang": "en_US",
"name": "IP conntrack usage",
"note": "If it is full, you will usually see errors such as: conntrack table full in the system log. You can adjust the maximum value of the system to solve the problem"
}
]
},
{
"id": 0,
"uuid": 1717556327790390000,
"collector": "Categraf",
"typ": "Linux",
"name": "LAST_ACK 状态的网络链接数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "netstat_tcp_last_ack",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "LAST_ACK 状态的网络链接数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of network links in LAST _ ACK state",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327792341000,
"collector": "Categraf",
"typ": "Linux",
"name": "LISTEN 状态的网络链接数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "netstat_tcp_listen",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "LISTEN 状态的网络链接数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of network links in LISTEN status",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327794144000,
"collector": "Categraf",
"typ": "Linux",
"name": "NF conntrack 使用率",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "100 * conntrack_nf_conntrack_count / conntrack_nf_conntrack_max",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "NF conntrack 使用率",
"note": ""
},
{
"lang": "en_US",
"name": "NF conntrack usage",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327796195000,
"collector": "Categraf",
"typ": "Linux",
"name": "1分钟内 OOM 次数统计",
"unit": "none",
"note": "取自 `/proc/vmstat`,需要较高版本的内核,没记错的话应该是 4.13 以上版本",
"lang": "zh_CN",
"expression": "increase(kernel_vmstat_oom_kill[1m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "1分钟内 OOM 次数统计",
"note": "取自 `/proc/vmstat`,需要较高版本的内核,没记错的话应该是 4.13 以上版本"
},
{
"lang": "en_US",
"name": "OOM number statistics within 1 minute",
"note": "Taken from `/proc/vmstat `, requires a higher version of the kernel, if I remember correctly, it should be 4.13 or above"
}
]
},
{
"id": 0,
"uuid": 1717556327798039000,
"collector": "Categraf",
"typ": "Linux",
"name": "SYN_RECV 状态的网络链接数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "netstat_tcp_syn_recv",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "SYN_RECV 状态的网络链接数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of network links in SYN _ RECV state",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327800360000,
"collector": "Categraf",
"typ": "Linux",
"name": "SYN_SENT 状态的网络链接数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "netstat_tcp_syn_sent",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "SYN_SENT 状态的网络链接数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of network links in SYN _ SENT state",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327803053000,
"collector": "Categraf",
"typ": "Linux",
"name": "TIME_WAIT 状态的网络链接数",
"unit": "none",
"note": "categraf 配置文件 `conf/input.netstat/netstat.toml` 中 `disable_connection_stats` 默认配置为 true,是因为在网络连接比较多的机器上,获取各个状态的连接数会耗费较多 CPU,所以默认 disable 掉了。",
"lang": "zh_CN",
"expression": "netstat_tcp_time_wait",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "TIME_WAIT 状态的网络链接数",
"note": "categraf 配置文件 `conf/input.netstat/netstat.toml` 中 `disable_connection_stats` 默认配置为 true,是因为在网络连接比较多的机器上,获取各个状态的连接数会耗费较多 CPU,所以默认 disable 掉了。"
},
{
"lang": "en_US",
"name": "Number of network links in TIME _ WAIT state",
"note": "The default configuration of ` disable _ connection _ stats ` in the categraf configuration file ` conf/input.netstat/netstat.toml ` is true because on machines with many network connections, obtaining the number of connections in each state will consume more CPU, so the default disable is dropped."
}
]
},
{
"id": 0,
"uuid": 1717556327804777000,
"collector": "Categraf",
"typ": "Linux",
"name": "TIME_WAIT 状态的网络链接数(推荐)",
"unit": "none",
"note": "读取自 `/proc/net/sockstat`。",
"lang": "zh_CN",
"expression": "netstat_tcp_tw",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "TIME_WAIT 状态的网络链接数(推荐)",
"note": "读取自 `/proc/net/sockstat`。"
},
{
"lang": "en_US",
"name": "Number of network links in TIME _ WAIT state (recommended)",
"note": "Read from `/proc/net/sockstat `."
}
]
},
{
"id": 0,
"uuid": 1717556327806758000,
"collector": "Categraf",
"typ": "Linux",
"name": "交换空间使用率",
"unit": "percent",
"note": "交换空间使用率。计算原子取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "(mem_swap_total - mem_swap_free)/mem_swap_total * 100 and mem_swap_total > 0",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "交换空间使用率",
"note": "交换空间使用率。计算原子取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Swap space usage",
"note": "Swap space usage. The computational atom is taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327808420000,
"collector": "Categraf",
"typ": "Linux",
"name": "交换空间使用量",
"unit": "bytesIEC",
"note": "交换空间使用量。计算原子取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "mem_swap_total - mem_swap_free",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "交换空间使用量",
"note": "交换空间使用量。计算原子取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Swap space usage",
"note": "Swap space usage. The computational atom is taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327810258000,
"collector": "Categraf",
"typ": "Linux",
"name": "交换空间总量",
"unit": "bytesIEC",
"note": "交换空间总量。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "mem_swap_total",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "交换空间总量",
"note": "交换空间总量。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Total swap space",
"note": "Total amount of swap space. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327812354000,
"collector": "Categraf",
"typ": "Linux",
"name": "交换空间空闲量",
"unit": "bytesIEC",
"note": "交换空间空闲量。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "mem_swap_free",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "交换空间空闲量",
"note": "交换空间空闲量。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Swap space free amount",
"note": "Swap space free amount. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327814499000,
"collector": "Categraf",
"typ": "Linux",
"name": "内存 Buffered 量",
"unit": "bytesIEC",
"note": "用作缓冲区的内存量。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "mem_buffered",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存 Buffered 量",
"note": "用作缓冲区的内存量。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Memory Buffered amount",
"note": "The amount of memory used as a buffer. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327817214000,
"collector": "Categraf",
"typ": "Linux",
"name": "内存 Cached 量",
"unit": "bytesIEC",
"note": "用作文件缓存的内存量。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "mem_cached",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存 Cached 量",
"note": "用作文件缓存的内存量。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Memory Cached amount",
"note": "The amount of memory used as a file cache. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327820178000,
"collector": "Categraf",
"typ": "Linux",
"name": "内存使用率",
"unit": "percent",
"note": "内存使用率",
"lang": "zh_CN",
"expression": "mem_used_percent",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存使用率",
"note": "内存使用率"
},
{
"lang": "en_US",
"name": "Memory usage",
"note": "Memory usage"
}
]
},
{
"id": 0,
"uuid": 1717556327823042000,
"collector": "Categraf",
"typ": "Linux",
"name": "内存使用率(基于MemAvailable)",
"unit": "percent",
"note": "内存使用率。基于 MemAvailable 计算更准确,但是老版本的 Linux 不支持。",
"lang": "zh_CN",
"expression": "100 - mem_available_percent",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存使用率(基于MemAvailable)",
"note": "内存使用率。基于 MemAvailable 计算更准确,但是老版本的 Linux 不支持。"
},
{
"lang": "en_US",
"name": "Memory usage (based on MemAvailable)",
"note": "Memory usage. Calculation based on MemAvailable is more accurate, but older versions of Linux do not support it."
}
]
},
{
"id": 0,
"uuid": 1717556327827697000,
"collector": "Categraf",
"typ": "Linux",
"name": "内存使用量",
"unit": "bytesIEC",
"note": "内存使用量 = Total - Free - Buffered - Cached",
"lang": "zh_CN",
"expression": "mem_used",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存使用量",
"note": "内存使用量 = Total - Free - Buffered - Cached"
},
{
"lang": "en_US",
"name": "Memory usage",
"note": "Memory usage = Total-Free-Buffered-Cached"
}
]
},
{
"id": 0,
"uuid": 1717556327830218000,
"collector": "Categraf",
"typ": "Linux",
"name": "内存可用率",
"unit": "percent",
"note": "可以立即分配给进程的可用内存量除以内存总量。分子分母的值均取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "mem_available_percent",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存可用率",
"note": "可以立即分配给进程的可用内存量除以内存总量。分子分母的值均取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Memory availability",
"note": "The amount of available memory that can be immediately allocated to the process divided by the total amount of memory. The values of numerator and denominator are taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327832329000,
"collector": "Categraf",
"typ": "Linux",
"name": "内存可用量",
"unit": "bytesIEC",
"note": "可以立即分配给进程的可用内存量。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "mem_available",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存可用量",
"note": "可以立即分配给进程的可用内存量。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Memory Availability",
"note": "The amount of available memory that can be immediately allocated to a process. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327834388000,
"collector": "Categraf",
"typ": "Linux",
"name": "内存总量",
"unit": "bytesIEC",
"note": "内存总量。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "mem_total",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存总量",
"note": "内存总量。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Total memory",
"note": "Total amount of memory. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327836679000,
"collector": "Categraf",
"typ": "Linux",
"name": "内存活跃量",
"unit": "bytesIEC",
"note": "这个字段表示当前活跃使用的内存总量,即系统最近访问过并且预计很快会再次使用的内存页面。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "mem_active",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存活跃量",
"note": "这个字段表示当前活跃使用的内存总量,即系统最近访问过并且预计很快会再次使用的内存页面。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Memory activity",
"note": "This field indicates the total amount of memory currently actively used, that is, the memory pages that the system has recently visited and is expected to be used again soon. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327838968000,
"collector": "Categraf",
"typ": "Linux",
"name": "内存空闲量",
"unit": "bytesIEC",
"note": "未使用的内存量。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "mem_free",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存空闲量",
"note": "未使用的内存量。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Free memory amount",
"note": "Amount of unused memory. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327840782000,
"collector": "Categraf",
"typ": "Linux",
"name": "内存非活跃量",
"unit": "bytesIEC",
"note": "在上一采样周期未以某种方式使用的内存量。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "mem_inactive",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存非活跃量",
"note": "在上一采样周期未以某种方式使用的内存量。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Memory inactivity",
"note": "The amount of memory that was not used in some way in the previous sampling period. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327842643000,
"collector": "Categraf",
"typ": "Linux",
"name": "当前 UDP 连接数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "netstat_udp_socket",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "当前 UDP 连接数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of current UDP connections",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327844493000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 inode 使用率",
"unit": "percent",
"note": "如果存储了很多小文件,需要注意 inode 使用情况。",
"lang": "zh_CN",
"expression": "disk_inodes_used / disk_inodes_total * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 inode 使用率",
"note": "如果存储了很多小文件,需要注意 inode 使用情况。"
},
{
"lang": "en_US",
"name": "Hard disk inode usage",
"note": "If you store a lot of small files, you need to pay attention to inode usage."
}
]
},
{
"id": 0,
"uuid": 1717556327846749000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 inode 使用量",
"unit": "none",
"note": "如果存储了很多小文件,需要注意 inode 使用情况。",
"lang": "zh_CN",
"expression": "disk_inodes_used",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 inode 使用量",
"note": "如果存储了很多小文件,需要注意 inode 使用情况。"
},
{
"lang": "en_US",
"name": "Hard disk inode usage",
"note": "If you store a lot of small files, you need to pay attention to inode usage."
}
]
},
{
"id": 0,
"uuid": 1717556327849317000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 inode 剩余量",
"unit": "none",
"note": "如果存储了很多小文件,需要注意 inode 使用情况。",
"lang": "zh_CN",
"expression": "disk_inodes_free",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 inode 剩余量",
"note": "如果存储了很多小文件,需要注意 inode 使用情况。"
},
{
"lang": "en_US",
"name": "Remaining amount of hard disk inode",
"note": "If you store a lot of small files, you need to pay attention to inode usage."
}
]
},
{
"id": 0,
"uuid": 1717556327851474000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 inode 总量",
"unit": "none",
"note": "如果存储了很多小文件,需要注意 inode 使用情况。",
"lang": "zh_CN",
"expression": "disk_inodes_total",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 inode 总量",
"note": "如果存储了很多小文件,需要注意 inode 使用情况。"
},
{
"lang": "en_US",
"name": "Total amount of hard disk inode",
"note": "If you store a lot of small files, you need to pay attention to inode usage."
}
]
},
{
"id": 0,
"uuid": 1717556327853833000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 IO - 写请求平均耗时",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "irate(diskio_write_time[2m])/irate(diskio_writes[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 写请求平均耗时",
"note": ""
},
{
"lang": "en_US",
"name": "Hard disk IO-Write request average time",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327856133000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 IO - 写请求等待处理的时间",
"unit": "milliseconds",
"note": "写入请求在磁盘上等待的时间。同时等待的多个写入请求会增加该数字。例如,如果 8 个请求均平均等待 1000 毫秒,则报告 8000。",
"lang": "zh_CN",
"expression": "diskio_write_time",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 写请求等待处理的时间",
"note": "写入请求在磁盘上等待的时间。同时等待的多个写入请求会增加该数字。例如,如果 8 个请求均平均等待 1000 毫秒,则报告 8000。"
},
{
"lang": "en_US",
"name": "Hard Drive IO-Time the write request waits to be processed",
"note": "The time a write request waits on disk. Multiple write requests waiting simultaneously increase this number. For example, if 8 requests all wait an average of 1000 milliseconds, 8000 is reported."
}
]
},
{
"id": 0,
"uuid": 1717556327858277000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 IO - 加权 IO 处理时间",
"unit": "milliseconds",
"note": "只要设备在处理 IO,不管同时处理了几个,diskio_io_time 就会递增;同时处理的 IO 数量乘以时间就得到加权的值:diskio_weighted_io_time。",
"lang": "zh_CN",
"expression": "diskio_weighted_io_time",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 加权 IO 处理时间",
"note": "只要设备在处理 IO,不管同时处理了几个,diskio_io_time 就会递增;同时处理的 IO 数量乘以时间就得到加权的值:diskio_weighted_io_time。"
},
{
"lang": "en_US",
"name": "Hard Disk IO-Weighted IO Processing Time",
"note": "As long as the device is processing IO, no matter how many are processed at the same time, diskio _ IO _ time will be incremented; The number of simultaneously processed IOs is multiplied by time to obtain a weighted value: diskio _ weighted _ IO _ time."
}
]
},
{
"id": 0,
"uuid": 1717556327860467000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 IO - 时间维度 Utilization",
"unit": "percent",
"note": "在时间维度统计硬盘 IO 时间占比,比如该值是 50%,表示有 50% 的时间是在处理 IO,该值 100%,表示一直在处理 IO,但是注意,现代磁盘设备具备并行处理多个 I/O 请求的能力,所以即便该值是 100%,可能硬盘还是可以接收新的处理请求。\n\n比如某人有两只手,最近 1 分钟一直在用单手劳动,从时间维度来看,利用率是 100%,但即便是 100%,再给他更多的活,他也能干,因为他还有一只手可用。",
"lang": "zh_CN",
"expression": "irate(diskio_io_time[2m])/1000*100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 时间维度 Utilization",
"note": "在时间维度统计硬盘 IO 时间占比,比如该值是 50%,表示有 50% 的时间是在处理 IO,该值 100%,表示一直在处理 IO,但是注意,现代磁盘设备具备并行处理多个 I/O 请求的能力,所以即便该值是 100%,可能硬盘还是可以接收新的处理请求。\n\n比如某人有两只手,最近 1 分钟一直在用单手劳动,从时间维度来看,利用率是 100%,但即便是 100%,再给他更多的活,他也能干,因为他还有一只手可用。"
},
{
"lang": "en_US",
"name": "Hard Disk IO-Time Dimension Utilization",
"note": "Count the proportion of hard disk IO time in the time dimension. For example, if the value is 50%, it means that 50% of the time is processing IO, and if the value is 100%, it means that IO has been processing all the time. However, note that modern disk devices have the ability to process multiple I/O requests in parallel, so even if the value is 100%, the hard disk may still be able to receive new processing requests. \n \nFor example, someone has two hands and has been working with one hand in the last minute. From the time dimension, the utilization rate is 100%, but even if it is 100%, he can do it if he is given more work, because he still has one hand available."
}
]
},
{
"id": 0,
"uuid": 1717556327862401000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 IO - 每秒写入字节数量",
"unit": "bytesSecIEC",
"note": "",
"lang": "zh_CN",
"expression": "irate(diskio_write_bytes[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 每秒写入字节数量",
"note": ""
},
{
"lang": "en_US",
"name": "Hard disk IO-bytes written per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327864213000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 IO - 每秒写次数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "irate(diskio_writes{}[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 每秒写次数",
"note": ""
},
{
"lang": "en_US",
"name": "Hard drive IO-writes per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327866432000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 IO - 每秒合并写请求次数",
"unit": "none",
"note": "相邻的读取和写入可能会为了效率而被合并。因此,在最终交给磁盘之前,两个4K的读取可能会变成一个8K的读取,因此它将被计数(和排队)为仅有一个I/O。这些字段让你知道这种情况发生的频率。",
"lang": "zh_CN",
"expression": "irate(diskio_merged_writes[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 每秒合并写请求次数",
"note": "相邻的读取和写入可能会为了效率而被合并。因此,在最终交给磁盘之前,两个4K的读取可能会变成一个8K的读取,因此它将被计数(和排队)为仅有一个I/O。这些字段让你知道这种情况发生的频率。"
},
{
"lang": "en_US",
"name": "Hard Disk IO-Number of merged write requests per second",
"note": "Adjacent reads and writes may be merged for efficiency. So two 4K reads may turn into an 8K read before finally handing it over to disk, so it will be counted (and queued) to have only one I/O. These fields let you know how often this happens."
}
]
},
{
"id": 0,
"uuid": 1717556327868237000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 IO - 每秒合并读请求次数",
"unit": "none",
"note": "相邻的读取和写入可能会为了效率而被合并。因此,在最终交给磁盘之前,两个4K的读取可能会变成一个8K的读取,因此它将被计数(和排队)为仅有一个I/O。这些字段让你知道这种情况发生的频率。",
"lang": "zh_CN",
"expression": "irate(diskio_merged_reads[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 每秒合并读请求次数",
"note": "相邻的读取和写入可能会为了效率而被合并。因此,在最终交给磁盘之前,两个4K的读取可能会变成一个8K的读取,因此它将被计数(和排队)为仅有一个I/O。这些字段让你知道这种情况发生的频率。"
},
{
"lang": "en_US",
"name": "Hard disk IO-merge read requests per second",
"note": "Adjacent reads and writes may be merged for efficiency. So two 4K reads may turn into an 8K read before finally handing it over to disk, so it will be counted (and queued) to have only one I/O. These fields let you know how often this happens."
}
]
},
{
"id": 0,
"uuid": 1717556327870357000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 IO - 每秒读取字节数量",
"unit": "bytesSecIEC",
"note": "",
"lang": "zh_CN",
"expression": "irate(diskio_read_bytes[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 每秒读取字节数量",
"note": ""
},
{
"lang": "en_US",
"name": "Hard Drive IO-bytes read per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327872238000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 IO - 每秒读次数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "irate(diskio_reads{}[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 每秒读次数",
"note": ""
},
{
"lang": "en_US",
"name": "Hard drive IO-Reads per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327874220000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 IO - 硬盘在处理 IO 的时间",
"unit": "milliseconds",
"note": "硬盘在处理 IO 的时间,单位是毫秒。假设最近 1 秒内有 500 毫秒是在处理 IO,这个值就要加 500,这是一个单调递增的 Counter 值,从操作系统启动之后就会一直涨。实际使用的时候,一般使用该指标计算硬盘 IO UTIL(时间维度的百分比)。",
"lang": "zh_CN",
"expression": "diskio_io_time",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 硬盘在处理 IO 的时间",
"note": "硬盘在处理 IO 的时间,单位是毫秒。假设最近 1 秒内有 500 毫秒是在处理 IO,这个值就要加 500,这是一个单调递增的 Counter 值,从操作系统启动之后就会一直涨。实际使用的时候,一般使用该指标计算硬盘 IO UTIL(时间维度的百分比)。"
},
{
"lang": "en_US",
"name": "Hard Disk IO-Time the hard disk takes to process IO",
"note": "The time it takes for the hard disk to process IO, in milliseconds. Assuming that 500 milliseconds are processing IO in the last 1 second, this value will be increased by 500. This is a monotonically increasing Counter value, which will continue to rise after the operating system is started. In actual use, this indicator is generally used to calculate the hard disk IO UTIL (percentage of time dimension)."
}
]
},
{
"id": 0,
"uuid": 1717556327876524000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 IO - 设备驱动程序正在处理的 IO 数量",
"unit": "none",
"note": "当前已经分配给设备驱动程序并且尚未完成的请求数量,Gauge 类型。如果 I/O 请求已经进入队列,但是尚未分配给设备驱动程序,则不统计在内。",
"lang": "zh_CN",
"expression": "diskio_iops_in_progress",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 设备驱动程序正在处理的 IO 数量",
"note": "当前已经分配给设备驱动程序并且尚未完成的请求数量,Gauge 类型。如果 I/O 请求已经进入队列,但是尚未分配给设备驱动程序,则不统计在内。"
},
{
"lang": "en_US",
"name": "Hard Disk IO-Number of IO being processed by the device driver",
"note": "The number of requests that have been currently assigned to the device driver and have not been completed, Gauge type. If I/O requests have entered the queue, but have not been assigned to the device driver, they are not counted."
}
]
},
{
"id": 0,
"uuid": 1717556327878372000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 IO - 读请求平均耗时",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "irate(diskio_read_time[2m])/irate(diskio_reads[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 读请求平均耗时",
"note": ""
},
{
"lang": "en_US",
"name": "Hard disk IO-Read request average time",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327880512000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 IO - 读请求等待处理的时间",
"unit": "milliseconds",
"note": "读取请求在磁盘上等待的时间。同时等待的多个读取请求会增加该数字。例如,如果 5 个请求均平均等待 100 毫秒,则报告 500。",
"lang": "zh_CN",
"expression": "diskio_read_time",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 读请求等待处理的时间",
"note": "读取请求在磁盘上等待的时间。同时等待的多个读取请求会增加该数字。例如,如果 5 个请求均平均等待 100 毫秒,则报告 500。"
},
{
"lang": "en_US",
"name": "Hard Drive IO-Time for read requests to be processed",
"note": "The time the read request waits on disk. Multiple read requests waiting simultaneously increase this number. For example, if 5 requests all wait an average of 100 milliseconds, 500 is reported."
}
]
},
{
"id": 0,
"uuid": 1717556327882423000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘 IO - 队列深度",
"unit": "none",
"note": "假设过去 1000 毫秒内有 5000 毫秒的加权等待时间,相当于平均有 5 个 IO 请求在等待,即队列深度为 5。",
"lang": "zh_CN",
"expression": "irate(diskio_weighted_io_time[1m])/1000",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 队列深度",
"note": "假设过去 1000 毫秒内有 5000 毫秒的加权等待时间,相当于平均有 5 个 IO 请求在等待,即队列深度为 5。"
},
{
"lang": "en_US",
"name": "Hard Disk IO-Queue Depth",
"note": "Assuming that there is a weighted waiting time of 5000 ms in the past 1000 ms, it is equivalent to an average of 5 IO requests waiting, that is, the queue depth is 5."
}
]
},
{
"id": 0,
"uuid": 1717556327884315000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘使用率",
"unit": "percent",
"note": "硬盘空间使用率。",
"lang": "zh_CN",
"expression": "disk_used_percent",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘使用率",
"note": "硬盘空间使用率。"
},
{
"lang": "en_US",
"name": "Hard Drive Usage",
"note": "Hard disk space usage."
}
]
},
{
"id": 0,
"uuid": 1717556327886520000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘使用量",
"unit": "bytesSI",
"note": "使用 SI 标准渲染数据,和 df 命令保持一致。",
"lang": "zh_CN",
"expression": "disk_used",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘使用量",
"note": "使用 SI 标准渲染数据,和 df 命令保持一致。"
},
{
"lang": "en_US",
"name": "Hard drive usage",
"note": "Use the SI standard to render data, consistent with the df command."
}
]
},
{
"id": 0,
"uuid": 1717556327888582000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘信息读取是否出错",
"unit": "none",
"note": "0 表示正常,没有出错;非 0 表示获取硬盘信息出错了。",
"lang": "zh_CN",
"expression": "disk_device_error",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘信息读取是否出错",
"note": "0 表示正常,没有出错;非 0 表示获取硬盘信息出错了。"
},
{
"lang": "en_US",
"name": "Is there an error in reading hard disk information",
"note": "0 means normal, no error; A non-0 indicates an error occurred in obtaining hard disk information."
}
]
},
{
"id": 0,
"uuid": 1717556327890542000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘剩余量",
"unit": "bytesSI",
"note": "使用 SI 标准渲染数据,和 df 命令保持一致。",
"lang": "zh_CN",
"expression": "disk_free",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘剩余量",
"note": "使用 SI 标准渲染数据,和 df 命令保持一致。"
},
{
"lang": "en_US",
"name": "Remaining hard disk",
"note": "Use the SI standard to render data, consistent with the df command."
}
]
},
{
"id": 0,
"uuid": 1717556327892398000,
"collector": "Categraf",
"typ": "Linux",
"name": "硬盘总量",
"unit": "bytesSI",
"note": "使用 SI 标准渲染数据,和 df 命令保持一致。",
"lang": "zh_CN",
"expression": "disk_total",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘总量",
"note": "使用 SI 标准渲染数据,和 df 命令保持一致。"
},
{
"lang": "en_US",
"name": "Total hard disk",
"note": "Use the SI standard to render data, consistent with the df command."
}
]
},
{
"id": 0,
"uuid": 1717556327894251000,
"collector": "Categraf",
"typ": "Linux",
"name": "系统 CPU 核数",
"unit": "none",
"note": "CPU 逻辑核的数量。",
"lang": "zh_CN",
"expression": "system_n_cpus",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "系统 CPU 核数",
"note": "CPU 逻辑核的数量。"
},
{
"lang": "en_US",
"name": "Number of CPU cores",
"note": "Number of CPU logical cores."
}
]
},
{
"id": 0,
"uuid": 1717556327896436000,
"collector": "Categraf",
"typ": "Linux",
"name": "系统启动时长",
"unit": "seconds",
"note": "操作系统启动了多久,单位:秒。",
"lang": "zh_CN",
"expression": "system_uptime",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "系统启动时长",
"note": "操作系统启动了多久,单位:秒。"
},
{
"lang": "en_US",
"name": "System startup time",
"note": "How long has the operating system been booted, in seconds."
}
]
},
{
"id": 0,
"uuid": 1717556327898386000,
"collector": "Categraf",
"typ": "Linux",
"name": "系统平均负载 - 最近 1 分钟",
"unit": "none",
"note": "取自 `/proc/loadavg`。",
"lang": "zh_CN",
"expression": "system_load1",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "系统平均负载 - 最近 1 分钟",
"note": "取自 `/proc/loadavg`。"
},
{
"lang": "en_US",
"name": "System load average-last 1 minute",
"note": "Taken from `/proc/loadavg `."
}
]
},
{
"id": 0,
"uuid": 1717556327900441000,
"collector": "Categraf",
"typ": "Linux",
"name": "系统平均负载 - 最近 15 分钟",
"unit": "none",
"note": "取自 `/proc/loadavg`。",
"lang": "zh_CN",
"expression": "system_load15",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "系统平均负载 - 最近 15 分钟",
"note": "取自 `/proc/loadavg`。"
},
{
"lang": "en_US",
"name": "System load average-last 15 minutes",
"note": "Taken from `/proc/loadavg `."
}
]
},
{
"id": 0,
"uuid": 1717556327902331000,
"collector": "Categraf",
"typ": "Linux",
"name": "系统平均负载 - 最近 5 分钟",
"unit": "none",
"note": "取自 `/proc/loadavg`。",
"lang": "zh_CN",
"expression": "system_load5",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "系统平均负载 - 最近 5 分钟",
"note": "取自 `/proc/loadavg`。"
},
{
"lang": "en_US",
"name": "System load average-last 5 minutes",
"note": "Taken from `/proc/loadavg `."
}
]
},
{
"id": 0,
"uuid": 1717556327904274000,
"collector": "Categraf",
"typ": "Linux",
"name": "系统平均负载(单核) - 最近 1 分钟",
"unit": "none",
"note": "相当于 `system_load1 / system_n_cpus`",
"lang": "zh_CN",
"expression": "system_load_norm_1",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "系统平均负载(单核) - 最近 1 分钟",
"note": "相当于 `system_load1 / system_n_cpus`"
},
{
"lang": "en_US",
"name": "System Load Average (Single Core)-Last 1 Minute",
"note": "Equivalent to ` system _ load1/system _ n _ cpus `"
}
]
},
{
"id": 0,
"uuid": 1717556327906176000,
"collector": "Categraf",
"typ": "Linux",
"name": "系统平均负载(单核) - 最近 15 分钟",
"unit": "none",
"note": "相当于 `system_load15 / system_n_cpus`",
"lang": "zh_CN",
"expression": "system_load_norm_15",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "系统平均负载(单核) - 最近 15 分钟",
"note": "相当于 `system_load15 / system_n_cpus`"
},
{
"lang": "en_US",
"name": "System Load Average (Single Core)-Last 15 Minutes",
"note": "Equivalent to ` system _ load15/system _ n _ cpus `"
}
]
},
{
"id": 0,
"uuid": 1717556327907960000,
"collector": "Categraf",
"typ": "Linux",
"name": "系统平均负载(单核) - 最近 5 分钟",
"unit": "none",
"note": "相当于 `system_load5 / system_n_cpus`",
"lang": "zh_CN",
"expression": "system_load_norm_5",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "系统平均负载(单核) - 最近 5 分钟",
"note": "相当于 `system_load5 / system_n_cpus`"
},
{
"lang": "en_US",
"name": "System Load Average (Single Core)-Last 5 Minutes",
"note": "Equivalent to ` system _ load5/system _ n _ cpus `"
}
]
},
{
"id": 0,
"uuid": 1717556327909750000,
"collector": "Categraf",
"typ": "Linux",
"name": "网卡入方向(接收)每秒丢弃的数据包个数",
"unit": "none",
"note": "原始指标 net_drop_in 表示操作系统启动之后各个网卡入方向(接收)丢弃的数据包总数。",
"lang": "zh_CN",
"expression": "irate(net_drop_in[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡入方向(接收)每秒丢弃的数据包个数",
"note": "原始指标 net_drop_in 表示操作系统启动之后各个网卡入方向(接收)丢弃的数据包总数。"
},
{
"lang": "en_US",
"name": "Number of packets dropped per second in the incoming direction (receiving) of the network card",
"note": "The original indicator net _ drop _ in indicates the total number of packets dropped (received) by each network card after the operating system is started."
}
]
},
{
"id": 0,
"uuid": 1717556327911515000,
"collector": "Categraf",
"typ": "Linux",
"name": "网卡入方向(接收)每秒数据包数",
"unit": "none",
"note": "原始指标 net_packets_recv 表示操作系统启动之后各个网卡入方向(接收)数据包总数。",
"lang": "zh_CN",
"expression": "irate(net_packets_recv[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡入方向(接收)每秒数据包数",
"note": "原始指标 net_packets_recv 表示操作系统启动之后各个网卡入方向(接收)数据包总数。"
},
{
"lang": "en_US",
"name": "NIC incoming (receiving) packets per second",
"note": "The original indicator net _ packets _ recv indicates the total number of incoming (received) data packets by each network card after the operating system is booted."
}
]
},
{
"id": 0,
"uuid": 1717556327913386000,
"collector": "Categraf",
"typ": "Linux",
"name": "网卡入方向(接收)每秒错包数",
"unit": "none",
"note": "原始指标 net_err_in 表示操作系统启动之后各个网卡入方向(接收)错包总数。",
"lang": "zh_CN",
"expression": "irate(net_err_in[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡入方向(接收)每秒错包数",
"note": "原始指标 net_err_in 表示操作系统启动之后各个网卡入方向(接收)错包总数。"
},
{
"lang": "en_US",
"name": "Number of wrong packets per second in the incoming direction (receiving) of the network card",
"note": "The original indicator net _ err _ in indicates the total number of error packets in (received) by each network card after the operating system is started."
}
]
},
{
"id": 0,
"uuid": 1717556327915259000,
"collector": "Categraf",
"typ": "Linux",
"name": "网卡出方向(发送)每秒丢弃的数据包个数",
"unit": "none",
"note": "原始指标 net_drop_out 表示操作系统启动之后各个网卡出方向(发送)丢弃的数据包总数。",
"lang": "zh_CN",
"expression": "irate(net_drop_out[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡出方向(发送)每秒丢弃的数据包个数",
"note": "原始指标 net_drop_out 表示操作系统启动之后各个网卡出方向(发送)丢弃的数据包总数。"
},
{
"lang": "en_US",
"name": "Number of packets discarded per second in the outbound direction (sending) of the network card",
"note": "The original indicator net _ drop _ out indicates the total number of packets discarded (sent) by each network card outbound after the operating system is started."
}
]
},
{
"id": 0,
"uuid": 1717556327917341000,
"collector": "Categraf",
"typ": "Linux",
"name": "网卡出方向(发送)每秒数据包数",
"unit": "none",
"note": "原始指标 net_packets_sent 表示操作系统启动之后各个网卡出方向(发送)数据包总数。",
"lang": "zh_CN",
"expression": "irate(net_packets_sent[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡出方向(发送)每秒数据包数",
"note": "原始指标 net_packets_sent 表示操作系统启动之后各个网卡出方向(发送)数据包总数。"
},
{
"lang": "en_US",
"name": "Number of packets per second in the outgoing direction (sent) of the network card",
"note": "The original indicator net _ packets _ sent indicates the total number of outbound (sent) data packets from each network card after the operating system is started."
}
]
},
{
"id": 0,
"uuid": 1717556327919292000,
"collector": "Categraf",
"typ": "Linux",
"name": "网卡出方向(发送)每秒错包数",
"unit": "none",
"note": "原始指标 net_err_out 表示操作系统启动之后各个网卡出方向(发送)错包总数。",
"lang": "zh_CN",
"expression": "irate(net_err_out[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡出方向(发送)每秒错包数",
"note": "原始指标 net_err_out 表示操作系统启动之后各个网卡出方向(发送)错包总数。"
},
{
"lang": "en_US",
"name": "Number of wrong packets per second in the outgoing direction (sending) of the network card",
"note": "The original indicator net _ errr _ out indicates the total number of error packets sent out by each network card after the operating system is started."
}
]
},
{
"id": 0,
"uuid": 1717556327921364000,
"collector": "Categraf",
"typ": "Linux",
"name": "网卡每秒发送的 bit 量",
"unit": "bitsSecIEC",
"note": "原始指标 net_bytes_sent 表示操作系统启动之后发送的 byte 总量,因为网卡流量习惯使用 bit 作为单位,所以在表达式中做了换算。",
"lang": "zh_CN",
"expression": "irate(net_bytes_sent[2m]) * 8",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡每秒发送的 bit 量",
"note": "原始指标 net_bytes_sent 表示操作系统启动之后发送的 byte 总量,因为网卡流量习惯使用 bit 作为单位,所以在表达式中做了换算。"
},
{
"lang": "en_US",
"name": "The amount of bits sent per second by the network card",
"note": "The original indicator net _ bytes _ sent represents the total number of bytes sent after the operating system is started. Because the network card traffic is used to using bits as the unit, it is converted in the expression."
}
]
},
{
"id": 0,
"uuid": 1717556327923797000,
"collector": "Categraf",
"typ": "Linux",
"name": "网卡每秒接收的 bit 量",
"unit": "bitsSecIEC",
"note": "原始指标 net_bytes_recv 表示操作系统启动之后接收的 byte 总量,因为网卡流量习惯使用 bit 作为单位,所以在表达式中做了换算。",
"lang": "zh_CN",
"expression": "irate(net_bytes_recv[2m]) * 8",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡每秒接收的 bit 量",
"note": "原始指标 net_bytes_recv 表示操作系统启动之后接收的 byte 总量,因为网卡流量习惯使用 bit 作为单位,所以在表达式中做了换算。"
},
{
"lang": "en_US",
"name": "The number of bits received by the network card per second",
"note": "The original indicator net _ bytes _ recv represents the total number of bytes received after the operating system is started. Because the network card traffic is used to using bits as the unit, it is converted in the expression."
}
]
},
{
"id": 0,
"uuid": 1717556327925641000,
"collector": "Categraf",
"typ": "Linux",
"name": "进程数 - “僵死”状态的进程数",
"unit": "none",
"note": "处于“僵死”状态的进程数,它是由 Linux 上的 X 状态代码表示的。",
"lang": "zh_CN",
"expression": "processes_dead",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程数 - “僵死”状态的进程数",
"note": "处于“僵死”状态的进程数,它是由 Linux 上的 X 状态代码表示的。"
},
{
"lang": "en_US",
"name": "Number of processes-Number of processes in the \"dead\" state",
"note": "The number of processes in the \"dead\" state, which is represented by the X status code on Linux."
}
]
},
{
"id": 0,
"uuid": 1717556327928108000,
"collector": "Categraf",
"typ": "Linux",
"name": "进程数 - “分页”状态的进程数",
"unit": "none",
"note": "处于“分页”状态的进程数,它是由 Linux 上的 W 状态代码表示的。",
"lang": "zh_CN",
"expression": "processes_paging",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程数 - “分页”状态的进程数",
"note": "处于“分页”状态的进程数,它是由 Linux 上的 W 状态代码表示的。"
},
{
"lang": "en_US",
"name": "Number of Processes-Number of processes in the \"Paging\" state",
"note": "The number of processes in the \"paging\" state, which is represented by the W status code on Linux."
}
]
},
{
"id": 0,
"uuid": 1717556327930025000,
"collector": "Categraf",
"typ": "Linux",
"name": "进程数 - 停止的进程数",
"unit": "none",
"note": "停止的进程数,由 T 状态代码表示。",
"lang": "zh_CN",
"expression": "processes_stopped",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程数 - 停止的进程数",
"note": "停止的进程数,由 T 状态代码表示。"
},
{
"lang": "en_US",
"name": "Number of Processes-Number of stopped processes",
"note": "The number of processes stopped, represented by the T status code."
}
]
},
{
"id": 0,
"uuid": 1717556327931736000,
"collector": "Categraf",
"typ": "Linux",
"name": "进程数 - 僵尸进程数",
"unit": "none",
"note": "僵尸进程数,由 Z 状态代码表示。",
"lang": "zh_CN",
"expression": "processes_zombies",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程数 - 僵尸进程数",
"note": "僵尸进程数,由 Z 状态代码表示。"
},
{
"lang": "en_US",
"name": "Number of processes-Number of zombie processes",
"note": "Number of zombie processes, represented by the Z status code."
}
]
},
{
"id": 0,
"uuid": 1717556327933660000,
"collector": "Categraf",
"typ": "Linux",
"name": "进程数 - 总数",
"unit": "none",
"note": "实例上的总进程数。",
"lang": "zh_CN",
"expression": "processes_total",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程数 - 总数",
"note": "实例上的总进程数。"
},
{
"lang": "en_US",
"name": "Number of processes-total",
"note": "The total number of processes on the instance."
}
]
},
{
"id": 0,
"uuid": 1717556327935470000,
"collector": "Categraf",
"typ": "Linux",
"name": "进程数 - 未知状态的进程数",
"unit": "none",
"note": "未知状态的进程数。",
"lang": "zh_CN",
"expression": "processes_unknown",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程数 - 未知状态的进程数",
"note": "未知状态的进程数。"
},
{
"lang": "en_US",
"name": "Number of Processes-Number of processes with unknown status",
"note": "Number of processes with unknown status."
}
]
},
{
"id": 0,
"uuid": 1717556327937676000,
"collector": "Categraf",
"typ": "Linux",
"name": "进程数 - 正在睡眠的进程数",
"unit": "none",
"note": "睡眠的进程数,由 S 状态代码表示。",
"lang": "zh_CN",
"expression": "processes_sleeping",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程数 - 正在睡眠的进程数",
"note": "睡眠的进程数,由 S 状态代码表示。"
},
{
"lang": "en_US",
"name": "Number of processes-Number of processes that are sleeping",
"note": "Number of sleeping processes, represented by the S status code."
}
]
},
{
"id": 0,
"uuid": 1717556327939548000,
"collector": "Categraf",
"typ": "Linux",
"name": "进程数 - 正在运行的进程数",
"unit": "none",
"note": "正在运行的进程数,由 R 状态代码表示。",
"lang": "zh_CN",
"expression": "processes_running",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程数 - 正在运行的进程数",
"note": "正在运行的进程数,由 R 状态代码表示。"
},
{
"lang": "en_US",
"name": "Number of Processes-Number of processes running",
"note": "The number of processes running, represented by the R status code."
}
]
},
{
"id": 0,
"uuid": 1717556327941412000,
"collector": "Categraf",
"typ": "Linux",
"name": "进程数 - 空闲的数量",
"unit": "none",
"note": "处于空闲状态 (睡眠超过 20 秒) 的进程数。",
"lang": "zh_CN",
"expression": "processes_idle",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程数 - 空闲的数量",
"note": "处于空闲状态 (睡眠超过 20 秒) 的进程数。"
},
{
"lang": "en_US",
"name": "Number of processes-Number of idle",
"note": "Number of processes that are idle (sleeping for more than 20 seconds)."
}
]
},
{
"id": 0,
"uuid": 1717556327943375000,
"collector": "Categraf",
"typ": "Linux",
"name": "进程数 - 线程总数",
"unit": "none",
"note": "组成进程的总线程数。该指标仅适用于 Linux 实例。",
"lang": "zh_CN",
"expression": "processes_total_threads",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程数 - 线程总数",
"note": "组成进程的总线程数。该指标仅适用于 Linux 实例。"
},
{
"lang": "en_US",
"name": "Number of Processes-Total Threads",
"note": "The number of bus threads that make up the process. This metric applies only to Linux instances."
}
]
},
{
"id": 0,
"uuid": 1717556327945242000,
"collector": "Categraf",
"typ": "Linux",
"name": "进程数 - 被阻止的进程数",
"unit": "none",
"note": "被阻止的进程数量。",
"lang": "zh_CN",
"expression": "processes_blocked",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程数 - 被阻止的进程数",
"note": "被阻止的进程数量。"
},
{
"lang": "en_US",
"name": "Number of processes-Number of blocked processes",
"note": "Number of blocked processes."
}
]
}
]
================================================
FILE: integrations/Linux/metrics/exporter-base.json
================================================
[
{
"id": 0,
"uuid": 1717556327947680000,
"collector": "Exporter",
"typ": "Linux",
"name": "CPU Steal 时间占比(整机平均)",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "avg without (mode,cpu) ( irate(node_cpu_seconds_total{mode=\"steal\"}[2m]) ) * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU Steal 时间占比(整机平均)",
"note": ""
},
{
"lang": "en_US",
"name": "CPU Steal time ratio (average of the whole machine)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327949505000,
"collector": "Exporter",
"typ": "Linux",
"name": "CPU 内核态时间占比(整机平均)",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "avg without (mode,cpu) ( irate(node_cpu_seconds_total{mode=\"system\"}[2m]) ) * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU 内核态时间占比(整机平均)",
"note": ""
},
{
"lang": "en_US",
"name": "CPU core mode time ratio (average of the whole machine)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327951300000,
"collector": "Exporter",
"typ": "Linux",
"name": "CPU 利用率(整机平均)",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "avg without (mode,cpu) ( 1 - irate(node_cpu_seconds_total{mode=\"idle\"}[2m]) ) * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU 利用率(整机平均)",
"note": ""
},
{
"lang": "en_US",
"name": "CPU utilization (machine average)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327954250000,
"collector": "Exporter",
"typ": "Linux",
"name": "CPU 用户态时间占比(整机平均)",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "avg without (mode,cpu) ( irate(node_cpu_seconds_total{mode=\"user\"}[2m]) ) * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU 用户态时间占比(整机平均)",
"note": ""
},
{
"lang": "en_US",
"name": "CPU user mode time ratio (average of the whole machine)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327957318000,
"collector": "Exporter",
"typ": "Linux",
"name": "CPU 硬中断时间占比(整机平均)",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "avg without (mode,cpu) ( irate(node_cpu_seconds_total{mode=\"irq\"}[2m]) ) * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU 硬中断时间占比(整机平均)",
"note": ""
},
{
"lang": "en_US",
"name": "Proportion of CPU hard interrupt time (average of the whole machine)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327959882000,
"collector": "Exporter",
"typ": "Linux",
"name": "CPU 空闲率(整机平均)",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "avg without (mode,cpu) ( irate(node_cpu_seconds_total{mode=\"idle\"}[2m]) ) * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU 空闲率(整机平均)",
"note": ""
},
{
"lang": "en_US",
"name": "CPU idle rate (overall machine average)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327962946000,
"collector": "Exporter",
"typ": "Linux",
"name": "CPU 软中断时间占比(整机平均)",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "avg without (mode,cpu) ( irate(node_cpu_seconds_total{mode=\"softirq\"}[2m]) ) * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "CPU 软中断时间占比(整机平均)",
"note": ""
},
{
"lang": "en_US",
"name": "Proportion of CPU soft interrupt time (average of the whole machine)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327965694000,
"collector": "Exporter",
"typ": "Linux",
"name": "交换空间使用率",
"unit": "percent",
"note": "交换空间使用率。计算原子取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes)/node_memory_SwapTotal_bytes * 100 and node_memory_SwapTotal_bytes > 0",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "交换空间使用率",
"note": "交换空间使用率。计算原子取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Swap space usage",
"note": "Swap space usage. The computational atom is taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327968158000,
"collector": "Exporter",
"typ": "Linux",
"name": "交换空间总量",
"unit": "bytesIEC",
"note": "交换空间总量。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "node_memory_SwapTotal_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "交换空间总量",
"note": "交换空间总量。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Total swap space",
"note": "Total amount of swap space. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327970655000,
"collector": "Exporter",
"typ": "Linux",
"name": "交换空间空闲量",
"unit": "bytesIEC",
"note": "交换空间空闲量。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "node_memory_SwapFree_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "交换空间空闲量",
"note": "交换空间空闲量。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Swap space free amount",
"note": "Exchange space free amount. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327973281000,
"collector": "Exporter",
"typ": "Linux",
"name": "内存 Buffered 量",
"unit": "bytesIEC",
"note": "用作缓冲区的内存量。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "node_memory_Buffers_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存 Buffered 量",
"note": "用作缓冲区的内存量。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Memory Buffered amount",
"note": "The amount of memory used as a buffer. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327975341000,
"collector": "Exporter",
"typ": "Linux",
"name": "内存 Cached 量",
"unit": "bytesIEC",
"note": "用作文件缓存的内存量。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "node_memory_Cached_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存 Cached 量",
"note": "用作文件缓存的内存量。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Memory Cached amount",
"note": "The amount of memory used as file cache. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327977268000,
"collector": "Exporter",
"typ": "Linux",
"name": "内存使用率(基于MemAvailable)",
"unit": "percent",
"note": "内存使用率。基于 MemAvailable 计算更准确,但是老版本的 Linux 不支持。",
"lang": "zh_CN",
"expression": "(1 - (node_memory_MemAvailable_bytes{} / node_memory_MemTotal_bytes{})) * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存使用率(基于MemAvailable)",
"note": "内存使用率。基于 MemAvailable 计算更准确,但是老版本的 Linux 不支持。"
},
{
"lang": "en_US",
"name": "Memory usage (based on MemAvailable)",
"note": "Memory usage. Calculation based on MemAvailable is more accurate, but older versions of Linux do not support it."
}
]
},
{
"id": 0,
"uuid": 1717556327979319000,
"collector": "Exporter",
"typ": "Linux",
"name": "内存可用量",
"unit": "bytesIEC",
"note": "可以立即分配给进程的可用内存量。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "node_memory_MemAvailable_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存可用量",
"note": "可以立即分配给进程的可用内存量。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Memory Availability",
"note": "The amount of available memory that can be immediately allocated to a process. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327981865000,
"collector": "Exporter",
"typ": "Linux",
"name": "内存总量",
"unit": "bytesIEC",
"note": "内存总量。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "node_memory_MemTotal_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存总量",
"note": "内存总量。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Total memory",
"note": "Total amount of memory. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327984422000,
"collector": "Exporter",
"typ": "Linux",
"name": "内存空闲量",
"unit": "bytesIEC",
"note": "未使用的内存量。取自 `/proc/meminfo`。",
"lang": "zh_CN",
"expression": "node_memory_MemFree_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "内存空闲量",
"note": "未使用的内存量。取自 `/proc/meminfo`。"
},
{
"lang": "en_US",
"name": "Free memory amount",
"note": "Amount of unused memory. Taken from `/proc/meminfo `."
}
]
},
{
"id": 0,
"uuid": 1717556327986746000,
"collector": "Exporter",
"typ": "Linux",
"name": "文件句柄 - 已分配占比",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "node_filefd_allocated / node_filefd_maximum * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "文件句柄 - 已分配占比",
"note": ""
},
{
"lang": "en_US",
"name": "File handle-allocated proportion",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327988887000,
"collector": "Exporter",
"typ": "Linux",
"name": "文件句柄 - 已分配量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "node_filefd_allocated",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "文件句柄 - 已分配量",
"note": ""
},
{
"lang": "en_US",
"name": "File Handle-Amount Allocated",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327991118000,
"collector": "Exporter",
"typ": "Linux",
"name": "文件句柄 - 总可分配量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "node_filefd_maximum",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "文件句柄 - 总可分配量",
"note": ""
},
{
"lang": "en_US",
"name": "File handle-total allocable quantity",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327993245000,
"collector": "Exporter",
"typ": "Linux",
"name": "硬盘 IO - 时间维度 Utilization",
"unit": "percent",
"note": "在时间维度统计硬盘 IO 时间占比,比如该值是 50%,表示有 50% 的时间是在处理 IO,该值 100%,表示一直在处理 IO,但是注意,现代磁盘设备具备并行处理多个 I/O 请求的能力,所以即便该值是 100%,可能硬盘还是可以接收新的处理请求。\n\n比如某人有两只手,最近 1 分钟一直在用单手劳动,从时间维度来看,利用率是 100%,但即便是 100%,再给他更多的活,他也能干,因为他还有一只手可用。",
"lang": "zh_CN",
"expression": "irate(node_disk_io_time_seconds_total[2m]) * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 时间维度 Utilization",
"note": "在时间维度统计硬盘 IO 时间占比,比如该值是 50%,表示有 50% 的时间是在处理 IO,该值 100%,表示一直在处理 IO,但是注意,现代磁盘设备具备并行处理多个 I/O 请求的能力,所以即便该值是 100%,可能硬盘还是可以接收新的处理请求。\n\n比如某人有两只手,最近 1 分钟一直在用单手劳动,从时间维度来看,利用率是 100%,但即便是 100%,再给他更多的活,他也能干,因为他还有一只手可用。"
},
{
"lang": "en_US",
"name": "Hard Disk IO-Time Dimension Utilization",
"note": "Count the proportion of hard disk IO time in the time dimension. For example, if the value is 50%, it means that 50% of the time is processing IO, and if the value is 100%, it means that IO has been processing all the time. However, note that modern disk devices have the ability to process multiple I/O requests in parallel, so even if the value is 100%, the hard disk may still be able to receive new processing requests. \n \nFor example, someone has two hands and has been working with one hand in the last minute. From the time dimension, the utilization rate is 100%, but even if it is 100%, he can do it if he is given more work, because he still has one hand available."
}
]
},
{
"id": 0,
"uuid": 1717556327995342000,
"collector": "Exporter",
"typ": "Linux",
"name": "硬盘 IO - 每秒写入字节数量",
"unit": "bytesSecIEC",
"note": "",
"lang": "zh_CN",
"expression": "irate(node_disk_written_bytes_total[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 每秒写入字节数量",
"note": ""
},
{
"lang": "en_US",
"name": "Hard disk IO-bytes written per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556327997897000,
"collector": "Exporter",
"typ": "Linux",
"name": "硬盘 IO - 每秒写次数",
"unit": "none",
"note": "每秒写次数",
"lang": "zh_CN",
"expression": "irate(node_disk_writes_completed_total{}[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 每秒写次数",
"note": "每秒写次数"
},
{
"lang": "en_US",
"name": "Hard drive IO-writes per second",
"note": "Writes per second"
}
]
},
{
"id": 0,
"uuid": 1717556328000186000,
"collector": "Exporter",
"typ": "Linux",
"name": "硬盘 IO - 每秒读取字节数量",
"unit": "bytesSecIEC",
"note": "",
"lang": "zh_CN",
"expression": "irate(node_disk_read_bytes_total[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 每秒读取字节数量",
"note": ""
},
{
"lang": "en_US",
"name": "Hard Drive IO-bytes read per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328002202000,
"collector": "Exporter",
"typ": "Linux",
"name": "硬盘 IO - 每秒读次数",
"unit": "none",
"note": "每秒读次数",
"lang": "zh_CN",
"expression": "irate(node_disk_reads_completed_total{}[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘 IO - 每秒读次数",
"note": "每秒读次数"
},
{
"lang": "en_US",
"name": "Hard drive IO-Reads per second",
"note": "Reads per second"
}
]
},
{
"id": 0,
"uuid": 1717556328004428000,
"collector": "Exporter",
"typ": "Linux",
"name": "硬盘使用率",
"unit": "percent",
"note": "硬盘空间使用率。",
"lang": "zh_CN",
"expression": "100 - ((node_filesystem_avail_bytes{} * 100) / node_filesystem_size_bytes{})",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘使用率",
"note": "硬盘空间使用率。"
},
{
"lang": "en_US",
"name": "Hard Drive Usage",
"note": "Hard disk space usage."
}
]
},
{
"id": 0,
"uuid": 1717556328006453000,
"collector": "Exporter",
"typ": "Linux",
"name": "硬盘剩余量",
"unit": "bytesSI",
"note": "使用 SI 标准渲染数据,和 df 命令保持一致。",
"lang": "zh_CN",
"expression": "node_filesystem_free_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘剩余量",
"note": "使用 SI 标准渲染数据,和 df 命令保持一致。"
},
{
"lang": "en_US",
"name": "Remaining hard disk",
"note": "Use the SI standard to render data, consistent with the df command."
}
]
},
{
"id": 0,
"uuid": 1717556328008355000,
"collector": "Exporter",
"typ": "Linux",
"name": "硬盘可用量",
"unit": "bytesSI",
"note": "使用 SI 标准渲染数据,和 df 命令保持一致。",
"lang": "zh_CN",
"expression": "node_filesystem_avail_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘可用量",
"note": "使用 SI 标准渲染数据,和 df 命令保持一致。"
},
{
"lang": "en_US",
"name": "Hard Drive Availability",
"note": "Use the SI standard to render data, consistent with the df command."
}
]
},
{
"id": 0,
"uuid": 1717556328010578000,
"collector": "Exporter",
"typ": "Linux",
"name": "硬盘总量",
"unit": "bytesSI",
"note": "使用 SI 标准渲染数据,和 df 命令保持一致。",
"lang": "zh_CN",
"expression": "node_filesystem_size_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "硬盘总量",
"note": "使用 SI 标准渲染数据,和 df 命令保持一致。"
},
{
"lang": "en_US",
"name": "Total hard disk",
"note": "Use the SI standard to render data, consistent with the df command."
}
]
},
{
"id": 0,
"uuid": 1717556328012604000,
"collector": "Exporter",
"typ": "Linux",
"name": "系统 CPU 核数",
"unit": "none",
"note": "CPU 逻辑核的数量。",
"lang": "zh_CN",
"expression": "count(node_cpu_seconds_total{mode=\"idle\"}) without (cpu, mode)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "系统 CPU 核数",
"note": "CPU 逻辑核的数量。"
},
{
"lang": "en_US",
"name": "Number of CPU cores",
"note": "Number of CPU logical cores."
}
]
},
{
"id": 0,
"uuid": 1717556328014325000,
"collector": "Exporter",
"typ": "Linux",
"name": "系统平均负载 - 最近 1 分钟",
"unit": "none",
"note": "取自 `/proc/loadavg`。",
"lang": "zh_CN",
"expression": "node_load1",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "系统平均负载 - 最近 1 分钟",
"note": "取自 `/proc/loadavg`。"
},
{
"lang": "en_US",
"name": "System load average-last 1 minute",
"note": "Taken from `/proc/loadavg `."
}
]
},
{
"id": 0,
"uuid": 1717556328016440000,
"collector": "Exporter",
"typ": "Linux",
"name": "系统平均负载 - 最近 15 分钟",
"unit": "none",
"note": "取自 `/proc/loadavg`。",
"lang": "zh_CN",
"expression": "node_load15",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "系统平均负载 - 最近 15 分钟",
"note": "取自 `/proc/loadavg`。"
},
{
"lang": "en_US",
"name": "System load average-last 15 minutes",
"note": "Taken from `/proc/loadavg `."
}
]
},
{
"id": 0,
"uuid": 1717556328018361000,
"collector": "Exporter",
"typ": "Linux",
"name": "系统平均负载 - 最近 5 分钟",
"unit": "none",
"note": "取自 `/proc/loadavg`。",
"lang": "zh_CN",
"expression": "node_load5",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "系统平均负载 - 最近 5 分钟",
"note": "取自 `/proc/loadavg`。"
},
{
"lang": "en_US",
"name": "System load average-last 5 minutes",
"note": "Taken from `/proc/loadavg `."
}
]
},
{
"id": 0,
"uuid": 1717556328020507000,
"collector": "Exporter",
"typ": "Linux",
"name": "系统平均负载(单核) - 最近 1 分钟",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "node_load1 / count(node_cpu_seconds_total{mode=\"idle\"}) without (cpu, mode)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "系统平均负载(单核) - 最近 1 分钟",
"note": ""
},
{
"lang": "en_US",
"name": "System Load Average (Single Core)-Last 1 Minute",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328022263000,
"collector": "Exporter",
"typ": "Linux",
"name": "系统平均负载(单核) - 最近 15 分钟",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "node_load15 / count(node_cpu_seconds_total{mode=\"idle\"}) without (cpu, mode)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "系统平均负载(单核) - 最近 15 分钟",
"note": ""
},
{
"lang": "en_US",
"name": "System Load Average (Single Core)-Last 15 Minutes",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328024125000,
"collector": "Exporter",
"typ": "Linux",
"name": "系统平均负载(单核) - 最近 5 分钟",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "node_load5 / count(node_cpu_seconds_total{mode=\"idle\"}) without (cpu, mode)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "系统平均负载(单核) - 最近 5 分钟",
"note": ""
},
{
"lang": "en_US",
"name": "System Load Average (Single Core)-Last 5 Minutes",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328026028000,
"collector": "Exporter",
"typ": "Linux",
"name": "网卡入方向(接收)每秒丢弃的数据包个数",
"unit": "none",
"note": "原始指标 node_network_receive_drop_total 表示操作系统启动之后各个网卡入方向(接收)丢弃的数据包总数。",
"lang": "zh_CN",
"expression": "irate(node_network_receive_drop_total[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡入方向(接收)每秒丢弃的数据包个数",
"note": "原始指标 node_network_receive_drop_total 表示操作系统启动之后各个网卡入方向(接收)丢弃的数据包总数。"
},
{
"lang": "en_US",
"name": "Number of packets dropped per second in the incoming direction (receiving) of the network card",
"note": "The original indicator node _ network _ receive _ drop _ total indicates the total number of packets dropped (received) by each network card after the operating system starts."
}
]
},
{
"id": 0,
"uuid": 1717556328027875000,
"collector": "Exporter",
"typ": "Linux",
"name": "网卡入方向(接收)每秒数据包数",
"unit": "none",
"note": "原始指标 node_network_receive_packets_total 表示操作系统启动之后各个网卡入方向(接收)数据包总数。",
"lang": "zh_CN",
"expression": "irate(node_network_receive_packets_total[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡入方向(接收)每秒数据包数",
"note": "原始指标 node_network_receive_packets_total 表示操作系统启动之后各个网卡入方向(接收)数据包总数。"
},
{
"lang": "en_US",
"name": "NIC incoming (receiving) packets per second",
"note": "The original indicator node _ network _ receive _ packets _ total indicates the total number of data packets incoming (received) by each network card after the operating system is booted."
}
]
},
{
"id": 0,
"uuid": 1717556328029862000,
"collector": "Exporter",
"typ": "Linux",
"name": "网卡入方向(接收)每秒错包数",
"unit": "none",
"note": "原始指标 node_network_receive_errs_total 表示操作系统启动之后各个网卡入方向(接收)错包总数。",
"lang": "zh_CN",
"expression": "irate(node_network_receive_errs_total[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡入方向(接收)每秒错包数",
"note": "原始指标 node_network_receive_errs_total 表示操作系统启动之后各个网卡入方向(接收)错包总数。"
},
{
"lang": "en_US",
"name": "Number of wrong packets per second in the incoming direction (receiving) of the network card",
"note": "The original indicator node _ network _ receive _ errs _ total indicates the total number of error packets incoming (received) by each network card after the operating system is started."
}
]
},
{
"id": 0,
"uuid": 1717556328032074000,
"collector": "Exporter",
"typ": "Linux",
"name": "网卡出方向(发送)每秒丢弃的数据包个数",
"unit": "none",
"note": "原始指标 node_network_transmit_drop_total 表示操作系统启动之后各个网卡出方向(发送)丢弃的数据包总数。",
"lang": "zh_CN",
"expression": "irate(node_network_transmit_drop_total[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡出方向(发送)每秒丢弃的数据包个数",
"note": "原始指标 node_network_transmit_drop_total 表示操作系统启动之后各个网卡出方向(发送)丢弃的数据包总数。"
},
{
"lang": "en_US",
"name": "Number of packets discarded per second in the outbound direction (sending) of the network card",
"note": "The original indicator node _ network _ transmit _ drop _ total indicates the total number of packets discarded (sent) by each network card after the operating system starts."
}
]
},
{
"id": 0,
"uuid": 1717556328033909000,
"collector": "Exporter",
"typ": "Linux",
"name": "网卡出方向(发送)每秒数据包数",
"unit": "none",
"note": "原始指标 node_network_transmit_packets_total 表示操作系统启动之后各个网卡出方向(发送)数据包总数。",
"lang": "zh_CN",
"expression": "irate(node_network_transmit_packets_total[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡出方向(发送)每秒数据包数",
"note": "原始指标 node_network_transmit_packets_total 表示操作系统启动之后各个网卡出方向(发送)数据包总数。"
},
{
"lang": "en_US",
"name": "Number of packets per second in the outgoing direction (sent) of the network card",
"note": "The original indicator node _ network _ transmit _ packets _ total indicates the total number of outbound (sent) data packets from each network card after the operating system is started."
}
]
},
{
"id": 0,
"uuid": 1717556328035721000,
"collector": "Exporter",
"typ": "Linux",
"name": "网卡出方向(发送)每秒错包数",
"unit": "none",
"note": "原始指标 node_network_transmit_errs_total 表示操作系统启动之后各个网卡出方向(发送)错包总数。",
"lang": "zh_CN",
"expression": "irate(node_network_transmit_errs_total[2m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡出方向(发送)每秒错包数",
"note": "原始指标 node_network_transmit_errs_total 表示操作系统启动之后各个网卡出方向(发送)错包总数。"
},
{
"lang": "en_US",
"name": "Number of wrong packets per second in the outgoing direction (sending) of the network card",
"note": "The original indicator node _ network _ transmit _ errs _ total indicates the total number of error packets sent out (sent) by each network card after the operating system is started."
}
]
},
{
"id": 0,
"uuid": 1717556328037688000,
"collector": "Exporter",
"typ": "Linux",
"name": "网卡每秒发送的 bit 量",
"unit": "bitsSecIEC",
"note": "原始指标 node_network_transmit_bytes_total 表示操作系统启动之后发送的 byte 总量,因为网卡流量习惯使用 bit 作为单位,所以在表达式中做了换算。",
"lang": "zh_CN",
"expression": "irate(node_network_transmit_bytes_total[2m]) * 8",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡每秒发送的 bit 量",
"note": "原始指标 node_network_transmit_bytes_total 表示操作系统启动之后发送的 byte 总量,因为网卡流量习惯使用 bit 作为单位,所以在表达式中做了换算。"
},
{
"lang": "en_US",
"name": "The amount of bits sent per second by the network card",
"note": "The original indicator node _ network _ transmit _ bytes _ total represents the total number of bytes sent after the operating system starts. Because the network card traffic is used to using bits as a unit, it is converted in the expression."
}
]
},
{
"id": 0,
"uuid": 1717556328039789000,
"collector": "Exporter",
"typ": "Linux",
"name": "网卡每秒接收的 bit 量",
"unit": "bitsSecIEC",
"note": "原始指标 node_network_receive_bytes_total 表示操作系统启动之后接收的 byte 总量,因为网卡流量习惯使用 bit 作为单位,所以在表达式中做了换算。",
"lang": "zh_CN",
"expression": "irate(node_network_receive_bytes_total[2m]) * 8",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "网卡每秒接收的 bit 量",
"note": "原始指标 node_network_receive_bytes_total 表示操作系统启动之后接收的 byte 总量,因为网卡流量习惯使用 bit 作为单位,所以在表达式中做了换算。"
},
{
"lang": "en_US",
"name": "The number of bits received by the network card per second",
"note": "The original indicator node _ network _ received _ bytes _ total represents the total number of bytes received after the operating system is started. Because the network card traffic is used to using bits as a unit, it is converted in the expression."
}
]
}
]
================================================
FILE: integrations/Logstash/collect/logstash/logstash.toml
================================================
# # collect interval
# interval = 15
# Read metrics exposed by Logstash
[[instances]]
# # interval = global.interval * interval_times
# interval_times = 1
# append labels
# labels = { instance="x" }
## The URL of the exposed Logstash API endpoint.
# url = "http://127.0.0.1:9600"
url = ""
## Use Logstash 5 single pipeline API, set to true when monitoring
## Logstash 5.
# single_pipeline = false
## Enable optional collection components. Can contain
## "pipelines", "process", and "jvm".
# collect = ["pipelines", "process", "jvm"]
## Timeout for HTTP requests.
# timeout = "5s"
## Optional HTTP Basic Auth credentials.
# username = "username"
# password = "pa$$word"
## Optional HTTP headers.
# [inputs.logstash.headers]
# "X-Special-Header" = "Special-Value"
## Optional TLS Config
# use_tls = false
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
================================================
FILE: integrations/Logstash/dashboards/logstash-dash.json
================================================
{
"id": 0,
"group_id": 0,
"name": "logstash",
"ident": "",
"tags": "logging Prometheus",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "9c88279f-7f15-4527-aa96-7df23cff9562",
"layout": {
"h": 1,
"i": "9c88279f-7f15-4527-aa96-7df23cff9562",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Events",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a8c3db4a-708d-405f-b788-150537e2c2e4",
"layout": {
"h": 4,
"i": "a8c3db4a-708d-405f-b788-150537e2c2e4",
"isResizable": true,
"w": 12,
"x": 0,
"y": 1
},
"name": "Event Received Rate(/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(logstash_events_in[1m])) by (pipeline)",
"legend": "{{pipeline}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "34fbd572-c71a-475b-91a9-168df988cec3",
"layout": {
"h": 4,
"i": "1e465bd5-c3ed-4f93-8df7-a980299805b4",
"isResizable": true,
"w": 12,
"x": 12,
"y": 1
},
"name": "Event emitted Rate(/s)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(logstash_events_out[1m])) by (pipeline)",
"legend": "output",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1bb9f546-bd57-48fd-8221-b8b43687a12a",
"layout": {
"h": 4,
"i": "3c350ca9-5fcd-4605-9fee-c9834d7bb3b5",
"isResizable": true,
"w": 12,
"x": 0,
"y": 5
},
"name": "Event queue(ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(rate(logstash_events_queue_push_duration_in_millis[1m])) by (pipeline))/(count(sum(logstash_events_queue_push_duration_in_millis[1m]) by(ident)))",
"legend": "{{pipeline}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "99f557e3-d368-4de1-b065-44e2006f1d21",
"layout": {
"h": 4,
"i": "1b808724-fbac-4b9b-b7ce-a5725e30e4f8",
"isResizable": true,
"w": 12,
"x": 12,
"y": 5
},
"name": "Event esp(ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(logstash_events_duration_in_millis[1m] )/rate(logstash_events_out[1m])) by(pipeline)",
"legend": "{{pipeline}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "5f12a28c-9ca4-4f10-aa0c-116dff2193cf",
"layout": {
"h": 1,
"i": "5f12a28c-9ca4-4f10-aa0c-116dff2193cf",
"isResizable": false,
"w": 24,
"x": 0,
"y": 9
},
"name": "memory",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "57867483-a372-45c9-9cd0-53aa160b57fb",
"layout": {
"h": 4,
"i": "f170fe2a-fd1e-49f0-aa7b-0c3ac42894e6",
"isResizable": true,
"w": 12,
"x": 0,
"y": 10
},
"name": "jvm gc count",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "increase(logstash_jvm_gc_collectors_old_collection_count)",
"legend": "old gc - {{ident}}",
"refId": "A"
},
{
"expr": "increase(logstash_jvm_gc_collectors_young_collection_count)",
"legend": "young gc - {{ident}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "e61be63c-81d6-4aab-a4de-d23ea444d802",
"layout": {
"h": 4,
"i": "9ccb2af0-fa05-4a66-814d-53b77e320303",
"isResizable": true,
"w": 12,
"x": 12,
"y": 10
},
"name": "jvm gc time(ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "increase(logstash_jvm_gc_collectors_old_collection_time_in_millis)",
"legend": "old gc - {{ident}}",
"refId": "A"
},
{
"expr": "increase(logstash_jvm_gc_collectors_young_collection_time_in_millis)",
"legend": "yougn gc - {{ident}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.29,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "86329740-9696-4435-bd28-120107914b40",
"layout": {
"h": 4,
"i": "ab5fd3bd-81c8-4f3f-a403-6111dd6fce43",
"isResizable": true,
"w": 12,
"x": 0,
"y": 14
},
"name": "jvm heap ",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "logstash_jvm_mem_heap_max_in_bytes",
"legend": " max - {{ident}}",
"refId": "A"
},
{
"expr": "logstash_jvm_mem_heap_used_in_bytes",
"legend": "used - {{ident}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.29,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8d82f269-d625-4da9-8dfd-33a0888c396d",
"layout": {
"h": 4,
"i": "ced26104-67f8-4537-b564-038d824f3be1",
"isResizable": true,
"w": 12,
"x": 12,
"y": 14
},
"name": "jvm no heap ",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "logstash_jvm_mem_non_heap_committed_in_bytes",
"legend": "committed - {{ident}}",
"refId": "A"
},
{
"expr": "logstash_jvm_mem_non_heap_used_in_bytes",
"legend": "used - {{ident}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": false,
"id": "9ca74da3-b31f-4ac9-ba76-397027da677d",
"layout": {
"h": 1,
"i": "9ca74da3-b31f-4ac9-ba76-397027da677d",
"isResizable": false,
"w": 24,
"x": 0,
"y": 18
},
"name": "cpu",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.29,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5d507c1a-8043-4abf-9c7a-94e3c9189cc1",
"layout": {
"h": 4,
"i": "5fffbff5-cd86-4447-9c54-1f5abdeffa5c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 19
},
"name": "cpu Utilization",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "logstash_process_cpu_percent",
"legend": "{{ident}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.29,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "87f9c763-e082-416a-994d-a1ed12d58e05",
"layout": {
"h": 4,
"i": "8ad83037-8df6-4f3a-936a-db9c26bc5071",
"isResizable": true,
"w": 12,
"x": 12,
"y": 19
},
"name": "system load",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "logstash_process_cpu_load_average_1m",
"legend": "1m - {{ident}}",
"refId": "A"
},
{
"expr": "logstash_process_cpu_load_average_5m",
"legend": "5m - {{ident}}",
"refId": "B"
},
{
"expr": "logstash_process_cpu_load_average_15m",
"legend": "15m - {{ident}}",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"type": "row"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(logstash_events_in, instance)",
"name": "instance",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328044690000
}
================================================
FILE: integrations/Logstash/markdown/README.md
================================================
# logstash
logstash 监控采集插件,由telegraf改造而来。
## Configuration
请参考配置[示例](https://github.com/flashcatcloud/categraf/blob/main/conf/input.logstash/logstash.toml)
================================================
FILE: integrations/MinIO/alerts/minio_by_categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "DisksOffline",
"note": "Disks down in MinIO deployment",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "avg_over_time(minio_cluster_disk_offline_total{}[5m]) \u003e 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(minio_cluster_disk_offline_total{}[5m]) \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328048390000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "NodesOffline",
"note": "Node down in MinIO deployment",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "avg_over_time(minio_cluster_nodes_offline_total{}[5m]) \u003e 0",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg_over_time(minio_cluster_nodes_offline_total{}[5m]) \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328048820000
}
]
================================================
FILE: integrations/MinIO/dashboards/minio_by_categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "MinIO Dashboard",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"textMode": "valueAndName"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "134b5e6d-63aa-4a43-ae26-13177c3d5184",
"layout": {
"h": 6,
"i": "134b5e6d-63aa-4a43-ae26-13177c3d5184",
"isResizable": true,
"w": 3,
"x": 0,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Uptime",
"options": {
"standardOptions": {
"decimals": 0,
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#3FC453",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "time() - max(minio_node_process_starttime_seconds{job=\"$scrape_jobs\"})",
"legend": "{{job}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "gauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "ca68e795-0a81-4c24-8e55-e754e0283b70",
"layout": {
"h": 3,
"i": "ca68e795-0a81-4c24-8e55-e754e0283b70",
"isResizable": true,
"w": 3,
"x": 3,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Total S3 Traffic Inbound",
"options": {
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum by (instance) (minio_s3_traffic_received_bytes{job=\"$scrape_jobs\"})",
"legend": "{{instance}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"detailName": "详情",
"legengPosition": "hidden"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "d356fc74-4fce-4764-aff9-bc5590c3753d",
"layout": {
"h": 6,
"i": "d356fc74-4fce-4764-aff9-bc5590c3753d",
"isResizable": true,
"w": 5,
"x": 6,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Capacity",
"options": {
"standardOptions": {
"util": "bytesIEC"
}
},
"targets": [
{
"expr": "topk(1, sum(minio_cluster_capacity_usable_total_bytes{job=\"$scrape_jobs\"}) by (instance)) - topk(1, sum(minio_cluster_capacity_usable_free_bytes{job=\"$scrape_jobs\"}) by (instance))",
"legend": "Used",
"refId": "A"
},
{
"expr": "topk(1, sum(minio_cluster_capacity_usable_free_bytes{job=\"$scrape_jobs\"}) by (instance)) ",
"legend": "Free",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "pie",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "3b618c2c-1394-46f7-b3b2-578201444e30",
"layout": {
"h": 6,
"i": "3b618c2c-1394-46f7-b3b2-578201444e30",
"isResizable": true,
"w": 5,
"x": 11,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Data Usage Growth",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(sum(minio_bucket_usage_total_bytes{job=\"$scrape_jobs\"}) by (instance,server))",
"legend": "Usage",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"baseColor": "#9470FF",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "817686c6-499c-434b-8b9e-ffc7c3a21308",
"layout": {
"h": 6,
"i": "817686c6-499c-434b-8b9e-ffc7c3a21308",
"isResizable": true,
"w": 5,
"x": 16,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Object size distribution",
"options": {
"standardOptions": {
"util": "seconds"
}
},
"targets": [
{
"expr": "max by (range) (sum (minio_bucket_objects_size_distribution{job=\"$scrape_jobs\"}) by (range))",
"legend": "{{range}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "855e0055-e418-464f-bc3e-04a8a5a1b1b3",
"layout": {
"h": 3,
"i": "855e0055-e418-464f-bc3e-04a8a5a1b1b3",
"isResizable": true,
"w": 3,
"x": 21,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Total Open FDs",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 2000
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum (minio_node_file_descriptor_open_total{job=\"$scrape_jobs\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "2cc7d5ac-1508-4067-91a5-1b08c4339f7f",
"layout": {
"h": 3,
"i": "2cc7d5ac-1508-4067-91a5-1b08c4339f7f",
"isResizable": true,
"w": 3,
"x": 3,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "Total S3 Traffic Outbound",
"options": {
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum by (instance) (minio_s3_traffic_sent_bytes{job=\"$scrape_jobs\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "b7b7d005-b1b3-403e-8cfa-91c561972ef5",
"layout": {
"h": 3,
"i": "b7b7d005-b1b3-403e-8cfa-91c561972ef5",
"isResizable": true,
"w": 3,
"x": 21,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "Total Goroutines",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 2000
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum without (server,instance) (minio_node_go_routine_total{job=\"$scrape_jobs\"})",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "avg",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "0e22c724-2ae0-4700-a7e5-3f1243a3d896",
"layout": {
"h": 2,
"i": "0e22c724-2ae0-4700-a7e5-3f1243a3d896",
"isResizable": true,
"w": 3,
"x": 0,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Total Online Servers",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "minio_cluster_nodes_online_total{job=\"$scrape_jobs\"}",
"legend": "{{job}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "avg",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "776ecb58-5daf-4d5c-9c5b-774f476de70b",
"layout": {
"h": 2,
"i": "776ecb58-5daf-4d5c-9c5b-774f476de70b",
"isResizable": true,
"w": 3,
"x": 3,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Total Online Disks",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "minio_cluster_disk_online_total{job=\"$scrape_jobs\"}",
"legend": "Total online disks in MinIO Cluster",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "e9225532-5991-4394-9e4e-8c09879447ba",
"layout": {
"h": 3,
"i": "e9225532-5991-4394-9e4e-8c09879447ba",
"isResizable": true,
"w": 3,
"x": 6,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Number of Buckets",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#E0B400",
"value": 75000000
},
{
"color": "#C4162A",
"value": 100000000
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "count(count by (bucket) (minio_bucket_usage_total_bytes{job=\"$scrape_jobs\"}))",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "2ebcc855-1fac-4a1e-baec-6ed398920c77",
"layout": {
"h": 6,
"i": "2ebcc855-1fac-4a1e-baec-6ed398920c77",
"isResizable": true,
"w": 7,
"x": 9,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "S3 API Data Received Rate ",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (server) (rate(minio_s3_traffic_received_bytes{job=\"$scrape_jobs\"}[5m]))",
"legend": "Data Received [{{server}}]",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "6edb8e54-e7ed-4cbe-af77-0bcd985dda95",
"layout": {
"h": 6,
"i": "6edb8e54-e7ed-4cbe-af77-0bcd985dda95",
"isResizable": true,
"w": 8,
"x": 16,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "S3 API Data Sent Rate ",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (server) (rate(minio_s3_traffic_sent_bytes{job=\"$scrape_jobs\"}[5m]))",
"legend": "Data Sent [{{server}}]",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"calc": "avg",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "26be803c-f2b5-40b2-99ce-39240830292e",
"layout": {
"h": 2,
"i": "26be803c-f2b5-40b2-99ce-39240830292e",
"isResizable": true,
"w": 3,
"x": 0,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Total Offline Servers",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "minio_cluster_nodes_offline_total{job=\"$scrape_jobs\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "avg",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "136b9b07-6979-4402-89da-3a79ab8dc732",
"layout": {
"h": 2,
"i": "136b9b07-6979-4402-89da-3a79ab8dc732",
"isResizable": true,
"w": 3,
"x": 3,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Total Offline Disks",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "minio_cluster_disk_offline_total{job=\"$scrape_jobs\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "4072f862-84d1-4563-b4cc-7e2219a1da8c",
"layout": {
"h": 3,
"i": "4072f862-84d1-4563-b4cc-7e2219a1da8c",
"isResizable": true,
"w": 3,
"x": 6,
"y": 9
},
"links": [],
"maxPerRow": 4,
"name": "Number of Objects",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#E0B400",
"value": 75000000
},
{
"color": "#C4162A",
"value": 100000000
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "topk(1, sum(minio_bucket_usage_object_total{job=\"$scrape_jobs\"}) by (instance))",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "9151721f-21e1-49ab-9a95-896fb364d9d2",
"layout": {
"h": 2,
"i": "9151721f-21e1-49ab-9a95-896fb364d9d2",
"isResizable": true,
"w": 3,
"x": 0,
"y": 10
},
"links": [],
"maxPerRow": 4,
"name": "Time Since Last Heal Activity",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "minio_heal_time_last_activity_nano_seconds{job=\"$scrape_jobs\"}",
"legend": "{{server}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "fc40c036-86fc-42c7-9cc6-384c64c0814c",
"layout": {
"h": 2,
"i": "fc40c036-86fc-42c7-9cc6-384c64c0814c",
"isResizable": true,
"w": 3,
"x": 3,
"y": 10
},
"links": [],
"maxPerRow": 4,
"name": "Time Since Last Scan Activity",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "minio_usage_last_activity_nano_seconds{job=\"$scrape_jobs\"}",
"legend": "{{server}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "b5fe1dc4-833e-4e53-a2cb-98e7ecb8ea7b",
"layout": {
"h": 10,
"i": "b5fe1dc4-833e-4e53-a2cb-98e7ecb8ea7b",
"isResizable": true,
"w": 12,
"x": 0,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "S3 API Request Rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (server,api) (increase(minio_s3_requests_total{job=\"$scrape_jobs\"}[5m]))",
"legend": "{{server,api}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.05,
"lineInterpolation": "smooth",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Total s3 bytes received per bucket",
"id": "761f532d-a867-4c56-8231-6d2ea2f82522",
"layout": {
"h": 10,
"i": "761f532d-a867-4c56-8231-6d2ea2f82522",
"isResizable": true,
"w": 12,
"x": 12,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "Bucket Traffic Received",
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum by(bucket) (minio_bucket_traffic_received_bytes{job=\"$scrape_jobs\"})",
"legend": "__auto",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.05,
"lineInterpolation": "smooth",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Total s3 bytes sent per bucket",
"id": "842bc922-c7c3-4c57-a126-4328d53ee703",
"layout": {
"h": 10,
"i": "842bc922-c7c3-4c57-a126-4328d53ee703",
"isResizable": true,
"w": 12,
"x": 0,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "Bucket Traffic Sent",
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum by(bucket) (minio_bucket_traffic_sent_bytes{job=\"$scrape_jobs\"})",
"legend": "__auto",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "59fedaf1-cfab-49a5-80ce-c789fc05cfc0",
"layout": {
"h": 10,
"i": "59fedaf1-cfab-49a5-80ce-c789fc05cfc0",
"isResizable": true,
"w": 12,
"x": 12,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "S3 API Request 4xx Error Rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (server,api) (increase(minio_s3_requests_4xx_errors_total{job=\"$scrape_jobs\"}[5m]))",
"legend": "{{server,api}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "7be6161e-0202-48a5-8c44-ea6d92b6046e",
"layout": {
"h": 10,
"i": "7be6161e-0202-48a5-8c44-ea6d92b6046e",
"isResizable": true,
"w": 12,
"x": 0,
"y": 32
},
"links": [],
"maxPerRow": 4,
"name": "S3 API Request 5xx Error Rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (server,api) (increase(minio_s3_requests_5xx_errors_total{job=\"$scrape_jobs\"}[5m]))",
"legend": "{{server,api}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "26ccd336-ccbd-4f84-a79e-aac8fc645295",
"layout": {
"h": 10,
"i": "26ccd336-ccbd-4f84-a79e-aac8fc645295",
"isResizable": true,
"w": 12,
"x": 12,
"y": 32
},
"links": [],
"maxPerRow": 4,
"name": "Healing",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (instance) (minio_heal_objects_heal_total{job=\"$scrape_jobs\"})",
"legend": "Objects healed in current self heal run",
"refId": "A"
},
{
"expr": "sum by (instance) (minio_heal_objects_error_total{job=\"$scrape_jobs\"})",
"legend": "Heal errors in current self heal run",
"refId": "B"
},
{
"expr": "sum by (instance) (minio_heal_objects_total{job=\"$scrape_jobs\"}) ",
"legend": "Objects scanned in current self heal run",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Total number of bytes received and sent among all MinIO server instances",
"id": "4ded094e-a71a-4104-80c6-eeb8770abf21",
"layout": {
"h": 9,
"i": "4ded094e-a71a-4104-80c6-eeb8770abf21",
"isResizable": true,
"w": 12,
"x": 0,
"y": 42
},
"links": [],
"maxPerRow": 4,
"name": "Internode Data Transfer",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(minio_inter_node_traffic_sent_bytes{job=\"$scrape_jobs\"}[5m])",
"legend": "Internode Bytes Received [{{server}}]",
"refId": "A"
},
{
"expr": "rate(minio_inter_node_traffic_received_bytes{job=\"$scrape_jobs\"}[5m])",
"legend": "Internode Bytes Sent [{{server}}]",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "e0994406-de33-4de1-9722-dcee647c2f68",
"layout": {
"h": 9,
"i": "e0994406-de33-4de1-9722-dcee647c2f68",
"isResizable": true,
"w": 12,
"x": 12,
"y": 42
},
"links": [],
"maxPerRow": 4,
"name": "Node Memory Usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "minio_node_process_resident_memory_bytes{job=\"$scrape_jobs\"}",
"legend": "Memory Used [{{server}}]",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "fc925e24-3be5-453a-85a0-f281ad60994e",
"layout": {
"h": 9,
"i": "fc925e24-3be5-453a-85a0-f281ad60994e",
"isResizable": true,
"w": 12,
"x": 0,
"y": 51
},
"links": [],
"maxPerRow": 4,
"name": "Node CPU Usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(minio_node_process_cpu_total_seconds{job=\"$scrape_jobs\"}[5m])",
"legend": "CPU Usage Rate [{{server}}]",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "54c9c962-e93e-4429-a376-eda862b2777a",
"layout": {
"h": 9,
"i": "54c9c962-e93e-4429-a376-eda862b2777a",
"isResizable": true,
"w": 12,
"x": 12,
"y": 51
},
"links": [],
"maxPerRow": 4,
"name": "Drives Free Inodes",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "minio_node_disk_free_inodes{job=\"$scrape_jobs\"}",
"legend": "Free Inodes [{{server}}:{{disk}}]",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "252d4c7b-c38a-4e08-8888-83408e31082c",
"layout": {
"h": 8,
"i": "252d4c7b-c38a-4e08-8888-83408e31082c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 60
},
"links": [],
"maxPerRow": 4,
"name": "Drive Used Capacity",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "minio_node_disk_used_bytes{job=\"$scrape_jobs\"}",
"legend": "Used Capacity [{{server}}:{{disk}}]",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "867a88e9-5317-4a97-9e06-efba95eda8fb",
"layout": {
"h": 9,
"i": "867a88e9-5317-4a97-9e06-efba95eda8fb",
"isResizable": true,
"w": 12,
"x": 12,
"y": 60
},
"links": [],
"maxPerRow": 4,
"name": "Node File Descriptors",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "minio_node_file_descriptor_open_total{job=\"$scrape_jobs\"}",
"legend": "Open FDs [{{server}}]",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Number of online disks per MinIO Server",
"id": "fc600570-ff7a-46b9-8cc8-1dc78fd81fa0",
"layout": {
"h": 9,
"i": "fc600570-ff7a-46b9-8cc8-1dc78fd81fa0",
"isResizable": true,
"w": 12,
"x": 0,
"y": 68
},
"links": [],
"maxPerRow": 4,
"name": "Node Syscalls",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(minio_node_syscall_read_total{job=\"$scrape_jobs\"}[5m])",
"legend": "Read Syscalls [{{server}}]",
"refId": "A"
},
{
"expr": "rate(minio_node_syscall_write_total{job=\"$scrape_jobs\"}[5m])",
"legend": "Write Syscalls [{{server}}]",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "83dcd281-30f8-4e36-805f-0f594a7816c2",
"layout": {
"h": 8,
"i": "83dcd281-30f8-4e36-805f-0f594a7816c2",
"isResizable": true,
"w": 12,
"x": 12,
"y": 69
},
"links": [],
"maxPerRow": 4,
"name": "Node IO",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(minio_node_io_rchar_bytes{job=\"$scrape_jobs\"}[5m])",
"legend": "Node RChar [{{server}}]",
"refId": "A"
},
{
"expr": "rate(minio_node_io_wchar_bytes{job=\"$scrape_jobs\"}[5m])",
"legend": "Node WChar [{{server}}]",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "DS_PROMETHEUS",
"type": "datasource"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(minio_node_process_starttime_seconds,job)",
"multi": true,
"name": "scrape_jobs",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328049983000
}
================================================
FILE: integrations/MinIO/dashboards/new-version.json
================================================
{
"name": "MinIO Dashboard New Version",
"tags": "",
"ident": "",
"uuid": 1755762763066000,
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"textMode": "valueAndName"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "134b5e6d-63aa-4a43-ae26-13177c3d5184",
"layout": {
"h": 6,
"i": "134b5e6d-63aa-4a43-ae26-13177c3d5184",
"isResizable": true,
"w": 3,
"x": 0,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Uptime",
"options": {
"standardOptions": {
"decimals": 0,
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#3FC453",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "time() - max(minio_node_process_starttime_seconds{job=\"$scrape_jobs\"})",
"legend": "{{job}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "gauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "ca68e795-0a81-4c24-8e55-e754e0283b70",
"layout": {
"h": 3,
"i": "ca68e795-0a81-4c24-8e55-e754e0283b70",
"isResizable": true,
"w": 3,
"x": 3,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Total S3 Traffic Inbound",
"options": {
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum by (instance) (minio_s3_traffic_received_bytes{job=\"$scrape_jobs\"})",
"legend": "{{instance}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"detailName": "详情",
"legengPosition": "hidden"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "d356fc74-4fce-4764-aff9-bc5590c3753d",
"layout": {
"h": 6,
"i": "d356fc74-4fce-4764-aff9-bc5590c3753d",
"isResizable": true,
"w": 5,
"x": 6,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Capacity",
"options": {
"standardOptions": {
"util": "bytesIEC"
}
},
"targets": [
{
"expr": "topk(1, sum(minio_cluster_capacity_usable_total_bytes{job=\"$scrape_jobs\"}) by (instance)) - topk(1, sum(minio_cluster_capacity_usable_free_bytes{job=\"$scrape_jobs\"}) by (instance))",
"legend": "Used",
"refId": "A"
},
{
"expr": "topk(1, sum(minio_cluster_capacity_usable_free_bytes{job=\"$scrape_jobs\"}) by (instance)) ",
"legend": "Free",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "pie",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "3b618c2c-1394-46f7-b3b2-578201444e30",
"layout": {
"h": 6,
"i": "3b618c2c-1394-46f7-b3b2-578201444e30",
"isResizable": true,
"w": 5,
"x": 11,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Data Usage Growth",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(sum(minio_bucket_usage_total_bytes{job=\"$scrape_jobs\"}) by (instance,server))",
"legend": "Usage",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "817686c6-499c-434b-8b9e-ffc7c3a21308",
"layout": {
"h": 6,
"i": "817686c6-499c-434b-8b9e-ffc7c3a21308",
"isResizable": true,
"w": 5,
"x": 16,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Object size distribution",
"options": {
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#9470FF",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "max by (range) (sum (minio_bucket_objects_size_distribution{job=\"$scrape_jobs\"}) by (range))",
"legend": "{{range}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "855e0055-e418-464f-bc3e-04a8a5a1b1b3",
"layout": {
"h": 3,
"i": "855e0055-e418-464f-bc3e-04a8a5a1b1b3",
"isResizable": true,
"w": 3,
"x": 21,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Total Open FDs",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 2000
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum (minio_node_file_descriptor_open_total{job=\"$scrape_jobs\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "2cc7d5ac-1508-4067-91a5-1b08c4339f7f",
"layout": {
"h": 3,
"i": "2cc7d5ac-1508-4067-91a5-1b08c4339f7f",
"isResizable": true,
"w": 3,
"x": 3,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "Total S3 Traffic Outbound",
"options": {
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum by (instance) (minio_s3_traffic_sent_bytes{job=\"$scrape_jobs\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "b7b7d005-b1b3-403e-8cfa-91c561972ef5",
"layout": {
"h": 3,
"i": "b7b7d005-b1b3-403e-8cfa-91c561972ef5",
"isResizable": true,
"w": 3,
"x": 21,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "Total Goroutines",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 2000
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "sum without (server,instance) (minio_node_go_routine_total{job=\"$scrape_jobs\"})",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "avg",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "0e22c724-2ae0-4700-a7e5-3f1243a3d896",
"layout": {
"h": 2,
"i": "0e22c724-2ae0-4700-a7e5-3f1243a3d896",
"isResizable": true,
"w": 3,
"x": 0,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Total Online Servers",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "minio_cluster_nodes_online_total{job=\"$scrape_jobs\"}",
"legend": "{{job}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"type": "stat",
"id": "776ecb58-5daf-4d5c-9c5b-774f476de70b",
"layout": {
"h": 2,
"i": "776ecb58-5daf-4d5c-9c5b-774f476de70b",
"isResizable": true,
"w": 3,
"x": 3,
"y": 6
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"expr": "minio_cluster_drive_online_total{job=\"$scrape_jobs\"}",
"legend": "Total online drives in MinIO Cluster",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Total Online Drives",
"links": [],
"description": "",
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "none",
"colorMode": "value",
"calc": "avg",
"valueField": "Value",
"colSpan": 1,
"textSize": {},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"standardOptions": {
"util": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "e9225532-5991-4394-9e4e-8c09879447ba",
"layout": {
"h": 3,
"i": "e9225532-5991-4394-9e4e-8c09879447ba",
"isResizable": true,
"w": 3,
"x": 6,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Number of Buckets",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#E0B400",
"value": 75000000
},
{
"color": "#C4162A",
"value": 100000000
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "count(count by (bucket) (minio_bucket_usage_total_bytes{job=\"$scrape_jobs\"}))",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "2ebcc855-1fac-4a1e-baec-6ed398920c77",
"layout": {
"h": 6,
"i": "2ebcc855-1fac-4a1e-baec-6ed398920c77",
"isResizable": true,
"w": 7,
"x": 9,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "S3 API Data Received Rate ",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (server) (rate(minio_s3_traffic_received_bytes{job=\"$scrape_jobs\"}[5m]))",
"legend": "Data Received [{{server}}]",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "6edb8e54-e7ed-4cbe-af77-0bcd985dda95",
"layout": {
"h": 6,
"i": "6edb8e54-e7ed-4cbe-af77-0bcd985dda95",
"isResizable": true,
"w": 8,
"x": 16,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "S3 API Data Sent Rate ",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (server) (rate(minio_s3_traffic_sent_bytes{job=\"$scrape_jobs\"}[5m]))",
"legend": "Data Sent [{{server}}]",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"calc": "avg",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "26be803c-f2b5-40b2-99ce-39240830292e",
"layout": {
"h": 2,
"i": "26be803c-f2b5-40b2-99ce-39240830292e",
"isResizable": true,
"w": 3,
"x": 0,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "Total Offline Servers",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "minio_cluster_nodes_offline_total{job=\"$scrape_jobs\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"type": "stat",
"id": "136b9b07-6979-4402-89da-3a79ab8dc732",
"layout": {
"h": 2,
"i": "136b9b07-6979-4402-89da-3a79ab8dc732",
"isResizable": true,
"w": 3,
"x": 3,
"y": 8
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"expr": "minio_cluster_drive_offline_total{job=\"$scrape_jobs\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Total Offline Drives",
"links": [],
"description": "",
"maxPerRow": 4,
"custom": {
"textMode": "value",
"graphMode": "none",
"colorMode": "value",
"calc": "avg",
"valueField": "Value",
"colSpan": 1,
"textSize": {},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#73BF69",
"value": null,
"type": "base"
},
{
"color": "#F2495C",
"value": 80
}
]
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"standardOptions": {
"util": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "4072f862-84d1-4563-b4cc-7e2219a1da8c",
"layout": {
"h": 3,
"i": "4072f862-84d1-4563-b4cc-7e2219a1da8c",
"isResizable": true,
"w": 3,
"x": 6,
"y": 9
},
"links": [],
"maxPerRow": 4,
"name": "Number of Objects",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#E0B400",
"value": 75000000
},
{
"color": "#C4162A",
"value": 100000000
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
]
},
"targets": [
{
"expr": "topk(1, sum(minio_bucket_usage_object_total{job=\"$scrape_jobs\"}) by (instance))",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "9151721f-21e1-49ab-9a95-896fb364d9d2",
"layout": {
"h": 2,
"i": "9151721f-21e1-49ab-9a95-896fb364d9d2",
"isResizable": true,
"w": 3,
"x": 0,
"y": 10
},
"links": [],
"maxPerRow": 4,
"name": "Time Since Last Heal Activity",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "minio_heal_time_last_activity_nano_seconds{job=\"$scrape_jobs\"}",
"legend": "{{server}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "fc40c036-86fc-42c7-9cc6-384c64c0814c",
"layout": {
"h": 2,
"i": "fc40c036-86fc-42c7-9cc6-384c64c0814c",
"isResizable": true,
"w": 3,
"x": 3,
"y": 10
},
"links": [],
"maxPerRow": 4,
"name": "Time Since Last Scan Activity",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "minio_usage_last_activity_nano_seconds{job=\"$scrape_jobs\"}",
"legend": "{{server}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "b5fe1dc4-833e-4e53-a2cb-98e7ecb8ea7b",
"layout": {
"h": 10,
"i": "b5fe1dc4-833e-4e53-a2cb-98e7ecb8ea7b",
"isResizable": true,
"w": 12,
"x": 0,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "S3 API Request Rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (server,api) (increase(minio_s3_requests_total{job=\"$scrape_jobs\"}[5m]))",
"legend": "{{server,api}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.05,
"lineInterpolation": "smooth",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Total s3 bytes received per bucket",
"id": "761f532d-a867-4c56-8231-6d2ea2f82522",
"layout": {
"h": 10,
"i": "761f532d-a867-4c56-8231-6d2ea2f82522",
"isResizable": true,
"w": 12,
"x": 12,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "Bucket Traffic Received",
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum by(bucket) (minio_bucket_traffic_received_bytes{job=\"$scrape_jobs\"})",
"legend": "__auto",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.05,
"lineInterpolation": "smooth",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Total s3 bytes sent per bucket",
"id": "842bc922-c7c3-4c57-a126-4328d53ee703",
"layout": {
"h": 10,
"i": "842bc922-c7c3-4c57-a126-4328d53ee703",
"isResizable": true,
"w": 12,
"x": 0,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "Bucket Traffic Sent",
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum by(bucket) (minio_bucket_traffic_sent_bytes{job=\"$scrape_jobs\"})",
"legend": "__auto",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "59fedaf1-cfab-49a5-80ce-c789fc05cfc0",
"layout": {
"h": 10,
"i": "59fedaf1-cfab-49a5-80ce-c789fc05cfc0",
"isResizable": true,
"w": 12,
"x": 12,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "S3 API Request 4xx Error Rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (server,api) (increase(minio_s3_requests_4xx_errors_total{job=\"$scrape_jobs\"}[5m]))",
"legend": "{{server,api}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "7be6161e-0202-48a5-8c44-ea6d92b6046e",
"layout": {
"h": 10,
"i": "7be6161e-0202-48a5-8c44-ea6d92b6046e",
"isResizable": true,
"w": 12,
"x": 0,
"y": 32
},
"links": [],
"maxPerRow": 4,
"name": "S3 API Request 5xx Error Rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (server,api) (increase(minio_s3_requests_5xx_errors_total{job=\"$scrape_jobs\"}[5m]))",
"legend": "{{server,api}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "26ccd336-ccbd-4f84-a79e-aac8fc645295",
"layout": {
"h": 10,
"i": "26ccd336-ccbd-4f84-a79e-aac8fc645295",
"isResizable": true,
"w": 12,
"x": 12,
"y": 32
},
"links": [],
"maxPerRow": 4,
"name": "Healing",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum by (instance) (minio_heal_objects_heal_total{job=\"$scrape_jobs\"})",
"legend": "Objects healed in current self heal run",
"refId": "A"
},
{
"expr": "sum by (instance) (minio_heal_objects_error_total{job=\"$scrape_jobs\"})",
"legend": "Heal errors in current self heal run",
"refId": "B"
},
{
"expr": "sum by (instance) (minio_heal_objects_total{job=\"$scrape_jobs\"}) ",
"legend": "Objects scanned in current self heal run",
"refId": "C"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Total number of bytes received and sent among all MinIO server instances",
"id": "4ded094e-a71a-4104-80c6-eeb8770abf21",
"layout": {
"h": 9,
"i": "4ded094e-a71a-4104-80c6-eeb8770abf21",
"isResizable": true,
"w": 12,
"x": 0,
"y": 42
},
"links": [],
"maxPerRow": 4,
"name": "Internode Data Transfer",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(minio_inter_node_traffic_sent_bytes{job=\"$scrape_jobs\"}[5m])",
"legend": "Internode Bytes Received [{{server}}]",
"refId": "A"
},
{
"expr": "rate(minio_inter_node_traffic_received_bytes{job=\"$scrape_jobs\"}[5m])",
"legend": "Internode Bytes Sent [{{server}}]",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "e0994406-de33-4de1-9722-dcee647c2f68",
"layout": {
"h": 9,
"i": "e0994406-de33-4de1-9722-dcee647c2f68",
"isResizable": true,
"w": 12,
"x": 12,
"y": 42
},
"links": [],
"maxPerRow": 4,
"name": "Node Memory Usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "minio_node_process_resident_memory_bytes{job=\"$scrape_jobs\"}",
"legend": "Memory Used [{{server}}]",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "fc925e24-3be5-453a-85a0-f281ad60994e",
"layout": {
"h": 9,
"i": "fc925e24-3be5-453a-85a0-f281ad60994e",
"isResizable": true,
"w": 12,
"x": 0,
"y": 51
},
"links": [],
"maxPerRow": 4,
"name": "Node CPU Usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(minio_node_process_cpu_total_seconds{job=\"$scrape_jobs\"}[5m])",
"legend": "CPU Usage Rate [{{server}}]",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"type": "timeseries",
"id": "54c9c962-e93e-4429-a376-eda862b2777a",
"layout": {
"h": 9,
"i": "54c9c962-e93e-4429-a376-eda862b2777a",
"isResizable": true,
"w": 12,
"x": 12,
"y": 51
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"expr": "minio_node_drive_free_inodes{job=\"$scrape_jobs\"}",
"legend": "Free Inodes [{{server}}:{{drive}}]",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Drives Free Inodes",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "252d4c7b-c38a-4e08-8888-83408e31082c",
"layout": {
"h": 8,
"i": "252d4c7b-c38a-4e08-8888-83408e31082c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 60
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"targets": [
{
"expr": "minio_node_drive_used_bytes{job=\"$scrape_jobs\"}",
"legend": "Used Capacity [{{server}}:{{drive}}]",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Drive Used Capacity",
"links": [],
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "linear",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "",
"id": "867a88e9-5317-4a97-9e06-efba95eda8fb",
"layout": {
"h": 9,
"i": "867a88e9-5317-4a97-9e06-efba95eda8fb",
"isResizable": true,
"w": 12,
"x": 12,
"y": 60
},
"links": [],
"maxPerRow": 4,
"name": "Node File Descriptors",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "minio_node_file_descriptor_open_total{job=\"$scrape_jobs\"}",
"legend": "Open FDs [{{server}}]",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Number of online disks per MinIO Server",
"id": "fc600570-ff7a-46b9-8cc8-1dc78fd81fa0",
"layout": {
"h": 9,
"i": "fc600570-ff7a-46b9-8cc8-1dc78fd81fa0",
"isResizable": true,
"w": 12,
"x": 0,
"y": 68
},
"links": [],
"maxPerRow": 4,
"name": "Node Syscalls",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(minio_node_syscall_read_total{job=\"$scrape_jobs\"}[5m])",
"legend": "Read Syscalls [{{server}}]",
"refId": "A"
},
{
"expr": "rate(minio_node_syscall_write_total{job=\"$scrape_jobs\"}[5m])",
"legend": "Write Syscalls [{{server}}]",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"id": "83dcd281-30f8-4e36-805f-0f594a7816c2",
"layout": {
"h": 8,
"i": "83dcd281-30f8-4e36-805f-0f594a7816c2",
"isResizable": true,
"w": 12,
"x": 12,
"y": 69
},
"links": [],
"maxPerRow": 4,
"name": "Node IO",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(minio_node_io_rchar_bytes{job=\"$scrape_jobs\"}[5m])",
"legend": "Node RChar [{{server}}]",
"refId": "A"
},
{
"expr": "rate(minio_node_io_wchar_bytes{job=\"$scrape_jobs\"}[5m])",
"legend": "Node WChar [{{server}}]",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "DS_PROMETHEUS",
"type": "datasource"
},
{
"name": "scrape_jobs",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"reg": "",
"definition": "label_values(minio_node_process_starttime_seconds,job)",
"multi": false
}
],
"version": "3.0.0"
}
}
================================================
FILE: integrations/MinIO/markdown/README.md
================================================
# MinIO
参考 [使用 Prometheus 采集 MinIO 指标](https://min.io/docs/minio/linux/operations/monitoring/collect-minio-metrics-using-prometheus.html?ref=docs-redirect#minio-metrics-collect-using-prometheus)
开启 MinIO Prometheus 访问;
```bash
# 启动 MinIO 服务的时候加入下面的变量:
MINIO_PROMETHEUS_AUTH_TYPE=public
```
## 采集配置
categraf 的 `conf/input.prometheus/prometheus.toml`
```toml
[[instances]]
urls = [
"http://192.168.1.188:9000/minio/v2/metrics/cluster"
]
labels = {job="minio-cluster"}
```
================================================
FILE: integrations/MongoDB/alerts/mongo_by_exporter.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Average MongoDB operation time exceeds 250 seconds - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(mongodb_mongod_op_latencies_latency_total[5m]) / rate(mongodb_mongod_op_latencies_ops_total[5m]) \u003e 250000",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoOperationHighLatency"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328060297000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Mongo connection number has exceeded 80% - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg by (instance) (mongodb_connections{state=\"current\"}) / avg by (instance) (mongodb_connections{state=\"available\"}) * 100 \u003e 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoTooManyConnections(\u003e80%)"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328060946000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Mongo has encountered an Assert error - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 1800,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(mongodb_asserts_total{type=~\"regular|message\"}[5m]) \u003e 0",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoAssertsDetected"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328061548000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Mongo has encountered cursor timeout - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 1800,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(mongodb_mongod_metrics_cursor_timed_out_total[5m]) \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoRecurrentCursorTimeout"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328062022000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Mongo has encountered page fault interrupt - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 1800,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(mongodb_extra_info_page_faults_total[5m]) \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoRecurrentMemoryPageFaults"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328062518000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Mongo has just restarted, please pay attention - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mongodb_instance_uptime_seconds \u003c 60",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoRestarted"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328063040000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Mongo instance has crashed - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "MongoServerDown",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoServerDown"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328063494000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Mongo replica set master-slave delay exceeds 30 seconds - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mongodb_mongod_replset_member_replication_lag \u003e 30",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MongoSlaveReplicationLag(\u003e30s)"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328063925000
}
]
================================================
FILE: integrations/MongoDB/collect/mongodb/mongodb.toml
================================================
[[instances]]
# log level, enum: panic, fatal, error, warn, warning, info, debug, trace, defaults to info.
log_level = "info"
# append some const labels to metrics
# NOTICE! the instance label is required for dashboards
labels = { instance="mongo-cluster-01" }
# mongodb dsn, see https://www.mongodb.com/docs/manual/reference/connection-string/
# mongodb_uri = "mongodb://127.0.0.1:27017"
mongodb_uri = ""
# if you don't specify the username or password in the mongodb_uri, you can set here.
# This will overwrite the dsn, it would be helpful when special characters existing in the username or password and you don't want to encode them.
# NOTICE! this user must be granted enough rights to query needed stats, see ../inputs/mongodb/README.md
username = "username@Bj"
password = "password@Bj"
# if set to true, use the direct connection way
# direct_connect = true
# collect all means you collect all the metrics, if set, all below enable_xxx flags in this section will be ignored
collect_all = true
# if set to true, collect databases metrics
# enable_db_stats = true
# if set to true, collect getDiagnosticData metrics
# enable_diagnostic_data = true
# if set to true, collect replSetGetStatus metrics
# enable_replicaset_status = true
# if set to true, collect top metrics by admin command
# enable_top_metrics = true
# if set to true, collect index metrics. You should specify one of the coll_stats_namespaces and the discovering_mode flags.
# enable_index_stats = true
# if set to true, collect collections metrics. You should specify one of the coll_stats_namespaces and the discovering_mode flags.
# enable_coll_stats = true
# Only get stats for the collections matching this list of namespaces. if none set, discovering_mode will be enabled.
# Example: db1.col1,db.col1
# coll_stats_namespaces = []
# Only get stats for index with the collections matching this list of namespaces.
# Example: db1.col1,db.col1
# index_stats_collections = []
# if set to true, replace -1 to DESC for label key_name of the descending_index metrics
# enable_override_descending_index = true
# which exposes metrics with 0.1x compatible metric names has been implemented which simplifies migration from the old version to the current version.
# compatible_mode = true
# [[instances]]
# # interval = global.interval * interval_times
# interval_times = 1
# log_level = "error"
# append some labels to metrics
# labels = { instance="mongo-cluster-02" }
# mongodb_uri = "mongodb://username:password@127.0.0.1:27017"
# collect_all = true
# compatible_mode = true
================================================
FILE: integrations/MongoDB/dashboards/mongo_by_exporter.json
================================================
{
"name": "MongoDB Overview by exporter",
"tags": "Prometheus MongoDB",
"ident": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "939298f2-b21f-4e2f-9142-c10946cc4032",
"layout": {
"h": 1,
"i": "939298f2-b21f-4e2f-9142-c10946cc4032",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Basic Info",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "instance count",
"id": "91970d24-3f04-4424-a1ed-73e7d28f5706",
"layout": {
"h": 7,
"i": "91970d24-3f04-4424-a1ed-73e7d28f5706",
"isResizable": true,
"w": 6,
"x": 0,
"y": 1
},
"name": "Up",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 1,
"special": 1
},
"result": {
"color": "#53b503",
"text": "UP"
},
"type": "special"
}
]
},
"targets": [
{
"expr": "mongodb_up{instance=\"$instance\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {
"title": null
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Uptime",
"id": "c7b52e8e-b417-4c61-a15e-e2f186fccd67",
"layout": {
"h": 7,
"i": "c7b52e8e-b417-4c61-a15e-e2f186fccd67",
"isResizable": true,
"w": 6,
"x": 6,
"y": 1
},
"name": "Uptime",
"options": {
"standardOptions": {
"util": "humantimeSeconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"to": 1800
},
"result": {
"color": "#ec7718"
},
"type": "range"
},
{
"match": {
"from": 1800
},
"result": {
"color": "#53b503"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "mongodb_ss_uptime{instance=\"$instance\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"type": "timeseries",
"id": "8446dded-9e11-4ee9-bdad-769b193ddf3e",
"layout": {
"h": 7,
"i": "8446dded-9e11-4ee9-bdad-769b193ddf3e",
"isResizable": true,
"w": 6,
"x": 12,
"y": 1
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "mongodb_ss_mem_resident{instance='$instance'} * 1024 * 1024",
"legend": "{{type}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Memory",
"description": "Memory usage (MiB)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Page faults indicate that requests are processed from disk either because an index is missing or there is not enough memory for the data set. Consider increasing memory or sharding out.",
"id": "3eda28e7-2480-4ddc-b346-89ced1c33034",
"layout": {
"h": 7,
"i": "3eda28e7-2480-4ddc-b346-89ced1c33034",
"isResizable": true,
"w": 6,
"x": 18,
"y": 1
},
"name": "Page Faults",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": null,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mongodb_ss_extra_info_page_faults{instance=\"$instance\"}[5m])",
"legend": "{{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Network traffic (bytes)",
"id": "528d0485-f947-470d-95f3-59eae157ebb6",
"layout": {
"h": 7,
"i": "528d0485-f947-470d-95f3-59eae157ebb6",
"isResizable": true,
"w": 6,
"x": 0,
"y": 8
},
"name": "Network I/O",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(mongodb_ss_network_bytesOut{cluster='$cluster'}[5m])",
"legend": "bytesOut",
"refId": "A"
},
{
"expr": "rate(mongodb_ss_network_bytesIn{instance='$instance'}[5m])",
"legend": "bytesIn",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Number of connections Keep in mind the hard limit on the maximum number of connections set by your distribution.",
"id": "067e97c3-4e57-447f-a9dc-a49627b6ce18",
"layout": {
"h": 7,
"i": "067e97c3-4e57-447f-a9dc-a49627b6ce18",
"isResizable": true,
"w": 6,
"x": 6,
"y": 8
},
"name": "Connections",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "mongodb_ss_connections{instance=\"$instance\", conn_type=\"current\"}",
"legend": "Connections",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Number of assertion errors, Asserts are not important by themselves, but you can correlate spikes with other graphs.",
"id": "9e9b7356-cf0e-4e5f-95f5-00258c576bf4",
"layout": {
"h": 7,
"i": "9e9b7356-cf0e-4e5f-95f5-00258c576bf4",
"isResizable": true,
"w": 6,
"x": 12,
"y": 8
},
"name": "Assert Events",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(mongodb_ss_asserts{instance=\"$instance\"}[5m])",
"legend": "{{assert_type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Number of operations waiting to acquire locks, Any number of queued operations for long periods of time is an indication of possible issues. Find the cause and fix it before requests get stuck in the queue.",
"id": "2698f0f8-a76a-499b-99cf-30504f0f4db6",
"layout": {
"h": 7,
"i": "2698f0f8-a76a-499b-99cf-30504f0f4db6",
"isResizable": true,
"w": 6,
"x": 18,
"y": 8
},
"name": "Lock Queue",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "mongodb_ss_globalLock_currentQueue{instance=\"$instance\"}",
"legend": "{{count_type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "2bdb8cc9-92f4-449e-8f70-a4c470a21604",
"layout": {
"h": 1,
"i": "2bdb8cc9-92f4-449e-8f70-a4c470a21604",
"isResizable": false,
"w": 24,
"x": 0,
"y": 15
},
"name": "Operation Info",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Number of requests received Shows how many times a command is executed per second on average during the selected interval.",
"id": "c2819508-95e7-4c63-aeae-ce19f92469cd",
"layout": {
"h": 7,
"i": "c2819508-95e7-4c63-aeae-ce19f92469cd",
"isResizable": true,
"w": 12,
"x": 0,
"y": 16
},
"name": "Command Operations",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mongodb_ss_opcounters{instance=\"$instance\", legacy_op_type!=\"command\"}[5m])",
"legend": "{{legacy_op_type}}",
"refId": "A"
},
{
"expr": "rate(mongodb_ss_opcountersRepl{instance=\"$instance\", legacy_op_type!~\"(command|query|getmore)\"}[5m]) or \nrate(mongodb_ss_opcountersRepl{instance=\"$instance\", legacy_op_type!~\"(command|query|getmore)\"}[5m])",
"legend": "repl_{{legacy_op_type}}",
"refId": "B"
},
{
"expr": "rate(mongodb_ss_metrics_ttl_deletedDocuments{instance=\"$instance\"}[5m]) or \nrate(mongodb_ss_metrics_ttl_deletedDocuments{instance=\"$instance\"}[5m])",
"legend": "ttl_delete",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"type": "timeseries",
"id": "7030d97a-d69f-4916-a415-ec57503ab1ed",
"layout": {
"h": 7,
"i": "7030d97a-d69f-4916-a415-ec57503ab1ed",
"isResizable": true,
"w": 12,
"x": 12,
"y": 16
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(mongodb_ss_metrics_document{instance=\"$instance\"}[5m])",
"legend": "",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Document Operations",
"description": "Number of document operations When used in combination with 'Command Operations', this graph can help identify write amplification. For example, when one insert or update command actually inserts or updates hundreds, thousands, or even millions of documents.",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "1c3b73d5-c25c-449f-995d-26acc9c621e1",
"layout": {
"h": 7,
"i": "1c3b73d5-c25c-449f-995d-26acc9c621e1",
"isResizable": true,
"w": 8,
"x": 0,
"y": 23
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(mongodb_ss_opLatencies_latency{instance='$instance'}[5m]) / rate(mongodb_ss_opLatencies_latency{instance='$instance'}[5m]) / 1000",
"legend": "{{op_type}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Response Time",
"description": "Operation detail processing time (milliseconds)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "",
"id": "e642183c-8ba2-4f60-abc6-c65de49e7577",
"layout": {
"h": 7,
"i": "e642183c-8ba2-4f60-abc6-c65de49e7577",
"isResizable": true,
"w": 8,
"x": 8,
"y": 23
},
"name": "Query Efficiency",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(increase(mongodb_ss_metrics_queryExecutor_scannedObjects{instance=\"$instance\"}[5m])) / sum(increase(mongodb_ss_metrics_document{instance=\"$instance\", doc_op_type=\"returned\"}[5m]))",
"legend": "Document",
"refId": "A"
},
{
"expr": "sum(increase(mongodb_ss_metrics_queryExecutor_scanned{instance=\"$instance\"}[5m])) / sum(increase(mongodb_ss_metrics_document{instance=\"$instance\", doc_op_type=\"returned\"}[5m]))",
"legend": "Index",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "number of cursors Helps identify why connections are increasing. Shows active cursors compared to cursors being automatically killed after 10 minutes due to an application not closing the connection.",
"id": "8b5a4f44-3291-4822-ab73-f56be6c62674",
"layout": {
"h": 7,
"i": "8b5a4f44-3291-4822-ab73-f56be6c62674",
"isResizable": true,
"w": 8,
"x": 16,
"y": 23
},
"name": "Cursors",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "mongodb_ss_metrics_cursor_open{instance=\"$instance\"}",
"legend": "{{csr_type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "06946b19-94b4-4f72-bd87-70f87989257d",
"layout": {
"h": 1,
"i": "06946b19-94b4-4f72-bd87-70f87989257d",
"isResizable": false,
"w": 24,
"x": 0,
"y": 30
},
"name": "Cache Info",
"panels": [],
"type": "row"
},
{
"type": "timeseries",
"id": "bb0ae571-43a1-430b-8f63-256f6f1ebee6",
"layout": {
"h": 7,
"i": "bb0ae571-43a1-430b-8f63-256f6f1ebee6",
"isResizable": true,
"w": 6,
"x": 0,
"y": 31
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "mongodb_ss_wt_cache_bytes_currently_in_the_cache{instance='$instance'}",
"legend": "total",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "mongodb_ss_wt_cache_tracked_dirty_bytes_in_the_cache{cluster='$cluster'}",
"legend": "dirty",
"refId": "B",
"maxDataPoints": 240
},
{
"expr": "mongodb_ss_wt_cache_tracked_bytes_belonging_to_internal_pages_in_the_cache{cluster='$cluster'}",
"legend": "internal_pages",
"refId": "C",
"maxDataPoints": 240
},
{
"expr": "mongodb_ss_wt_cache_tracked_bytes_belonging_to_leaf_pages_in_the_cache{cluster='$cluster'}",
"legend": "leaf_pages",
"refId": "D",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Cache Size",
"description": "cache size (byte)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "f1ffd169-2a1a-42bc-9647-0e6621be0fef",
"layout": {
"h": 7,
"i": "f1ffd169-2a1a-42bc-9647-0e6621be0fef",
"isResizable": true,
"w": 6,
"x": 6,
"y": 31
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(mongodb_ss_wt_cache_bytes_read_into_cache{instance='$instance'}[5m])",
"legend": "read",
"refId": "A",
"maxDataPoints": 240
},
{
"expr": "rate(mongodb_ss_wt_cache_bytes_written_from_cache{cluster='$cluster'}[5m])",
"legend": "written",
"refId": "B",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Cache I/O",
"description": "size of cached data written or read (in bytes)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "43ee140d-ae6d-474a-9892-fa4743d7f97e",
"layout": {
"h": 7,
"i": "43ee140d-ae6d-474a-9892-fa4743d7f97e",
"isResizable": true,
"w": 6,
"x": 12,
"y": 31
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "100 * sum(mongodb_ss_wt_cache_tracked_dirty_pages_in_the_cache{instance='$instance'}) / sum(mongodb_ss_wt_cache_pages_currently_held_in_the_cache{instance='$instance'})",
"legend": "dirty rate",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Cache Dirty Pages Rate",
"description": "",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "1a22c31a-859a-400c-af2a-ae83c308d0f2",
"layout": {
"h": 7,
"i": "1a22c31a-859a-400c-af2a-ae83c308d0f2",
"isResizable": true,
"w": 6,
"x": 18,
"y": 31
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(mongodb_mongod_wiredtiger_cache_evicted_total{instance='$instance'}[5m])",
"legend": "evicted pages",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Cache Evicted Pages",
"description": "",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "b0016f4a-c565-4276-a08d-bacdf94b6b5a",
"layout": {
"h": 1,
"i": "b0016f4a-c565-4276-a08d-bacdf94b6b5a",
"isResizable": false,
"w": 24,
"x": 0,
"y": 45
},
"name": "ReplSet Info",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "",
"id": "6187ceee-7c25-43f2-be1b-c44ad612ab52",
"layout": {
"h": 7,
"i": "6187ceee-7c25-43f2-be1b-c44ad612ab52",
"isResizable": true,
"w": 12,
"x": 0,
"y": 46
},
"name": "Replset Election",
"options": {
"standardOptions": {
"decimals": 1,
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"to": 1800
},
"result": {
"color": "#f24526"
},
"type": "range"
},
{
"match": {
"from": 1800
},
"result": {
"color": "#53b503"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "time() - mongodb_mongod_replset_member_election_date",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "replica set member master-slave synchronization delay",
"id": "f73fd0cd-ecbe-41f0-a2dc-4e02f7eaef1c",
"layout": {
"h": 7,
"i": "f73fd0cd-ecbe-41f0-a2dc-4e02f7eaef1c",
"isResizable": true,
"w": 12,
"x": 12,
"y": 46
},
"name": "Replset Lag Seconds",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "mongodb_mongod_replset_member_replication_lag{instance=\"$instance\"}",
"legend": "lag",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(mongodb_ss_uptime,instance)",
"name": "instance",
"type": "query"
}
],
"version": "3.0.0"
},
"uuid": 1717556328065329000
}
================================================
FILE: integrations/MongoDB/markdown/README.md
================================================
# mongodb
mongodb 监控采集插件,由 [mongodb-exporter](https://github.com/percona/mongodb_exporter)封装而来。
## Configuration
配置文件示例:
```toml
[[instances]]
# log level, enum: panic, fatal, error, warn, warning, info, debug, trace, defaults to info.
log_level = "info"
# append some const labels to metrics
# NOTICE! the instance label is required for dashboards
labels = { instance="mongo-cluster-01" }
# mongodb dsn, see https://www.mongodb.com/docs/manual/reference/connection-string/
# mongodb_uri = "mongodb://127.0.0.1:27017"
mongodb_uri = ""
# if you don't specify the username or password in the mongodb_uri, you can set here.
# This will overwrite the dsn, it would be helpful when special characters existing in the username or password and you don't want to encode them.
# NOTICE! this user must be granted enough rights to query needed stats, see ../inputs/mongodb/README.md
username = "username@Bj"
password = "password@Bj"
# if set to true, use the direct connection way
# direct_connect = true
# collect all means you collect all the metrics, if set, all below enable_xxx flags in this section will be ignored
collect_all = true
# if set to true, collect databases metrics
# enable_db_stats = true
# if set to true, collect getDiagnosticData metrics
# enable_diagnostic_data = true
# if set to true, collect replSetGetStatus metrics
# enable_replicaset_status = true
# if set to true, collect top metrics by admin command
# enable_top_metrics = true
# if set to true, collect index metrics. You should specify one of the coll_stats_namespaces and the discovering_mode flags.
# enable_index_stats = true
# if set to true, collect collections metrics. You should specify one of the coll_stats_namespaces and the discovering_mode flags.
# enable_coll_stats = true
# Only get stats for the collections matching this list of namespaces. if none set, discovering_mode will be enabled.
# Example: db1.col1,db.col1
# coll_stats_namespaces = []
# Only get stats for index with the collections matching this list of namespaces.
# Example: db1.col1,db.col1
# index_stats_collections = []
# if set to true, replace -1 to DESC for label key_name of the descending_index metrics
# enable_override_descending_index = true
# which exposes metrics with 0.1x compatible metric names has been implemented which simplifies migration from the old version to the current version.
# compatible_mode = true
# [[instances]]
# # interval = global.interval * interval_times
# interval_times = 1
# log_level = "error"
# append some labels to metrics
# labels = { instance="mongo-cluster-02" }
# mongodb_uri = "mongodb://username:password@127.0.0.1:27017"
# collect_all = true
# compatible_mode = true
```
categraf 作为一个 client 连接 MongoDB,需要有足够的权限来收集指标,具体的权限配置请参考[官方文档](https://www.mongodb.com/docs/manual/reference/built-in-roles/#mongodb-authrole-clusterMonitor)。至少具有以下权限才可以:
```json
{
"role":"clusterMonitor",
"db":"admin"
},
{
"role":"read",
"db":"local"
}
```
授权操作样例:
```shell
mongo -h xxx -u xxx -p xxx --authenticationDatabase admin
> use admin
> db.createUser({user:"categraf",pwd:"categraf",roles: [{role:"read",db:"local"},{"role":"clusterMonitor","db":"admin"}]})
```
================================================
FILE: integrations/Mtail/collect/mtail/mtail.toml
================================================
[[instances]]
# progs = "/path/to/prog1" # prog dir1
# logs = ["/path/to/a.log", "path/to/b.log"]
# override_timezone = "Asia/Shanghai"
# emit_metric_timestamp = "true" #string type
# [[instances]]
# progs = "/path/to/prog2" # prog dir2
# logs = ["/path/to/logdir/"]
# override_timezone = "Asia/Shanghai"
# emit_metric_timestamp = "true" # string type
================================================
FILE: integrations/Mtail/markdown/README.md
================================================
# mtail插件
## 简介
功能:提取日志内容,转换为监控metrics
+ 输入: 日志
+ 输出: metrics 按照mtail语法输出, 仅支持counter、gauge、histogram
+ 处理: 本质是golang的正则提取+表达式计算
## 启动
编辑mtail.toml文件, 一般每个instance需要指定不同的progs参数(不同的progs文件或者目录),否则指标会相互干扰。
**注意**: 如果不同instance使用相同progs, 可以通过给每个instance增加labels做区分,
```
labels = { k1=v1 }
```
或
```
[instances.labels]
k1=v1
```
1. conf/inputs.mtail/mtail.toml中指定instance
```toml
[[instances]]
## 指定mtail prog的目录
progs = "/path/to/prog1"
## 指定mtail要读取的日志
logs = ["/path/to/a.log", "path/to/b.log"]
## 指定时区
# override_timezone = "Asia/Shanghai"
## metrics是否带时间戳,注意,这里是"true"
# emit_metric_timestamp = "true"
```
2. 在/path/to/prog1 目录下编写规则文件
```
gauge xxx_errors
/ERROR.*/ {
xxx_errros++
}
```
3. 一个tab中执行 `categraf --test --inputs mtail`,用于测试
4. 另一个tab中,"/path/to/a.log" 或者 "path/to/b.log" 追加一行 ERROR,看看categraf的输出
5. 测试通过后,启动categraf
### 输入
logs参数指定要处理的日志源, 支持模糊匹配, 支持多个log文件。
### 处理规则
`progs`指定具体的规则文件目录(或文件)
## 处理规则与语法
### 处理流程
```python
for line in lines:
for regex in regexes:
if match:
do something
```
### 语法
``` golang
exported variable
pattern {
action statements
}
def decorator {
pattern and action statements
}
```
#### 定义指标名称
前面也提过,指标仅支持 counter gauge histogram 三种类型。
一个🌰
```mtail
counter lines
/INFO.*/ {
lines++
}
```
注意,定义的名称只支持 C类型的命名方式(字母/数字/下划线),**如果想使用"-" 要使用"as"导出别名**。例如,
```mtail
counter lines_total as "line-count"
```
这样获取到的就是line-count这个指标名称了
#### 匹配与计算(pattern/action)
```mtail
PATTERN {
ACTION
}
```
例子
```mtail
/foo/ {
ACTION1
}
variable > 0 {
ACTION2
}
/foo/ && variable > 0 {
ACTION3
}
```
支持RE2正则匹配
```mtail
const PREFIX /^\w+\W+\d+ /
PREFIX {
ACTION1
}
PREFIX + /foo/ {
ACTION2
}
```
这样,ACTION1 是匹配以小写字符+大写字符+数字+空格的行,ACTION2 是匹配小写字符+大写字符+数字+空格+foo开头的行。
#### 关系运算符
+ `<` 小于 `<=` 小于等于
+ `>` 大于 `>=` 大于等于
+ `==` 相等 `!=` 不等
+ `=~` 匹配(模糊) `!~` 不匹配(模糊)
+ `||` 逻辑或 `&&` 逻辑与 `!` 逻辑非
#### 数学运算符
+ `|` 按位或
+ `&` 按位与
+ `^` 按位异或
+ `+ - * /` 四则运算
+ `<<` 按位左移
+ `>>` 按位右移
+ `**` 指数运算
+ `=` 赋值
+ `++` 自增运算
+ `--` 自减运算
+ `+=` 加且赋值
#### 支持else与otherwise
```mtail
/foo/ {
ACTION1
} else {
ACTION2
}
```
支持嵌套
```mtail
/foo/ {
/foo1/ {
ACTION1
}
/foo2/ {
ACTION2
}
otherwise {
ACTION3
}
}
```
支持命名与非命名提取
```mtail
/(?P\S+) (\S+) \[\S+\] (\S+) \(\S*\) \S+ (?P\d+)/ {
bytes_total[$operation][$3] += $bytes
}
```
增加常量label
```mtail
# test.mtail
# 定义常量label env
hidden text env
# 给label 赋值 这样定义是global范围;
# 局部添加,则在对应的condition中添加
env="production"
counter line_total by logfile,env
/^(?P\w+\s+\d+\s+\d+:\d+:\d+)/ {
line_total[getfilename()][env]++
}
```
获取到的metrics中会添加上`env=production`的label 如下:
```mtail
# metrics
line_total{env="production",logfile="/path/to/xxxx.log",prog="test.mtail"} 4 1661165941788
```
如果要给metrics增加变量label,必须要使用命名提取。例如
```python
# 日志内容
192.168.0.1 GET /foo
192.168.0.2 GET /bar
192.168.0.1 POST /bar
```
``` mtail
# test.mtail
counter my_http_requests_total by log_file, verb
/^/ +
/(?P[0-9A-Za-z\.:-]+) / +
/(?P[A-Z]+) / +
/(?P\S+).*/ +
/$/ {
my_http_requests_total[getfilename()][$verb]++
}
```
```python
# metrics
my_http_requests_total{logfile="xxx.log",verb="GET",prog="test.mtail"} 4242
my_http_requests_total{logfile="xxx.log",verb="POST",prog="test.mtail"} 42
```
命名提取的变量可以在条件中使用
```mtail
/(?P\d+)/ && $x > 1 {
nonzero_positives++
}
```
#### 时间处理
不显示处理,则默认使用系统时间
默认emit_metric_timestamp="false" (注意是字符串)
```
http_latency_bucket{prog="histo.mtail",le="1"} 0
http_latency_bucket{prog="histo.mtail",le="2"} 0
http_latency_bucket{prog="histo.mtail",le="4"} 0
http_latency_bucket{prog="histo.mtail",le="8"} 0
http_latency_bucket{prog="histo.mtail",le="+Inf"} 0
http_latency_sum{prog="histo.mtail"} 0
http_latency_count{prog="histo.mtail"} 0
```
参数 emit_metric_timestamp="true" (注意是字符串)
```
http_latency_bucket{prog="histo.mtail",le="1"} 1 1661152917471
http_latency_bucket{prog="histo.mtail",le="2"} 2 1661152917471
http_latency_bucket{prog="histo.mtail",le="4"} 2 1661152917471
http_latency_bucket{prog="histo.mtail",le="8"} 2 1661152917471
http_latency_bucket{prog="histo.mtail",le="+Inf"} 2 1661152917471
http_latency_sum{prog="histo.mtail"} 3 1661152917471
http_latency_count{prog="histo.mtail"} 4 1661152917471
```
使用日志的时间
```
Aug 22 15:28:32 GET /api/v1/pods latency=2s code=200
Aug 22 15:28:32 GET /api/v1/pods latency=1s code=200
Aug 22 15:28:32 GET /api/v1/pods latency=0s code=200
```
```
histogram http_latency buckets 1, 2, 4, 8
/^(?P\w+\s+\d+\s+\d+:\d+:\d+)/ {
strptime($date, "Jan 02 15:04:05")
/latency=(?P\d+)/ {
http_latency=$latency
}
}
```
日志提取的时间,一定要注意时区问题,有一个参数 `override_timezone` 可以控制时区选择,否则默认使用UTC转换。
比如我启动时指定 `override_timezone=Asia/Shanghai`, 这个时候日志提取的时间会当做东八区时间 转换为timestamp, 然后再从timestamp转换为各时区时间时 就没有问题了,如图。

如果不带 `override_timezone=Asia/Shanghai`, 则默认将`Aug 22 15:34:32` 当做UTC时间,转换为timestamp。 这样再转换为本地时间时,会多了8个小时, 如图。

================================================
FILE: integrations/MySQL/alerts/mysql_by_categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "A slow query has occurred in Mysql within the last minute - categraf",
"note": "MySQL server mysql has some new slow query",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(mysql_global_status_slow_queries[1m]) \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlowQueries"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328072748000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "More than 60% of the connections in Mysql are in a running state - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg by (instance) (mysql_global_status_threads_running) / avg by (instance) (mysql_global_variables_max_connections) * 100 \u003e 60",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlHighThreadsRunning"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328073274000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Mysql has just restarted. Please be advised - categraf",
"note": "MySQL has just been restarted, less than one minute ago",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_global_status_uptime \u003c 60",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlRestarted"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328073904000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Mysql has opened a large number of file handles. Please be aware - categraf",
"note": "More than 80% of MySQL files open",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg by (instance) (mysql_global_status_open_files) / avg by (instance)(mysql_global_variables_open_files_limit) * 100 \u003e 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlHighOpenFiles"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328074410000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Mysql instance has crashed - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_up == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlDown"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328075687000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "MysqlInnodbLogWaits - categraf",
"note": "MySQL innodb log writes stalling",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(mysql_global_status_innodb_log_waits[15m]) \u003e 10",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlInnodbLogWaits"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328076573000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "MysqlSlaveIoThreadNotRunning - categraf",
"note": "MySQL Slave IO thread not running",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_slave_status_master_server_id \u003e 0 and ON (instance) mysql_slave_status_slave_io_running == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlaveIoThreadNotRunning"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328077065000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "MysqlSlaveReplicationLag - categraf",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_slave_status_master_server_id \u003e 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) \u003e 30",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlaveReplicationLag"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328077529000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "MysqlSlaveSqlThreadNotRunning - categraf",
"note": "MySQL Slave SQL thread not running",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_slave_status_master_server_id \u003e 0 and ON (instance) mysql_slave_status_slave_sql_running == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlaveSqlThreadNotRunning"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328078038000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "The number of connections in Mysql has exceeded 80% - categraf",
"note": "More than 80% of MySQL connections are in use",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg by (instance) (mysql_global_status_threads_connected) / avg by (instance) (mysql_global_variables_max_connections) * 100 \u003e 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlTooManyConnections"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328078545000
}
]
================================================
FILE: integrations/MySQL/alerts/mysql_by_exporter.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "More than 60% of the connections in MySQL are in a running state",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg by (instance) (mysql_global_status_threads_running) / avg by (instance) (mysql_global_variables_max_connections) * 100 \u003e 60",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlHighThreadsRunning"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328079469000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "More than 80% of MySQL files open",
"note": "More than 80% of MySQL files open",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg by (instance) (mysql_global_status_innodb_num_open_files) / avg by (instance)(mysql_global_variables_open_files_limit) * 100 \u003e 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlHighOpenFiles"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328079961000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "MySQL connection count has exceeded 80%",
"note": "More than 80% of MySQL connections are in use",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "avg by (instance) (mysql_global_status_threads_connected) / avg by (instance) (mysql_global_variables_max_connections) * 100 \u003e 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlTooManyConnections"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328080440000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Mysql has just restarted. Please be advised - exporter",
"note": "MySQL has just been restarted, less than one minute ago",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_global_status_uptime \u003c 60",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlRestarted"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328080928000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "MySQL server mysql has some new slow query",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(mysql_global_status_slow_queries[1m]) \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlowQueries"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328081375000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "MysqlInnodbLogWaits - exporter",
"note": "MySQL innodb log writes stalling",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(mysql_global_status_innodb_log_waits[15m]) \u003e 10",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlInnodbLogWaits"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328081825000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "MysqlSlaveIoThreadNotRunning - exporter",
"note": "MySQL Slave IO thread not running",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_slave_status_master_server_id \u003e 0 and ON (instance) mysql_slave_status_slave_io_running == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlaveIoThreadNotRunning"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328082227000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "MysqlSlaveReplicationLag - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_slave_status_master_server_id \u003e 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) \u003e 30",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlaveReplicationLag"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328082623000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "MysqlSlaveSqlThreadNotRunning - exporter",
"note": "MySQL Slave SQL thread not running",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_slave_status_master_server_id \u003e 0 and ON (instance) mysql_slave_status_slave_sql_running == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlSlaveSqlThreadNotRunning"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328083030000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "The MySQL instance is down",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "mysql_up == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=MysqlDown"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328083447000
}
]
================================================
FILE: integrations/MySQL/collect/mysql/mysql.toml
================================================
# # collect interval
# interval = 15
# [[queries]]
# measurement = "users"
# metric_fields = [ "total" ]
# label_fields = [ "service" ]
# timeout = "3s"
# request = '''
# select 'n9e' as service, count(*) as total from n9e_v5.users
# '''
[[instances]]
# address = "127.0.0.1:3306"
# username = "root"
# password = "1234"
# # set tls=custom to enable tls
# parameters = "tls=false"
# extra_status_metrics = true
# extra_innodb_metrics = false
# gather_processlist_processes_by_state = false
# gather_processlist_processes_by_user = false
# gather_schema_size = true
# gather_table_size = false
# gather_system_table_size = false
# gather_slave_status = true
# # timeout
# timeout_seconds = 3
# # interval = global.interval * interval_times
# interval_times = 1
# important! use global unique string to specify instance
# labels = { instance="n9e-10.2.3.4:3306" }
## Optional TLS Config
# use_tls = false
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
#[[instances.queries]]
# measurement = "lock_wait"
# metric_fields = [ "total" ]
# timeout = "3s"
# request = '''
#SELECT count(*) as total FROM information_schema.innodb_trx WHERE trx_state='LOCK WAIT'
#'''
# [[instances.queries]]
# measurement = "users"
# metric_fields = [ "total" ]
# label_fields = [ "service" ]
# # field_to_append = ""
# timeout = "3s"
# request = '''
# select 'n9e' as service, count(*) as total from n9e_v5.users
# '''
================================================
FILE: integrations/MySQL/dashboards/MySQL-by-address.json
================================================
{
"name": "MySQL 仪表盘(使用 address 筛选,用于中心端 Categraf 采集远端多个 mysql 实例的场景)",
"tags": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "dfd77e6d-4e88-4bd9-8c19-74f566920f6c",
"layout": {
"h": 1,
"i": "dfd77e6d-4e88-4bd9-8c19-74f566920f6c",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Basic Info",
"panels": [],
"type": "row"
},
{
"custom": {
"alignItems": "center",
"bgColor": "rgba(0, 0, 0, 0)",
"content": " ",
"justifyContent": "center",
"textColor": "#000000",
"textDarkColor": "#FFFFFF",
"textSize": 12
},
"id": "74a5cd8c-f870-442d-bda6-48b5ce4e87ea",
"layout": {
"h": 6,
"i": "74a5cd8c-f870-442d-bda6-48b5ce4e87ea",
"isResizable": true,
"w": 5,
"x": 0,
"y": 1
},
"maxPerRow": 4,
"name": "",
"type": "text",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 0,
"colorMode": "background",
"graphMode": "none",
"orientation": "vertical",
"textMode": "valueAndName",
"textSize": {
"title": null,
"value": null
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "98364700-8949-4e5d-a6ac-34becb52edf2",
"layout": {
"h": 3,
"i": "75363e2e-deba-421a-bb28-dedf0d7a1a6f",
"isResizable": true,
"w": 7,
"x": 5,
"y": 1
},
"maxPerRow": 4,
"name": "MySQL Uptime Days",
"options": {
"standardOptions": {
"decimals": 1,
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"to": 1800
},
"result": {
"color": "#ec7718"
},
"type": "range"
},
{
"match": {
"from": 1800
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "mysql_global_status_uptime{address=~\"$address\"}",
"legend": "{{address}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 0,
"colorMode": "background",
"graphMode": "none",
"orientation": "vertical",
"textMode": "valueAndName",
"textSize": {
"title": null,
"value": null
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "mysql_global_status_queries",
"id": "1763bcc6-d058-4a2b-a099-3d590debd01a",
"layout": {
"h": 3,
"i": "1763bcc6-d058-4a2b-a099-3d590debd01a",
"isResizable": true,
"w": 12,
"x": 12,
"y": 1
},
"maxPerRow": 4,
"name": "Current QPS",
"options": {
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"to": 100
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
},
{
"match": {
"from": 100,
"to": 1000
},
"result": {
"color": "rgba(255, 153, 25, 1)"
},
"type": "range"
},
{
"match": {
"from": 1000
},
"result": {
"color": "rgba(255, 101, 107, 1)"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "rate(mysql_global_status_queries{address=~\"$address\"}[5m])",
"legend": "{{address}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 0,
"colorMode": "background",
"graphMode": "none",
"orientation": "vertical",
"textMode": "valueAndName",
"textSize": {
"title": null,
"value": null
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**InnoDB Buffer Pool Size**\n\nInnoDB maintains a storage area called the buffer pool for caching data and indexes in memory. Knowing how the InnoDB buffer pool works, and taking advantage of it to keep frequently accessed data in memory, is one of the most important aspects of MySQL tuning. The goal is to keep the working set in memory. In most cases, this should be between 60%-90% of available memory on a dedicated database host, but depends on many factors.",
"id": "28d16171-9e36-4f5d-87be-95bcb2aeb643",
"layout": {
"h": 3,
"i": "28d16171-9e36-4f5d-87be-95bcb2aeb643",
"isResizable": true,
"w": 7,
"x": 5,
"y": 4
},
"maxPerRow": 4,
"name": "InnoDB Buffer Pool",
"options": {
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "rgba(83, 170, 177, 1)",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "mysql_global_variables_innodb_buffer_pool_size{address=~\"$address\"}",
"legend": "{{address}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 0,
"colorMode": "background",
"graphMode": "none",
"orientation": "vertical",
"textMode": "valueAndName",
"textSize": {
"title": null,
"value": null
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Table Locks**\n\nMySQL takes a number of different locks for varying reasons. In this graph we see how many Table level locks MySQL has requested from the storage engine. In the case of InnoDB, many times the locks could actually be row locks as it only takes table level locks in a few specific cases.\n\nIt is most useful to compare Locks Immediate and Locks Waited. If Locks waited is rising, it means you have lock contention. Otherwise, Locks Immediate rising and falling is normal activity.",
"id": "5fe39015-bf33-4f02-b79e-a8977e56d7ca",
"layout": {
"h": 3,
"i": "5fe39015-bf33-4f02-b79e-a8977e56d7ca",
"isResizable": true,
"w": 6,
"x": 12,
"y": 4
},
"maxPerRow": 4,
"name": "Table Locks Waited(5min)",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#e70d0d"
},
"type": "range"
},
{
"match": {
"to": 1
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "increase(mysql_global_status_table_locks_waited{address=~\"$address\"}[5m])",
"legend": "{{address}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "70ee692b-24d9-4807-81b4-81582b5526c2",
"layout": {
"h": 3,
"i": "70ee692b-24d9-4807-81b4-81582b5526c2",
"isResizable": true,
"w": 6,
"x": 18,
"y": 4
},
"maxPerRow": 4,
"name": "Slave Replication Lag",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "mysql_slave_status_seconds_behind_master{address=~\"$address\"} - mysql_slave_status_sql_delay{address=~\"$address\"}",
"legend": "{{address}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "c6da1a55-04d2-4e3e-a22f-e5790182da4a",
"layout": {
"h": 1,
"i": "c6da1a55-04d2-4e3e-a22f-e5790182da4a",
"isResizable": false,
"w": 24,
"x": 0,
"y": 7
},
"name": "Connections",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.03,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.",
"id": "458753cc-a6d0-4afc-bf5e-54585dc5990c",
"layout": {
"h": 5,
"i": "458753cc-a6d0-4afc-bf5e-54585dc5990c",
"isResizable": true,
"w": 6,
"x": 0,
"y": 8
},
"maxPerRow": 4,
"name": "MySQL Connections",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "mysql_global_status_threads_connected{address=~\"$address\"}",
"legend": "{{address}} Connections",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.03,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.",
"id": "ebf01aad-c07b-4541-9891-bb3d5a7175a6",
"layout": {
"h": 5,
"i": "13bf0230-db47-4338-9b32-8e15af8915e4",
"isResizable": true,
"w": 6,
"x": 6,
"y": 8
},
"maxPerRow": 4,
"name": "MySQL Connections Used Percent",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "mysql_global_status_threads_connected{address=~\"$address\"}/mysql_global_variables_max_connections{address=~\"$address\"}",
"legend": "{{address}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.03,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Threads Connected is the number of open connections, while Threads Running is the number of threads not sleeping.",
"id": "f18e13bf-5495-492f-95c5-4a590e38c58e",
"layout": {
"h": 5,
"i": "f18e13bf-5495-492f-95c5-4a590e38c58e",
"isResizable": true,
"w": 6,
"x": 12,
"y": 8
},
"maxPerRow": 4,
"name": "MySQL Client Thread Running",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "mysql_global_status_threads_running{address=~\"$address\"}",
"legend": "{{address}} Threads Running",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.03,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.",
"id": "86251111-3a14-4c52-b1f2-a5cbe009bc0f",
"layout": {
"h": 5,
"i": "34bd296e-bea3-4638-9a35-f97121e804b2",
"isResizable": true,
"w": 6,
"x": 18,
"y": 8
},
"maxPerRow": 4,
"name": "Max Used and Aborted Connections",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "mysql_global_status_max_used_connections{address=~\"$address\"}",
"legend": "{{address}} Max Used Connections",
"maxDataPoints": 240
},
{
"expr": "rate(mysql_global_status_aborted_connects{address=~\"$address\"}[5m])",
"legend": "{{address}} Aborted Connections",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "462559f7-06d3-4585-9ad3-a0906e7c362d",
"layout": {
"h": 1,
"i": "462559f7-06d3-4585-9ad3-a0906e7c362d",
"isResizable": false,
"w": 24,
"x": 0,
"y": 13
},
"name": "Query Performance",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0428fde5-3fbf-45dd-b1a9-1a498d6c2de4",
"layout": {
"h": 4,
"i": "0428fde5-3fbf-45dd-b1a9-1a498d6c2de4",
"isResizable": true,
"w": 12,
"x": 0,
"y": 14
},
"name": "MySQL Temporary Objects",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_created_tmp_tables{address=~\"$address\"}[5m])",
"legend": "{{address}} Created Tmp Tables"
},
{
"expr": "rate(mysql_global_status_created_tmp_disk_tables{address=~\"$address\"}[5m])",
"legend": "{{address}} Created Tmp Disk Tables"
},
{
"expr": "rate(mysql_global_status_created_tmp_files{address=~\"$address\"}[5m])",
"legend": "{{address}} Created Tmp Files"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Select Types**\n\nAs with most relational databases, selecting based on indexes is more efficient than scanning an entire table's data. Here we see the counters for selects not done with indexes.\n\n* ***Select Scan*** is how many queries caused full table scans, in which all the data in the table had to be read and either discarded or returned.\n* ***Select Range*** is how many queries used a range scan, which means MySQL scanned all rows in a given range.\n* ***Select Full Join*** is the number of joins that are not joined on an index, this is usually a huge performance hit.",
"id": "7333267f-e76e-495a-b3d8-08b100ab1330",
"layout": {
"h": 4,
"i": "7333267f-e76e-495a-b3d8-08b100ab1330",
"isResizable": true,
"w": 12,
"x": 12,
"y": 14
},
"name": "MySQL Select Types",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_select_full_join{ address=~\"$address\"}[5m])",
"legend": "{{address}} Select Full Join"
},
{
"expr": "rate(mysql_global_status_select_full_range_join{ address=~\"$address\"}[5m])",
"legend": "{{address}} Select Full Range Join"
},
{
"expr": "rate(mysql_global_status_select_range{ address=~\"$address\"}[5m])",
"legend": "{{address}} Select Range"
},
{
"expr": "rate(mysql_global_status_select_range_check{ address=~\"$address\"}[5m])",
"legend": "{{address}} Select Range Check"
},
{
"expr": "rate(mysql_global_status_select_scan{ address=~\"$address\"}[5m])",
"legend": "{{address}} Select Scan"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Sorts**\n\nDue to a query's structure, order, or other requirements, MySQL sorts the rows before returning them. For example, if a table is ordered 1 to 10 but you want the results reversed, MySQL then has to sort the rows to return 10 to 1.\n\nThis graph also shows when sorts had to scan a whole table or a given range of a table in order to return the results and which could not have been sorted via an index.",
"id": "033652d8-8918-4eee-80bd-625cb0cf8d05",
"layout": {
"h": 4,
"i": "033652d8-8918-4eee-80bd-625cb0cf8d05",
"isResizable": true,
"w": 12,
"x": 0,
"y": 18
},
"name": "MySQL Sorts",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_sort_rows{address=~\"$address\"}[5m])",
"legend": "{{address}} Sort Rows"
},
{
"expr": "rate(mysql_global_status_sort_range{address=~\"$address\"}[5m])",
"legend": "{{address}} Sort Range"
},
{
"expr": "rate(mysql_global_status_sort_merge_passes{address=~\"$address\"}[5m])",
"legend": "{{address}} Sort Merge Passes"
},
{
"expr": "rate(mysql_global_status_sort_scan{address=~\"$address\"}[5m])",
"legend": "{{address}} Sort Scan"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "bars",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Slow Queries**\n\nSlow queries are defined as queries being slower than the long_query_time setting. For example, if you have long_query_time set to 3, all queries that take longer than 3 seconds to complete will show on this graph.",
"id": "08c7c660-5dbb-4fce-9037-3680b9e807d6",
"layout": {
"h": 4,
"i": "08c7c660-5dbb-4fce-9037-3680b9e807d6",
"isResizable": true,
"w": 12,
"x": 12,
"y": 18
},
"name": "MySQL Slow Queries",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_slow_queries{address=~\"$address\"}[5m])",
"legend": "{{address}} Slow Queries"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "6f36134c-8dd7-4cfb-8a55-7b18ecce2cd6",
"layout": {
"h": 1,
"i": "6f36134c-8dd7-4cfb-8a55-7b18ecce2cd6",
"isResizable": false,
"w": 24,
"x": 0,
"y": 22
},
"name": "Network",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Network Traffic**\n\nHere we can see how much network traffic is generated by MySQL. Outbound is network traffic sent from MySQL and Inbound is network traffic MySQL has received.",
"id": "6d50c653-a256-461d-80f1-69e3db613dbc",
"layout": {
"h": 4,
"i": "6d50c653-a256-461d-80f1-69e3db613dbc",
"isResizable": true,
"w": 24,
"x": 0,
"y": 23
},
"name": "MySQL Network Traffic",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_bytes_received{address=~\"$address\"}[5m])",
"legend": "{{address}} Inbound"
},
{
"expr": "rate(mysql_global_status_bytes_sent{address=~\"$address\"}[5m])",
"legend": "{{address}} Outbound"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "73cbe32a-36cd-488e-a818-23bb1857d6e7",
"layout": {
"h": 1,
"i": "73cbe32a-36cd-488e-a818-23bb1857d6e7",
"isResizable": false,
"w": 24,
"x": 0,
"y": 27
},
"name": "Commands, Handlers",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Top Command Counters**\n\nThe Com_{{xxx}} statement counter variables indicate the number of times each xxx statement has been executed. There is one status variable for each type of statement. For example, Com_delete and Com_update count [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements, respectively. Com_delete_multi and Com_update_multi are similar but apply to [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements that use multiple-table syntax.",
"id": "ffa708e1-2132-4dca-9cda-2dd73fad16da",
"layout": {
"h": 4,
"i": "ffa708e1-2132-4dca-9cda-2dd73fad16da",
"isResizable": true,
"w": 6,
"x": 0,
"y": 28
},
"name": "Top Command Counters",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "topk(10, rate(mysql_global_status_commands_total{address=~\"$address\"}[5m])>0)",
"legend": "{{address}} {{command}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "49a40cdf-4715-4d5c-90f9-944479296d8b",
"layout": {
"h": 4,
"i": "ad5d900a-3e60-436a-b8a6-eccc9ba117d4",
"isResizable": true,
"w": 6,
"x": 6,
"y": 28
},
"name": "Select per second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_commands_total{address=~\"$address\", command=\"select\"}[1m])",
"legend": "{{address}} "
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"lineInterpolation": "smooth",
"stack": "noraml"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "01970b88-417a-4c75-9bd0-33eb017a7264",
"layout": {
"h": 4,
"i": "10a3834f-5074-4a0a-9013-03c42a78e2c5",
"isResizable": true,
"w": 6,
"x": 12,
"y": 28
},
"name": "Write(insert|update|delete) per second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_commands_total{address=~\"$address\", command=~\"insert|update|delete\"}[1m])",
"legend": "{{address}} {{command}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "noraml"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "958eae25-8c2a-4886-962f-eb12d57bd594",
"layout": {
"h": 4,
"i": "64603263-1433-4041-9078-65ca95e09932",
"isResizable": true,
"w": 6,
"x": 18,
"y": 28
},
"name": "TPS(commit|rollback)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_commands_total{address=~\"$address\", command=~\"commit|rollback\"}[10m])",
"legend": "{{address}} {{command}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Handlers**\n\nHandler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes.\n\nThis is in fact the layer between the Storage Engine and MySQL.\n\n* `read_rnd_next` is incremented when the server performs a full table scan and this is a counter you don't really want to see with a high value.\n* `read_key` is incremented when a read is done with an index.\n* `read_next` is incremented when the storage engine is asked to 'read the next index entry'. A high value means a lot of index scans are being done.",
"id": "d9623f6a-64f4-4520-b7b5-01abfc76144d",
"layout": {
"h": 4,
"i": "d9623f6a-64f4-4520-b7b5-01abfc76144d",
"isResizable": true,
"w": 12,
"x": 0,
"y": 32
},
"name": "MySQL Handlers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 3
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_handlers_total{address=~\"$address\", handler!~\"commit|rollback|savepoint.*|prepare\"}[5m])",
"legend": "{{address}} {{handler}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3a5ad3a4-5877-46e6-bb3d-bd71174c693e",
"layout": {
"h": 4,
"i": "3a5ad3a4-5877-46e6-bb3d-bd71174c693e",
"isResizable": true,
"w": 12,
"x": 12,
"y": 32
},
"name": "MySQL Transaction Handlers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_handlers_total{address=~\"$address\", handler=~\"commit|rollback|savepoint.*|prepare\"}[5m])",
"legend": "{{address}} {{handler}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "4595a676-3d0e-4746-a881-260505002f64",
"layout": {
"h": 1,
"i": "4595a676-3d0e-4746-a881-260505002f64",
"isResizable": false,
"w": 24,
"x": 0,
"y": 36
},
"name": "Open Files",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ac66ac2b-e48b-4ba7-95e5-4846d616449a",
"layout": {
"h": 4,
"i": "ac66ac2b-e48b-4ba7-95e5-4846d616449a",
"isResizable": true,
"w": 24,
"x": 0,
"y": 37
},
"name": "MySQL Open Files",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_variables_open_files_limit{address=~\"$address\"}",
"legend": "{{address}} Open Files Limit"
},
{
"expr": "mysql_global_status_open_files{address=~\"$address\"}",
"legend": "{{address}} Open Files"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "ddf0e641-3ef6-4be2-a90c-d013eb8a6c30",
"layout": {
"h": 1,
"i": "ddf0e641-3ef6-4be2-a90c-d013eb8a6c30",
"isResizable": false,
"w": 24,
"x": 0,
"y": 41
},
"name": "Table Openings",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Table Open Cache Status**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).",
"id": "c215348c-ecdf-4480-8371-bc6a8d72da10",
"layout": {
"h": 4,
"i": "c215348c-ecdf-4480-8371-bc6a8d72da10",
"isResizable": true,
"w": 12,
"x": 0,
"y": 42
},
"name": "Table Open Cache Hit Ratio Mysql 5.6.6+",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_table_open_cache_hits{address=~\"$address\"}[5m])\n/\n(\nrate(mysql_global_status_table_open_cache_hits{address=~\"$address\"}[5m])\n+\nrate(mysql_global_status_table_open_cache_misses{address=~\"$address\"}[5m])\n)",
"legend": "{{address}} Table Open Cache Hit Ratio"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Open Tables**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).",
"id": "a8fde020-a904-4eaf-84e3-7dbc9f4febf5",
"layout": {
"h": 4,
"i": "a8fde020-a904-4eaf-84e3-7dbc9f4febf5",
"isResizable": true,
"w": 12,
"x": 12,
"y": 42
},
"name": "MySQL Open Tables",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_status_open_tables{address=~\"$address\"}",
"legend": "{{address}} Open Tables"
},
{
"expr": "mysql_global_variables_table_open_cache{address=~\"$address\"}",
"legend": "{{address}} Table Open Cache"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "d70df2e3-bd10-4072-a027-0cc83235e972",
"layout": {
"h": 1,
"i": "d70df2e3-bd10-4072-a027-0cc83235e972",
"isResizable": false,
"w": 24,
"x": 0,
"y": 46
},
"name": "InnoDB",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2bce3a5c-1ec3-4789-9ce5-897a3e40de30",
"layout": {
"h": 4,
"i": "2bce3a5c-1ec3-4789-9ce5-897a3e40de30",
"isResizable": true,
"w": 6,
"x": 0,
"y": 47
},
"name": "Read requests / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_buffer_pool_read_requests{address=~\"$address\"}[1m])",
"legend": "{{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6bded8a5-383e-49ad-b61b-1b0c72a8a911",
"layout": {
"h": 4,
"i": "ab9b8335-2e25-40f6-9402-cd673dc7ae4e",
"isResizable": true,
"w": 6,
"x": 6,
"y": 47
},
"name": "Reads from disk / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_buffer_pool_reads{address=~\"$address\"}[1m])",
"legend": "{{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "08e60f4e-f7fd-4513-bf08-f9514371fa94",
"layout": {
"h": 4,
"i": "763c8183-4315-474c-991e-f3ec78699b4e",
"isResizable": true,
"w": 12,
"x": 12,
"y": 47
},
"name": "Reads from memory percent",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "100 - increase(mysql_global_status_innodb_buffer_pool_reads{address=~\"$address\"}[5m])/increase(mysql_global_status_innodb_buffer_pool_read_requests{address=~\"$address\"}[5m]) * 100",
"legend": "{{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "768306ee-2092-42f6-8b92-7edaf09fdab0",
"layout": {
"h": 4,
"i": "25e1fc62-9e94-4a39-9fc3-2a174777f93b",
"isResizable": true,
"w": 6,
"x": 0,
"y": 51
},
"name": "Row lock waits / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_row_lock_waits{address=~\"$address\"}[1m])",
"legend": "{{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "722ff93d-630f-4921-a1f4-8240af974fd3",
"layout": {
"h": 4,
"i": "37679c80-588d-45e3-b2ac-3e0dad4be32a",
"isResizable": true,
"w": 6,
"x": 6,
"y": 51
},
"name": "Row lock time / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_row_lock_time{address=~\"$address\"}[1m])",
"legend": "{{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3f8ba45a-a9bc-4420-980d-382c2638cda0",
"layout": {
"h": 4,
"i": "33c8d0e1-03a3-4a3f-8b20-7b5b6373bdb0",
"isResizable": true,
"w": 6,
"x": 12,
"y": 51
},
"name": "Log fsyncs / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_os_log_fsyncs{address=~\"$address\"}[1m])",
"legend": "{{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c98ff938-5076-4217-bb0f-e082f34cc6bb",
"layout": {
"h": 4,
"i": "c5b892ee-bc0d-4fe9-b57d-7132c329752d",
"isResizable": true,
"w": 6,
"x": 18,
"y": 51
},
"name": "Buffer Pool Pages Utilization %",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_status_buffer_pool_pages_utilization{address=~\"$address\"}",
"legend": "{{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": true,
"allValue": ".*",
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(mysql_global_status_uptime, address)",
"hide": false,
"multi": true,
"name": "address",
"type": "query"
}
],
"version": "3.0.0"
},
"uuid": 1731986330320000
}
================================================
FILE: integrations/MySQL/dashboards/MySQL仪表盘-远端.json
================================================
{
"name": "MySQL 仪表盘(使用 instance 筛选,需要采集时自行打上 instance 标签)",
"tags": "",
"ident": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "dfd77e6d-4e88-4bd9-8c19-74f566920f6c",
"layout": {
"h": 1,
"i": "dfd77e6d-4e88-4bd9-8c19-74f566920f6c",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Basic Info",
"panels": [],
"type": "row"
},
{
"custom": {
"alignItems": "center",
"bgColor": "rgba(0, 0, 0, 0)",
"content": " ",
"justifyContent": "center",
"textColor": "#000000",
"textDarkColor": "#FFFFFF",
"textSize": 12
},
"id": "74a5cd8c-f870-442d-bda6-48b5ce4e87ea",
"layout": {
"h": 6,
"i": "74a5cd8c-f870-442d-bda6-48b5ce4e87ea",
"isResizable": true,
"w": 5,
"x": 0,
"y": 1
},
"maxPerRow": 4,
"name": "",
"type": "text",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 0,
"colorMode": "background",
"graphMode": "none",
"orientation": "vertical",
"textMode": "valueAndName",
"textSize": {
"title": null,
"value": null
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "98364700-8949-4e5d-a6ac-34becb52edf2",
"layout": {
"h": 3,
"i": "75363e2e-deba-421a-bb28-dedf0d7a1a6f",
"isResizable": true,
"w": 7,
"x": 5,
"y": 1
},
"maxPerRow": 4,
"name": "MySQL Uptime Days",
"options": {
"standardOptions": {
"decimals": 1,
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"to": 1800
},
"result": {
"color": "#ec7718"
},
"type": "range"
},
{
"match": {
"from": 1800
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "mysql_global_status_uptime{instance=~\"$instance\"}",
"legend": "{{instance}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 0,
"colorMode": "background",
"graphMode": "none",
"orientation": "vertical",
"textMode": "valueAndName",
"textSize": {
"title": null,
"value": null
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "mysql_global_status_queries",
"id": "1763bcc6-d058-4a2b-a099-3d590debd01a",
"layout": {
"h": 3,
"i": "1763bcc6-d058-4a2b-a099-3d590debd01a",
"isResizable": true,
"w": 12,
"x": 12,
"y": 1
},
"maxPerRow": 4,
"name": "Current QPS",
"options": {
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"to": 100
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
},
{
"match": {
"from": 100,
"to": 1000
},
"result": {
"color": "rgba(255, 153, 25, 1)"
},
"type": "range"
},
{
"match": {
"from": 1000
},
"result": {
"color": "rgba(255, 101, 107, 1)"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "rate(mysql_global_status_queries{instance=~\"$instance\"}[5m])",
"legend": "{{instance}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 0,
"colorMode": "background",
"graphMode": "none",
"orientation": "vertical",
"textMode": "valueAndName",
"textSize": {
"title": null,
"value": null
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**InnoDB Buffer Pool Size**\n\nInnoDB maintains a storage area called the buffer pool for caching data and indexes in memory. Knowing how the InnoDB buffer pool works, and taking advantage of it to keep frequently accessed data in memory, is one of the most important aspects of MySQL tuning. The goal is to keep the working set in memory. In most cases, this should be between 60%-90% of available memory on a dedicated database host, but depends on many factors.",
"id": "28d16171-9e36-4f5d-87be-95bcb2aeb643",
"layout": {
"h": 3,
"i": "28d16171-9e36-4f5d-87be-95bcb2aeb643",
"isResizable": true,
"w": 7,
"x": 5,
"y": 4
},
"maxPerRow": 4,
"name": "InnoDB Buffer Pool",
"options": {
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "rgba(83, 170, 177, 1)",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "mysql_global_variables_innodb_buffer_pool_size{instance=~\"$instance\"}",
"legend": "{{instance}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 0,
"colorMode": "background",
"graphMode": "none",
"orientation": "vertical",
"textMode": "valueAndName",
"textSize": {
"title": null,
"value": null
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Table Locks**\n\nMySQL takes a number of different locks for varying reasons. In this graph we see how many Table level locks MySQL has requested from the storage engine. In the case of InnoDB, many times the locks could actually be row locks as it only takes table level locks in a few specific cases.\n\nIt is most useful to compare Locks Immediate and Locks Waited. If Locks waited is rising, it means you have lock contention. Otherwise, Locks Immediate rising and falling is normal activity.",
"id": "5fe39015-bf33-4f02-b79e-a8977e56d7ca",
"layout": {
"h": 3,
"i": "5fe39015-bf33-4f02-b79e-a8977e56d7ca",
"isResizable": true,
"w": 6,
"x": 12,
"y": 4
},
"maxPerRow": 4,
"name": "Table Locks Waited(5min)",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#e70d0d"
},
"type": "range"
},
{
"match": {
"to": 1
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "increase(mysql_global_status_table_locks_waited{instance=~\"$instance\"}[5m])",
"legend": "{{instance}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "70ee692b-24d9-4807-81b4-81582b5526c2",
"layout": {
"h": 3,
"i": "70ee692b-24d9-4807-81b4-81582b5526c2",
"isResizable": true,
"w": 6,
"x": 18,
"y": 4
},
"maxPerRow": 4,
"name": "Slave Replication Lag",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "mysql_slave_status_seconds_behind_master{instance=~\"$instance\"} - mysql_slave_status_sql_delay{instance=~\"$instance\"}",
"legend": "{{address}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "c6da1a55-04d2-4e3e-a22f-e5790182da4a",
"layout": {
"h": 1,
"i": "c6da1a55-04d2-4e3e-a22f-e5790182da4a",
"isResizable": false,
"w": 24,
"x": 0,
"y": 7
},
"name": "Connections",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.03,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.",
"id": "458753cc-a6d0-4afc-bf5e-54585dc5990c",
"layout": {
"h": 5,
"i": "458753cc-a6d0-4afc-bf5e-54585dc5990c",
"isResizable": true,
"w": 6,
"x": 0,
"y": 8
},
"maxPerRow": 4,
"name": "MySQL Connections",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "mysql_global_status_threads_connected{instance=~\"$instance\"}",
"legend": "{{instance}} Connections",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.03,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.",
"id": "ebf01aad-c07b-4541-9891-bb3d5a7175a6",
"layout": {
"h": 5,
"i": "13bf0230-db47-4338-9b32-8e15af8915e4",
"isResizable": true,
"w": 6,
"x": 6,
"y": 8
},
"maxPerRow": 4,
"name": "MySQL Connections Used Percent",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "mysql_global_status_threads_connected{instance=~\"$instance\"}/mysql_global_variables_max_connections{instance=~\"$instance\"}",
"legend": "{{instance}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.03,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Threads Connected is the number of open connections, while Threads Running is the number of threads not sleeping.",
"id": "f18e13bf-5495-492f-95c5-4a590e38c58e",
"layout": {
"h": 5,
"i": "f18e13bf-5495-492f-95c5-4a590e38c58e",
"isResizable": true,
"w": 6,
"x": 12,
"y": 8
},
"maxPerRow": 4,
"name": "MySQL Client Thread Running",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "mysql_global_status_threads_running{instance=~\"$instance\"}",
"legend": "{{instance}} Threads Running",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.03,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.",
"id": "86251111-3a14-4c52-b1f2-a5cbe009bc0f",
"layout": {
"h": 5,
"i": "34bd296e-bea3-4638-9a35-f97121e804b2",
"isResizable": true,
"w": 6,
"x": 18,
"y": 8
},
"maxPerRow": 4,
"name": "Max Used and Aborted Connections",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "mysql_global_status_max_used_connections{instance=~\"$instance\"}",
"legend": "{{instance}} Max Used Connections",
"maxDataPoints": 240
},
{
"expr": "rate(mysql_global_status_aborted_connects{instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Aborted Connections",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "462559f7-06d3-4585-9ad3-a0906e7c362d",
"layout": {
"h": 1,
"i": "462559f7-06d3-4585-9ad3-a0906e7c362d",
"isResizable": false,
"w": 24,
"x": 0,
"y": 13
},
"name": "Query Performance",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0428fde5-3fbf-45dd-b1a9-1a498d6c2de4",
"layout": {
"h": 4,
"i": "0428fde5-3fbf-45dd-b1a9-1a498d6c2de4",
"isResizable": true,
"w": 12,
"x": 0,
"y": 14
},
"name": "MySQL Temporary Objects",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_created_tmp_tables{instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Created Tmp Tables"
},
{
"expr": "rate(mysql_global_status_created_tmp_disk_tables{instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Created Tmp Disk Tables"
},
{
"expr": "rate(mysql_global_status_created_tmp_files{instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Created Tmp Files"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Select Types**\n\nAs with most relational databases, selecting based on indexes is more efficient than scanning an entire table's data. Here we see the counters for selects not done with indexes.\n\n* ***Select Scan*** is how many queries caused full table scans, in which all the data in the table had to be read and either discarded or returned.\n* ***Select Range*** is how many queries used a range scan, which means MySQL scanned all rows in a given range.\n* ***Select Full Join*** is the number of joins that are not joined on an index, this is usually a huge performance hit.",
"id": "7333267f-e76e-495a-b3d8-08b100ab1330",
"layout": {
"h": 4,
"i": "7333267f-e76e-495a-b3d8-08b100ab1330",
"isResizable": true,
"w": 12,
"x": 12,
"y": 14
},
"name": "MySQL Select Types",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_select_full_join{ instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Select Full Join"
},
{
"expr": "rate(mysql_global_status_select_full_range_join{ instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Select Full Range Join"
},
{
"expr": "rate(mysql_global_status_select_range{ instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Select Range"
},
{
"expr": "rate(mysql_global_status_select_range_check{ instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Select Range Check"
},
{
"expr": "rate(mysql_global_status_select_scan{ instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Select Scan"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Sorts**\n\nDue to a query's structure, order, or other requirements, MySQL sorts the rows before returning them. For example, if a table is ordered 1 to 10 but you want the results reversed, MySQL then has to sort the rows to return 10 to 1.\n\nThis graph also shows when sorts had to scan a whole table or a given range of a table in order to return the results and which could not have been sorted via an index.",
"id": "033652d8-8918-4eee-80bd-625cb0cf8d05",
"layout": {
"h": 4,
"i": "033652d8-8918-4eee-80bd-625cb0cf8d05",
"isResizable": true,
"w": 12,
"x": 0,
"y": 18
},
"name": "MySQL Sorts",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_sort_rows{instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Sort Rows"
},
{
"expr": "rate(mysql_global_status_sort_range{instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Sort Range"
},
{
"expr": "rate(mysql_global_status_sort_merge_passes{instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Sort Merge Passes"
},
{
"expr": "rate(mysql_global_status_sort_scan{instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Sort Scan"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "bars",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Slow Queries**\n\nSlow queries are defined as queries being slower than the long_query_time setting. For example, if you have long_query_time set to 3, all queries that take longer than 3 seconds to complete will show on this graph.",
"id": "08c7c660-5dbb-4fce-9037-3680b9e807d6",
"layout": {
"h": 4,
"i": "08c7c660-5dbb-4fce-9037-3680b9e807d6",
"isResizable": true,
"w": 12,
"x": 12,
"y": 18
},
"name": "MySQL Slow Queries",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_slow_queries{instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Slow Queries"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "6f36134c-8dd7-4cfb-8a55-7b18ecce2cd6",
"layout": {
"h": 1,
"i": "6f36134c-8dd7-4cfb-8a55-7b18ecce2cd6",
"isResizable": false,
"w": 24,
"x": 0,
"y": 22
},
"name": "Network",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Network Traffic**\n\nHere we can see how much network traffic is generated by MySQL. Outbound is network traffic sent from MySQL and Inbound is network traffic MySQL has received.",
"id": "6d50c653-a256-461d-80f1-69e3db613dbc",
"layout": {
"h": 4,
"i": "6d50c653-a256-461d-80f1-69e3db613dbc",
"isResizable": true,
"w": 24,
"x": 0,
"y": 23
},
"name": "MySQL Network Traffic",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_bytes_received{instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Inbound"
},
{
"expr": "rate(mysql_global_status_bytes_sent{instance=~\"$instance\"}[5m])",
"legend": "{{instance}} Outbound"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "73cbe32a-36cd-488e-a818-23bb1857d6e7",
"layout": {
"h": 1,
"i": "73cbe32a-36cd-488e-a818-23bb1857d6e7",
"isResizable": false,
"w": 24,
"x": 0,
"y": 27
},
"name": "Commands, Handlers",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Top Command Counters**\n\nThe Com_{{xxx}} statement counter variables indicate the number of times each xxx statement has been executed. There is one status variable for each type of statement. For example, Com_delete and Com_update count [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements, respectively. Com_delete_multi and Com_update_multi are similar but apply to [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements that use multiple-table syntax.",
"id": "ffa708e1-2132-4dca-9cda-2dd73fad16da",
"layout": {
"h": 4,
"i": "ffa708e1-2132-4dca-9cda-2dd73fad16da",
"isResizable": true,
"w": 6,
"x": 0,
"y": 28
},
"name": "Top Command Counters",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "topk(10, rate(mysql_global_status_commands_total{instance=~\"$instance\"}[5m])>0)",
"legend": "{{instance}} {{command}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "49a40cdf-4715-4d5c-90f9-944479296d8b",
"layout": {
"h": 4,
"i": "ad5d900a-3e60-436a-b8a6-eccc9ba117d4",
"isResizable": true,
"w": 6,
"x": 6,
"y": 28
},
"name": "Select per second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_commands_total{instance=~\"$instance\", command=\"select\"}[1m])",
"legend": "{{instance}} "
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"lineInterpolation": "smooth",
"stack": "noraml"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "01970b88-417a-4c75-9bd0-33eb017a7264",
"layout": {
"h": 4,
"i": "10a3834f-5074-4a0a-9013-03c42a78e2c5",
"isResizable": true,
"w": 6,
"x": 12,
"y": 28
},
"name": "Write(insert|update|delete) per second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_commands_total{instance=~\"$instance\", command=~\"insert|update|delete\"}[1m])",
"legend": "{{instance}} {{command}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "normal"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "958eae25-8c2a-4886-962f-eb12d57bd594",
"layout": {
"h": 4,
"i": "64603263-1433-4041-9078-65ca95e09932",
"isResizable": true,
"w": 6,
"x": 18,
"y": 28
},
"name": "TPS(commit|rollback)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_commands_total{instance=~\"$instance\", command=~\"commit|rollback\"}[10m])",
"legend": "{{instance}} {{command}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Handlers**\n\nHandler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes.\n\nThis is in fact the layer between the Storage Engine and MySQL.\n\n* `read_rnd_next` is incremented when the server performs a full table scan and this is a counter you don't really want to see with a high value.\n* `read_key` is incremented when a read is done with an index.\n* `read_next` is incremented when the storage engine is asked to 'read the next index entry'. A high value means a lot of index scans are being done.",
"id": "d9623f6a-64f4-4520-b7b5-01abfc76144d",
"layout": {
"h": 4,
"i": "d9623f6a-64f4-4520-b7b5-01abfc76144d",
"isResizable": true,
"w": 12,
"x": 0,
"y": 32
},
"name": "MySQL Handlers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 3
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_handlers_total{instance=~\"$instance\", handler!~\"commit|rollback|savepoint.*|prepare\"}[5m])",
"legend": "{{instance}} {{handler}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3a5ad3a4-5877-46e6-bb3d-bd71174c693e",
"layout": {
"h": 4,
"i": "3a5ad3a4-5877-46e6-bb3d-bd71174c693e",
"isResizable": true,
"w": 12,
"x": 12,
"y": 32
},
"name": "MySQL Transaction Handlers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_handlers_total{instance=~\"$instance\", handler=~\"commit|rollback|savepoint.*|prepare\"}[5m])",
"legend": "{{instance}} {{handler}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "4595a676-3d0e-4746-a881-260505002f64",
"layout": {
"h": 1,
"i": "4595a676-3d0e-4746-a881-260505002f64",
"isResizable": false,
"w": 24,
"x": 0,
"y": 36
},
"name": "Open Files",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ac66ac2b-e48b-4ba7-95e5-4846d616449a",
"layout": {
"h": 4,
"i": "ac66ac2b-e48b-4ba7-95e5-4846d616449a",
"isResizable": true,
"w": 24,
"x": 0,
"y": 37
},
"name": "MySQL Open Files",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_variables_open_files_limit{instance=~\"$instance\"}",
"legend": "{{instance}} Open Files Limit"
},
{
"expr": "mysql_global_status_open_files{instance=~\"$instance\"}",
"legend": "{{instance}} Open Files"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "ddf0e641-3ef6-4be2-a90c-d013eb8a6c30",
"layout": {
"h": 1,
"i": "ddf0e641-3ef6-4be2-a90c-d013eb8a6c30",
"isResizable": false,
"w": 24,
"x": 0,
"y": 41
},
"name": "Table Openings",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Table Open Cache Status**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).",
"id": "c215348c-ecdf-4480-8371-bc6a8d72da10",
"layout": {
"h": 4,
"i": "c215348c-ecdf-4480-8371-bc6a8d72da10",
"isResizable": true,
"w": 12,
"x": 0,
"y": 42
},
"name": "Table Open Cache Hit Ratio Mysql 5.6.6+",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_table_open_cache_hits{instance=~\"$instance\"}[5m])\n/\n(\nrate(mysql_global_status_table_open_cache_hits{instance=~\"$instance\"}[5m])\n+\nrate(mysql_global_status_table_open_cache_misses{instance=~\"$instance\"}[5m])\n)",
"legend": "{{instance}} Table Open Cache Hit Ratio"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Open Tables**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).",
"id": "a8fde020-a904-4eaf-84e3-7dbc9f4febf5",
"layout": {
"h": 4,
"i": "a8fde020-a904-4eaf-84e3-7dbc9f4febf5",
"isResizable": true,
"w": 12,
"x": 12,
"y": 42
},
"name": "MySQL Open Tables",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_status_open_tables{instance=~\"$instance\"}",
"legend": "{{instance}} Open Tables"
},
{
"expr": "mysql_global_variables_table_open_cache{instance=~\"$instance\"}",
"legend": "{{instance}} Table Open Cache"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "d70df2e3-bd10-4072-a027-0cc83235e972",
"layout": {
"h": 1,
"i": "d70df2e3-bd10-4072-a027-0cc83235e972",
"isResizable": false,
"w": 24,
"x": 0,
"y": 46
},
"name": "InnoDB",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2bce3a5c-1ec3-4789-9ce5-897a3e40de30",
"layout": {
"h": 4,
"i": "2bce3a5c-1ec3-4789-9ce5-897a3e40de30",
"isResizable": true,
"w": 6,
"x": 0,
"y": 47
},
"name": "Read requests / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_buffer_pool_read_requests{instance=~\"$instance\"}[1m])",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6bded8a5-383e-49ad-b61b-1b0c72a8a911",
"layout": {
"h": 4,
"i": "ab9b8335-2e25-40f6-9402-cd673dc7ae4e",
"isResizable": true,
"w": 6,
"x": 6,
"y": 47
},
"name": "Reads from disk / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_buffer_pool_reads{instance=~\"$instance\"}[1m])",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "08e60f4e-f7fd-4513-bf08-f9514371fa94",
"layout": {
"h": 4,
"i": "763c8183-4315-474c-991e-f3ec78699b4e",
"isResizable": true,
"w": 12,
"x": 12,
"y": 47
},
"name": "Reads from memory percent",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "100 - increase(mysql_global_status_innodb_buffer_pool_reads{instance=~\"$instance\"}[5m])/increase(mysql_global_status_innodb_buffer_pool_read_requests{instance=~\"$instance\"}[5m]) * 100",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "768306ee-2092-42f6-8b92-7edaf09fdab0",
"layout": {
"h": 4,
"i": "25e1fc62-9e94-4a39-9fc3-2a174777f93b",
"isResizable": true,
"w": 6,
"x": 0,
"y": 51
},
"name": "Row lock waits / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_row_lock_waits{instance=~\"$instance\"}[1m])",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "722ff93d-630f-4921-a1f4-8240af974fd3",
"layout": {
"h": 4,
"i": "37679c80-588d-45e3-b2ac-3e0dad4be32a",
"isResizable": true,
"w": 6,
"x": 6,
"y": 51
},
"name": "Row lock time / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_row_lock_time{instance=~\"$instance\"}[1m])",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3f8ba45a-a9bc-4420-980d-382c2638cda0",
"layout": {
"h": 4,
"i": "33c8d0e1-03a3-4a3f-8b20-7b5b6373bdb0",
"isResizable": true,
"w": 6,
"x": 12,
"y": 51
},
"name": "Log fsyncs / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_os_log_fsyncs{instance=~\"$instance\"}[1m])",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c98ff938-5076-4217-bb0f-e082f34cc6bb",
"layout": {
"h": 4,
"i": "c5b892ee-bc0d-4fe9-b57d-7132c329752d",
"isResizable": true,
"w": 6,
"x": 18,
"y": 51
},
"name": "Buffer Pool Pages Utilization %",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_status_buffer_pool_pages_utilization{instance=~\"$instance\"}",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"name": "instance",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(mysql_global_status_uptime, instance)",
"multi": true,
"allOption": true,
"allValue": ".*"
}
],
"version": "3.0.0"
},
"uuid": 1717556328087995000
}
================================================
FILE: integrations/MySQL/dashboards/MySQL仪表盘.json
================================================
{
"name": "MySQL 仪表盘,适用于 Categraf 采集本机 MySQL 的场景",
"tags": "",
"ident": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "dfd77e6d-4e88-4bd9-8c19-74f566920f6c",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 0,
"i": "dfd77e6d-4e88-4bd9-8c19-74f566920f6c",
"isResizable": false
},
"name": "Basic Info",
"panels": [],
"type": "row"
},
{
"type": "text",
"id": "74a5cd8c-f870-442d-bda6-48b5ce4e87ea",
"layout": {
"h": 6,
"w": 5,
"x": 0,
"y": 1,
"i": "74a5cd8c-f870-442d-bda6-48b5ce4e87ea",
"isResizable": true
},
"version": "3.0.0",
"name": "",
"maxPerRow": 4,
"custom": {
"textColor": "#000000",
"textDarkColor": "#FFFFFF",
"bgColor": "rgba(0, 0, 0, 0)",
"textSize": 12,
"justifyContent": "center",
"alignItems": "center",
"content": " "
}
},
{
"type": "stat",
"id": "98364700-8949-4e5d-a6ac-34becb52edf2",
"layout": {
"h": 3,
"w": 7,
"x": 5,
"y": 1,
"i": "75363e2e-deba-421a-bb28-dedf0d7a1a6f",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "mysql_global_status_uptime{ident=~\"$ident\"}",
"legend": "{{ident}} > {{address}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "MySQL Uptime Days",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {
"title": null,
"value": null
},
"orientation": "vertical"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"valueMappings": [
{
"match": {
"to": 1800
},
"result": {
"color": "#ec7718"
},
"type": "range"
},
{
"match": {
"from": 1800
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
}
],
"standardOptions": {
"util": "seconds",
"decimals": 1
}
}
},
{
"type": "stat",
"id": "1763bcc6-d058-4a2b-a099-3d590debd01a",
"layout": {
"h": 3,
"w": 12,
"x": 12,
"y": 1,
"i": "1763bcc6-d058-4a2b-a099-3d590debd01a",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "rate(mysql_global_status_queries{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} > {{address}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Current QPS",
"description": "mysql_global_status_queries",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {
"title": null,
"value": null
},
"orientation": "vertical"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"valueMappings": [
{
"match": {
"to": 100
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
},
{
"match": {
"from": 100,
"to": 1000
},
"result": {
"color": "rgba(255, 153, 25, 1)"
},
"type": "range"
},
{
"type": "range",
"result": {
"color": "rgba(255, 101, 107, 1)"
},
"match": {
"from": 1000
}
}
],
"standardOptions": {
"decimals": 2
}
}
},
{
"type": "stat",
"id": "28d16171-9e36-4f5d-87be-95bcb2aeb643",
"layout": {
"h": 3,
"w": 7,
"x": 5,
"y": 4,
"i": "28d16171-9e36-4f5d-87be-95bcb2aeb643",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "mysql_global_variables_innodb_buffer_pool_size{ident=~\"$ident\"}",
"legend": "{{ident}} > {{address}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "InnoDB Buffer Pool",
"description": "**InnoDB Buffer Pool Size**\n\nInnoDB maintains a storage area called the buffer pool for caching data and indexes in memory. Knowing how the InnoDB buffer pool works, and taking advantage of it to keep frequently accessed data in memory, is one of the most important aspects of MySQL tuning. The goal is to keep the working set in memory. In most cases, this should be between 60%-90% of available memory on a dedicated database host, but depends on many factors.",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {
"title": null,
"value": null
},
"orientation": "vertical"
},
"options": {
"thresholds": {
"steps": [
{
"color": "rgba(83, 170, 177, 1)",
"value": null,
"type": "base"
}
]
},
"standardOptions": {
"util": "bytesIEC"
}
}
},
{
"type": "stat",
"id": "5fe39015-bf33-4f02-b79e-a8977e56d7ca",
"layout": {
"h": 3,
"w": 6,
"x": 12,
"y": 4,
"i": "5fe39015-bf33-4f02-b79e-a8977e56d7ca",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "increase(mysql_global_status_table_locks_waited{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} > {{address}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Table Locks Waited(5min)",
"description": "**Table Locks**\n\nMySQL takes a number of different locks for varying reasons. In this graph we see how many Table level locks MySQL has requested from the storage engine. In the case of InnoDB, many times the locks could actually be row locks as it only takes table level locks in a few specific cases.\n\nIt is most useful to compare Locks Immediate and Locks Waited. If Locks waited is rising, it means you have lock contention. Otherwise, Locks Immediate rising and falling is normal activity.",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {
"title": null,
"value": null
},
"orientation": "vertical"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#e70d0d"
},
"type": "range"
},
{
"match": {
"to": 1
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
}
],
"standardOptions": {}
}
},
{
"type": "timeseries",
"id": "70ee692b-24d9-4807-81b4-81582b5526c2",
"layout": {
"h": 3,
"w": 6,
"x": 18,
"y": 4,
"i": "70ee692b-24d9-4807-81b4-81582b5526c2",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "mysql_slave_status_seconds_behind_master{ident=~\"$ident\"} - mysql_slave_status_sql_delay{ident=~\"$ident\"}",
"legend": "{{address}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Slave Replication Lag",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "c6da1a55-04d2-4e3e-a22f-e5790182da4a",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 7,
"i": "c6da1a55-04d2-4e3e-a22f-e5790182da4a",
"isResizable": false
},
"name": "Connections",
"panels": [],
"type": "row"
},
{
"type": "timeseries",
"id": "458753cc-a6d0-4afc-bf5e-54585dc5990c",
"layout": {
"h": 5,
"w": 6,
"x": 0,
"y": 8,
"i": "458753cc-a6d0-4afc-bf5e-54585dc5990c",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "mysql_global_status_threads_connected{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}} Connections",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "MySQL Connections",
"description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "ebf01aad-c07b-4541-9891-bb3d5a7175a6",
"layout": {
"h": 5,
"w": 6,
"x": 6,
"y": 8,
"i": "13bf0230-db47-4338-9b32-8e15af8915e4",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "mysql_global_status_threads_connected{ident=~\"$ident\"}/mysql_global_variables_max_connections{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "MySQL Connections Used Percent",
"description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "f18e13bf-5495-492f-95c5-4a590e38c58e",
"layout": {
"h": 5,
"w": 6,
"x": 12,
"y": 8,
"i": "f18e13bf-5495-492f-95c5-4a590e38c58e",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "mysql_global_status_threads_running{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}} Threads Running",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "MySQL Client Thread Running",
"description": "Threads Connected is the number of open connections, while Threads Running is the number of threads not sleeping.",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "86251111-3a14-4c52-b1f2-a5cbe009bc0f",
"layout": {
"h": 5,
"w": 6,
"x": 18,
"y": 8,
"i": "34bd296e-bea3-4638-9a35-f97121e804b2",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"targets": [
{
"expr": "mysql_global_status_max_used_connections{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}} Max Used Connections",
"maxDataPoints": 240
},
{
"expr": "rate(mysql_global_status_aborted_connects{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Aborted Connections",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Max Used and Aborted Connections",
"description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.03,
"gradientMode": "none",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "462559f7-06d3-4585-9ad3-a0906e7c362d",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 13,
"i": "462559f7-06d3-4585-9ad3-a0906e7c362d",
"isResizable": false
},
"name": "Query Performance",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0428fde5-3fbf-45dd-b1a9-1a498d6c2de4",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 14,
"i": "0428fde5-3fbf-45dd-b1a9-1a498d6c2de4",
"isResizable": true
},
"name": "MySQL Temporary Objects",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_created_tmp_tables{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Created Tmp Tables"
},
{
"expr": "rate(mysql_global_status_created_tmp_disk_tables{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Created Tmp Disk Tables"
},
{
"expr": "rate(mysql_global_status_created_tmp_files{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Created Tmp Files"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Select Types**\n\nAs with most relational databases, selecting based on indexes is more efficient than scanning an entire table's data. Here we see the counters for selects not done with indexes.\n\n* ***Select Scan*** is how many queries caused full table scans, in which all the data in the table had to be read and either discarded or returned.\n* ***Select Range*** is how many queries used a range scan, which means MySQL scanned all rows in a given range.\n* ***Select Full Join*** is the number of joins that are not joined on an index, this is usually a huge performance hit.",
"id": "7333267f-e76e-495a-b3d8-08b100ab1330",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 14,
"i": "7333267f-e76e-495a-b3d8-08b100ab1330",
"isResizable": true
},
"name": "MySQL Select Types",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_select_full_join{ ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Select Full Join"
},
{
"expr": "rate(mysql_global_status_select_full_range_join{ ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Select Full Range Join"
},
{
"expr": "rate(mysql_global_status_select_range{ ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Select Range"
},
{
"expr": "rate(mysql_global_status_select_range_check{ ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Select Range Check"
},
{
"expr": "rate(mysql_global_status_select_scan{ ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Select Scan"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Sorts**\n\nDue to a query's structure, order, or other requirements, MySQL sorts the rows before returning them. For example, if a table is ordered 1 to 10 but you want the results reversed, MySQL then has to sort the rows to return 10 to 1.\n\nThis graph also shows when sorts had to scan a whole table or a given range of a table in order to return the results and which could not have been sorted via an index.",
"id": "033652d8-8918-4eee-80bd-625cb0cf8d05",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 18,
"i": "033652d8-8918-4eee-80bd-625cb0cf8d05",
"isResizable": true
},
"name": "MySQL Sorts",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_sort_rows{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Sort Rows"
},
{
"expr": "rate(mysql_global_status_sort_range{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Sort Range"
},
{
"expr": "rate(mysql_global_status_sort_merge_passes{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Sort Merge Passes"
},
{
"expr": "rate(mysql_global_status_sort_scan{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Sort Scan"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "bars",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Slow Queries**\n\nSlow queries are defined as queries being slower than the long_query_time setting. For example, if you have long_query_time set to 3, all queries that take longer than 3 seconds to complete will show on this graph.",
"id": "08c7c660-5dbb-4fce-9037-3680b9e807d6",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 18,
"i": "08c7c660-5dbb-4fce-9037-3680b9e807d6",
"isResizable": true
},
"name": "MySQL Slow Queries",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_slow_queries{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Slow Queries"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "6f36134c-8dd7-4cfb-8a55-7b18ecce2cd6",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 22,
"i": "6f36134c-8dd7-4cfb-8a55-7b18ecce2cd6",
"isResizable": false
},
"name": "Network",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Network Traffic**\n\nHere we can see how much network traffic is generated by MySQL. Outbound is network traffic sent from MySQL and Inbound is network traffic MySQL has received.",
"id": "6d50c653-a256-461d-80f1-69e3db613dbc",
"layout": {
"h": 4,
"w": 24,
"x": 0,
"y": 23,
"i": "6d50c653-a256-461d-80f1-69e3db613dbc",
"isResizable": true
},
"name": "MySQL Network Traffic",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_bytes_received{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Inbound"
},
{
"expr": "rate(mysql_global_status_bytes_sent{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Outbound"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "73cbe32a-36cd-488e-a818-23bb1857d6e7",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 27,
"i": "73cbe32a-36cd-488e-a818-23bb1857d6e7",
"isResizable": false
},
"name": "Commands, Handlers",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Top Command Counters**\n\nThe Com_{{xxx}} statement counter variables indicate the number of times each xxx statement has been executed. There is one status variable for each type of statement. For example, Com_delete and Com_update count [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements, respectively. Com_delete_multi and Com_update_multi are similar but apply to [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements that use multiple-table syntax.",
"id": "ffa708e1-2132-4dca-9cda-2dd73fad16da",
"layout": {
"h": 4,
"w": 6,
"x": 0,
"y": 28,
"i": "ffa708e1-2132-4dca-9cda-2dd73fad16da",
"isResizable": true
},
"name": "Top Command Counters",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "topk(10, rate(mysql_global_status_commands_total{ident=~\"$ident\"}[5m])>0)",
"legend": "{{ident}} {{address}} {{command}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "49a40cdf-4715-4d5c-90f9-944479296d8b",
"layout": {
"h": 4,
"w": 6,
"x": 6,
"y": 28,
"i": "ad5d900a-3e60-436a-b8a6-eccc9ba117d4",
"isResizable": true
},
"name": "Select per second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_commands_total{ident=~\"$ident\", command=\"select\"}[1m])",
"legend": "{{ident}} {{address}} "
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"lineInterpolation": "smooth",
"stack": "noraml"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "01970b88-417a-4c75-9bd0-33eb017a7264",
"layout": {
"h": 4,
"w": 6,
"x": 12,
"y": 28,
"i": "10a3834f-5074-4a0a-9013-03c42a78e2c5",
"isResizable": true
},
"name": "Write(insert|update|delete) per second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_commands_total{ident=~\"$ident\", command=~\"insert|update|delete\"}[1m])",
"legend": "{{ident}} {{address}} {{command}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "noraml"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "958eae25-8c2a-4886-962f-eb12d57bd594",
"layout": {
"h": 4,
"w": 6,
"x": 18,
"y": 28,
"i": "64603263-1433-4041-9078-65ca95e09932",
"isResizable": true
},
"name": "TPS(commit|rollback)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_commands_total{ident=~\"$ident\", command=~\"commit|rollback\"}[10m])",
"legend": "{{ident}} {{address}} {{command}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Handlers**\n\nHandler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes.\n\nThis is in fact the layer between the Storage Engine and MySQL.\n\n* `read_rnd_next` is incremented when the server performs a full table scan and this is a counter you don't really want to see with a high value.\n* `read_key` is incremented when a read is done with an index.\n* `read_next` is incremented when the storage engine is asked to 'read the next index entry'. A high value means a lot of index scans are being done.",
"id": "d9623f6a-64f4-4520-b7b5-01abfc76144d",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 32,
"i": "d9623f6a-64f4-4520-b7b5-01abfc76144d",
"isResizable": true
},
"name": "MySQL Handlers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 3
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_handlers_total{ident=~\"$ident\", handler!~\"commit|rollback|savepoint.*|prepare\"}[5m])",
"legend": "{{ident}} {{address}} {{handler}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3a5ad3a4-5877-46e6-bb3d-bd71174c693e",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 32,
"i": "3a5ad3a4-5877-46e6-bb3d-bd71174c693e",
"isResizable": true
},
"name": "MySQL Transaction Handlers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_handlers_total{ident=~\"$ident\", handler=~\"commit|rollback|savepoint.*|prepare\"}[5m])",
"legend": "{{ident}} {{address}} {{handler}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "4595a676-3d0e-4746-a881-260505002f64",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 36,
"i": "4595a676-3d0e-4746-a881-260505002f64",
"isResizable": false
},
"name": "Open Files",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ac66ac2b-e48b-4ba7-95e5-4846d616449a",
"layout": {
"h": 4,
"w": 24,
"x": 0,
"y": 37,
"i": "ac66ac2b-e48b-4ba7-95e5-4846d616449a",
"isResizable": true
},
"name": "MySQL Open Files",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_variables_open_files_limit{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}} Open Files Limit"
},
{
"expr": "mysql_global_status_open_files{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}} Open Files"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "ddf0e641-3ef6-4be2-a90c-d013eb8a6c30",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 41,
"i": "ddf0e641-3ef6-4be2-a90c-d013eb8a6c30",
"isResizable": false
},
"name": "Table Openings",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Table Open Cache Status**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).",
"id": "c215348c-ecdf-4480-8371-bc6a8d72da10",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 42,
"i": "c215348c-ecdf-4480-8371-bc6a8d72da10",
"isResizable": true
},
"name": "Table Open Cache Hit Ratio Mysql 5.6.6+",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_table_open_cache_hits{ident=~\"$ident\"}[5m])\n/\n(\nrate(mysql_global_status_table_open_cache_hits{ident=~\"$ident\"}[5m])\n+\nrate(mysql_global_status_table_open_cache_misses{ident=~\"$ident\"}[5m])\n)",
"legend": "{{ident}} {{address}} Table Open Cache Hit Ratio"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Open Tables**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).",
"id": "a8fde020-a904-4eaf-84e3-7dbc9f4febf5",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 42,
"i": "a8fde020-a904-4eaf-84e3-7dbc9f4febf5",
"isResizable": true
},
"name": "MySQL Open Tables",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_status_open_tables{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}} Open Tables"
},
{
"expr": "mysql_global_variables_table_open_cache{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}} Table Open Cache"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "d70df2e3-bd10-4072-a027-0cc83235e972",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 46,
"i": "d70df2e3-bd10-4072-a027-0cc83235e972",
"isResizable": false
},
"name": "InnoDB",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2bce3a5c-1ec3-4789-9ce5-897a3e40de30",
"layout": {
"h": 4,
"w": 6,
"x": 0,
"y": 47,
"i": "2bce3a5c-1ec3-4789-9ce5-897a3e40de30",
"isResizable": true
},
"name": "Read requests / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_buffer_pool_read_requests{ident=~\"$ident\"}[1m])",
"legend": "{{ident}} {{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6bded8a5-383e-49ad-b61b-1b0c72a8a911",
"layout": {
"h": 4,
"w": 6,
"x": 6,
"y": 47,
"i": "ab9b8335-2e25-40f6-9402-cd673dc7ae4e",
"isResizable": true
},
"name": "Reads from disk / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_buffer_pool_reads{ident=~\"$ident\"}[1m])",
"legend": "{{ident}} {{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "08e60f4e-f7fd-4513-bf08-f9514371fa94",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 47,
"i": "763c8183-4315-474c-991e-f3ec78699b4e",
"isResizable": true
},
"name": "Reads from memory percent",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "100 - increase(mysql_global_status_innodb_buffer_pool_reads{ident=~\"$ident\"}[5m])/increase(mysql_global_status_innodb_buffer_pool_read_requests{ident=~\"$ident\"}[5m]) * 100",
"legend": "{{ident}} {{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "768306ee-2092-42f6-8b92-7edaf09fdab0",
"layout": {
"h": 4,
"w": 6,
"x": 0,
"y": 51,
"i": "25e1fc62-9e94-4a39-9fc3-2a174777f93b",
"isResizable": true
},
"name": "Row lock waits / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_row_lock_waits{ident=~\"$ident\"}[1m])",
"legend": "{{ident}} {{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "722ff93d-630f-4921-a1f4-8240af974fd3",
"layout": {
"h": 4,
"w": 6,
"x": 6,
"y": 51,
"i": "37679c80-588d-45e3-b2ac-3e0dad4be32a",
"isResizable": true
},
"name": "Row lock time / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_row_lock_time{ident=~\"$ident\"}[1m])",
"legend": "{{ident}} {{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3f8ba45a-a9bc-4420-980d-382c2638cda0",
"layout": {
"h": 4,
"w": 6,
"x": 12,
"y": 51,
"i": "33c8d0e1-03a3-4a3f-8b20-7b5b6373bdb0",
"isResizable": true
},
"name": "Log fsyncs / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_os_log_fsyncs{ident=~\"$ident\"}[1m])",
"legend": "{{ident}} {{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c98ff938-5076-4217-bb0f-e082f34cc6bb",
"layout": {
"h": 4,
"w": 6,
"x": 18,
"y": 51,
"i": "c5b892ee-bc0d-4fe9-b57d-7132c329752d",
"isResizable": true
},
"name": "Buffer Pool Pages Utilization %",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_status_buffer_pool_pages_utilization{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(mysql_global_status_uptime, ident)",
"name": "ident",
"type": "query"
}
],
"version": "3.0.0"
},
"uuid": 1717556328087994000
}
================================================
FILE: integrations/MySQL/dashboards/mysql_by_categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "MySQL Overview by categraf",
"ident": "",
"tags": "Prometheus MySQL",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "fe0e2a5d-4e82-4eaf-b13a-6d98aa6b6860",
"layout": {
"h": 1,
"i": "fe0e2a5d-4e82-4eaf-b13a-6d98aa6b6860",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Basic Info",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "80079949-dbff-48fe-a1eb-54b646c30135",
"layout": {
"h": 3,
"i": "80079949-dbff-48fe-a1eb-54b646c30135",
"isResizable": true,
"w": 6,
"x": 0,
"y": 1
},
"name": "MySQL Uptime",
"options": {
"standardOptions": {
"util": "humantimeSeconds"
},
"valueMappings": [
{
"match": {
"to": 1800
},
"result": {
"color": "#ec7718"
},
"type": "range"
},
{
"match": {
"from": 1800
},
"result": {
"color": "#369603"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "min(mysql_global_status_uptime{instance=~\"$instance\"})"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "mysql_global_status_queries",
"id": "9fd6dd09-d131-4c0e-88ea-ed62c72baf97",
"layout": {
"h": 3,
"i": "9fd6dd09-d131-4c0e-88ea-ed62c72baf97",
"isResizable": true,
"w": 6,
"x": 6,
"y": 1
},
"name": "Current QPS",
"options": {
"standardOptions": {
"decimals": 2
},
"valueMappings": [
{
"match": {
"to": 100
},
"result": {
"color": "#05a31f"
},
"type": "range"
},
{
"match": {
"from": 100
},
"result": {
"color": "#ea3939"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "rate(mysql_global_status_queries{instance=~\"$instance\"}[5m])"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**InnoDB Buffer Pool Size**\n\nInnoDB maintains a storage area called the buffer pool for caching data and indexes in memory. Knowing how the InnoDB buffer pool works, and taking advantage of it to keep frequently accessed data in memory, is one of the most important aspects of MySQL tuning. The goal is to keep the working set in memory. In most cases, this should be between 60%-90% of available memory on a dedicated database host, but depends on many factors.",
"id": "24913190-b86d-44b7-a8db-555351d9d3c2",
"layout": {
"h": 3,
"i": "24913190-b86d-44b7-a8db-555351d9d3c2",
"isResizable": true,
"w": 6,
"x": 12,
"y": 1
},
"name": "InnoDB Buffer Pool",
"options": {
"standardOptions": {
"util": "bytesIEC"
}
},
"targets": [
{
"expr": "avg(mysql_global_variables_innodb_buffer_pool_size{instance=~\"$instance\"})"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**Table Locks**\n\nMySQL takes a number of different locks for varying reasons. In this graph we see how many Table level locks MySQL has requested from the storage engine. In the case of InnoDB, many times the locks could actually be row locks as it only takes table level locks in a few specific cases.\n\nIt is most useful to compare Locks Immediate and Locks Waited. If Locks waited is rising, it means you have lock contention. Otherwise, Locks Immediate rising and falling is normal activity.",
"id": "94a1e97e-2241-4e05-a9e9-a9b1e69d1070",
"layout": {
"h": 3,
"i": "94a1e97e-2241-4e05-a9e9-a9b1e69d1070",
"isResizable": true,
"w": 6,
"x": 18,
"y": 1
},
"name": "Table Locks Waited(5min)",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#e70d0d"
},
"type": "range"
},
{
"match": {
"to": 1
},
"result": {
"color": "#53b503"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(increase(mysql_global_status_table_locks_waited{instance=~\"$instance\"}[5m]))"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "ca82d30f-8e0d-4caa-8a00-2ed9efe4ad85",
"layout": {
"h": 1,
"i": "ca82d30f-8e0d-4caa-8a00-2ed9efe4ad85",
"isResizable": false,
"w": 24,
"x": 0,
"y": 4
},
"name": "Connections",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.",
"id": "e2c85e72-0286-49bc-8ddb-5fba5f449b53",
"layout": {
"h": 7,
"i": "e2c85e72-0286-49bc-8ddb-5fba5f449b53",
"isResizable": true,
"w": 12,
"x": 0,
"y": 5
},
"name": "MySQL Connections",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(mysql_global_status_threads_connected{instance=~\"$instance\"})",
"legend": "Connections"
},
{
"expr": "sum(mysql_global_status_max_used_connections{instance=~\"$instance\"})",
"legend": "Max Used Connections"
},
{
"expr": "sum(mysql_global_variables_max_connections{instance=~\"$instance\"})",
"legend": "Max Connections"
},
{
"expr": "sum(rate(mysql_global_status_aborted_connects{instance=~\"$instance\"}[5m]))",
"legend": "Aborted Connections"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Threads Connected is the number of open connections, while Threads Running is the number of threads not sleeping.",
"id": "fbd43ac2-159d-4e55-8bc6-800d1bbfbd59",
"layout": {
"h": 7,
"i": "fbd43ac2-159d-4e55-8bc6-800d1bbfbd59",
"isResizable": true,
"w": 12,
"x": 12,
"y": 5
},
"name": "MySQL Client Thread Activity",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(mysql_global_status_threads_connected{instance=~\"$instance\"})",
"legend": "Threads Connected"
},
{
"expr": "sum(mysql_global_status_threads_running{instance=~\"$instance\"})",
"legend": "Threads Running"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "cb81def4-ac63-4d42-b66e-440f9061794b",
"layout": {
"h": 1,
"i": "cb81def4-ac63-4d42-b66e-440f9061794b",
"isResizable": false,
"w": 24,
"x": 0,
"y": 12
},
"name": "Query Performance",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "5fa65a30-a49b-457f-b46a-11d2029188bd",
"layout": {
"h": 7,
"i": "5fa65a30-a49b-457f-b46a-11d2029188bd",
"isResizable": true,
"w": 12,
"x": 0,
"y": 13
},
"name": "MySQL Temporary Objects",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(mysql_global_status_created_tmp_tables{instance=~\"$instance\"}[5m]))",
"legend": "Created Tmp Tables"
},
{
"expr": "sum(rate(mysql_global_status_created_tmp_disk_tables{instance=~\"$instance\"}[5m]))",
"legend": "Created Tmp Disk Tables"
},
{
"expr": "sum(rate(mysql_global_status_created_tmp_files{instance=~\"$instance\"}[5m]))",
"legend": "Created Tmp Files"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Select Types**\n\nAs with most relational databases, selecting based on indexes is more efficient than scanning an entire table's data. Here we see the counters for selects not done with indexes.\n\n* ***Select Scan*** is how many queries caused full table scans, in which all the data in the table had to be read and either discarded or returned.\n* ***Select Range*** is how many queries used a range scan, which means MySQL scanned all rows in a given range.\n* ***Select Full Join*** is the number of joins that are not joined on an index, this is usually a huge performance hit.",
"id": "20efd251-6207-4cec-aa3b-4351e8e9b125",
"layout": {
"h": 7,
"i": "20efd251-6207-4cec-aa3b-4351e8e9b125",
"isResizable": true,
"w": 12,
"x": 12,
"y": 13
},
"name": "MySQL Select Types",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(mysql_global_status_select_full_join{ instance=~\"$instance\"}[5m]))",
"legend": "Select Full Join"
},
{
"expr": "sum(rate(mysql_global_status_select_full_range_join{ instance=~\"$instance\"}[5m]))",
"legend": "Select Full Range Join"
},
{
"expr": "sum(rate(mysql_global_status_select_range{ instance=~\"$instance\"}[5m]))",
"legend": "Select Range"
},
{
"expr": "sum(rate(mysql_global_status_select_range_check{ instance=~\"$instance\"}[5m]))",
"legend": "Select Range Check"
},
{
"expr": "sum(rate(mysql_global_status_select_scan{ instance=~\"$instance\"}[5m]))",
"legend": "Select Scan"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Sorts**\n\nDue to a query's structure, order, or other requirements, MySQL sorts the rows before returning them. For example, if a table is ordered 1 to 10 but you want the results reversed, MySQL then has to sort the rows to return 10 to 1.\n\nThis graph also shows when sorts had to scan a whole table or a given range of a table in order to return the results and which could not have been sorted via an index.",
"id": "a4d0c5fb-04e0-4627-8722-ae996d70e2aa",
"layout": {
"h": 7,
"i": "a4d0c5fb-04e0-4627-8722-ae996d70e2aa",
"isResizable": true,
"w": 12,
"x": 0,
"y": 20
},
"name": "MySQL Sorts",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(mysql_global_status_sort_rows{instance=~\"$instance\"}[5m]))",
"legend": "Sort Rows"
},
{
"expr": "sum(rate(mysql_global_status_sort_range{instance=~\"$instance\"}[5m]))",
"legend": "Sort Range"
},
{
"expr": "sum(rate(mysql_global_status_sort_merge_passes{instance=~\"$instance\"}[5m]))",
"legend": "Sort Merge Passes"
},
{
"expr": "sum(rate(mysql_global_status_sort_scan{instance=~\"$instance\"}[5m]))",
"legend": "Sort Scan"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "bars",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Slow Queries**\n\nSlow queries are defined as queries being slower than the long_query_time setting. For example, if you have long_query_time set to 3, all queries that take longer than 3 seconds to complete will show on this graph.",
"id": "2e13ada4-1128-440d-9360-028f16c3779b",
"layout": {
"h": 7,
"i": "2e13ada4-1128-440d-9360-028f16c3779b",
"isResizable": true,
"w": 12,
"x": 12,
"y": 20
},
"name": "MySQL Slow Queries",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(mysql_global_status_slow_queries{instance=~\"$instance\"}[5m]))",
"legend": "Slow Queries"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "c9df805c-8ae7-41d7-b28b-575f478fd9ce",
"layout": {
"h": 1,
"i": "c9df805c-8ae7-41d7-b28b-575f478fd9ce",
"isResizable": false,
"w": 24,
"x": 0,
"y": 27
},
"name": "Network",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Network Traffic**\n\nHere we can see how much network traffic is generated by MySQL. Outbound is network traffic sent from MySQL and Inbound is network traffic MySQL has received.",
"id": "6107714f-bedd-437c-b6e4-d6eb74db6d30",
"layout": {
"h": 7,
"i": "6107714f-bedd-437c-b6e4-d6eb74db6d30",
"isResizable": true,
"w": 24,
"x": 0,
"y": 28
},
"name": "MySQL Network Traffic",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2,
"util": "bytesSI"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(mysql_global_status_bytes_received{instance=~\"$instance\"}[5m]))",
"legend": "Inbound"
},
{
"expr": "sum(rate(mysql_global_status_bytes_sent{instance=~\"$instance\"}[5m]))",
"legend": "Outbound"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "00fd2b70-a133-4ad7-bd56-69a3c91ecf0c",
"layout": {
"h": 1,
"i": "00fd2b70-a133-4ad7-bd56-69a3c91ecf0c",
"isResizable": false,
"w": 24,
"x": 0,
"y": 35
},
"name": "Commands, Handlers",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**Top Command Counters**\n\nThe Com_{{xxx}} statement counter variables indicate the number of times each xxx statement has been executed. There is one status variable for each type of statement. For example, Com_delete and Com_update count [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements, respectively. Com_delete_multi and Com_update_multi are similar but apply to [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements that use multiple-table syntax.",
"id": "f90ca2bc-0809-45f6-88b6-e258805def04",
"layout": {
"h": 7,
"i": "f90ca2bc-0809-45f6-88b6-e258805def04",
"isResizable": true,
"w": 24,
"x": 0,
"y": 36
},
"name": "Top Command Counters",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "topk(10, rate(mysql_global_status_commands_total{instance=~\"$instance\"}[5m])\u003e0)",
"legend": "Com_{{command}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Handlers**\n\nHandler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes.\n\nThis is in fact the layer between the Storage Engine and MySQL.\n\n* `read_rnd_next` is incremented when the server performs a full table scan and this is a counter you don't really want to see with a high value.\n* `read_key` is incremented when a read is done with an index.\n* `read_next` is incremented when the storage engine is asked to 'read the next index entry'. A high value means a lot of index scans are being done.",
"id": "74e1844d-a918-48fa-a29f-6535dc087dac",
"layout": {
"h": 7,
"i": "74e1844d-a918-48fa-a29f-6535dc087dac",
"isResizable": true,
"w": 12,
"x": 0,
"y": 43
},
"name": "MySQL Handlers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 3
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_handlers_total{instance=~\"$instance\", handler!~\"commit|rollback|savepoint.*|prepare\"}[5m])",
"legend": "{{handler}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "b2c3a13d-898f-407b-b6a9-db852072b12f",
"layout": {
"h": 7,
"i": "b2c3a13d-898f-407b-b6a9-db852072b12f",
"isResizable": true,
"w": 12,
"x": 12,
"y": 43
},
"name": "MySQL Transaction Handlers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_handlers_total{instance=~\"$instance\", handler=~\"commit|rollback|savepoint.*|prepare\"}[5m])",
"legend": "{{handler}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "c32a02da-6c61-4b9e-9365-c0b56088fabc",
"layout": {
"h": 1,
"i": "c32a02da-6c61-4b9e-9365-c0b56088fabc",
"isResizable": false,
"w": 24,
"x": 0,
"y": 50
},
"name": "Open Files",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "fc13eadb-890d-4184-ac16-943d54188db8",
"layout": {
"h": 7,
"i": "fc13eadb-890d-4184-ac16-943d54188db8",
"isResizable": true,
"w": 24,
"x": 0,
"y": 51
},
"name": "MySQL Open Files",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_variables_open_files_limit{instance=~\"$instance\"}",
"legend": "Open Files Limit"
},
{
"expr": "mysql_global_status_open_files{instance=~\"$instance\"}",
"legend": "Open Files"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "6f596e65-3e4b-4d9a-aad7-a32c8c7b8239",
"layout": {
"h": 1,
"i": "6f596e65-3e4b-4d9a-aad7-a32c8c7b8239",
"isResizable": false,
"w": 24,
"x": 0,
"y": 58
},
"name": "Table Openings",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Table Open Cache Status**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).",
"id": "0b78fbb5-a0b4-4a1b-98b1-af15bc91779d",
"layout": {
"h": 7,
"i": "0b78fbb5-a0b4-4a1b-98b1-af15bc91779d",
"isResizable": true,
"w": 12,
"x": 0,
"y": 59
},
"name": "Table Open Cache Hit Ratio Mysql 5.6.6+",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_table_open_cache_hits{instance=~\"$instance\"}[5m])\n/\n(\nrate(mysql_global_status_table_open_cache_hits{instance=~\"$instance\"}[5m])\n+\nrate(mysql_global_status_table_open_cache_misses{instance=~\"$instance\"}[5m])\n)",
"legend": "Table Open Cache Hit Ratio"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Open Tables**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).",
"id": "948ad10b-8b22-4d42-9e94-99ef09e12927",
"layout": {
"h": 7,
"i": "948ad10b-8b22-4d42-9e94-99ef09e12927",
"isResizable": true,
"w": 12,
"x": 12,
"y": 59
},
"name": "MySQL Open Tables",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_status_open_tables{instance=~\"$instance\"}",
"legend": "Open Tables"
},
{
"expr": "mysql_global_variables_table_open_cache{instance=~\"$instance\"}",
"legend": "Table Open Cache"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(mysql_global_status_uptime, instance)",
"name": "instance",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328084794000
}
================================================
FILE: integrations/MySQL/dashboards/mysql_by_categraf_ident.json
================================================
{
"id": 0,
"group_id": 0,
"name": "MySQL Overview by categraf, group by ident",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "dfd77e6d-4e88-4bd9-8c19-74f566920f6c",
"layout": {
"h": 1,
"i": "dfd77e6d-4e88-4bd9-8c19-74f566920f6c",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Basic Info",
"panels": [],
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {
"title": 12,
"value": 24
}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "74a5cd8c-f870-442d-bda6-48b5ce4e87ea",
"layout": {
"h": 3,
"i": "74a5cd8c-f870-442d-bda6-48b5ce4e87ea",
"isResizable": true,
"w": 12,
"x": 0,
"y": 1
},
"name": "MySQL Uptime Days",
"options": {
"standardOptions": {
"decimals": 1,
"util": "none"
},
"valueMappings": [
{
"match": {
"to": 1800
},
"result": {
"color": "#ec7718"
},
"type": "range"
},
{
"match": {
"from": 1800
},
"result": {
"color": "#369603"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "mysql_global_status_uptime{ident=~\"$ident\"}/3600/24",
"legend": "{{ident}} {{address}}"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {
"title": 12,
"value": 24
}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "mysql_global_status_queries",
"id": "1763bcc6-d058-4a2b-a099-3d590debd01a",
"layout": {
"h": 3,
"i": "1763bcc6-d058-4a2b-a099-3d590debd01a",
"isResizable": true,
"w": 12,
"x": 12,
"y": 1
},
"name": "Current QPS",
"options": {
"standardOptions": {
"decimals": 2
},
"valueMappings": [
{
"match": {
"to": 100
},
"result": {
"color": "#05a31f"
},
"type": "range"
},
{
"match": {
"from": 100
},
"result": {
"color": "#ea3939"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "rate(mysql_global_status_queries{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}}"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {
"title": 12,
"value": 24
}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**InnoDB Buffer Pool Size**\n\nInnoDB maintains a storage area called the buffer pool for caching data and indexes in memory. Knowing how the InnoDB buffer pool works, and taking advantage of it to keep frequently accessed data in memory, is one of the most important aspects of MySQL tuning. The goal is to keep the working set in memory. In most cases, this should be between 60%-90% of available memory on a dedicated database host, but depends on many factors.",
"id": "28d16171-9e36-4f5d-87be-95bcb2aeb643",
"layout": {
"h": 3,
"i": "28d16171-9e36-4f5d-87be-95bcb2aeb643",
"isResizable": true,
"w": 12,
"x": 0,
"y": 4
},
"name": "InnoDB Buffer Pool",
"options": {
"standardOptions": {
"util": "bytesIEC"
}
},
"targets": [
{
"expr": "mysql_global_variables_innodb_buffer_pool_size{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}}"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {
"title": 12,
"value": 24
}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Table Locks**\n\nMySQL takes a number of different locks for varying reasons. In this graph we see how many Table level locks MySQL has requested from the storage engine. In the case of InnoDB, many times the locks could actually be row locks as it only takes table level locks in a few specific cases.\n\nIt is most useful to compare Locks Immediate and Locks Waited. If Locks waited is rising, it means you have lock contention. Otherwise, Locks Immediate rising and falling is normal activity.",
"id": "5fe39015-bf33-4f02-b79e-a8977e56d7ca",
"layout": {
"h": 3,
"i": "5fe39015-bf33-4f02-b79e-a8977e56d7ca",
"isResizable": true,
"w": 6,
"x": 12,
"y": 4
},
"name": "Table Locks Waited(5min)",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#e70d0d"
},
"type": "range"
},
{
"match": {
"to": 1
},
"result": {
"color": "#53b503"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "increase(mysql_global_status_table_locks_waited{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}}"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "70ee692b-24d9-4807-81b4-81582b5526c2",
"layout": {
"h": 3,
"i": "70ee692b-24d9-4807-81b4-81582b5526c2",
"isResizable": true,
"w": 6,
"x": 18,
"y": 4
},
"name": "Slave Replication Lag",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_slave_status_seconds_behind_master{ident=~\"$ident\"} - mysql_slave_status_sql_delay{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "c6da1a55-04d2-4e3e-a22f-e5790182da4a",
"layout": {
"h": 1,
"i": "c6da1a55-04d2-4e3e-a22f-e5790182da4a",
"isResizable": false,
"w": 24,
"x": 0,
"y": 7
},
"name": "Connections",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.05,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.",
"id": "458753cc-a6d0-4afc-bf5e-54585dc5990c",
"layout": {
"h": 5,
"i": "458753cc-a6d0-4afc-bf5e-54585dc5990c",
"isResizable": true,
"w": 6,
"x": 0,
"y": 8
},
"name": "MySQL Connections",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_status_threads_connected{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}} Connections"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.05,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.",
"id": "ebf01aad-c07b-4541-9891-bb3d5a7175a6",
"layout": {
"h": 5,
"i": "13bf0230-db47-4338-9b32-8e15af8915e4",
"isResizable": true,
"w": 6,
"x": 6,
"y": 8
},
"name": "MySQL Connections Used Percent",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_status_threads_connected{ident=~\"$ident\"}/mysql_global_variables_max_connections{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Threads Connected is the number of open connections, while Threads Running is the number of threads not sleeping.",
"id": "f18e13bf-5495-492f-95c5-4a590e38c58e",
"layout": {
"h": 5,
"i": "f18e13bf-5495-492f-95c5-4a590e38c58e",
"isResizable": true,
"w": 6,
"x": 12,
"y": 8
},
"name": "MySQL Client Thread Running",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_status_threads_running{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}} Threads Running"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.05,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.",
"id": "86251111-3a14-4c52-b1f2-a5cbe009bc0f",
"layout": {
"h": 5,
"i": "34bd296e-bea3-4638-9a35-f97121e804b2",
"isResizable": true,
"w": 6,
"x": 18,
"y": 8
},
"name": "Max Used and Aborted Connections",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_status_max_used_connections{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}} Max Used Connections"
},
{
"expr": "rate(mysql_global_status_aborted_connects{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Aborted Connections"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "462559f7-06d3-4585-9ad3-a0906e7c362d",
"layout": {
"h": 1,
"i": "462559f7-06d3-4585-9ad3-a0906e7c362d",
"isResizable": false,
"w": 24,
"x": 0,
"y": 13
},
"name": "Query Performance",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0428fde5-3fbf-45dd-b1a9-1a498d6c2de4",
"layout": {
"h": 4,
"i": "0428fde5-3fbf-45dd-b1a9-1a498d6c2de4",
"isResizable": true,
"w": 12,
"x": 0,
"y": 14
},
"name": "MySQL Temporary Objects",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_created_tmp_tables{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Created Tmp Tables"
},
{
"expr": "rate(mysql_global_status_created_tmp_disk_tables{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Created Tmp Disk Tables"
},
{
"expr": "rate(mysql_global_status_created_tmp_files{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Created Tmp Files"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Select Types**\n\nAs with most relational databases, selecting based on indexes is more efficient than scanning an entire table's data. Here we see the counters for selects not done with indexes.\n\n* ***Select Scan*** is how many queries caused full table scans, in which all the data in the table had to be read and either discarded or returned.\n* ***Select Range*** is how many queries used a range scan, which means MySQL scanned all rows in a given range.\n* ***Select Full Join*** is the number of joins that are not joined on an index, this is usually a huge performance hit.",
"id": "7333267f-e76e-495a-b3d8-08b100ab1330",
"layout": {
"h": 4,
"i": "7333267f-e76e-495a-b3d8-08b100ab1330",
"isResizable": true,
"w": 12,
"x": 12,
"y": 14
},
"name": "MySQL Select Types",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_select_full_join{ ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Select Full Join"
},
{
"expr": "rate(mysql_global_status_select_full_range_join{ ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Select Full Range Join"
},
{
"expr": "rate(mysql_global_status_select_range{ ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Select Range"
},
{
"expr": "rate(mysql_global_status_select_range_check{ ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Select Range Check"
},
{
"expr": "rate(mysql_global_status_select_scan{ ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Select Scan"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Sorts**\n\nDue to a query's structure, order, or other requirements, MySQL sorts the rows before returning them. For example, if a table is ordered 1 to 10 but you want the results reversed, MySQL then has to sort the rows to return 10 to 1.\n\nThis graph also shows when sorts had to scan a whole table or a given range of a table in order to return the results and which could not have been sorted via an index.",
"id": "033652d8-8918-4eee-80bd-625cb0cf8d05",
"layout": {
"h": 4,
"i": "033652d8-8918-4eee-80bd-625cb0cf8d05",
"isResizable": true,
"w": 12,
"x": 0,
"y": 18
},
"name": "MySQL Sorts",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_sort_rows{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Sort Rows"
},
{
"expr": "rate(mysql_global_status_sort_range{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Sort Range"
},
{
"expr": "rate(mysql_global_status_sort_merge_passes{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Sort Merge Passes"
},
{
"expr": "rate(mysql_global_status_sort_scan{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Sort Scan"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "bars",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Slow Queries**\n\nSlow queries are defined as queries being slower than the long_query_time setting. For example, if you have long_query_time set to 3, all queries that take longer than 3 seconds to complete will show on this graph.",
"id": "08c7c660-5dbb-4fce-9037-3680b9e807d6",
"layout": {
"h": 4,
"i": "08c7c660-5dbb-4fce-9037-3680b9e807d6",
"isResizable": true,
"w": 12,
"x": 12,
"y": 18
},
"name": "MySQL Slow Queries",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_slow_queries{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Slow Queries"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "6f36134c-8dd7-4cfb-8a55-7b18ecce2cd6",
"layout": {
"h": 1,
"i": "6f36134c-8dd7-4cfb-8a55-7b18ecce2cd6",
"isResizable": false,
"w": 24,
"x": 0,
"y": 22
},
"name": "Network",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Network Traffic**\n\nHere we can see how much network traffic is generated by MySQL. Outbound is network traffic sent from MySQL and Inbound is network traffic MySQL has received.",
"id": "6d50c653-a256-461d-80f1-69e3db613dbc",
"layout": {
"h": 4,
"i": "6d50c653-a256-461d-80f1-69e3db613dbc",
"isResizable": true,
"w": 24,
"x": 0,
"y": 23
},
"name": "MySQL Network Traffic",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_bytes_received{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Inbound"
},
{
"expr": "rate(mysql_global_status_bytes_sent{ident=~\"$ident\"}[5m])",
"legend": "{{ident}} {{address}} Outbound"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "73cbe32a-36cd-488e-a818-23bb1857d6e7",
"layout": {
"h": 1,
"i": "73cbe32a-36cd-488e-a818-23bb1857d6e7",
"isResizable": false,
"w": 24,
"x": 0,
"y": 27
},
"name": "Commands, Handlers",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**Top Command Counters**\n\nThe Com_{{xxx}} statement counter variables indicate the number of times each xxx statement has been executed. There is one status variable for each type of statement. For example, Com_delete and Com_update count [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements, respectively. Com_delete_multi and Com_update_multi are similar but apply to [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements that use multiple-table syntax.",
"id": "ffa708e1-2132-4dca-9cda-2dd73fad16da",
"layout": {
"h": 4,
"i": "ffa708e1-2132-4dca-9cda-2dd73fad16da",
"isResizable": true,
"w": 6,
"x": 0,
"y": 28
},
"name": "Top Command Counters",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "topk(10, rate(mysql_global_status_commands_total{ident=~\"$ident\"}[5m])\u003e0)",
"legend": "{{ident}} {{address}} {{command}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "49a40cdf-4715-4d5c-90f9-944479296d8b",
"layout": {
"h": 4,
"i": "ad5d900a-3e60-436a-b8a6-eccc9ba117d4",
"isResizable": true,
"w": 6,
"x": 6,
"y": 28
},
"name": "Select per second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_commands_total{ident=~\"$ident\", command=\"select\"}[1m])",
"legend": "{{ident}} {{address}} "
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"lineInterpolation": "smooth",
"stack": "noraml"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "01970b88-417a-4c75-9bd0-33eb017a7264",
"layout": {
"h": 4,
"i": "10a3834f-5074-4a0a-9013-03c42a78e2c5",
"isResizable": true,
"w": 6,
"x": 12,
"y": 28
},
"name": "Write(insert|update|delete) per second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_commands_total{ident=~\"$ident\", command=~\"insert|update|delete\"}[1m])",
"legend": "{{ident}} {{address}} {{command}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "noraml"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "958eae25-8c2a-4886-962f-eb12d57bd594",
"layout": {
"h": 4,
"i": "64603263-1433-4041-9078-65ca95e09932",
"isResizable": true,
"w": 6,
"x": 18,
"y": 28
},
"name": "TPS(commit|rollback)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_commands_total{ident=~\"$ident\", command=~\"commit|rollback\"}[10m])",
"legend": "{{ident}} {{address}} {{command}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Handlers**\n\nHandler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes.\n\nThis is in fact the layer between the Storage Engine and MySQL.\n\n* `read_rnd_next` is incremented when the server performs a full table scan and this is a counter you don't really want to see with a high value.\n* `read_key` is incremented when a read is done with an index.\n* `read_next` is incremented when the storage engine is asked to 'read the next index entry'. A high value means a lot of index scans are being done.",
"id": "d9623f6a-64f4-4520-b7b5-01abfc76144d",
"layout": {
"h": 4,
"i": "d9623f6a-64f4-4520-b7b5-01abfc76144d",
"isResizable": true,
"w": 12,
"x": 0,
"y": 32
},
"name": "MySQL Handlers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 3
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_handlers_total{ident=~\"$ident\", handler!~\"commit|rollback|savepoint.*|prepare\"}[5m])",
"legend": "{{ident}} {{address}} {{handler}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3a5ad3a4-5877-46e6-bb3d-bd71174c693e",
"layout": {
"h": 4,
"i": "3a5ad3a4-5877-46e6-bb3d-bd71174c693e",
"isResizable": true,
"w": 12,
"x": 12,
"y": 32
},
"name": "MySQL Transaction Handlers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_handlers_total{ident=~\"$ident\", handler=~\"commit|rollback|savepoint.*|prepare\"}[5m])",
"legend": "{{ident}} {{address}} {{handler}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "4595a676-3d0e-4746-a881-260505002f64",
"layout": {
"h": 1,
"i": "4595a676-3d0e-4746-a881-260505002f64",
"isResizable": false,
"w": 24,
"x": 0,
"y": 36
},
"name": "Open Files",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ac66ac2b-e48b-4ba7-95e5-4846d616449a",
"layout": {
"h": 4,
"i": "ac66ac2b-e48b-4ba7-95e5-4846d616449a",
"isResizable": true,
"w": 24,
"x": 0,
"y": 37
},
"name": "MySQL Open Files",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_variables_open_files_limit{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}} Open Files Limit"
},
{
"expr": "mysql_global_status_open_files{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}} Open Files"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "ddf0e641-3ef6-4be2-a90c-d013eb8a6c30",
"layout": {
"h": 1,
"i": "ddf0e641-3ef6-4be2-a90c-d013eb8a6c30",
"isResizable": false,
"w": 24,
"x": 0,
"y": 41
},
"name": "Table Openings",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Table Open Cache Status**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).",
"id": "c215348c-ecdf-4480-8371-bc6a8d72da10",
"layout": {
"h": 4,
"i": "c215348c-ecdf-4480-8371-bc6a8d72da10",
"isResizable": true,
"w": 12,
"x": 0,
"y": 42
},
"name": "Table Open Cache Hit Ratio Mysql 5.6.6+",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_table_open_cache_hits{ident=~\"$ident\"}[5m])\n/\n(\nrate(mysql_global_status_table_open_cache_hits{ident=~\"$ident\"}[5m])\n+\nrate(mysql_global_status_table_open_cache_misses{ident=~\"$ident\"}[5m])\n)",
"legend": "{{ident}} {{address}} Table Open Cache Hit Ratio"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "**MySQL Open Tables**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).",
"id": "a8fde020-a904-4eaf-84e3-7dbc9f4febf5",
"layout": {
"h": 4,
"i": "a8fde020-a904-4eaf-84e3-7dbc9f4febf5",
"isResizable": true,
"w": 12,
"x": 12,
"y": 42
},
"name": "MySQL Open Tables",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_status_open_tables{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}} Open Tables"
},
{
"expr": "mysql_global_variables_table_open_cache{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}} Table Open Cache"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "d70df2e3-bd10-4072-a027-0cc83235e972",
"layout": {
"h": 1,
"i": "d70df2e3-bd10-4072-a027-0cc83235e972",
"isResizable": false,
"w": 24,
"x": 0,
"y": 46
},
"name": "InnoDB",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2bce3a5c-1ec3-4789-9ce5-897a3e40de30",
"layout": {
"h": 4,
"i": "2bce3a5c-1ec3-4789-9ce5-897a3e40de30",
"isResizable": true,
"w": 6,
"x": 0,
"y": 47
},
"name": "Read requests / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_buffer_pool_read_requests{ident=~\"$ident\"}[1m])",
"legend": "{{ident}} {{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6bded8a5-383e-49ad-b61b-1b0c72a8a911",
"layout": {
"h": 4,
"i": "ab9b8335-2e25-40f6-9402-cd673dc7ae4e",
"isResizable": true,
"w": 6,
"x": 6,
"y": 47
},
"name": "Reads from disk / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_buffer_pool_reads{ident=~\"$ident\"}[1m])",
"legend": "{{ident}} {{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "08e60f4e-f7fd-4513-bf08-f9514371fa94",
"layout": {
"h": 4,
"i": "763c8183-4315-474c-991e-f3ec78699b4e",
"isResizable": true,
"w": 12,
"x": 12,
"y": 47
},
"name": "Reads from memory percent",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "100 - increase(mysql_global_status_innodb_buffer_pool_reads{ident=~\"$ident\"}[5m])/increase(mysql_global_status_innodb_buffer_pool_read_requests{ident=~\"$ident\"}[5m]) * 100",
"legend": "{{ident}} {{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "768306ee-2092-42f6-8b92-7edaf09fdab0",
"layout": {
"h": 4,
"i": "25e1fc62-9e94-4a39-9fc3-2a174777f93b",
"isResizable": true,
"w": 6,
"x": 0,
"y": 51
},
"name": "Row lock waits / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_row_lock_waits{ident=~\"$ident\"}[1m])",
"legend": "{{ident}} {{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "722ff93d-630f-4921-a1f4-8240af974fd3",
"layout": {
"h": 4,
"i": "37679c80-588d-45e3-b2ac-3e0dad4be32a",
"isResizable": true,
"w": 6,
"x": 6,
"y": 51
},
"name": "Row lock time / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_row_lock_time{ident=~\"$ident\"}[1m])",
"legend": "{{ident}} {{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3f8ba45a-a9bc-4420-980d-382c2638cda0",
"layout": {
"h": 4,
"i": "33c8d0e1-03a3-4a3f-8b20-7b5b6373bdb0",
"isResizable": true,
"w": 6,
"x": 12,
"y": 51
},
"name": "Log fsyncs / second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_innodb_os_log_fsyncs{ident=~\"$ident\"}[1m])",
"legend": "{{ident}} {{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c98ff938-5076-4217-bb0f-e082f34cc6bb",
"layout": {
"h": 4,
"i": "c5b892ee-bc0d-4fe9-b57d-7132c329752d",
"isResizable": true,
"w": 6,
"x": 18,
"y": 51
},
"name": "Buffer Pool Pages Utilization %",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_status_buffer_pool_pages_utilization{ident=~\"$ident\"}",
"legend": "{{ident}} {{address}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(mysql_global_status_uptime, ident)",
"name": "ident",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328087990000
}
================================================
FILE: integrations/MySQL/dashboards/mysql_by_categraf_instance.json
================================================
{
"id": 0,
"group_id": 0,
"name": "MySQL Overview by categraf, group by instance",
"ident": "",
"tags": "Prometheus MySQL",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "fe0e2a5d-4e82-4eaf-b13a-6d98aa6b6860",
"layout": {
"h": 1,
"i": "fe0e2a5d-4e82-4eaf-b13a-6d98aa6b6860",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Basic Info",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "80079949-dbff-48fe-a1eb-54b646c30135",
"layout": {
"h": 3,
"i": "80079949-dbff-48fe-a1eb-54b646c30135",
"isResizable": true,
"w": 6,
"x": 0,
"y": 1
},
"name": "MySQL Uptime",
"options": {
"standardOptions": {
"util": "humantimeSeconds"
},
"valueMappings": [
{
"match": {
"to": 1800
},
"result": {
"color": "#ec7718"
},
"type": "range"
},
{
"match": {
"from": 1800
},
"result": {
"color": "#369603"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "min(mysql_global_status_uptime{instance=~\"$instance\"})"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "mysql_global_status_queries",
"id": "9fd6dd09-d131-4c0e-88ea-ed62c72baf97",
"layout": {
"h": 3,
"i": "9fd6dd09-d131-4c0e-88ea-ed62c72baf97",
"isResizable": true,
"w": 6,
"x": 6,
"y": 1
},
"name": "Current QPS",
"options": {
"standardOptions": {
"decimals": 2
},
"valueMappings": [
{
"match": {
"to": 100
},
"result": {
"color": "#05a31f"
},
"type": "range"
},
{
"match": {
"from": 100
},
"result": {
"color": "#ea3939"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "rate(mysql_global_status_queries{instance=~\"$instance\"}[5m])"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**InnoDB Buffer Pool Size**\n\nInnoDB maintains a storage area called the buffer pool for caching data and indexes in memory. Knowing how the InnoDB buffer pool works, and taking advantage of it to keep frequently accessed data in memory, is one of the most important aspects of MySQL tuning. The goal is to keep the working set in memory. In most cases, this should be between 60%-90% of available memory on a dedicated database host, but depends on many factors.",
"id": "24913190-b86d-44b7-a8db-555351d9d3c2",
"layout": {
"h": 3,
"i": "24913190-b86d-44b7-a8db-555351d9d3c2",
"isResizable": true,
"w": 6,
"x": 12,
"y": 1
},
"name": "InnoDB Buffer Pool",
"options": {
"standardOptions": {
"util": "bytesIEC"
}
},
"targets": [
{
"expr": "avg(mysql_global_variables_innodb_buffer_pool_size{instance=~\"$instance\"})"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**Table Locks**\n\nMySQL takes a number of different locks for varying reasons. In this graph we see how many Table level locks MySQL has requested from the storage engine. In the case of InnoDB, many times the locks could actually be row locks as it only takes table level locks in a few specific cases.\n\nIt is most useful to compare Locks Immediate and Locks Waited. If Locks waited is rising, it means you have lock contention. Otherwise, Locks Immediate rising and falling is normal activity.",
"id": "94a1e97e-2241-4e05-a9e9-a9b1e69d1070",
"layout": {
"h": 3,
"i": "94a1e97e-2241-4e05-a9e9-a9b1e69d1070",
"isResizable": true,
"w": 6,
"x": 18,
"y": 1
},
"name": "Table Locks Waited(5min)",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#e70d0d"
},
"type": "range"
},
{
"match": {
"to": 1
},
"result": {
"color": "#53b503"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(increase(mysql_global_status_table_locks_waited{instance=~\"$instance\"}[5m]))"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "ca82d30f-8e0d-4caa-8a00-2ed9efe4ad85",
"layout": {
"h": 1,
"i": "ca82d30f-8e0d-4caa-8a00-2ed9efe4ad85",
"isResizable": false,
"w": 24,
"x": 0,
"y": 4
},
"name": "Connections",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.",
"id": "e2c85e72-0286-49bc-8ddb-5fba5f449b53",
"layout": {
"h": 7,
"i": "e2c85e72-0286-49bc-8ddb-5fba5f449b53",
"isResizable": true,
"w": 12,
"x": 0,
"y": 5
},
"name": "MySQL Connections",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(mysql_global_status_threads_connected{instance=~\"$instance\"})",
"legend": "Connections"
},
{
"expr": "sum(mysql_global_status_max_used_connections{instance=~\"$instance\"})",
"legend": "Max Used Connections"
},
{
"expr": "sum(mysql_global_variables_max_connections{instance=~\"$instance\"})",
"legend": "Max Connections"
},
{
"expr": "sum(rate(mysql_global_status_aborted_connects{instance=~\"$instance\"}[5m]))",
"legend": "Aborted Connections"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Threads Connected is the number of open connections, while Threads Running is the number of threads not sleeping.",
"id": "fbd43ac2-159d-4e55-8bc6-800d1bbfbd59",
"layout": {
"h": 7,
"i": "fbd43ac2-159d-4e55-8bc6-800d1bbfbd59",
"isResizable": true,
"w": 12,
"x": 12,
"y": 5
},
"name": "MySQL Client Thread Activity",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(mysql_global_status_threads_connected{instance=~\"$instance\"})",
"legend": "Threads Connected"
},
{
"expr": "sum(mysql_global_status_threads_running{instance=~\"$instance\"})",
"legend": "Threads Running"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "cb81def4-ac63-4d42-b66e-440f9061794b",
"layout": {
"h": 1,
"i": "cb81def4-ac63-4d42-b66e-440f9061794b",
"isResizable": false,
"w": 24,
"x": 0,
"y": 12
},
"name": "Query Performance",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "5fa65a30-a49b-457f-b46a-11d2029188bd",
"layout": {
"h": 7,
"i": "5fa65a30-a49b-457f-b46a-11d2029188bd",
"isResizable": true,
"w": 12,
"x": 0,
"y": 13
},
"name": "MySQL Temporary Objects",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(mysql_global_status_created_tmp_tables{instance=~\"$instance\"}[5m]))",
"legend": "Created Tmp Tables"
},
{
"expr": "sum(rate(mysql_global_status_created_tmp_disk_tables{instance=~\"$instance\"}[5m]))",
"legend": "Created Tmp Disk Tables"
},
{
"expr": "sum(rate(mysql_global_status_created_tmp_files{instance=~\"$instance\"}[5m]))",
"legend": "Created Tmp Files"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Select Types**\n\nAs with most relational databases, selecting based on indexes is more efficient than scanning an entire table's data. Here we see the counters for selects not done with indexes.\n\n* ***Select Scan*** is how many queries caused full table scans, in which all the data in the table had to be read and either discarded or returned.\n* ***Select Range*** is how many queries used a range scan, which means MySQL scanned all rows in a given range.\n* ***Select Full Join*** is the number of joins that are not joined on an index, this is usually a huge performance hit.",
"id": "20efd251-6207-4cec-aa3b-4351e8e9b125",
"layout": {
"h": 7,
"i": "20efd251-6207-4cec-aa3b-4351e8e9b125",
"isResizable": true,
"w": 12,
"x": 12,
"y": 13
},
"name": "MySQL Select Types",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(mysql_global_status_select_full_join{ instance=~\"$instance\"}[5m]))",
"legend": "Select Full Join"
},
{
"expr": "sum(rate(mysql_global_status_select_full_range_join{ instance=~\"$instance\"}[5m]))",
"legend": "Select Full Range Join"
},
{
"expr": "sum(rate(mysql_global_status_select_range{ instance=~\"$instance\"}[5m]))",
"legend": "Select Range"
},
{
"expr": "sum(rate(mysql_global_status_select_range_check{ instance=~\"$instance\"}[5m]))",
"legend": "Select Range Check"
},
{
"expr": "sum(rate(mysql_global_status_select_scan{ instance=~\"$instance\"}[5m]))",
"legend": "Select Scan"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Sorts**\n\nDue to a query's structure, order, or other requirements, MySQL sorts the rows before returning them. For example, if a table is ordered 1 to 10 but you want the results reversed, MySQL then has to sort the rows to return 10 to 1.\n\nThis graph also shows when sorts had to scan a whole table or a given range of a table in order to return the results and which could not have been sorted via an index.",
"id": "a4d0c5fb-04e0-4627-8722-ae996d70e2aa",
"layout": {
"h": 7,
"i": "a4d0c5fb-04e0-4627-8722-ae996d70e2aa",
"isResizable": true,
"w": 12,
"x": 0,
"y": 20
},
"name": "MySQL Sorts",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(mysql_global_status_sort_rows{instance=~\"$instance\"}[5m]))",
"legend": "Sort Rows"
},
{
"expr": "sum(rate(mysql_global_status_sort_range{instance=~\"$instance\"}[5m]))",
"legend": "Sort Range"
},
{
"expr": "sum(rate(mysql_global_status_sort_merge_passes{instance=~\"$instance\"}[5m]))",
"legend": "Sort Merge Passes"
},
{
"expr": "sum(rate(mysql_global_status_sort_scan{instance=~\"$instance\"}[5m]))",
"legend": "Sort Scan"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "bars",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Slow Queries**\n\nSlow queries are defined as queries being slower than the long_query_time setting. For example, if you have long_query_time set to 3, all queries that take longer than 3 seconds to complete will show on this graph.",
"id": "2e13ada4-1128-440d-9360-028f16c3779b",
"layout": {
"h": 7,
"i": "2e13ada4-1128-440d-9360-028f16c3779b",
"isResizable": true,
"w": 12,
"x": 12,
"y": 20
},
"name": "MySQL Slow Queries",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(mysql_global_status_slow_queries{instance=~\"$instance\"}[5m]))",
"legend": "Slow Queries"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "c9df805c-8ae7-41d7-b28b-575f478fd9ce",
"layout": {
"h": 1,
"i": "c9df805c-8ae7-41d7-b28b-575f478fd9ce",
"isResizable": false,
"w": 24,
"x": 0,
"y": 27
},
"name": "Network",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Network Traffic**\n\nHere we can see how much network traffic is generated by MySQL. Outbound is network traffic sent from MySQL and Inbound is network traffic MySQL has received.",
"id": "6107714f-bedd-437c-b6e4-d6eb74db6d30",
"layout": {
"h": 7,
"i": "6107714f-bedd-437c-b6e4-d6eb74db6d30",
"isResizable": true,
"w": 24,
"x": 0,
"y": 28
},
"name": "MySQL Network Traffic",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2,
"util": "bytesSI"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(mysql_global_status_bytes_received{instance=~\"$instance\"}[5m]))",
"legend": "Inbound"
},
{
"expr": "sum(rate(mysql_global_status_bytes_sent{instance=~\"$instance\"}[5m]))",
"legend": "Outbound"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "00fd2b70-a133-4ad7-bd56-69a3c91ecf0c",
"layout": {
"h": 1,
"i": "00fd2b70-a133-4ad7-bd56-69a3c91ecf0c",
"isResizable": false,
"w": 24,
"x": 0,
"y": 35
},
"name": "Commands, Handlers",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**Top Command Counters**\n\nThe Com_{{xxx}} statement counter variables indicate the number of times each xxx statement has been executed. There is one status variable for each type of statement. For example, Com_delete and Com_update count [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements, respectively. Com_delete_multi and Com_update_multi are similar but apply to [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements that use multiple-table syntax.",
"id": "f90ca2bc-0809-45f6-88b6-e258805def04",
"layout": {
"h": 7,
"i": "f90ca2bc-0809-45f6-88b6-e258805def04",
"isResizable": true,
"w": 24,
"x": 0,
"y": 36
},
"name": "Top Command Counters",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "topk(10, rate(mysql_global_status_commands_total{instance=~\"$instance\"}[5m])\u003e0)",
"legend": "Com_{{command}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Handlers**\n\nHandler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes.\n\nThis is in fact the layer between the Storage Engine and MySQL.\n\n* `read_rnd_next` is incremented when the server performs a full table scan and this is a counter you don't really want to see with a high value.\n* `read_key` is incremented when a read is done with an index.\n* `read_next` is incremented when the storage engine is asked to 'read the next index entry'. A high value means a lot of index scans are being done.",
"id": "74e1844d-a918-48fa-a29f-6535dc087dac",
"layout": {
"h": 7,
"i": "74e1844d-a918-48fa-a29f-6535dc087dac",
"isResizable": true,
"w": 12,
"x": 0,
"y": 43
},
"name": "MySQL Handlers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 3
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_handlers_total{instance=~\"$instance\", handler!~\"commit|rollback|savepoint.*|prepare\"}[5m])",
"legend": "{{handler}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "b2c3a13d-898f-407b-b6a9-db852072b12f",
"layout": {
"h": 7,
"i": "b2c3a13d-898f-407b-b6a9-db852072b12f",
"isResizable": true,
"w": 12,
"x": 12,
"y": 43
},
"name": "MySQL Transaction Handlers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_handlers_total{instance=~\"$instance\", handler=~\"commit|rollback|savepoint.*|prepare\"}[5m])",
"legend": "{{handler}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "c32a02da-6c61-4b9e-9365-c0b56088fabc",
"layout": {
"h": 1,
"i": "c32a02da-6c61-4b9e-9365-c0b56088fabc",
"isResizable": false,
"w": 24,
"x": 0,
"y": 50
},
"name": "Open Files",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "fc13eadb-890d-4184-ac16-943d54188db8",
"layout": {
"h": 7,
"i": "fc13eadb-890d-4184-ac16-943d54188db8",
"isResizable": true,
"w": 24,
"x": 0,
"y": 51
},
"name": "MySQL Open Files",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_variables_open_files_limit{instance=~\"$instance\"}",
"legend": "Open Files Limit"
},
{
"expr": "mysql_global_status_open_files{instance=~\"$instance\"}",
"legend": "Open Files"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "6f596e65-3e4b-4d9a-aad7-a32c8c7b8239",
"layout": {
"h": 1,
"i": "6f596e65-3e4b-4d9a-aad7-a32c8c7b8239",
"isResizable": false,
"w": 24,
"x": 0,
"y": 58
},
"name": "Table Openings",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Table Open Cache Status**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).",
"id": "0b78fbb5-a0b4-4a1b-98b1-af15bc91779d",
"layout": {
"h": 7,
"i": "0b78fbb5-a0b4-4a1b-98b1-af15bc91779d",
"isResizable": true,
"w": 12,
"x": 0,
"y": 59
},
"name": "Table Open Cache Hit Ratio Mysql 5.6.6+",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_table_open_cache_hits{instance=~\"$instance\"}[5m])\n/\n(\nrate(mysql_global_status_table_open_cache_hits{instance=~\"$instance\"}[5m])\n+\nrate(mysql_global_status_table_open_cache_misses{instance=~\"$instance\"}[5m])\n)",
"legend": "Table Open Cache Hit Ratio"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Open Tables**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).",
"id": "948ad10b-8b22-4d42-9e94-99ef09e12927",
"layout": {
"h": 7,
"i": "948ad10b-8b22-4d42-9e94-99ef09e12927",
"isResizable": true,
"w": 12,
"x": 12,
"y": 59
},
"name": "MySQL Open Tables",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_status_open_tables{instance=~\"$instance\"}",
"legend": "Open Tables"
},
{
"expr": "mysql_global_variables_table_open_cache{instance=~\"$instance\"}",
"legend": "Table Open Cache"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(mysql_global_status_uptime, instance)",
"name": "instance",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328097010000
}
================================================
FILE: integrations/MySQL/dashboards/mysql_by_exporter.json
================================================
{
"id": 0,
"group_id": 0,
"name": "MySQL Overview by exporter",
"ident": "",
"tags": "Prometheus MySQL",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "a94506f9-879c-41d4-bf0a-0ce479352742",
"layout": {
"h": 1,
"i": "a94506f9-879c-41d4-bf0a-0ce479352742",
"w": 24,
"x": 0,
"y": 0
},
"name": "Basic Info",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "c1ed017a-86d8-4ba5-8e75-ce3be943eef9",
"layout": {
"h": 3,
"i": "c1ed017a-86d8-4ba5-8e75-ce3be943eef9",
"w": 6,
"x": 0,
"y": 1
},
"name": "MySQL Uptime",
"options": {
"standardOptions": {
"util": "humantimeSeconds"
},
"valueMappings": [
{
"match": {
"to": 1800
},
"result": {
"color": "#ec7718"
},
"type": "range"
},
{
"match": {
"from": 1800
},
"result": {
"color": "#369603"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "min(mysql_global_status_uptime{instance=~\"$instance\"})"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "mysql_global_status_queries",
"id": "05b0a593-7328-4298-9b5c-af6bd6a34e52",
"layout": {
"h": 3,
"i": "05b0a593-7328-4298-9b5c-af6bd6a34e52",
"w": 6,
"x": 6,
"y": 1
},
"name": "Current QPS",
"options": {
"standardOptions": {
"decimals": 2
},
"valueMappings": [
{
"match": {
"to": 100
},
"result": {
"color": "#05a31f"
},
"type": "range"
},
{
"match": {
"from": 100
},
"result": {
"color": "#ea3939"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "rate(mysql_global_status_queries{instance=~\"$instance\"}[5m])"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**InnoDB Buffer Pool Size**\n\nInnoDB maintains a storage area called the buffer pool for caching data and indexes in memory. Knowing how the InnoDB buffer pool works, and taking advantage of it to keep frequently accessed data in memory, is one of the most important aspects of MySQL tuning. The goal is to keep the working set in memory. In most cases, this should be between 60%-90% of available memory on a dedicated database host, but depends on many factors.",
"id": "e5388f85-8970-4f64-83e1-e77d4025f1dd",
"layout": {
"h": 3,
"i": "e5388f85-8970-4f64-83e1-e77d4025f1dd",
"w": 6,
"x": 12,
"y": 1
},
"name": "InnoDB Buffer Pool",
"options": {
"standardOptions": {
"util": "bytesIEC"
}
},
"targets": [
{
"expr": "avg(mysql_global_variables_innodb_buffer_pool_size{instance=~\"$instance\"})"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**Table Locks**\n\nMySQL takes a number of different locks for varying reasons. In this graph we see how many Table level locks MySQL has requested from the storage engine. In the case of InnoDB, many times the locks could actually be row locks as it only takes table level locks in a few specific cases.\n\nIt is most useful to compare Locks Immediate and Locks Waited. If Locks waited is rising, it means you have lock contention. Otherwise, Locks Immediate rising and falling is normal activity.",
"id": "ab8a768e-98f3-4215-bfbf-ea838a12b45c",
"layout": {
"h": 3,
"i": "ab8a768e-98f3-4215-bfbf-ea838a12b45c",
"w": 6,
"x": 18,
"y": 1
},
"name": "Table Locks Waited(5min)",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#e70d0d"
},
"type": "range"
},
{
"match": {
"to": 1
},
"result": {
"color": "#53b503"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(increase(mysql_global_status_table_locks_waited{instance=~\"$instance\"}[5m]))"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "24a1be60-6b90-483a-af6f-48cc79830da1",
"layout": {
"h": 1,
"i": "24a1be60-6b90-483a-af6f-48cc79830da1",
"w": 24,
"x": 0,
"y": 4
},
"name": "Connections",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.",
"id": "bb31cf5e-1a80-478c-b300-ee9975d14963",
"layout": {
"h": 7,
"i": "bb31cf5e-1a80-478c-b300-ee9975d14963",
"w": 12,
"x": 0,
"y": 5
},
"name": "MySQL Connections",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(mysql_global_status_threads_connected{instance=~\"$instance\"})",
"legend": "Connections"
},
{
"expr": "sum(mysql_global_status_max_used_connections{instance=~\"$instance\"})",
"legend": "Max Used Connections"
},
{
"expr": "sum(mysql_global_variables_max_connections{instance=~\"$instance\"})",
"legend": "Max Connections"
},
{
"expr": "sum(rate(mysql_global_status_aborted_connects{instance=~\"$instance\"}[5m]))",
"legend": "Aborted Connections"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Threads Connected is the number of open connections, while Threads Running is the number of threads not sleeping.",
"id": "c1083f59-1e46-442e-a7c3-f5d1fbb78751",
"layout": {
"h": 7,
"i": "c1083f59-1e46-442e-a7c3-f5d1fbb78751",
"w": 12,
"x": 12,
"y": 5
},
"name": "MySQL Client Thread Activity",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(mysql_global_status_threads_connected{instance=~\"$instance\"})",
"legend": "Threads Connected"
},
{
"expr": "sum(mysql_global_status_threads_running{instance=~\"$instance\"})",
"legend": "Threads Running"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "e126f7dd-df38-4a43-846a-ea6188718de9",
"layout": {
"h": 1,
"i": "e126f7dd-df38-4a43-846a-ea6188718de9",
"w": 24,
"x": 0,
"y": 12
},
"name": "Query Performance",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "80f94d89-babe-4e38-a220-2490af80e091",
"layout": {
"h": 7,
"i": "80f94d89-babe-4e38-a220-2490af80e091",
"w": 12,
"x": 0,
"y": 13
},
"name": "MySQL Temporary Objects",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(mysql_global_status_created_tmp_tables{instance=~\"$instance\"}[5m]))",
"legend": "Created Tmp Tables"
},
{
"expr": "sum(rate(mysql_global_status_created_tmp_disk_tables{instance=~\"$instance\"}[5m]))",
"legend": "Created Tmp Disk Tables"
},
{
"expr": "sum(rate(mysql_global_status_created_tmp_files{instance=~\"$instance\"}[5m]))",
"legend": "Created Tmp Files"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Select Types**\n\nAs with most relational databases, selecting based on indexes is more efficient than scanning an entire table's data. Here we see the counters for selects not done with indexes.\n\n* ***Select Scan*** is how many queries caused full table scans, in which all the data in the table had to be read and either discarded or returned.\n* ***Select Range*** is how many queries used a range scan, which means MySQL scanned all rows in a given range.\n* ***Select Full Join*** is the number of joins that are not joined on an index, this is usually a huge performance hit.",
"id": "a03b6272-cd60-430c-8128-6bfc8da2938f",
"layout": {
"h": 7,
"i": "a03b6272-cd60-430c-8128-6bfc8da2938f",
"w": 12,
"x": 12,
"y": 13
},
"name": "MySQL Select Types",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(mysql_global_status_select_full_join{ instance=~\"$instance\"}[5m]))",
"legend": "Select Full Join"
},
{
"expr": "sum(rate(mysql_global_status_select_full_range_join{ instance=~\"$instance\"}[5m]))",
"legend": "Select Full Range Join"
},
{
"expr": "sum(rate(mysql_global_status_select_range{ instance=~\"$instance\"}[5m]))",
"legend": "Select Range"
},
{
"expr": "sum(rate(mysql_global_status_select_range_check{ instance=~\"$instance\"}[5m]))",
"legend": "Select Range Check"
},
{
"expr": "sum(rate(mysql_global_status_select_scan{ instance=~\"$instance\"}[5m]))",
"legend": "Select Scan"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Sorts**\n\nDue to a query's structure, order, or other requirements, MySQL sorts the rows before returning them. For example, if a table is ordered 1 to 10 but you want the results reversed, MySQL then has to sort the rows to return 10 to 1.\n\nThis graph also shows when sorts had to scan a whole table or a given range of a table in order to return the results and which could not have been sorted via an index.",
"id": "d5fbfe0e-fc90-4f2a-b016-7a24a19c73d7",
"layout": {
"h": 7,
"i": "d5fbfe0e-fc90-4f2a-b016-7a24a19c73d7",
"w": 12,
"x": 0,
"y": 15
},
"name": "MySQL Sorts",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(mysql_global_status_sort_rows{instance=~\"$instance\"}[5m]))",
"legend": "Sort Rows"
},
{
"expr": "sum(rate(mysql_global_status_sort_range{instance=~\"$instance\"}[5m]))",
"legend": "Sort Range"
},
{
"expr": "sum(rate(mysql_global_status_sort_merge_passes{instance=~\"$instance\"}[5m]))",
"legend": "Sort Merge Passes"
},
{
"expr": "sum(rate(mysql_global_status_sort_scan{instance=~\"$instance\"}[5m]))",
"legend": "Sort Scan"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "bars",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Slow Queries**\n\nSlow queries are defined as queries being slower than the long_query_time setting. For example, if you have long_query_time set to 3, all queries that take longer than 3 seconds to complete will show on this graph.",
"id": "51306ae6-e11a-4c08-a55c-3678676d5d8e",
"layout": {
"h": 7,
"i": "51306ae6-e11a-4c08-a55c-3678676d5d8e",
"w": 12,
"x": 12,
"y": 15
},
"name": "MySQL Slow Queries",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(mysql_global_status_slow_queries{instance=~\"$instance\"}[5m]))",
"legend": "Slow Queries"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "867ae6c9-b4a4-4349-8e68-56ef9cebf8b4",
"layout": {
"h": 1,
"i": "867ae6c9-b4a4-4349-8e68-56ef9cebf8b4",
"w": 24,
"x": 0,
"y": 22
},
"name": "Network",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Network Traffic**\n\nHere we can see how much network traffic is generated by MySQL. Outbound is network traffic sent from MySQL and Inbound is network traffic MySQL has received.",
"id": "392c15b2-d413-4201-9692-5277f7863c05",
"layout": {
"h": 7,
"i": "392c15b2-d413-4201-9692-5277f7863c05",
"w": 24,
"x": 0,
"y": 23
},
"name": "MySQL Network Traffic",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2,
"util": "bytesSI"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(mysql_global_status_bytes_received{instance=~\"$instance\"}[5m]))",
"legend": "Inbound"
},
{
"expr": "sum(rate(mysql_global_status_bytes_sent{instance=~\"$instance\"}[5m]))",
"legend": "Outbound"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "e58cb79a-75f2-452f-bc55-b36ff93a60c4",
"layout": {
"h": 1,
"i": "e58cb79a-75f2-452f-bc55-b36ff93a60c4",
"w": 24,
"x": 0,
"y": 30
},
"name": "Commands, Handlers",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.2,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**Top Command Counters**\n\nThe Com_{{xxx}} statement counter variables indicate the number of times each xxx statement has been executed. There is one status variable for each type of statement. For example, Com_delete and Com_update count [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements, respectively. Com_delete_multi and Com_update_multi are similar but apply to [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements that use multiple-table syntax.",
"id": "df2f62e6-5a75-4cea-9268-3077348a6558",
"layout": {
"h": 7,
"i": "df2f62e6-5a75-4cea-9268-3077348a6558",
"w": 24,
"x": 0,
"y": 31
},
"name": "Top Command Counters",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "topk(10, rate(mysql_global_status_commands_total{instance=~\"$instance\"}[5m])\u003e0)",
"legend": "Com_{{command}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Handlers**\n\nHandler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes.\n\nThis is in fact the layer between the Storage Engine and MySQL.\n\n* `read_rnd_next` is incremented when the server performs a full table scan and this is a counter you don't really want to see with a high value.\n* `read_key` is incremented when a read is done with an index.\n* `read_next` is incremented when the storage engine is asked to 'read the next index entry'. A high value means a lot of index scans are being done.",
"id": "34ba0da1-d6f0-4c35-8418-56a7506035c5",
"layout": {
"h": 7,
"i": "34ba0da1-d6f0-4c35-8418-56a7506035c5",
"w": 12,
"x": 0,
"y": 33
},
"name": "MySQL Handlers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 3
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_handlers_total{instance=~\"$instance\", handler!~\"commit|rollback|savepoint.*|prepare\"}[5m])",
"legend": "{{handler}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "9e37aa84-a6b6-4730-9fa7-0dab9e596e36",
"layout": {
"h": 7,
"i": "9e37aa84-a6b6-4730-9fa7-0dab9e596e36",
"w": 12,
"x": 12,
"y": 33
},
"name": "MySQL Transaction Handlers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_handlers_total{instance=~\"$instance\", handler=~\"commit|rollback|savepoint.*|prepare\"}[5m])",
"legend": "{{handler}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "779fdf9a-fcf8-4454-91a4-608950d3fba1",
"layout": {
"h": 1,
"i": "779fdf9a-fcf8-4454-91a4-608950d3fba1",
"w": 24,
"x": 0,
"y": 40
},
"name": "Open Files",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "ac797cf1-56f6-4cf7-a472-8a2facd84588",
"layout": {
"h": 7,
"i": "ac797cf1-56f6-4cf7-a472-8a2facd84588",
"w": 24,
"x": 0,
"y": 41
},
"name": "MySQL Open Files",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_variables_open_files_limit{instance=~\"$instance\"}",
"legend": "Open Files Limit"
},
{
"expr": "mysql_global_status_innodb_num_open_files{instance=~\"$instance\"}",
"legend": "InnoDB Open Files"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "292f69d6-1a6c-463e-8aaf-14715b447c1f",
"layout": {
"h": 1,
"i": "292f69d6-1a6c-463e-8aaf-14715b447c1f",
"w": 24,
"x": 0,
"y": 48
},
"name": "Table Openings",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Table Open Cache Status**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).",
"id": "0139a750-1a56-45ee-9004-7a8ef15d34dd",
"layout": {
"h": 7,
"i": "0139a750-1a56-45ee-9004-7a8ef15d34dd",
"w": 12,
"x": 0,
"y": 49
},
"name": "Table Open Cache Hit Ratio",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(mysql_global_status_table_open_cache_hits{instance=~\"$instance\"}[5m])\n/\n(\nrate(mysql_global_status_table_open_cache_hits{instance=~\"$instance\"}[5m])\n+\nrate(mysql_global_status_table_open_cache_misses{instance=~\"$instance\"}[5m])\n)",
"legend": "Table Open Cache Hit Ratio"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "**MySQL Open Tables**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).",
"id": "fba77c7e-9e40-4829-89b6-ed8bb2a7add7",
"layout": {
"h": 7,
"i": "fba77c7e-9e40-4829-89b6-ed8bb2a7add7",
"w": 12,
"x": 12,
"y": 49
},
"name": "MySQL Open Tables",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "mysql_global_status_open_tables{instance=~\"$instance\"}",
"legend": "Open Tables"
},
{
"expr": "mysql_global_variables_table_open_cache{instance=~\"$instance\"}",
"legend": "Table Open Cache"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(mysql_global_status_uptime, instance)",
"name": "instance",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328099241000
}
================================================
FILE: integrations/MySQL/markdown/README.md
================================================
# mysql
mysql 监控采集插件,核心原理就是连到 mysql 实例,执行一些 sql,解析输出内容,整理为监控数据上报。
## Configuration
categraf 的 `conf/input.mysql/mysql.toml`
```toml
[[instances]]
# 要监控 MySQL,首先要给出要监控的MySQL的连接地址、用户名、密码
address = "127.0.0.1:3306"
username = "root"
password = "1234"
# # set tls=custom to enable tls
# parameters = "tls=false"
# 通过 show global status监控mysql,默认抓取一些基础指标,
# 如果想抓取更多global status的指标,把下面的配置设置为true
extra_status_metrics = true
# 通过show global variables监控mysql的全局变量,默认抓取一些常规的
# 常规的基本够用了,扩展的部分,默认不采集,下面的配置设置为false
extra_innodb_metrics = false
# 监控processlist,关注较少,默认不采集
gather_processlist_processes_by_state = false
gather_processlist_processes_by_user = false
# 监控各个数据库的磁盘占用大小
gather_schema_size = false
# 监控所有的table的磁盘占用大小
gather_table_size = false
# 是否采集系统表的大小,通过不用,所以默认设置为false
gather_system_table_size = false
# 通过 show slave status监控slave的情况,比较关键,所以默认采集
gather_slave_status = true
# # timeout
# timeout_seconds = 3
# # interval = global.interval * interval_times
# interval_times = 1
# 为mysql实例附一个instance的标签,因为通过address=127.0.0.1:3306不好区分
# important! use global unique string to specify instance
# labels = { instance="n9e-10.2.3.4:3306" }
## Optional TLS Config
# use_tls = false
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
# 自定义SQL,指定SQL、返回的各个列那些是作为metric,哪些是作为label
# [[instances.queries]]
# measurement = "users"
# metric_fields = [ "total" ]
# label_fields = [ "service" ]
# # field_to_append = ""
# timeout = "3s"
# request = '''
# select 'n9e' as service, count(*) as total from n9e_v5.users
# '''
```
## 监控多个实例
大家最常问的问题是如何监控多个mysql实例,实际大家对toml配置学习一下就了解了,`[[instances]]` 部分表示数组,是可以出现多个的,举例:
```toml
[[instances]]
address = "10.2.3.6:3306"
username = "root"
password = "1234"
labels = { instance="n9e-10.2.3.6:3306" }
[[instances]]
address = "10.2.6.9:3306"
username = "root"
password = "1234"
labels = { instance="zbx-10.2.6.9:3306" }
[[instances]]
address = "/tmp/mysql.sock"
username = "root"
password = "1234"
labels = { instance="zbx-localhost:3306" }
```
================================================
FILE: integrations/MySQL/markdown/mysql.md
================================================
# mysql
mysql 监控采集插件,核心原理就是连到 mysql 实例,执行一些 sql,解析输出内容,整理为监控数据上报。
## Configuration
```toml
# # collect interval
# interval = 15
# 要监控 MySQL,首先要给出要监控的MySQL的连接地址、用户名、密码
[[instances]]
address = "127.0.0.1:3306"
username = "root"
password = "1234"
# # set tls=custom to enable tls
# parameters = "tls=false"
# 通过 show global status监控mysql,默认抓取一些基础指标,
# 如果想抓取更多global status的指标,把下面的配置设置为true
extra_status_metrics = true
# 通过show global variables监控mysql的全局变量,默认抓取一些常规的
# 常规的基本够用了,扩展的部分,默认不采集,下面的配置设置为false
extra_innodb_metrics = false
# 监控processlist,关注较少,默认不采集
gather_processlist_processes_by_state = false
gather_processlist_processes_by_user = false
# 监控各个数据库的磁盘占用大小
gather_schema_size = false
# 监控所有的table的磁盘占用大小
gather_table_size = false
# 是否采集系统表的大小,通过不用,所以默认设置为false
gather_system_table_size = false
# 通过 show slave status监控slave的情况,比较关键,所以默认采集
gather_slave_status = true
# # timeout
# timeout_seconds = 3
# # interval = global.interval * interval_times
# interval_times = 1
# 为mysql实例附一个instance的标签,因为通过address=127.0.0.1:3306不好区分
# important! use global unique string to specify instance
# labels = { instance="n9e-10.2.3.4:3306" }
## Optional TLS Config
# use_tls = false
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
# 自定义SQL,指定SQL、返回的各个列那些是作为metric,哪些是作为label
# [[instances.queries]]
# measurement = "users"
# metric_fields = [ "total" ]
# label_fields = [ "service" ]
# # field_to_append = ""
# timeout = "3s"
# request = '''
# select 'n9e' as service, count(*) as total from n9e_v5.users
# '''
```
## 监控多个实例
当主机填写为localhost时mysql会采用 unix domain socket连接
当主机填写为127.0.0.1时mysql会采用tcp方式连接
大家最常问的问题是如何监控多个mysql实例,实际大家对toml配置学习一下就了解了,`[[instances]]` 部分表示数组,是可以出现多个的,address参数支持通过unix路径连接 所以,举例:
```toml
[[instances]]
address = "10.2.3.6:3306"
username = "root"
password = "1234"
labels = { instance="n9e-10.2.3.6:3306" }
[[instances]]
address = "10.2.6.9:3306"
username = "root"
password = "1234"
labels = { instance="zbx-10.2.6.9:3306" }
[[instances]]
address = "/tmp/mysql.sock"
username = "root"
password = "1234"
labels = { instance="zbx-localhost:3306" }
```
================================================
FILE: integrations/MySQL/metrics/categraf-base.json
================================================
[
{
"id": 0,
"uuid": 1717556328100998000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status InnoDB 缓冲池 data 大小",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "mysql_global_status_buffer_pool_bytes_data",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status InnoDB 缓冲池 data 大小",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status InnoDB buffer pool data size",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328103397000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status InnoDB 缓冲池 dirty 大小",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "mysql_global_status_buffer_pool_bytes_dirty",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status InnoDB 缓冲池 dirty 大小",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status InnoDB buffer pool dirty size",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328105885000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status InnoDB 缓冲池 free 大小",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "mysql_global_status_buffer_pool_bytes_free",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status InnoDB 缓冲池 free 大小",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status InnoDB buffer pool free size",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328107738000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status InnoDB 缓冲池 page 使用率",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "mysql_global_status_buffer_pool_pages_utilization",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status InnoDB 缓冲池 page 使用率",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status InnoDB buffer pool page usage",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328109861000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status InnoDB 缓冲池 used 大小",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "mysql_global_status_buffer_pool_bytes_used",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status InnoDB 缓冲池 used 大小",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status InnoDB Buffer Pool used Size",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328111723000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status InnoDB 缓冲池总大小",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "mysql_global_status_buffer_pool_bytes_total",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status InnoDB 缓冲池总大小",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status Total InnoDB Buffer Pool Size",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328113574000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 启动时长",
"unit": "seconds",
"note": "",
"lang": "zh_CN",
"expression": "mysql_global_status_uptime",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 启动时长",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status Startup Time",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328115548000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 当前 running 的 threads 数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "mysql_global_status_threads_running",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 当前 running 的 threads 数量",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status The number of threads currently running",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328117539000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 当前打开的文件句柄数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "mysql_global_status_open_files",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 当前打开的文件句柄数",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status Number of file handles currently open",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328119415000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 当前连接数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "mysql_global_status_threads_connected",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 当前连接数",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status Number of current connections",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328121269000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 最大曾用连接数",
"unit": "none",
"note": "曾经达到过的最大连接数",
"lang": "zh_CN",
"expression": "mysql_global_status_max_used_connections",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 最大曾用连接数",
"note": "曾经达到过的最大连接数"
},
{
"lang": "en_US",
"name": "Global Status Maximum number of connections used",
"note": "Maximum number of connections ever reached"
}
]
},
{
"id": 0,
"uuid": 1717556328123032000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 每秒 Command 数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "irate(mysql_global_status_commands_total[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 每秒 Command 数量",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status Number of Commands per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328125132000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 每秒 query 数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "irate(mysql_global_status_queries[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 每秒 query 数量",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status queries per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328126899000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 每秒 question 数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "irate(mysql_global_status_questions[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 每秒 question 数量",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status Questions per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328128900000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 每秒 slow query 数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "irate(mysql_global_status_slow_queries[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 每秒 slow query 数量",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status slow queries per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328130785000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 每秒事务操作数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "sum(irate(mysql_global_status_commands_total{command=~\"commit|rollback\"}[3m])) without (command)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 每秒事务操作数量",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status Number of transactions per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328133386000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 每秒写操作数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "sum(irate(mysql_global_status_commands_total{command=~\"insert|update|delete|replace\"}[3m])) without (command)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 每秒写操作数量",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status Number of writes per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328135341000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 每秒发送流量",
"unit": "bytesSecIEC",
"note": "",
"lang": "zh_CN",
"expression": "irate(mysql_global_status_bytes_sent[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 每秒发送流量",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status sends traffic per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328137304000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 每秒接收流量",
"unit": "bytesSecIEC",
"note": "",
"lang": "zh_CN",
"expression": "irate(mysql_global_status_bytes_received[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 每秒接收流量",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status receives traffic per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328139154000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 每秒读操作数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "sum(irate(mysql_global_status_commands_total{command=\"select\"}[3m])) without (command)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 每秒读操作数量",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status Read operations per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328140928000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 近 3 分钟 abort 的客户端",
"unit": "none",
"note": "原始指标 mysql_global_status_aborted_clients 表示由于客户端未正确关闭连接而终止的连接数,Counter 类型,单调递增。",
"lang": "zh_CN",
"expression": "increase(mysql_global_status_aborted_clients[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 近 3 分钟 abort 的客户端",
"note": "原始指标 mysql_global_status_aborted_clients 表示由于客户端未正确关闭连接而终止的连接数,Counter 类型,单调递增。"
},
{
"lang": "en_US",
"name": "Global Status nearly 3 minutes abort client",
"note": "The raw metric mysql _ global _ status _ aborted _ clients represents the number of connections terminated because the client did not properly close the connection, Counter type, monotonically increasing."
}
]
},
{
"id": 0,
"uuid": 1717556328143016000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 近 3 分钟 abort 的连接数",
"unit": "none",
"note": "原始指标 mysql_global_status_aborted_connects 表示尝试连接到 MySQL 服务器失败的次数,Counter 类型,单调递增。",
"lang": "zh_CN",
"expression": "increase(mysql_global_status_aborted_connects[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 近 3 分钟 abort 的连接数",
"note": "原始指标 mysql_global_status_aborted_connects 表示尝试连接到 MySQL 服务器失败的次数,Counter 类型,单调递增。"
},
{
"lang": "en_US",
"name": "Global Status Number of connections in last 3 minutes abort",
"note": "The raw metric MySQL _ global _ status _ aborted _ connects represents the number of failed attempts to connect to a MySQL server, Counter type, monotonically increasing."
}
]
},
{
"id": 0,
"uuid": 1717556328144854000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Status 近 3 分钟 table lock 等待次数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "increase(mysql_global_status_table_locks_waited[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Status 近 3 分钟 table lock 等待次数",
"note": ""
},
{
"lang": "en_US",
"name": "Global Status nearly 3 minutes table lock waiting times",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328146686000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Variables InnoDB 缓冲池配置大小",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "mysql_global_variables_innodb_buffer_pool_size",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Variables InnoDB 缓冲池配置大小",
"note": ""
},
{
"lang": "en_US",
"name": "Global Variables InnoDB buffer pool configuration size",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328148603000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Variables read_only 开关值",
"unit": "none",
"note": "0 就是 OFF,1 是 ON",
"lang": "zh_CN",
"expression": "mysql_global_variables_read_only",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Variables read_only 开关值",
"note": "0 就是 OFF,1 是 ON"
},
{
"lang": "en_US",
"name": "Global Variables read _ only Switch value",
"note": "0 is OFF, 1 is ON"
}
]
},
{
"id": 0,
"uuid": 1717556328150445000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Variables 允许打开的文件句柄数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "mysql_global_variables_open_files_limit",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Variables 允许打开的文件句柄数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of file handles that Global Variables allows to open",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328152634000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Variables 最大连接数限制",
"unit": "none",
"note": "允许的最大连接数,默认值是 151,过小了。\n\n- 通过 `SHOW VARIABLES LIKE 'max_connections'` 命令查看当前设置\n- 通过 `SET GLOBAL max_connections = 2048` 重新设置最大连接数\n- 通过修改 MySQL 配置文件,在 `[mysqld]` 下面添加 `max_connections = 2048` 使其重启依旧生效",
"lang": "zh_CN",
"expression": "mysql_global_variables_max_connections",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Variables 最大连接数限制",
"note": "允许的最大连接数,默认值是 151,过小了。\n\n- 通过 `SHOW VARIABLES LIKE 'max_connections'` 命令查看当前设置\n- 通过 `SET GLOBAL max_connections = 2048` 重新设置最大连接数\n- 通过修改 MySQL 配置文件,在 `[mysqld]` 下面添加 `max_connections = 2048` 使其重启依旧生效"
},
{
"lang": "en_US",
"name": "Global Variables Maximum Connection Limit",
"note": "The maximum number of connections allowed, the default value is 151, is too small. \n \n-View the current settings with the ` SHOW VARIABLES LIKE'max _ connections ''command \n-Reset the maximum number of connections via ` SET GLOBAL max _ connections = 2048 ` \n-By modifying the MySQL configuration file, add ` max _ connections = 2048 ` under ` [mysqld] ` so that its restart still works"
}
]
},
{
"id": 0,
"uuid": 1717556328154471000,
"collector": "Categraf",
"typ": "MySQL",
"name": "Global Variables 查询缓存大小",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "mysql_global_variables_query_cache_size",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Global Variables 查询缓存大小",
"note": ""
},
{
"lang": "en_US",
"name": "Global Variables Query Cache Size",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328156411000,
"collector": "Categraf",
"typ": "MySQL",
"name": "MySQL 实例是否 UP",
"unit": "none",
"note": "1 表示 UP,说明能正常连到 MySQL 采集数据;0 表示无法连通 MySQL 实例,可能是网络问题、认证问题,或者 MySQL 本身就是挂了",
"lang": "zh_CN",
"expression": "mysql_up",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "MySQL 实例是否 UP",
"note": "1 表示 UP,说明能正常连到 MySQL 采集数据;0 表示无法连通 MySQL 实例,可能是网络问题、认证问题,或者 MySQL 本身就是挂了"
},
{
"lang": "en_US",
"name": "Whether MySQL instance is UP",
"note": "1 means UP, indicating that it can normally connect to MySQL to collect data; 0 means that the MySQL instance cannot be connected. It may be a network problem, authentication problem, or MySQL itself is down"
}
]
},
{
"id": 0,
"uuid": 1717556328158254000,
"collector": "Categraf",
"typ": "MySQL",
"name": "MySQL 指标抓取耗时",
"unit": "seconds",
"note": "",
"lang": "zh_CN",
"expression": "mysql_scrape_use_seconds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "MySQL 指标抓取耗时",
"note": ""
},
{
"lang": "en_US",
"name": "MySQL metric crawling time-consuming",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328160126000,
"collector": "Categraf",
"typ": "MySQL",
"name": "MySQL 版本信息",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "mysql_version_info",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "MySQL 版本信息",
"note": ""
},
{
"lang": "en_US",
"name": "MySQL version information",
"note": ""
}
]
}
]
================================================
FILE: integrations/N9E/dashboards/n9e_server.json
================================================
{
"id": 0,
"group_id": 0,
"name": "nightingale",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "53fcb9dc-23f9-41e0-bc5e-121eed14c3a4",
"layout": {
"h": 4,
"i": "53fcb9dc-23f9-41e0-bc5e-121eed14c3a4",
"isResizable": true,
"w": 12,
"x": 0,
"y": 0
},
"name": "number of data points received per second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(n9e_server_samples_received_total[1m])",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "f70dcb8b-b58b-4ef9-9e48-f230d9e17140",
"layout": {
"h": 4,
"i": "47fc6252-9cc8-4b53-8e27-0c5c59a47269",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"name": "number of alarm events generated per second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(n9e_server_alerts_total[10m])",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "caf23e58-d907-42b0-9ed6-722c8c6f3c5f",
"layout": {
"h": 4,
"i": "ad1af16c-de0c-45f4-8875-cea4e85d51d0",
"isResizable": true,
"w": 12,
"x": 0,
"y": 4
},
"name": "queue length of alarm events",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "n9e_server_alert_queue_size",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "noraml"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "6b8d2db1-efca-4b9e-b429-57a9d2272bc5",
"layout": {
"h": 4,
"i": "64c3abc2-404c-4462-a82f-c109a21dac91",
"isResizable": true,
"w": 12,
"x": 12,
"y": 4
},
"name": "The average response time of the data receiving interface (unit: seconds).",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "n9e_server_http_request_duration_seconds_sum/n9e_server_http_request_duration_seconds_count",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "bd41677d-40d3-482e-bb6e-fbd25df46d87",
"layout": {
"h": 4,
"i": "1c7da942-58c2-40dc-b42f-983e4a35b89b",
"isResizable": true,
"w": 12,
"x": 0,
"y": 8
},
"name": "length of the in-memory data queue",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "n9e_server_sample_queue_size",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "noraml"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "c8642e72-f384-46a5-8410-1e6be2953c3c",
"layout": {
"h": 4,
"i": "eed94a0b-954f-48ac-82e5-a2eada1c8a3d",
"isResizable": true,
"w": 12,
"x": 12,
"y": 8
},
"name": "average time it takes to send data to TSDB (unit: seconds)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "avg(n9e_server_forward_duration_seconds_sum/n9e_server_forward_duration_seconds_count)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328164766000
}
================================================
FILE: integrations/N9E/dashboards/n9e_v6.json
================================================
{
"id": 0,
"group_id": 0,
"name": "nightingale v6",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "53fcb9dc-23f9-41e0-bc5e-121eed14c3a4",
"layout": {
"h": 4,
"i": "53fcb9dc-23f9-41e0-bc5e-121eed14c3a4",
"isResizable": true,
"w": 12,
"x": 0,
"y": 0
},
"maxPerRow": 4,
"name": "number of data points received per second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(n9e_pushgw_samples_received_total[1m])",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "caf23e58-d907-42b0-9ed6-722c8c6f3c5f",
"layout": {
"h": 4,
"i": "ad1af16c-de0c-45f4-8875-cea4e85d51d0",
"isResizable": true,
"w": 12,
"x": 12,
"y": 0
},
"maxPerRow": 4,
"name": "queue length of alarm events",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "n9e_alert_alert_queue_size",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "d7c253bd-05f3-4b43-a7bd-34ffacd16fbd",
"layout": {
"h": 4,
"i": "d7c253bd-05f3-4b43-a7bd-34ffacd16fbd",
"isResizable": true,
"w": 12,
"x": 0,
"y": 4
},
"maxPerRow": 4,
"name": "last sync records count",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "n9e_cron_sync_number",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "7c725076-a986-4414-abd4-ce06605665b5",
"layout": {
"h": 4,
"i": "7c725076-a986-4414-abd4-ce06605665b5",
"isResizable": true,
"w": 12,
"x": 12,
"y": 4
},
"maxPerRow": 4,
"name": "last sync time duration",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "n9e_cron_duration",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328165924000
}
================================================
FILE: integrations/N9E/dashboards/n9e_v8.json
================================================
{
"name": "nightingale v8",
"tags": "",
"ident": "",
"uuid": 1742467018844000,
"configs": {
"panels": [
{
"collapsed": true,
"id": "5bfd69d8-f7c1-4821-b663-515b70744c0f",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 0,
"i": "5bfd69d8-f7c1-4821-b663-515b70744c0f",
"isResizable": false
},
"name": "Time Series In/Out",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "53fcb9dc-23f9-41e0-bc5e-121eed14c3a4",
"layout": {
"h": 5,
"w": 5,
"x": 0,
"y": 1,
"i": "53fcb9dc-23f9-41e0-bc5e-121eed14c3a4",
"isResizable": true
},
"maxPerRow": 4,
"name": "Samples received / Second",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "single"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(n9e_pushgw_samples_received_total[$__rate_interval])",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "8830b5cc-2b1a-4513-9cc1-bbd6c96dfffb",
"layout": {
"h": 5,
"w": 5,
"x": 5,
"y": 1,
"i": "21a12e4d-f029-4dd0-98f4-970ae28612a3",
"isResizable": true
},
"maxPerRow": 4,
"name": "Samples written to TSDB / Second",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "single"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(n9e_pushgw_write_total[$__rate_interval])",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"type": "timeseries",
"id": "6fa42981-424c-4309-bc22-3c8191e5cf3c",
"layout": {
"h": 5,
"w": 5,
"x": 10,
"y": 1,
"i": "e3e4d46e-b6d6-4cdd-b72d-56f3e9fb1e4a",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(n9e_pushgw_write_error_total[$__rate_interval]) or vector(0)",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Write Samples Occur Error / Second",
"description": "每秒有多少数据点写失败了",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "single"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "每秒 Drop 掉多少个数据点。只有配置了 Pushgw.DropSample 才会主动 Drop 指标",
"id": "e2cc1263-1eda-4d31-8772-7e63b3c24971",
"layout": {
"h": 5,
"w": 5,
"x": 15,
"y": 1,
"i": "27ec2eb1-390d-42bb-904e-a8e28e3331ae",
"isResizable": true
},
"maxPerRow": 4,
"name": "Drop Samples / Second",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "single"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(n9e_pushgw_drop_sample_total[$__rate_interval])",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"type": "barGauge",
"id": "83beec43-1471-4507-a7f9-d58c0c1f81df",
"layout": {
"h": 10,
"w": 4,
"x": 20,
"y": 1,
"i": "1084e65c-0d67-4589-9c59-47a784dba0d6",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "topk(20, rate(n9e_pushgw_sample_received_by_ident[$__rate_interval]))",
"instant": true,
"legend": "{{host_ident}}",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Top hosts",
"description": "上报监控数据最多的机器,数值表示每秒上报多少个数据点",
"maxPerRow": 4,
"custom": {
"calc": "lastNotNull",
"valueField": "Value",
"sortOrder": "desc",
"displayMode": "basic",
"valueMode": "color",
"otherPosition": "none"
},
"options": {
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(44, 157, 61)",
"value": null,
"type": "base"
}
]
},
"standardOptions": {
"decimals": 0
}
}
},
{
"type": "timeseries",
"id": "9e32d6c9-e9a5-4890-8722-00d4197d9e5e",
"layout": {
"h": 5,
"w": 5,
"x": 0,
"y": 6,
"i": "326b6619-1665-4d0c-adf8-75fb54aeb75b",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "n9e_pushgw_sample_queue_size",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Write Queue Size",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(255, 101, 107)",
"value": 10000,
"type": ""
},
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "39c3cfc0-0860-4f9f-aaf9-98cda7622556",
"layout": {
"h": 5,
"w": 5,
"x": 5,
"y": 6,
"i": "dc941959-2255-4591-9fd4-c21c54815b92",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(n9e_pushgw_push_queue_over_limit_error_total[$__rate_interval])",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Queue Over WaterMark, Reject Requests / Second",
"description": "队列积压太严重,达到水位线,开始拒绝请求。表示每秒拒绝多少次请求",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgb(255, 101, 107)",
"value": 10000,
"type": ""
},
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "afe97f46-7752-44aa-8944-a60be597cf5e",
"layout": {
"h": 5,
"w": 5,
"x": 10,
"y": 6,
"i": "321a6deb-8027-4af9-bd0c-0adf9941b286",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(n9e_pushgw_forward_duration_seconds_count[$__rate_interval])",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "HTTP Request / Second(Write TSDB)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "single"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "4efab293-c374-4515-985a-1dea80aa6da8",
"layout": {
"h": 5,
"w": 5,
"x": 15,
"y": 6,
"i": "1ae8127f-d90b-4dfc-873c-a2f7b8936bc8",
"isResizable": true
},
"maxPerRow": 4,
"name": "P95 Latency(Write TSDB)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"placement": "bottom",
"selectMode": "single"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"thresholdsStyle": {
"mode": "dashed"
},
"tooltip": {
"mode": "single"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "histogram_quantile(0.95, rate(n9e_pushgw_forward_duration_seconds_bucket[$__rate_interval]))",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.1.0"
},
{
"collapsed": true,
"id": "5bf92893-f3fb-4a8a-bee3-c4c857add367",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 11,
"i": "5bf92893-f3fb-4a8a-bee3-c4c857add367",
"isResizable": false
},
"name": "Sync and Alerting",
"panels": [],
"type": "row"
},
{
"type": "timeseries",
"id": "d7c253bd-05f3-4b43-a7bd-34ffacd16fbd",
"layout": {
"h": 5,
"w": 6,
"x": 0,
"y": 12,
"i": "d7c253bd-05f3-4b43-a7bd-34ffacd16fbd",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "n9e_cron_sync_number",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Last sync records count",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "7c725076-a986-4414-abd4-ce06605665b5",
"layout": {
"h": 5,
"w": 6,
"x": 6,
"y": 12,
"i": "7c725076-a986-4414-abd4-ce06605665b5",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "n9e_cron_duration",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Last sync time duration",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "af584a0a-0595-4bd1-a999-4a9799fa40bf",
"layout": {
"h": 5,
"w": 6,
"x": 12,
"y": 12,
"i": "9a91af6a-c4b2-4220-b3cb-7e52e431870d",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "n9e_alert_alert_queue_size",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Queue length of alarm events",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "caf23e58-d907-42b0-9ed6-722c8c6f3c5f",
"layout": {
"h": 5,
"w": 6,
"x": 18,
"y": 12,
"i": "ad1af16c-de0c-45f4-8875-cea4e85d51d0",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "(rate(n9e_alert_rule_eval_error_total[$__rate_interval]) > 0) or vector(0)",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Alerting rules eval occur errors / Second",
"description": "告警规则运行的时候,每秒失败了多少次",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "row",
"id": "88508bde-2957-4afb-b10f-1d84c6461eec",
"name": "Redis Read and Write",
"collapsed": true,
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 17,
"i": "88508bde-2957-4afb-b10f-1d84c6461eec",
"isResizable": false
}
},
{
"type": "timeseries",
"id": "aa551bb1-c218-4fe2-8099-059e66905617",
"layout": {
"h": 5,
"w": 5,
"x": 0,
"y": 18,
"i": "79b57988-0b58-4d2d-bd8f-41f5998496ce",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "histogram_quantile(0.99, rate({__name__=~\"n9e_pushgw_redis_operation_latency_seconds_bucket|n9e_center_redis_operation_latency_seconds_bucket\"}[$__rate_interval]))",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "P99 Latency(Redis Operation)",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "seconds",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "964bd084-9a2e-453c-bc3a-cf64550fe358",
"layout": {
"h": 5,
"w": 5,
"x": 5,
"y": 18,
"i": "153a8a0f-b9d7-4d7d-8cc4-aa346ecb73c5",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate({__name__=~\"n9e_pushgw_redis_operation_latency_seconds_count|n9e_center_redis_operation_latency_seconds_count\"}[$__rate_interval])",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Redis Operation / Second",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "ops",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "5f92a0f4-cc4d-4c86-a5b1-cb12b9fd5304",
"layout": {
"h": 5,
"w": 5,
"x": 10,
"y": 18,
"i": "7abbce6f-efea-40a8-a7be-85c3380a0794",
"isResizable": true
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "(\nrate({__name__=~\"n9e_pushgw_redis_operation_latency_seconds_bucket|n9e_center_redis_operation_latency_seconds_bucket\", le=\"+Inf\"}[$__rate_interval]) - rate({__name__=~\"n9e_pushgw_redis_operation_latency_seconds_bucket|n9e_center_redis_operation_latency_seconds_bucket\", le=\"1\"}[$__rate_interval])\n) or vector(0)",
"maxDataPoints": 720,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Slow Operataion / Second",
"description": "操作 Redis 时耗时超过 1s 就算慢",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "ops",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
}
],
"var": [
{
"name": "prom",
"label": "数据源",
"type": "datasource",
"hide": false,
"definition": "prometheus"
}
],
"version": "3.0.0"
}
}
================================================
FILE: integrations/N9E/markdown/README.md
================================================
# N9E
夜莺V5版本分两个组件,n9e-webapi 和 n9e-server,都通过 `/metrics` 接口暴露了 Prometheus 协议的监控数据。夜莺V6版本默认只有一个组件,就是 n9e,也通过 `/metrics` 接口暴露了 Prometheus 协议的监控数据。如果使用边缘机房部署方案,会用到 n9e-edge,n9e-edge 也通过 `/metrics` 接口暴露了 Prometheus 协议的监控数据。
所以,通过 categraf 的 prometheus 插件即可采集夜莺的监控数据。
## 采集配置
categraf 的 `conf/input.prometheus/prometheus.toml`
```toml
[[instances]]
urls = [
"http://IP:17000/metrics"
]
labels = {job="n9e"}
```
================================================
FILE: integrations/NFSClient/collect/nfsclient/nfsclient.toml
================================================
# # collect interval
# interval = 15
## Read more low-level metrics (optional, defaults to false)
fullstat = false
## List of mounts to explicitly include or exclude (optional)
## The pattern (Go regexp) is matched against the mount point (not the
## device being mounted). If include_mounts is set, all mounts are ignored
## unless present in the list. If a mount is listed in both include_mounts
## and exclude_mounts, it is excluded. Go regexp patterns can be used.
# include_mounts = []
# exclude_mounts = []
## List of operations to include or exclude from collecting. This applies
## only when fullstat=true. Semantics are similar to {include,exclude}_mounts:
## the default is to collect everything; when include_operations is set, only
## those OPs are collected; when exclude_operations is set, all are collected
## except those listed. If include and exclude are set, the OP is excluded.
## See /proc/self/mountstats for a list of valid operations; note that
## NFSv3 and NFSv4 have different lists. While it is not possible to
## have different include/exclude lists for NFSv3/4, unused elements
## in the list should be okay. It is possible to have different lists
## for different mountpoints: use multiple [[input.nfsclient]] stanzas,
## with their own lists. See "include_mounts" above, and be careful of
## duplicate metrics.
# include_operations = ['READ','WRITE','ACCESS','GETATTR','READDIR','LOOKUP']
# exclude_operations = []
================================================
FILE: integrations/NFSClient/markdown/README.md
================================================
# NFS Client
forked from telegraf/inputs.nfsclient
## 停用该插件
- 方法一:把 `input.nfsclient` 目录改个别的名字,不用 `input.` 打头
- 方法二:nfsclient.toml 中的配置留空
================================================
FILE: integrations/NSQ/collect/nsq/nsq.toml
================================================
# # collect interval
# interval = 15
# [[instances]]
## The Nsq API URI used to collect statistical information.
# targets = ["http://localhost:4151"]
# headers={Authorization="", X-Forwarded-For="", Host=""}
# timeout="5s"
# # basic auth
# username=""
# password=""
## append some labels for series
# labels = { product="nsq" }
## interval = global.interval * interval_times
# interval_times = 1
================================================
FILE: integrations/NSQ/markdown/README.md
================================================
# nsq
forked from [telegraf/nsq](https://github.com/influxdata/telegraf/blob/master/plugins/inputs/nsq/nsq.go)
## Configuration
- 配置文件,[参考示例](https://github.com/flashcatcloud/categraf/blob/main/conf/input.nsq/nsq.toml)
## 指标列表
### nsq_client类
ready_count 可消费消息数
inflight_count 正在处理消息数
message_count 消息总数
finish_count 完成统计
requeue_count 重新排队消息数
### nsq_channel类
depth 当前的积压量
backend_depth 消息缓冲队列积压量
inflight_count 正在处理消息数
deferred_count 延迟消息数
message_count 消息总数
requeue_count 重新排队消息数
timeout_count 超时消息数
client_count 客户端数量
### nsq_topic类
depth 消息队列积压量
backend_depth 消息缓冲队列积压量
message_count 消息总数
channel_count 消费者总数
## metrics
此配置可 克隆到nightingale的metrics.yaml文件中作为中文指标解释
# [nsq]
nsq_server_server_count: "nsq 服务端总计"
nsq_server_topic_count: "nsq topic总数"
nsq_topic_depth: 消息队列积压量
nsq_topic_backend_depth: 消息缓冲队列积压量
nsq_topic_message_count: 消息总数
nsq_topic_channel_count: 消费者总数
nsq_channel_depth: "当前消息数,内存和硬盘转存的消息数,即当前的积压量"
nsq_channel_backend_depth: 消息缓冲队列积压量
nsq_channel_inflight_count: "当前未完成的消息数,包括发送但未返回FIN/重新入队列REQ/超时TIMEOUT 三种消息数之和,代表已经投递还未消费掉的消息"
nsq_channel_deferred_count: "重新入队的延迟消息数,指还未发布的重入队消息数量,即未消费的定时(延时)消息数"
nsq_channel_message_count: 节点启动后的所有新消息总数,真正的消息次数
nsq_channel_requeue_count: 重新入队的消息数,即返回REQ的消息数量
nsq_channel_timeout_count: 已重入队列但按配置的超时时间内还收到响应的消息数
nsq_channel_client_count: 客户端连接数
nsq_client_ready_count: 客户端可消费消息数
nsq_client_inflight_count: 客户端正在处理消息数
nsq_client_message_count: 客户端消息总数
nsq_client_finish_count: 客户端完成的消息数,即返回FIN的消息数
nsq_client_requeue_count: 客户端重新入队的消息数,即返回REQ的消息数量
================================================
FILE: integrations/NVIDIA/collect/nvidia_smi/nvidia_smi.toml
================================================
# # collect interval
# interval = 15
# exec local command
# e.g. nvidia_smi_command = "nvidia-smi"
nvidia_smi_command = ""
# exec remote command
# nvidia_smi_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null SSH_USER@SSH_HOST nvidia-smi"
# Comma-separated list of the query fields.
# You can find out possible fields by running `nvidia-smi --help-query-gpu`.
# The value `AUTO` will automatically detect the fields to query.
query_field_names = "AUTO"
================================================
FILE: integrations/NVIDIA/dashboards/nvidia-gpu-metrics-by-categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Nvidia GPU Metrics by nvidia-smi",
"ident": "nvidia-gpu-metrics-by-nvidia-smi",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"graphTooltip": "default",
"graphZoom": "default",
"links": [],
"panels": [
{
"custom": {
"calc": "last",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "The official product name of the GPU. This is an alphanumeric string. For all products.",
"id": "dc97df4f-3983-4c94-af14-ceaace1e64f9",
"layout": {
"h": 3,
"i": "dc97df4f-3983-4c94-af14-ceaace1e64f9",
"w": 4,
"x": 0,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Name",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"decimals": 2,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "The current performance state for the GPU. States range from P0 (maximum performance) to P12 (minimum performance).",
"id": "6d1fe3b0-4873-438c-9135-32fae6ad03ad",
"layout": {
"h": 3,
"i": "6d1fe3b0-4873-438c-9135-32fae6ad03ad",
"w": 2,
"x": 4,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "P-State",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"decimals": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": [
{
"options": {
"": {
"text": ""
}
},
"type": "value"
}
]
},
"targets": [
{
"expr": "nvidia_smi_pstate{uuid=\"$gpu\"}",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.",
"id": "de5719b9-2288-4b14-9ffe-3103cfaddf96",
"layout": {
"h": 5,
"i": "de5719b9-2288-4b14-9ffe-3103cfaddf96",
"w": 3,
"x": 6,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "GPU Utilization %",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"max": 1,
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "#F2495C",
"value": 90
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}",
"legend": "{{uuid}}",
"refId": "A"
}
],
"type": "gauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts / The software power limit in watts.",
"id": "591615c2-2dde-47d5-a905-c83ae5140556",
"layout": {
"h": 5,
"i": "591615c2-2dde-47d5-a905-c83ae5140556",
"w": 3,
"x": 9,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Power Draw %",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"max": 1,
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "#F2495C",
"value": 90
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"} / nvidia_smi_power_default_limit_watts{uuid=\"$gpu\"}",
"legend": "",
"refId": "A"
}
],
"type": "gauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n",
"id": "72a73e7e-7d78-4966-a32f-448ab24f74bc",
"layout": {
"h": 5,
"i": "72a73e7e-7d78-4966-a32f-448ab24f74bc",
"w": 3,
"x": 12,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Fan Speed %",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"max": 1,
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "#F2495C",
"value": 90
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}",
"legend": "",
"refId": "A"
}
],
"type": "gauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Core GPU temperature. in degrees C.",
"id": "4fdd82a2-c08f-490c-af6f-c6305971fb17",
"layout": {
"h": 5,
"i": "4fdd82a2-c08f-490c-af6f-c6305971fb17",
"w": 3,
"x": 15,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Temperature",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"max": 100,
"min": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}",
"legend": "{{uuid}}",
"refId": "A"
}
],
"type": "gauge",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.",
"id": "9e47375c-7db9-4ec3-8912-5f2870362405",
"layout": {
"h": 5,
"i": "9e47375c-7db9-4ec3-8912-5f2870362405",
"w": 6,
"x": 18,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Memory Utilization %",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}",
"legend": "{{uuid}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "The version of the installed NVIDIA display driver. This is an alphanumeric string.",
"id": "a4d93297-1437-479d-b4a2-8957b4fd8107",
"layout": {
"h": 2,
"i": "a4d93297-1437-479d-b4a2-8957b4fd8107",
"w": 3,
"x": 0,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "Driver Version",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"decimals": 2,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}",
"legend": "{{driver_version}}",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "The BIOS of the GPU board.",
"id": "f1541d28-c8ee-40c8-b625-06ddf9ffdca3",
"layout": {
"h": 2,
"i": "f1541d28-c8ee-40c8-b625-06ddf9ffdca3",
"w": 3,
"x": 3,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "Vbios Version",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"decimals": 2,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}",
"legend": "{{vbios_version}}",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Information about factors that are reducing the frequency of clocks. If all throttle reasons are returned as \"Not Active\" it means that clocks are running as high as possible.",
"id": "412ec2b8-253f-4906-bc47-31c0344a6c55",
"layout": {
"h": 5,
"i": "412ec2b8-253f-4906-bc47-31c0344a6c55",
"w": 6,
"x": 0,
"y": 5
},
"links": [],
"maxPerRow": 4,
"name": "Throttle Reasons",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"decimals": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": [
{
"options": {
"0": {
"text": "Not Active"
},
"1": {
"text": "Active"
}
},
"type": "value"
}
]
},
"targets": [
{
"expr": "nvidia_smi_clocks_throttle_reasons_gpu_idle{uuid=\"$gpu\"}",
"legend": "Idle",
"refId": "A"
},
{
"expr": "nvidia_smi_clocks_throttle_reasons_hw_thermal_slowdown{uuid=\"$gpu\"}",
"legend": "HW Thermal Slowdown",
"refId": "B"
},
{
"expr": "nvidia_smi_clocks_throttle_reasons_sw_power_cap{uuid=\"$gpu\"}",
"legend": "SW Power Cap",
"refId": "C"
},
{
"expr": "nvidia_smi_clocks_throttle_reasons_applications_clocks_setting{uuid=\"$gpu\"}",
"legend": "App Clocks Setting",
"refId": "D"
},
{
"expr": "nvidia_smi_clocks_throttle_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"}",
"legend": "HW Power Brake",
"refId": "E"
},
{
"expr": "nvidia_smi_clocks_throttle_reasons_sw_thermal_slowdown{uuid=\"$gpu\"}",
"legend": "SW Thermal Slowdown",
"refId": "F"
},
{
"expr": "nvidia_smi_clocks_throttle_reasons_sync_boost{uuid=\"$gpu\"}",
"legend": "Sync Boost",
"refId": "G"
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Current frequency of graphics (shader) clock\n/\nMaximum frequency of graphics (shader) clock.\n",
"id": "bc00b64d-de08-4dcb-824a-40798ed5e382",
"layout": {
"h": 5,
"i": "bc00b64d-de08-4dcb-824a-40798ed5e382",
"w": 3,
"x": 6,
"y": 5
},
"links": [],
"maxPerRow": 4,
"name": "GPU Clock Speed %",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"max": 1,
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "#F2495C",
"value": 90
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_graphics_clock_hz{uuid=\"$gpu\"}",
"legend": "",
"refId": "A"
}
],
"type": "gauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Current frequency of memory clock / Maximum frequency of memory clock",
"id": "3d11c7ed-a7aa-4b4e-8fab-7e75fa024fce",
"layout": {
"h": 5,
"i": "3d11c7ed-a7aa-4b4e-8fab-7e75fa024fce",
"w": 3,
"x": 9,
"y": 5
},
"links": [],
"maxPerRow": 4,
"name": "Memory Clock Speed %",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"max": 1,
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "#F2495C",
"value": 90
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_memory_clock_hz{uuid=\"$gpu\"}",
"legend": "",
"refId": "A"
}
],
"type": "gauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Total memory allocated by active contexts / Total installed GPU memory.",
"id": "5531b3b0-6fcb-4c0e-bfbe-97a27f69d694",
"layout": {
"h": 5,
"i": "5531b3b0-6fcb-4c0e-bfbe-97a27f69d694",
"w": 3,
"x": 12,
"y": 5
},
"links": [],
"maxPerRow": 4,
"name": "Memory Allocation %",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"max": 1,
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "#F2495C",
"value": 90
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"} / nvidia_smi_memory_total_bytes{uuid=\"$gpu\"}",
"legend": "",
"refId": "A"
}
],
"type": "gauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "last",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.",
"id": "0f7051b8-f94a-427a-b4e3-5edcf50cd1ac",
"layout": {
"h": 5,
"i": "0f7051b8-f94a-427a-b4e3-5edcf50cd1ac",
"w": 3,
"x": 15,
"y": 5
},
"links": [],
"maxPerRow": 4,
"name": "Memory Utilization %",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"max": 1,
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "#F2495C",
"value": 90
}
],
"style": "line"
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}",
"legend": "",
"refId": "A"
}
],
"type": "gauge",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.",
"id": "c15d98fa-619b-4a6d-a50d-321bcf5dd993",
"layout": {
"h": 5,
"i": "c15d98fa-619b-4a6d-a50d-321bcf5dd993",
"w": 6,
"x": 18,
"y": 5
},
"links": [],
"maxPerRow": 4,
"name": "GPU Utilization %",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Total memory allocated by active contexts.",
"id": "04ba1b8b-9319-4f4a-9432-c773b8adee78",
"layout": {
"h": 5,
"i": "04ba1b8b-9319-4f4a-9432-c773b8adee78",
"w": 6,
"x": 0,
"y": 10
},
"links": [],
"maxPerRow": 4,
"name": "Memory Allocation",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"}",
"legend": "{{uuid}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Core GPU temperature. in degrees C.",
"id": "57305284-7479-4b51-bc4c-7c3128a64278",
"layout": {
"h": 5,
"i": "57305284-7479-4b51-bc4c-7c3128a64278",
"w": 6,
"x": 6,
"y": 10
},
"links": [],
"maxPerRow": 4,
"name": "Temperature",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}",
"legend": "{{uuid}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts",
"id": "a183857f-cd22-4065-9f76-dc8e5c7c10df",
"layout": {
"h": 5,
"i": "a183857f-cd22-4065-9f76-dc8e5c7c10df",
"w": 6,
"x": 12,
"y": 10
},
"links": [],
"maxPerRow": 4,
"name": "Power Draw",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"}",
"legend": "{{uuid}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n",
"id": "e83d4003-c6d4-4bb3-bf21-2572c5c32f1c",
"layout": {
"h": 5,
"i": "e83d4003-c6d4-4bb3-bf21-2572c5c32f1c",
"w": 6,
"x": 18,
"y": 10
},
"links": [],
"maxPerRow": 4,
"name": "Fan Speed %",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}",
"legend": "{{uuid}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Current frequency of graphics (shader) clock.",
"id": "7ba96896-b5e8-4168-868a-80ca82550337",
"layout": {
"h": 5,
"i": "7ba96896-b5e8-4168-868a-80ca82550337",
"w": 6,
"x": 0,
"y": 15
},
"links": [],
"maxPerRow": 4,
"name": "Graphics Clock Speed",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"}",
"legend": "{{uuid}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Current frequency of video encoder/decoder clock.",
"id": "cc1786fc-2c94-410b-a1cc-88d0418a9d99",
"layout": {
"h": 5,
"i": "cc1786fc-2c94-410b-a1cc-88d0418a9d99",
"w": 6,
"x": 6,
"y": 15
},
"links": [],
"maxPerRow": 4,
"name": "Video Clock Speed",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "nvidia_smi_clocks_current_video_clock_hz{uuid=\"$gpu\"}",
"legend": "{{uuid}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Current frequency of SM (Streaming Multiprocessor) clock.",
"id": "6eaf7e94-5e61-4e79-9f2a-f2204c0382e8",
"layout": {
"h": 5,
"i": "6eaf7e94-5e61-4e79-9f2a-f2204c0382e8",
"w": 6,
"x": 12,
"y": 15
},
"links": [],
"maxPerRow": 4,
"name": "SM Clock Speed",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "nvidia_smi_clocks_current_sm_clock_hz{uuid=\"$gpu\"}",
"legend": "{{uuid}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${DS_PROMETHEUS}",
"description": "Current frequency of memory clock.",
"id": "b14d69c3-a130-4e96-9d9d-934350acb34e",
"layout": {
"h": 5,
"i": "b14d69c3-a130-4e96-9d9d-934350acb34e",
"w": 6,
"x": 18,
"y": 15
},
"links": [],
"maxPerRow": 4,
"name": "Memory Clock Speed",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"}",
"legend": "{{uuid}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "DS_PROMETHEUS",
"type": "datasource"
},
{
"allOption": false,
"allValue": null,
"datasource": {
"cate": "prometheus",
"value": "${DS_PROMETHEUS}"
},
"definition": "label_values(nvidia_smi_index, uuid)",
"hide": false,
"multi": false,
"name": "gpu",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328176016000
}
================================================
FILE: integrations/NVIDIA/markdown/README.md
================================================
# nvidia_smi
该采集插件的原理,就是读取 nvidia-smi 命令的内容输出,转换为Prometheus格式的监控数据上报给Nightingale夜莺。
是对 [nvidia_gpu_exporter](https://github.com/utkuozdemir/nvidia_gpu_exporter) 代码的集成。
## Configuration
配置文件在 `conf/input.nvidia_smi/nvidia_smi.toml`
```toml
# # collect interval
# interval = 15
# 下面这个配置是最重要的配置,如果要采集 nvidia-smi 的信息,就打开下面的配置,
# 给出 nvidia-smi 命令的路径,最好是给绝对路径
# 相当于让 Categraf 执行本机的 nvidia-smi 命令,获取本机 GPU 的状态信息
# exec local command
# nvidia_smi_command = "nvidia-smi"
# 如果想远程方式采集远端机器的 GPU 状态信息,可以使用 ssh 命令,登录远端机器
# 在远端机器执行 nvidia-smi 的命令输出,通常 Categraf 是部署在每个物理机上的
# 所以,ssh 这种方式,理论上用不到
# exec remote command
# nvidia_smi_command = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null SSH_USER@SSH_HOST nvidia-smi"
# Comma-separated list of the query fields.
# You can find out possible fields by running `nvidia-smi --help-query-gpu`.
# The value `AUTO` will automatically detect the fields to query.
query_field_names = "AUTO"
```
================================================
FILE: integrations/Net_Response/alerts/net_response_by_categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "",
"datasource_ids": null,
"cluster": "",
"name": "Network address probe failed",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": null,
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "net_response_result_code != 0",
"rule_config": null,
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": null,
"enable_etime": "23:59",
"enable_etimes": null,
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": null,
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328182186000
}
]
================================================
FILE: integrations/Net_Response/collect/net_response/net_response.toml
================================================
# # collect interval
# interval = 15
[mappings]
# "127.0.0.1:22"= {region="local",ssh="test"}
# "127.0.0.1:22"= {region="local",ssh="redis"}
[[instances]]
targets = [
# "127.0.0.1:22",
# "localhost:6379",
# ":9090"
]
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
## Protocol, must be "tcp" or "udp"
## NOTE: because the "udp" protocol does not respond to requests, it requires
## a send/expect string pair (see below).
# protocol = "tcp"
## Set timeout
# timeout = "1s"
## Set read timeout (only used if expecting a response)
# read_timeout = "1s"
## The following options are required for UDP checks. For TCP, they are
## optional. The plugin will send the given string to the server and then
## expect to receive the given 'expect' string back.
## string sent to the server
# send = "ssh"
## expected string in answer
# expect = "ssh"
================================================
FILE: integrations/Net_Response/dashboards/dashboard-by-ziv.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Net Response Dashboard",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "labelsOfSeriesToRows",
"showHeader": true,
"sortColumn": "value",
"sortOrder": "descend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "efe3939b-33ec-415b-982f-92f0218626ec",
"layout": {
"h": 6,
"i": "efe3939b-33ec-415b-982f-92f0218626ec",
"isResizable": true,
"w": 24,
"x": 0,
"y": 1
},
"name": "Targets",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "#3fc453",
"text": "up"
},
"type": "special"
},
{
"match": {
"from": 1
},
"result": {
"color": "#ff656b",
"text": "down"
},
"type": "range"
}
]
},
"overrides": [
{
"matcher": {
"value": "A"
},
"properties": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "#417505",
"text": "UP"
},
"type": "special"
},
{
"match": {
"from": 1,
"special": 1
},
"result": {
"color": "#e90f0f",
"text": "DOWN"
},
"type": "range"
}
]
}
}
],
"targets": [
{
"expr": "max(net_response_result_code{ident=~\"$ident\"}) without (protocol, source)",
"legend": "",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "labelsOfSeriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "11b58922-d7a4-418f-912f-50500d312ed5",
"layout": {
"h": 7,
"i": "11b58922-d7a4-418f-912f-50500d312ed5",
"isResizable": true,
"w": 24,
"x": 0,
"y": 7
},
"name": "Latency (s)",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "max(net_response_response_time{ident=~\"$ident\"}) without (protocol, source)",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(net_response_result_code, ident)",
"multi": true,
"name": "ident",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328182968000
}
================================================
FILE: integrations/Net_Response/dashboards/net_response_by_categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "TCP detection by UlricQin",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"aggrDimension": "target",
"calc": "lastNotNull",
"colorMode": "background",
"displayMode": "labelValuesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "73c6eaf9-1685-4a7a-bf53-3d52afa1792e",
"layout": {
"h": 15,
"i": "73c6eaf9-1685-4a7a-bf53-3d52afa1792e",
"isResizable": true,
"w": 24,
"x": 0,
"y": 0
},
"name": "Targets",
"options": {
"standardOptions": {},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"value": "A"
},
"properties": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "#2c9d3d",
"text": "UP"
},
"type": "special"
},
{
"match": {
"from": 1,
"special": 1
},
"result": {
"color": "#e90f0f",
"text": "DOWN"
},
"type": "range"
}
]
}
},
{
"matcher": {
"value": "C"
},
"properties": {
"standardOptions": {
"decimals": 3,
"util": "milliseconds"
},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#f10c0c"
},
"type": "range"
},
{
"match": {
"to": 1
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
}
]
},
"type": "special"
}
],
"targets": [
{
"expr": "max(net_response_result_code) by (target)",
"legend": "UP?",
"refId": "A"
},
{
"expr": "max(net_response_response_time) by (target) * 1000",
"legend": "Latency(ms)",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {
"indexByName": {
"target": 0
}
}
}
],
"type": "table",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328183994000
}
================================================
FILE: integrations/Net_Response/markdown/README.md
================================================
# net_response
网络探测插件,通常用于监控本机某个端口是否在监听,或远端某个端口是否能连通。因为 Prometheus 生态的时序库只能存储 float64 类型的值,所以网络探测插件探测的结果也是 float64 类型的值,但是这个值的含义是不同的,具体含义如下:
```
- 0: Success
- 1: Timeout
- 2: ConnectionFailed
- 3: ReadFailed
- 4: StringMismatch
```
如果一切正常,这个值是 0,如果有异常,这个值是 1-4 之间的值,具体含义如上。这个值对应的指标名字是 `net_response_result_code`。
## Configuration
categraf 的 `conf/input.net_response/net_response.toml`。最核心的配置就是 targets 部分,指定探测的目标,下面的例子:
```toml
[[instances]]
targets = [
"10.2.3.4:22",
"localhost:6379",
":9090"
]
```
- `10.2.3.4:22` 表示探测 10.2.3.4 这个机器的 22 端口是否可以连通
- `localhost:6379` 表示探测本机的 6379 端口是否可以连通
- `:9090` 表示探测本机的 9090 端口是否可以连通
监控数据或告警事件中只是一个 IP 和端口,接收告警的人看到了,可能不清楚只是哪个业务的模块告警了,可以附加一些更有价值的信息放到标签里,比如:
```toml
labels = { region="cloud", product="n9e" }
```
标识了这是 cloud 这个 region,n9e 这个产品,这俩标签会附到时序数据上,告警的时候自然也会报出来。
完整配置样例如下:
```toml
[mappings]
# "127.0.0.1:22"= {region="local",ssh="test"}
# "127.0.0.1:22"= {region="local",ssh="redis"}
[[instances]]
targets = [
# "127.0.0.1:22",
# "localhost:6379",
# ":9090"
]
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
## Protocol, must be "tcp" or "udp"
## NOTE: because the "udp" protocol does not respond to requests, it requires
## a send/expect string pair (see below).
# protocol = "tcp"
## Set timeout
# timeout = "1s"
## Set read timeout (only used if expecting a response)
# read_timeout = "1s"
## The following options are required for UDP checks. For TCP, they are
## optional. The plugin will send the given string to the server and then
## expect to receive the given 'expect' string back.
## string sent to the server
# send = "ssh"
## expected string in answer
# expect = "ssh"
```
================================================
FILE: integrations/Net_Response/metrics/categraf.json
================================================
[
{
"id": 0,
"uuid": 1717556328185013000,
"collector": "Categraf",
"typ": "Net_Response",
"name": "NET 探测结果状态码",
"unit": "none",
"note": "0 值表示正常,大于 0 就是异常,各个值的含义如下:\n\n- 0: Success\n- 1: Timeout\n- 2: ConnectionFailed\n- 3: ReadFailed\n- 4: StringMismatch",
"lang": "zh_CN",
"expression": "net_response_result_code",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "NET 探测结果状态码",
"note": "0 值表示正常,大于 0 就是异常,各个值的含义如下:\n\n- 0: Success\n- 1: Timeout\n- 2: ConnectionFailed\n- 3: ReadFailed\n- 4: StringMismatch"
},
{
"lang": "en_US",
"name": "NET Probe Result Status Code",
"note": "A value of 0 means normal, and a value greater than 0 means abnormal. The meanings of each value are as follows: \n \n-0: Success \n1: Timeout \n2: ConnectionFailed \n-3: ReadFailed \n4: StringMismatch"
}
]
},
{
"id": 0,
"uuid": 1717556328186975000,
"collector": "Categraf",
"typ": "Net_Response",
"name": "NET 探测耗时",
"unit": "seconds",
"note": "",
"lang": "zh_CN",
"expression": "net_response_response_time",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "NET 探测耗时",
"note": ""
},
{
"lang": "en_US",
"name": "NET probe time-consuming",
"note": ""
}
]
}
]
================================================
FILE: integrations/Netstat_Filter/collect/netstat_filter/netstat_filter.toml
================================================
# # collect interval
# interval = 15
[[instances]]
# laddr_ip = ""
# laddr_port = 0
# raddr_ip = ""
# raddr_port = 0
================================================
FILE: integrations/Netstat_Filter/markdown/README.md
================================================
# netstat_filter
该插件采集网络连接情况,并根据用户条件进行过滤统计,以达到监控用户关心链接情况
## 指标列表
tcp_established
tcp_syn_sent
tcp_syn_recv
tcp_fin_wait1
tcp_fin_wait2
tcp_time_wait
tcp_close
tcp_close_wait
tcp_last_ack
tcp_listen
tcp_closing
tcp_none
tcp_send_queue
tcp_recv_queue
## 功能说明
对源IP、源端口、目标IP和目标端口过滤后进行网卡recv-Q、send-Q进行采集,该指标可以很好反应出指定连接的质量,例如rtt时间过长,导致收到服务端ack确认很慢就会使send-Q长期大于0,可以及时通过监控发现,从而提前优化网络或程序
当过滤结果为多个连接时会将send和recv值进行加和
例如:
配置文件``raddr_port = 11883``
当本地和不同IP的11883都有连接建立的情况下,会将多条连接的结果进行加和。或在并发多连接的情况下,会合并加合,总之过滤的越粗略被加合数就会越多。
多条规则请复制``[[instances]]``进行配置
## 注意事项
netstat_filter_tcp_send_queue和netstat_filter_tcp_recv_queue指标目前只支持linux。windows用户默认为0。
================================================
FILE: integrations/Nginx/collect/nginx/nginx.toml
================================================
# # collect interval
# interval = 15
[[instances]]
## An array of Nginx stub_status URI to gather stats.
urls = [
# "http://192.168.0.216:8000/nginx_status",
# "https://www.baidu.com/ngx_status"
]
## append some labels for series
# labels = { region="cloud", product="n9e" }
## interval = global.interval * interval_times
# interval_times = 1
## Set response_timeout (default 5 seconds)
response_timeout = "5s"
## Whether to follow redirects from the server (defaults to false)
# follow_redirects = false
## Optional HTTP Basic Auth Credentials
#username = "admin"
#password = "admin"
## Optional headers
# headers = ["X-From", "categraf", "X-Xyz", "abc"]
## Optional TLS Config
# use_tls = false
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
================================================
FILE: integrations/Nginx/collect/nginx_upstream_check/nginx_upstream_check.toml
================================================
# # collect interval
# interval = 15
[[instances]]
targets = [
# "http://127.0.0.1/status?format=json",
# "http://10.2.3.56/status?format=json"
]
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
## Set http_proxy (categraf uses the system wide proxy settings if it's is not set)
# http_proxy = "http://localhost:8888"
## Interface to use when dialing an address
# interface = "eth0"
## HTTP Request Method
# method = "GET"
## Set timeout (default 5 seconds)
# timeout = "5s"
## Whether to follow redirects from the server (defaults to false)
# follow_redirects = false
## Optional HTTP Basic Auth Credentials
# username = "username"
# password = "pa$$word"
## Optional headers
# headers = ["X-From", "categraf", "X-Xyz", "abc"]
## Optional TLS Config
# use_tls = false
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
================================================
FILE: integrations/Nginx/dashboards/nginx_stub_status.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Nginx Stub",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f29b8521-eb9f-41d5-8a79-1e222baabf9d",
"layout": {
"h": 7,
"i": "f29b8521-eb9f-41d5-8a79-1e222baabf9d",
"isResizable": true,
"w": 2,
"x": 0,
"y": 0
},
"links": [],
"name": "Requests",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 1
},
"result": {
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"text": "DOWN"
},
"type": "special"
}
]
},
"targets": [
{
"expr": "nginx_requests{server=\"$server\"}",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c0d3d10a-fd3b-485c-97e4-9f68ffc7a026",
"layout": {
"h": 7,
"i": "c0d3d10a-fd3b-485c-97e4-9f68ffc7a026",
"isResizable": true,
"w": 6,
"x": 2,
"y": 0
},
"links": [],
"name": "Active connections",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "nginx_active{server=\"$server\"}",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "abbce8f8-222f-4e07-9e5e-fc85e7780672",
"layout": {
"h": 7,
"i": "abbce8f8-222f-4e07-9e5e-fc85e7780672",
"isResizable": true,
"w": 6,
"x": 8,
"y": 0
},
"links": [],
"name": "Waiting connections",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "nginx_waiting{server=\"$server\"}",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "52f77144-19ba-4349-a7de-cedeb41ac3d7",
"layout": {
"h": 7,
"i": "52f77144-19ba-4349-a7de-cedeb41ac3d7",
"isResizable": true,
"w": 5,
"x": 14,
"y": 0
},
"links": [],
"name": "Reading connections",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "nginx_reading{server=\"$server\"}",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4c02d0ab-7dc7-466d-a610-be5810b7a1e6",
"layout": {
"h": 7,
"i": "4c02d0ab-7dc7-466d-a610-be5810b7a1e6",
"isResizable": true,
"w": 5,
"x": 19,
"y": 0
},
"links": [],
"name": "Writing connections",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "nginx_writing{server=\"$server\"}",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5e837a2b-b919-4ee5-8edf-b6bb490030ff",
"layout": {
"h": 7,
"i": "b13dce58-7f2a-4680-a9e4-507f7d5a2af8",
"isResizable": true,
"w": 5,
"x": 0,
"y": 7
},
"links": [],
"name": "handled",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "nginx_handled{server=\"$server\"}",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(nginx_active,server)",
"multi": false,
"name": "server",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328194546000
}
================================================
FILE: integrations/Nginx/dashboards/nginx_upstream_check.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Nginx Upstream",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f29b8521-eb9f-41d5-8a79-1e222baabf9d",
"layout": {
"h": 7,
"i": "f29b8521-eb9f-41d5-8a79-1e222baabf9d",
"isResizable": true,
"w": 2,
"x": 0,
"y": 0
},
"links": [],
"name": "Requests",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 1
},
"result": {
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"text": "DOWN"
},
"type": "special"
}
]
},
"targets": [
{
"expr": "nginx_upstream_check_status_code{target=\"$target\"}",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c0d3d10a-fd3b-485c-97e4-9f68ffc7a026",
"layout": {
"h": 7,
"i": "c0d3d10a-fd3b-485c-97e4-9f68ffc7a026",
"isResizable": true,
"w": 6,
"x": 2,
"y": 0
},
"links": [],
"name": "Rise check",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "nginx_upstream_check_rise{target=\"$target\",upstream=\"$upstream\"}",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "abbce8f8-222f-4e07-9e5e-fc85e7780672",
"layout": {
"h": 7,
"i": "abbce8f8-222f-4e07-9e5e-fc85e7780672",
"isResizable": true,
"w": 6,
"x": 8,
"y": 0
},
"links": [],
"name": "Fall Check",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "nginx_upstream_check_fall{target=\"$target\",upstream=\"$upstream\"}",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(nginx_upstream_check_status_code,target)",
"multi": false,
"name": "target",
"reg": "/http:\\/\\//",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(nginx_upstream_check_status_code,upstream)",
"name": "upstream",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328195658000
}
================================================
FILE: integrations/Nginx/dashboards/nginx_vts.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Nginx VTS",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2bed0dff-e7c7-4d8b-bf22-e7e4452300d8",
"layout": {
"h": 4,
"i": "2bed0dff-e7c7-4d8b-bf22-e7e4452300d8",
"w": 12,
"x": 0,
"y": 0
},
"links": [],
"name": "Server Connections",
"options": {
"legend": {
"displayMode": "hidden"
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(nginx_vts_main_connections{instance=~\"$Instance\", status=~\"active|writing|reading|waiting\"}) by (status)",
"legend": "{{status}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"colorMode": "value",
"textMode": "value",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "69d6240e-0c69-45b4-83ae-350d38d18f4c",
"layout": {
"h": 4,
"i": "69d6240e-0c69-45b4-83ae-350d38d18f4c",
"w": 3,
"x": 12,
"y": 0
},
"links": [],
"name": "active",
"options": {},
"targets": [
{
"expr": "sum(irate(nginx_vts_main_connections{status=\"active\"}[1m]))",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"colorMode": "value",
"textMode": "value",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d7666059-71fd-49f3-8cba-96cdbfadce4d",
"layout": {
"h": 4,
"i": "d7666059-71fd-49f3-8cba-96cdbfadce4d",
"w": 3,
"x": 15,
"y": 0
},
"links": [],
"name": "writing",
"options": {},
"targets": [
{
"expr": "sum(irate(nginx_vts_main_connections{status=\"writing\"}[1m]))",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6dca89ce-f2de-4b2b-a826-9fc6ae0cce28",
"layout": {
"h": 4,
"i": "6dca89ce-f2de-4b2b-a826-9fc6ae0cce28",
"w": 3,
"x": 18,
"y": 0
},
"links": [],
"name": "read",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(irate(nginx_vts_main_connections{instance=\"$instance\",status=\"reading\"}[1m]))",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"colorMode": "value",
"textMode": "value",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "39b4c42c-5418-4386-837a-8b36464e83bf",
"layout": {
"h": 4,
"i": "39b4c42c-5418-4386-837a-8b36464e83bf",
"w": 3,
"x": 21,
"y": 0
},
"links": [],
"name": "waiting",
"options": {},
"targets": [
{
"expr": "sum(irate(nginx_vts_main_connections{status=\"waiting\"}[1m]))",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "97381677-fb79-473e-b2b1-cd7d21452546",
"layout": {
"h": 6,
"i": "97381677-fb79-473e-b2b1-cd7d21452546",
"w": 6,
"x": 0,
"y": 4
},
"links": [],
"name": "Server Requests",
"options": {
"legend": {
"displayMode": "hidden"
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(irate(nginx_vts_server_requests_total{instance=~\"$Instance\", host=~\"$Host\", code!=\"total\"}[5m])) by (code)",
"legend": "{{ code }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "This one is providing aggregated error codes, but it's still possible to graph these per upstream.",
"id": "6139b81f-d2de-4ecf-8ec3-41b94713ec48",
"layout": {
"h": 6,
"i": "6139b81f-d2de-4ecf-8ec3-41b94713ec48",
"w": 6,
"x": 6,
"y": 4
},
"links": [],
"name": "Upstream Requests",
"options": {
"legend": {
"displayMode": "hidden"
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(irate(nginx_vts_upstream_requests_total{instance=~\"$Instance\", upstream=~\"^$Upstream$\", backend=~\"^$Backend$\", code!=\"total\"}[5m])) by (code)",
"legend": "{{ code }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2d09b8b7-dc80-455e-b809-5a46d64a6263",
"layout": {
"h": 6,
"i": "2d09b8b7-dc80-455e-b809-5a46d64a6263",
"w": 6,
"x": 12,
"y": 4
},
"links": [],
"name": "Request delta/sec (BACKEND)",
"options": {
"legend": {
"displayMode": "hidden"
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(irate(nginx_vts_upstream_requests_total{backend=~\"$Backend\", instance=~\"$Instance\", code!=\"total\"} [1m])) by (code)",
"legend": "{{code}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3447df45-823c-4a52-bebf-7003736ca138",
"layout": {
"h": 6,
"i": "3447df45-823c-4a52-bebf-7003736ca138",
"w": 6,
"x": 18,
"y": 4
},
"links": [],
"name": "Request delta/sec (FILTER)",
"options": {
"legend": {
"displayMode": "hidden"
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(irate(nginx_vts_filter_requests_total{filter=~\"country::$Host\", filter_name=~\"$Country\", instance=~\"$Instance\", direction!=\"total\"} [1m])) by (direction)",
"legend": "{{direction}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9c830846-110c-49df-8fa7-0662899c5804",
"layout": {
"h": 7,
"i": "9c830846-110c-49df-8fa7-0662899c5804",
"w": 24,
"x": 0,
"y": 10
},
"links": [],
"name": "Response times (FILTER)",
"options": {
"legend": {
"displayMode": "hidden"
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(irate(nginx_vts_filter_request_seconds{filter=~\"country::$Host\", filter_name=~\"$Country\", instance=~\"$Instance\"} [1m])) by (filter_name) * 1000",
"legend": "{{filter_name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9785673c-0343-4796-9091-4f1f0df10cd7",
"layout": {
"h": 6,
"i": "9785673c-0343-4796-9091-4f1f0df10cd7",
"w": 8,
"x": 0,
"y": 17
},
"links": [],
"name": "bandwidth delta/sec (FILTER)",
"options": {
"legend": {
"displayMode": "hidden"
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(irate(nginx_vts_filter_bytes_total{filter=~\"country::$Host\", filter_name=~\"$Country\", instance=~\"$Instance\"} [1m])) by (direction)",
"legend": "{{direction}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "56bae540-1e16-49e0-82df-33d0b0602c5f",
"layout": {
"h": 6,
"i": "56bae540-1e16-49e0-82df-33d0b0602c5f",
"w": 8,
"x": 8,
"y": 17
},
"links": [],
"name": "Server Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(irate(nginx_vts_server_bytes_total{instance=~\"$Instance\", host=~\"$Host\"}[5m])) by (direction)",
"legend": "{{ direction }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9124e32c-7c06-4f2d-ba35-390a1274b289",
"layout": {
"h": 6,
"i": "9124e32c-7c06-4f2d-ba35-390a1274b289",
"w": 8,
"x": 16,
"y": 17
},
"links": [],
"name": "Upstream Bytes",
"options": {
"legend": {
"displayMode": "hidden"
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(irate(nginx_vts_upstream_bytes_total{instance=~\"$Instance\", upstream=~\"^$Upstream$\", backend=~\"^$Backend$\"}[5m])) by (direction)",
"legend": "{{ direction }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ed58e88d-4130-4d96-8e73-62be1d13909a",
"layout": {
"h": 7,
"i": "ed58e88d-4130-4d96-8e73-62be1d13909a",
"w": 12,
"x": 0,
"y": 23
},
"links": [],
"name": "Upstream Backend Response",
"options": {
"legend": {
"displayMode": "hidden"
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(nginx_vts_upstream_response_seconds{instance=~\"$Instance\", upstream=~\"^$Upstream$\", backend=~\"^$Backend$\"}) by (backend)",
"legend": "{{ backend }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "75d3533d-156a-41ec-ae72-d12ca6a5f900",
"layout": {
"h": 7,
"i": "75d3533d-156a-41ec-ae72-d12ca6a5f900",
"w": 12,
"x": 12,
"y": 23
},
"links": [],
"name": "Server Cache",
"options": {
"legend": {
"displayMode": "hidden"
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(irate(nginx_vts_server_cache_total{instance=~\"$Instance\", host=~\"$Host\"}[5m])) by (status)",
"legend": "{{ status }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "nginx_vts_filter_bytes_total",
"multi": true,
"name": "Country",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(nginx_vts_server_bytes_total, instance)",
"multi": false,
"name": "Instance",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(nginx_vts_server_requests_total{instance=~\"$Instance\"}, host)",
"multi": false,
"name": "Host",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(nginx_vts_upstream_requests_total{instance=~\"$Instance\"}, upstream)",
"multi": false,
"name": "Upstream",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(nginx_vts_upstream_requests_total{instance=~\"$Instance\", upstream=~\"$Upstream\"}, backend)",
"multi": false,
"name": "Backend",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328197145000
}
================================================
FILE: integrations/Nginx/markdown/README.md
================================================
# Nginx
Nginx 监控有多种方式,最推荐的是 vts 插件:
**[http_stub_status_module](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)**
配置样例如下:
```toml
[[instances]]
## An array of Nginx stub_status URI to gather stats.
urls = [
# "http://192.168.0.216:8000/nginx_status",
# "https://www.baidu.com/ngx_status"
]
## append some labels for series
# labels = { region="cloud", product="n9e" }
## interval = global.interval * interval_times
# interval_times = 1
## Set response_timeout (default 5 seconds)
response_timeout = "5s"
## Whether to follow redirects from the server (defaults to false)
# follow_redirects = false
## Optional HTTP Basic Auth Credentials
#username = "admin"
#password = "admin"
## Optional headers
# headers = ["X-From", "categraf", "X-Xyz", "abc"]
## Optional TLS Config
# use_tls = false
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
```
**[nginx_upstream_check](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx_upstream_check/README.md)**
配置样例如下:
```toml
[[instances]]
targets = [
# "http://127.0.0.1/status?format=json",
# "http://10.2.3.56/status?format=json"
]
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
## Set http_proxy (categraf uses the system wide proxy settings if it's is not set)
# http_proxy = "http://localhost:8888"
## Interface to use when dialing an address
# interface = "eth0"
## HTTP Request Method
# method = "GET"
## Set timeout (default 5 seconds)
# timeout = "5s"
## Whether to follow redirects from the server (defaults to false)
# follow_redirects = false
## Optional HTTP Basic Auth Credentials
# username = "username"
# password = "pa$$word"
## Optional headers
# headers = ["X-From", "categraf", "X-Xyz", "abc"]
## Optional TLS Config
# use_tls = false
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
```
**[nginx vts](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx_vts/README.md)**
nginx_vts 已经支持输出 prometheus 格式的数据,所以,其实已经不需要这个采集插件了,直接用 categraf 的 prometheus 采集插件,读取 nginx_vts 的 prometheus 数据即可。配置样例如下:
```toml
[[instances]]
urls = [
"http://IP:PORT/vts/format/prometheus"
]
labels = {job="nginx-vts"}
```
# nginx_upstream_check 插件
### 应用场景
一般用于业务系统做对外或对外路由映射时使用代理服务,是运维最常见且最重要的代理工具。
### 部署场景
需要在装有nginx服务的虚拟机启用此插件。
### 采集原理
- 该采集插件是读取 [nginx_upstream_check](https://github.com/yaoweibin/nginx_upstream_check_module) 的状态输出。[nginx_upstream_check](https://github.com/yaoweibin/nginx_upstream_check_module) 可以周期性检查 upstream 中的各个 server 是否存活,如果检查失败,就会标记为 `down`,如果检查成功,就标记为 `up`。
### 注意事项
- 由于 TSDB 通常无法处理字符串,所以 Categraf 会做转换,将 `down` 转换为 2, `up` 转换为 1,其他状态转换为 0,使用 `nginx_upstream_check_status_code` 这个指标来表示,所以,我们可能需要这样的告警规则:
### 前置条件
#### 条件1:nginx服务需要启用nginx_upstream_check_module模块
```
推荐源码编译方式安装模块,如不清楚要安装哪些模块,可参考:
cd /opt/nginx-1.20.1 && ./configure \
--prefix=/usr/share/nginx \
--sbin-path=/usr/sbin/nginx \
--modules-path=/usr/lib64/nginx/modules \
--conf-path=/etc/nginx/nginx.conf \
--error-log-path=/var/log/nginx/error.log \
--http-log-path=/var/log/nginx/access.log \
--http-client-body-temp-path=/var/lib/nginx/tmp/client_body \
--http-proxy-temp-path=/var/lib/nginx/tmp/proxy \
--http-fastcgi-temp-path=/var/lib/nginx/tmp/fastcgi \
--http-uwsgi-temp-path=/var/lib/nginx/tmp/uwsgi \
--http-scgi-temp-path=/var/lib/nginx/tmp/scgi \
--pid-path=/var/run/nginx.pid \
--lock-path=/run/lock/subsys/nginx \
--user=nginx \
--group=nginx \
--with-compat \
--with-threads \
--with-http_addition_module \
--with-http_auth_request_module \
--with-http_dav_module \
--with-http_flv_module \
--with-http_gunzip_module \
--with-http_gzip_static_module \
--with-http_mp4_module \
--with-http_random_index_module \
--with-http_realip_module \
--with-http_secure_link_module \
--with-http_slice_module \
--with-http_ssl_module \
--with-http_stub_status_module \
--with-http_sub_module \
--with-http_v2_module \
--with-mail \
--with-mail_ssl_module \
--with-stream \
--with-stream_realip_module \
--with-stream_ssl_module \
--with-stream_ssl_preread_module \
--with-select_module \
--with-poll_module \
--with-file-aio \
--with-http_xslt_module=dynamic \
--with-http_image_filter_module=dynamic \
--with-http_perl_module=dynamic \
--with-stream=dynamic \
--with-mail=dynamic \
--with-http_xslt_module=dynamic \
--add-module=/etc/nginx/third-modules/nginx_upstream_check_module \
--add-module=/etc/nginx/third-modules/ngx_devel_kit-0.3.0 \
--add-module=/etc/nginx/third-modules/lua-nginx-module-0.10.13 \
--add-module=/etc/nginx/third-modules/nginx-module-vts \
--add-module=/etc/nginx/third-modules/ngx-fancyindex-0.5.2
# 根据cpu核数
make -j2
make install
注意:第三方模块nginx_upstream_check_module lua-nginx-module nginx-module-vts 都是相关插件所必备的依赖。
```
#### 条件2:nginx启用check_status配置
```
[root@aliyun categraf]# cat /etc/nginx/conf.d/nginx-upstream.domains.com.conf
server {
listen 80;
listen 443 ssl;
server_name nginx-upstream.domains.com;
include /etc/nginx/ssl_conf/domains.com.conf;
location / {
check_status;
include /etc/nginx/ip_whitelist.conf;
}
access_log /var/log/nginx/nginx-upstream.domains.com.access.log main;
error_log /var/log/nginx/nginx-upstream.domains.com.error.log warn;
}
```
浏览器访问https://nginx-upstream.domains.com?format=json出现:

浏览器访问https://nginx-upstream.domains.com出现:

#### 条件3:在需要启用upstream监控的域名配置下进行配置
例如:
```
[root@aliyun upstream_conf]# cat upstream_n9e.conf
upstream n9e {
server 127.0.0.1:18000 weight=10 max_fails=2 fail_timeout=5s;
check interval=3000 rise=2 fall=5 timeout=1000 type=tcp default_down=false port=18000;
check_http_send "HEAD / HTTP/1.0\r\n\r\n";
check_http_expect_alive http_2xx http_3xx;
}
[root@aliyun upstream_conf]# cat upstream_n9e_server_api.conf
upstream n9e-server-api {
server 127.0.0.1:19000 weight=10 max_fails=2 fail_timeout=5s;
check interval=3000 rise=2 fall=5 timeout=1000 type=tcp default_down=false port=19000;
check_http_send "HEAD / HTTP/1.0\r\n\r\n";
check_http_expect_alive http_2xx http_3xx;
}
[root@aliyun upstream_conf]# cat upstream_vm.conf
upstream vm {
server 127.0.0.1:8428 weight=10 max_fails=2 fail_timeout=5s;
keepalive 20;
check interval=3000 rise=2 fall=5 timeout=1000 type=tcp default_down=false port=8428;
check_http_send "HEAD / HTTP/1.0\r\n\r\n";
check_http_expect_alive http_2xx http_3xx;
}
```
### 配置场景
```
本配置启用或数据定义如下功能:
增加自定义标签,可通过自定义标签筛选数据及更加精确的告警推送。
响应超时时间为5秒。
urls字段填写条件2所定义好的域名。
```
### 修改nginx.toml文件配置
```
[root@aliyun conf]# cat input.nginx_upstream_check/nginx_upstream_check.toml
# # collect interval
# interval = 15
[[instances]]
# 这个配置最关键,是要给出获取 status 信息的接口地址
targets = [
"https://nginx-upstream.domains.com/?format=json"
]
# 标签这个配置请注意
# 如果 Categraf 和 Nginx 是在一台机器上,target 可能配置的是 127.0.0.1
# 如果 Nginx 有多台机器,每台机器都有 Categraf 来采集本机的 Nginx 的 Status 信息
# 可能会导致时序数据标签相同,不易区分,当然,Categraf 会自带 ident 标签,该标签标识本机机器名
# 如果大家觉得 ident 标签不够用,可以用下面 labels 配置,附加 instance、region 之类的标签
# # append some labels for series
labels = { cloud="my-cloud", region="my-region",azone="az1", product="my-product" }
# # interval = global.interval * interval_times
# interval_times = 1
### Set http_proxy (categraf uses the system wide proxy settings if it's is not set)
# http_proxy = "http://localhost:8888"
### Interface to use when dialing an address
# interface = "eth0"
### HTTP Request Method
# method = "GET"
### Set timeout (default 5 seconds)
# timeout = "5s"
### Whether to follow redirects from the server (defaults to false)
# follow_redirects = false
### Optional HTTP Basic Auth Credentials
# username = "username"
# password = "pa$$word"
### Optional headers
# headers = ["X-From", "categraf", "X-Xyz", "abc"]
### Optional TLS Config
# use_tls = false
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
### Use TLS but skip chain & host verification
# insecure_skip_verify = false
```
### 测试配置
```
./categraf --test --inputs nginx_upstream_check
```
### 重启服务
```
重启categraf服务生效
systemctl daemon-reload && systemctl restart categraf && systemctl status categraf
查看启动日志是否有错误
journalctl -f -n 500 -u categraf | grep "E\!" | grep "W\!"
```
### 检查数据呈现
等待1-2分钟后数据就会在图表中展示出来,如图:

### 监控告警规则配置
```
一般查看后端是否异常为关键检查对象,nginx_upstream_check_status_code返回1代表正常,返回2代表异常(实际测试可从上图看出)。
nginx_upstream_check_status_code!=1则视为异常需立即告警,级别为一级告警,执行频率为60秒,持续时长为60秒,留观时长2分钟,重复发送频率5分钟,最大发送次数0次,使用企业微信应用及电话语音通道将告警内容发送给系统运维组,此规则运用到周一到周日全天。
```
================================================
FILE: integrations/Nginx/metrics/categraf.json
================================================
[
{
"id": 0,
"uuid": 1717556328198593000,
"collector": "Categraf",
"typ": "Nginx",
"name": "Nginx stub_status 当前空闲连接数",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)",
"lang": "zh_CN",
"expression": "nginx_waiting",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Nginx stub_status 当前空闲连接数",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)"
},
{
"lang": "en_US",
"name": "Nginx stub _ status Number of current idle connections",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328200723000,
"collector": "Categraf",
"typ": "Nginx",
"name": "Nginx stub_status 正在回写 response 的连接数",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)",
"lang": "zh_CN",
"expression": "nginx_writing",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Nginx stub_status 正在回写 response 的连接数",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)"
},
{
"lang": "en_US",
"name": "Nginx stub _ status The number of connections that are writing back response",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328202610000,
"collector": "Categraf",
"typ": "Nginx",
"name": "Nginx stub_status 正在处理的活动连接数",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)\n\nReading + Writing + Waiting 的总和",
"lang": "zh_CN",
"expression": "nginx_active",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Nginx stub_status 正在处理的活动连接数",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)\n\nReading + Writing + Waiting 的总和"
},
{
"lang": "en_US",
"name": "Nginx stub _ status Number of active connections being processed",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md) \n \nSum of Reading + Writing + Waiting"
}
]
},
{
"id": 0,
"uuid": 1717556328204562000,
"collector": "Categraf",
"typ": "Nginx",
"name": "Nginx stub_status 正在读取 request header 的连接数",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)",
"lang": "zh_CN",
"expression": "nginx_reading",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Nginx stub_status 正在读取 request header 的连接数",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)"
},
{
"lang": "en_US",
"name": "Nginx stub _ status is reading the number of connections to the request header",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328206518000,
"collector": "Categraf",
"typ": "Nginx",
"name": "Nginx stub_status 每秒 accept 的新连接数",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)",
"lang": "zh_CN",
"expression": "irate(nginx_accepts[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Nginx stub_status 每秒 accept 的新连接数",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)"
},
{
"lang": "en_US",
"name": "Nginx stub _ status New connections accepted per second",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328208392000,
"collector": "Categraf",
"typ": "Nginx",
"name": "Nginx stub_status 每秒 handle 的新连接数",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)",
"lang": "zh_CN",
"expression": "irate(nginx_handled[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Nginx stub_status 每秒 handle 的新连接数",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)"
},
{
"lang": "en_US",
"name": "Nginx stub _ status New connections per second handle",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328210396000,
"collector": "Categraf",
"typ": "Nginx",
"name": "Nginx stub_status 每秒处理的请求数",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)\n\n如果有 keep-alive 连接的情况,一个连接上会处理多个请求。",
"lang": "zh_CN",
"expression": "irate(nginx_requests[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Nginx stub_status 每秒处理的请求数",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md)\n\n如果有 keep-alive 连接的情况,一个连接上会处理多个请求。"
},
{
"lang": "en_US",
"name": "Nginx stub _ status requests processed per second",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/nginx/README.md) \n \nIf there is a keep-alive connection, multiple requests will be processed on one connection."
}
]
}
]
================================================
FILE: integrations/Oracle/alerts/oracle_alert.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
2
],
"cluster": "",
"name": "Oracle: 数据库归档日志量大于100",
"note": "Job:{{$labels.job}};Address:{{$labels.address}};Region:{{$labels.region}};",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "oracle_archivelog_count\u003e100",
"severity": 2
}
]
},
"prom_eval_interval": 60,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"wecom",
"email",
"sms"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 5,
"recover_duration": 120,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328215417000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
2
],
"cluster": "",
"name": "Oracle: 数据库总连接数大于500",
"note": "Job:{{$labels.job}};Address:{{$labels.address}};Region:{{$labels.region}};",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2,
3
],
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"prom_ql": "oracle_process_count \u003e 500",
"severity": 2
},
{
"prom_ql": "oracle_process_count \u003e 800",
"severity": 3
}
]
},
"prom_eval_interval": 60,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"wecom",
"email",
"sms"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 5,
"recover_duration": 120,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328215986000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
2
],
"cluster": "",
"name": "Oracle: 数据库活跃连接数大于100",
"note": "Job:{{$labels.job}};Address:{{$labels.address}};Region:{{$labels.region}};",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "oracle_sessions_value{address='localhost:1111/erpdb',status='ACTIVE',type='USER'}\u003e100",
"severity": 2
}
]
},
"prom_eval_interval": 60,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"wecom",
"email",
"sms"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 5,
"recover_duration": 120,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328216432000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
2
],
"cluster": "",
"name": "Oracle: 监控数据采集失败,可能已经挂了",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 0,
"prom_for_duration": 90,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "oracle_up != 1",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"wecom",
"email"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 2,
"recover_duration": 120,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328216858000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
2
],
"cluster": "",
"name": "Oracle: 表空间使用率大于95%",
"note": "Job:{{$labels.job}};Address:{{$labels.address}};Tablespace:{{$labels.tablespace}};Region:{{$labels.region}};",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 180,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "oracle_tablespace_bytes/oracle_tablespace_max_bytes*100\u003e95",
"severity": 2
}
]
},
"prom_eval_interval": 60,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "00:00",
"enable_etimes": [
"00:00"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"wecom",
"email",
"sms"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 5,
"recover_duration": 120,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328217386000
}
]
================================================
FILE: integrations/Oracle/collect/oracle/oracle.toml
================================================
# # collect interval
# interval = 15
#[[instances]]
# address = "10.1.2.3:1521/orcl"
# username = "monitor"
# password = "123456"
# is_sys_dba = false
# is_sys_oper = false
# disable_connection_pool = false
# max_open_connections = 5
# # interval = global.interval * interval_times
# interval_times = 1
# labels = { region="cloud" }
# [[instances.metrics]]
# measurement = "sessions"
# label_fields = [ "status", "type" ]
# metric_fields = [ "value" ]
# timeout = "3s"
# request = '''
# SELECT status, type, COUNT(*) as value FROM v$session GROUP BY status, type
# '''
# [[instances]]
# address = "192.168.10.10:1521/orcl"
# username = "monitor"
# password = "123456"
# is_sys_dba = false
# is_sys_oper = false
# disable_connection_pool = false
# max_open_connections = 5
# # labels = { region="local" }
[[metrics]]
measurement = "sessions"
label_fields = [ "status", "type" ]
metric_fields = [ "value" ]
timeout = "3s"
request = '''
SELECT status, type, COUNT(*) as value FROM v$session GROUP BY status, type
'''
[[metrics]]
measurement = "lock"
metric_fields = [ "cnt" ]
timeout = "3s"
request = '''
SELECT COUNT(*) AS cnt
FROM ALL_OBJECTS A, V$LOCKED_OBJECT B, SYS.GV_$SESSION C
WHERE A.OBJECT_ID = B.OBJECT_ID
AND B.PROCESS = C.PROCESS
'''
[[metrics]]
measurement = "slow_queries"
metric_fields = [ "p95_time_usecs" , "p99_time_usecs"]
timeout = "3s"
request = '''
select percentile_disc(0.95) within group (order by elapsed_time) as p95_time_usecs,
percentile_disc(0.99) within group (order by elapsed_time) as p99_time_usecs
from v$sql where last_active_time >= sysdate - 5/(24*60)
'''
[[metrics]]
measurement = "resource"
label_fields = [ "resource_name" ]
metric_fields = [ "current_utilization", "limit_value" ]
timeout = "3s"
request = '''
SELECT resource_name,current_utilization,CASE WHEN TRIM(limit_value) LIKE 'UNLIMITED' THEN '-1' ELSE TRIM(limit_value) END as limit_value FROM v$resource_limit
'''
[[metrics]]
measurement = "asm_diskgroup"
label_fields = [ "name" ]
metric_fields = [ "total", "free" ]
timeout = "3s"
request = '''
SELECT name,total_mb*1024*1024 as total,free_mb*1024*1024 as free FROM v$asm_diskgroup_stat where exists (select 1 from v$datafile where name like '+%')
'''
IgnoreZeroResult = true
[[metrics]]
measurement = "activity"
metric_fields = [ "value" ]
field_to_append = "name"
timeout = "3s"
request = '''
SELECT name, value FROM v$sysstat WHERE name IN ('parse count (total)', 'execute count', 'user commits', 'user rollbacks')
'''
[[metrics]]
measurement = "process"
metric_fields = [ "count" ]
timeout = "3s"
request = '''
SELECT COUNT(*) as count FROM v$process
'''
[[metrics]]
measurement = "wait_time"
metric_fields = [ "value" ]
label_fields = ["wait_class"]
timeout = "3s"
request = '''
SELECT
n.wait_class as WAIT_CLASS,
round(m.time_waited/m.INTSIZE_CSEC,3) as VALUE
FROM
v$waitclassmetric m, v$system_wait_class n
WHERE
m.wait_class_id=n.wait_class_id AND n.wait_class != 'Idle'
'''
[[metrics]]
measurement = "tablespace"
label_fields = [ "tablespace", "type" ]
metric_fields = [ "bytes", "max_bytes", "free" ]
timeout = "3s"
request = '''
SELECT
dt.tablespace_name as tablespace,
dt.contents as type,
dt.block_size * dtum.used_space as bytes,
dt.block_size * dtum.tablespace_size as max_bytes,
dt.block_size * (dtum.tablespace_size - dtum.used_space) as free
FROM dba_tablespace_usage_metrics dtum, dba_tablespaces dt
WHERE dtum.tablespace_name = dt.tablespace_name
ORDER by tablespace
'''
[[metrics]]
measurement = "sysmetric"
metric_fields = [ "value" ]
field_to_append = "metric_name"
timeout = "3s"
request = '''
select METRIC_NAME,VALUE from v$sysmetric where group_id=2
'''
[[metrics]]
measurement = "applylag"
metric_fields = [ "value" ]
timeout = "3s"
request = '''
SELECT TO_NUMBER(EXTRACT(SECOND FROM TO_DSINTERVAL (value))) as value FROM v$dataguard_stats WHERE name = 'apply lag'
'''
================================================
FILE: integrations/Oracle/dashboards/oracle_by_categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Oracle by categraf",
"ident": "",
"tags": "categraf",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "16c3b81f-38ea-472e-ba9d-58f3218413c9",
"layout": {
"h": 1,
"i": "16c3b81f-38ea-472e-ba9d-58f3218413c9",
"w": 24,
"x": 0,
"y": 0
},
"name": "Activities",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "6834fcfd-6448-4848-9f63-72350d818a39",
"layout": {
"h": 3,
"i": "6834fcfd-6448-4848-9f63-72350d818a39",
"w": 6,
"x": 6,
"y": 1
},
"name": "execute count / second",
"options": {
"standardOptions": {
"decimals": 1
}
},
"targets": [
{
"expr": "rate(oracle_activity_execute_count_value{ident=\"$ident\", instance=\"$instance\"}[2m])",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1754fda2-fa98-481e-ba86-520f1d7ebc0d",
"layout": {
"h": 3,
"i": "1754fda2-fa98-481e-ba86-520f1d7ebc0d",
"w": 6,
"x": 12,
"y": 1
},
"name": "user commits / second",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "rate(oracle_activity_user_commits_value{ident=\"$ident\", instance=\"$instance\"}[2m])",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "18357a10-cab4-4795-a4a4-fd960d37ce95",
"layout": {
"h": 3,
"i": "18357a10-cab4-4795-a4a4-fd960d37ce95",
"w": 6,
"x": 18,
"y": 1
},
"name": "user rollbacks / second",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "rate(oracle_activity_user_rollbacks_value{ident=\"$ident\", instance=\"$instance\"}[2m])",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "8e5f9501-7bc0-4b77-9178-3ab875202f43",
"layout": {
"h": 3,
"i": "8e5f9501-7bc0-4b77-9178-3ab875202f43",
"w": 6,
"x": 0,
"y": 1
},
"name": "status",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 1
},
"result": {
"color": "#5ea70f",
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"color": "#f60f0f",
"text": "DOWN"
},
"type": "special"
}
]
},
"targets": [
{
"expr": "oracle_up{ident=\"$ident\", instance=\"$instance\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "aa019cdc-109a-4d3d-9549-9abc20720343",
"layout": {
"h": 1,
"i": "aa019cdc-109a-4d3d-9549-9abc20720343",
"w": 24,
"x": 0,
"y": 4
},
"name": "Waits",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "51451443-eb34-4bdc-8fc5-1f0ee35eb73c",
"layout": {
"h": 7,
"i": "51451443-eb34-4bdc-8fc5-1f0ee35eb73c",
"w": 24,
"x": 0,
"y": 5
},
"name": "Time waited",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "oracle_wait_time_value{ident=\"$ident\", instance=\"$instance\"}",
"legend": "{{wait_class}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "108aa978-21be-45f4-92a6-f125f977965c",
"layout": {
"h": 1,
"i": "108aa978-21be-45f4-92a6-f125f977965c",
"w": 24,
"x": 0,
"y": 12
},
"name": "Tablespace",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "cf2454bd-0cf4-4f1a-a96a-b043db94da1f",
"layout": {
"h": 7,
"i": "cf2454bd-0cf4-4f1a-a96a-b043db94da1f",
"w": 12,
"x": 0,
"y": 13
},
"name": "Used Percent",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "oracle_tablespace_bytes{ident=\"$ident\", instance=\"$instance\"}/oracle_tablespace_max_bytes{ident=\"$ident\", instance=\"$instance\"}",
"legend": "{{tablespace}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "8f4aa32b-1ab6-49d2-94c5-83f408dd3cc4",
"layout": {
"h": 7,
"i": "8f4aa32b-1ab6-49d2-94c5-83f408dd3cc4",
"w": 12,
"x": 12,
"y": 13
},
"name": "Free space",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "oracle_tablespace_free{ident=\"$ident\", instance=\"$instance\"}",
"legend": "{{tablespace}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "91c8d4ca-109e-4380-9222-92cffdcc5381",
"layout": {
"h": 1,
"i": "91c8d4ca-109e-4380-9222-92cffdcc5381",
"w": 24,
"x": 0,
"y": 20
},
"name": "IO and TPS",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "08552fed-10d2-4408-809e-eabc705db9f5",
"layout": {
"h": 7,
"i": "08552fed-10d2-4408-809e-eabc705db9f5",
"w": 8,
"x": 0,
"y": 21
},
"name": "IO Requests / Second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "oracle_sysmetric_io_requests_per_second_value{ident=\"$ident\", instance=\"$instance\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "c42236d6-d18a-40bb-84dc-d287b1d0ac25",
"layout": {
"h": 7,
"i": "c42236d6-d18a-40bb-84dc-d287b1d0ac25",
"w": 8,
"x": 8,
"y": 21
},
"name": "TPS",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "oracle_sysmetric_user_transaction_per_sec_value{ident=\"$ident\", instance=\"$instance\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "3a564e64-4bed-4381-ab08-517b51f6cc66",
"layout": {
"h": 7,
"i": "3a564e64-4bed-4381-ab08-517b51f6cc66",
"w": 8,
"x": 16,
"y": 21
},
"name": "IO Bytes / Second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "oracle_sysmetric_io_megabytes_per_second_value{ident=\"$ident\", instance=\"$instance\"}*1024*1024",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "34bc0a3c-23ee-4792-9552-0994fb027464",
"layout": {
"h": 1,
"i": "34bc0a3c-23ee-4792-9552-0994fb027464",
"w": 24,
"x": 0,
"y": 28
},
"name": "Connections",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "f46676da-b625-458e-b8d2-9079441ac3d6",
"layout": {
"h": 7,
"i": "f46676da-b625-458e-b8d2-9079441ac3d6",
"w": 24,
"x": 0,
"y": 29
},
"name": "Sessions",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "oracle_sessions_value{ident=\"$ident\", instance=\"$instance\",status=\"ACTIVE\"}",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "f8a61c95-0d00-4d38-a9d1-5813f70443da",
"layout": {
"h": 1,
"i": "f8a61c95-0d00-4d38-a9d1-5813f70443da",
"w": 24,
"x": 0,
"y": 36
},
"name": "Hit Ratio",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "f35e0768-204e-43c8-8d43-32f34a391bf8",
"layout": {
"h": 7,
"i": "f35e0768-204e-43c8-8d43-32f34a391bf8",
"w": 6,
"x": 0,
"y": 37
},
"name": "Buffer Cache Hit Ratio",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "oracle_sysmetric_buffer_cache_hit_ratio_value{ident=\"$ident\", instance=\"$instance\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "4239c9e1-0bf3-42ae-a7a5-8db2c38f1900",
"layout": {
"h": 7,
"i": "4239c9e1-0bf3-42ae-a7a5-8db2c38f1900",
"w": 6,
"x": 6,
"y": 37
},
"name": "Redo Allocation Hit Ratio",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "oracle_sysmetric_redo_allocation_hit_ratio_value{ident=\"$ident\", instance=\"$instance\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "c40640c2-31b8-4bec-a88e-8a0f346da2a8",
"layout": {
"h": 7,
"i": "c40640c2-31b8-4bec-a88e-8a0f346da2a8",
"w": 6,
"x": 12,
"y": 37
},
"name": "Row Cache Hit Ratio",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "oracle_sysmetric_row_cache_hit_ratio_value{ident=\"$ident\", instance=\"$instance\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "93c7f8d2-093c-47fc-93e8-97b47bfcff80",
"layout": {
"h": 7,
"i": "93c7f8d2-093c-47fc-93e8-97b47bfcff80",
"w": 6,
"x": 18,
"y": 37
},
"name": "Library Cache Hit Ratio",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "oracle_sysmetric_library_cache_hit_ratio_value{ident=\"$ident\", instance=\"$instance\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "9857bf37-1e40-4cf5-adbc-8331f5e128c8",
"layout": {
"h": 1,
"i": "9857bf37-1e40-4cf5-adbc-8331f5e128c8",
"w": 24,
"x": 0,
"y": 44
},
"name": "Physical Read Write",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "b47bcfb9-2d26-454d-982a-039b769d405b",
"layout": {
"h": 7,
"i": "b47bcfb9-2d26-454d-982a-039b769d405b",
"w": 6,
"x": 0,
"y": 45
},
"name": "Physical Read Write Bytes / Second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "oracle_sysmetric_physical_read_bytes_per_sec_value{ident=\"$ident\", instance=\"$instance\"}",
"refId": "A"
},
{
"expr": "oracle_sysmetric_Physical_Write_Bytes_Per_Sec{ident=\"$ident\", instance=\"$instance\"}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "0c4ea45a-913f-4464-9c31-eb026a365729",
"layout": {
"h": 7,
"i": "0c4ea45a-913f-4464-9c31-eb026a365729",
"w": 6,
"x": 6,
"y": 45
},
"name": "Physical Read Write Total Bytes / Second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "oracle_sysmetric_physical_read_total_bytes_per_sec_value{ident=\"$ident\", instance=\"$instance\"}",
"refId": "A"
},
{
"expr": "oracle_sysmetric_Physical_Write_Total_Bytes_Per_Sec{ident=\"$ident\", instance=\"$instance\"}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "2bbfa751-4ac8-4ab9-affe-04b68e98daec",
"layout": {
"h": 7,
"i": "2bbfa751-4ac8-4ab9-affe-04b68e98daec",
"w": 6,
"x": 12,
"y": 45
},
"name": "Physical RW IO Requests / Second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "oracle_sysmetric_physical_read_io_requests_per_sec_value{ident=\"$ident\", instance=\"$instance\"}",
"refId": "A"
},
{
"expr": "oracle_sysmetric_Physical_Write_IO_Requests_Per_Sec{ident=\"$ident\", instance=\"$instance\"}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "2ec5b041-dbd1-4013-bac8-bb0ac6fb5df6",
"layout": {
"h": 7,
"i": "2ec5b041-dbd1-4013-bac8-bb0ac6fb5df6",
"w": 6,
"x": 18,
"y": 45
},
"name": "Physical RW Total IO Requests / Second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "oracle_sysmetric_physical_read_total_io_requests_per_sec_value{ident=\"$ident\", instance=\"$instance\"}",
"refId": "A"
},
{
"expr": "oracle_sysmetric_Physical_Write_Total_IO_Requests_Per_Sec{ident=\"$ident\", instance=\"$instance\"}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(oracle_up,ident)",
"name": "ident",
"options": [
"tt-fc-log00.nj"
],
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(oracle_up{ident=\"$ident\"},instance)",
"name": "instance",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328218855000
}
================================================
FILE: integrations/Oracle/markdown/README.md
================================================
# Oracle plugin
Oracle 插件,用于监控 Oracle 数据库。默认无法跑在 Windows 上。如果你的 Oracle 部署在 Windows 上,也没问题,使用部署在 Linux 上的 Categraf 远程监控 Windows 上的 Oracle,也行得通。
Oracle 插件的核心监控原理,就是执行下面 [这些 SQL 语句](https://github.com/flashcatcloud/categraf/blob/main/conf/input.oracle/metric.toml),然后解析出结果,上报到监控服务端。
以其中一个为例:
```toml
[[metrics]]
measurement = "activity"
metric_fields = [ "value" ]
field_to_append = "name"
timeout = "3s"
request = '''
SELECT name, value FROM v$sysstat WHERE name IN ('parse count (total)', 'execute count', 'user commits', 'user rollbacks')
'''
```
- measurement:指标类别
- label_fields:作为 label 的字段
- metric_fields:作为 metric 的字段,因为是作为 metric 的字段,所以这个字段的值必须是数字
- field_to_append:表示这个字段附加到 metric_name 后面,作为 metric_name 的一部分
- timeout:超时时间
- request:具体查询的 SQL 语句
如果你想监控的指标,默认没有采集,只需要增加自定义的 `[[metrics]]` 配置即可。
================================================
FILE: integrations/PHP/collect/phpfpm/phpfpm.toml
================================================
# # collect interval
# interval = 15
[[instances]]
## An array of Nginx stub_status URI to gather stats.
urls = [
## HTTP: the URL must start with http:// or https://, ie:
# "http://localhost/status",
# "https://www.baidu.com/phpfpm-status",
## fcgi: the URL must start with fcgi:// or cgi://, and port must be present, ie:
# "fcgi://127.0.0.1:9001",
# "cgi://192.168.0.1:9000/status",
## Unix socket: path to fpm socket, ie:
# "/run/php/php7.2-fpm.sock",
## or using a custom fpm status path:
# "/var/run/php5-fpm.sock:/fpm-custom-status-path",
## glob patterns are also supported:
# "/var/run/php*.sock"
]
## append some labels for series
# labels = { region="cloud", product="n9e" }
## interval = global.interval * interval_times
# interval_times = 1
## Set response_timeout (default 5 seconds),HTTP urls only
response_timeout = "5s"
## Whether to follow redirects from the server (defaults to false),HTTP urls only
# follow_redirects = false
## Optional HTTP Basic Auth Credentials,HTTP urls only
#username = "admin"
#password = "admin"
## Optional headers,HTTP urls only
# headers = ["X-From", "categraf", "X-Xyz", "abc"]
## Optional TLS Config,only http
# use_tls = false
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
================================================
FILE: integrations/PHP/markdown/README.md
================================================
# PHP-FPM
*PHP-FPM* (PHP FastCGI Process Manager) 监控采集插件,由telegraf的phpfpm改造而来。
该插件需要更改phpfpm的配置文件,开启 *pm.status_path*配置项
```
pm.status_path = /status
```
## Configuration
请参考配置[示例](https://github.com/flashcatcloud/categraf/blob/main/conf/input.phpfpm/phpfpm.toml)文件
### 注意事项:
1. 如下配置 仅生效于HTTP的url
- response_timeout
- username & password
- headers
- TLS config
2. 如果使用 Unix socket,需要保证 categraf 和 socket path 在同一个主机上,且 categraf 运行用户拥有读取该 path 的权限。
## 监控大盘和告警规则
待更新...
================================================
FILE: integrations/Ping/alerts/ping_by_categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "PING address detection failed",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "ping_result_code != 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328224555000
}
]
================================================
FILE: integrations/Ping/collect/ping/ping.toml
================================================
# # collect interval
# interval = 15
[[instances]]
# send ping packets to
targets = [
# "www.baidu.com",
# "127.0.0.1",
# "10.4.5.6",
# "10.4.5.7"
]
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
## Number of ping packets to send per interval. Corresponds to the "-c"
## option of the ping command.
# count = 1
## Time to wait between sending ping packets in seconds. Operates like the
## "-i" option of the ping command.
# ping_interval = 1.0
## If set, the time to wait for a ping response in seconds. Operates like
## the "-W" option of the ping command.
# timeout = 3.0
## Interface or source address to send ping from. Operates like the -I or -S
## option of the ping command.
# interface = ""
## Use only IPv6 addresses when resolving a hostname.
# ipv6 = false
## Number of data bytes to be sent. Corresponds to the "-s"
## option of the ping command.
# size = 56
# max concurrency coroutine
# concurrency = 50
================================================
FILE: integrations/Ping/dashboards/ping_by_categraf_a.json
================================================
{
"id": 0,
"group_id": 0,
"name": "PING detection by UlricQin",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"aggrDimension": "target",
"calc": "lastNotNull",
"colorMode": "background",
"displayMode": "labelValuesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1677138f-0f33-485c-8ee1-2db24cabbf54",
"layout": {
"h": 15,
"i": "1677138f-0f33-485c-8ee1-2db24cabbf54",
"isResizable": true,
"w": 24,
"x": 0,
"y": 0
},
"name": "Ping",
"options": {
"standardOptions": {},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"value": "A"
},
"properties": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "#2c9d3d",
"text": "UP"
},
"type": "special"
},
{
"match": {
"from": 1,
"special": 1
},
"result": {
"color": "#e90f0f",
"text": "DOWN"
},
"type": "range"
}
]
}
},
{
"matcher": {
"value": "B"
},
"properties": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#f30a0a"
},
"type": "range"
},
{
"match": {
"special": 0
},
"result": {
"color": "#2c9d3d"
},
"type": "special"
}
]
},
"type": "special"
},
{
"matcher": {
"value": "C"
},
"properties": {
"standardOptions": {
"util": "milliseconds"
},
"valueMappings": [
{
"match": {
"from": null,
"to": 100
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
},
{
"match": {
"to": 300
},
"result": {
"color": "#ff8286"
},
"type": "range"
},
{
"match": {
"from": 1000,
"to": null
},
"result": {
"color": "#f00808"
},
"type": "range"
}
]
},
"type": "special"
}
],
"targets": [
{
"expr": "max(ping_result_code) by (target)",
"legend": "UP?",
"refId": "A"
},
{
"expr": "max(ping_percent_packet_loss) by (target)",
"legend": "Packet Loss %",
"refId": "B"
},
{
"expr": "max(ping_maximum_response_ms) by (target) ",
"legend": "Latency(ms)",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328225512000
}
================================================
FILE: integrations/Ping/dashboards/ping_by_categraf_b.json
================================================
{
"id": 0,
"group_id": 0,
"name": "PING大盘2.0",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colorMode": "background",
"displayMode": "labelsOfSeriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceName": "flashcat_prometheus",
"datasourceValue": "${datasource}",
"id": "cc788533-f60a-4fe7-bea5-9bdb20389bc9",
"layout": {
"h": 11,
"i": "cc788533-f60a-4fe7-bea5-9bdb20389bc9",
"isResizable": true,
"w": 7,
"x": 0,
"y": 0
},
"maxPerRow": 4,
"name": "连通性",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "#2c9d3d",
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 1
},
"result": {
"color": "#ff656b",
"text": "DOWN"
},
"type": "special"
}
]
},
"overrides": [
{}
],
"targets": [
{
"expr": "max(ping_result_code) by (target,subnet)",
"legend": "源地址: {{subnet}} 目标地址:{{target}}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "background",
"columns": [],
"displayMode": "labelsOfSeriesToRows",
"showHeader": true,
"sortOrder": "descend"
},
"datasourceCate": "prometheus",
"datasourceName": "flashcat_prometheus",
"datasourceValue": "${datasource}",
"id": "0372da5a-d139-4fc4-92e5-bbf77dc6ee3b",
"layout": {
"h": 11,
"i": "0372da5a-d139-4fc4-92e5-bbf77dc6ee3b",
"isResizable": true,
"w": 8,
"x": 7,
"y": 0
},
"maxPerRow": 4,
"name": "延迟",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": -1
},
"result": {
"color": "#ff656b"
},
"type": "special"
},
{
"match": {
"from": 0,
"to": 5
},
"result": {
"color": "#61d071"
},
"type": "range"
},
{
"match": {
"from": 5,
"to": 100
},
"result": {
"color": "#ecd245"
},
"type": "range"
},
{
"match": {
"from": 100,
"to": 200
},
"result": {
"color": "#ffae39"
},
"type": "range"
}
]
},
"overrides": [
{
"matcher": {
"value": "A"
},
"properties": {
"valueMappings": []
}
}
],
"targets": [
{
"expr": "max(ping_maximum_response_ms) by (target,subnet)",
"legend": "源地址: {{subnet}} 目标地址:{{target}}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"donut": false,
"labelWithName": false,
"legengPosition": "hidden"
},
"datasourceCate": "prometheus",
"datasourceName": "flashcat_prometheus",
"datasourceValue": "${datasource}",
"id": "4b8d51bf-01cf-4007-8c96-8f21378bee3f",
"layout": {
"h": 11,
"i": "4b8d51bf-01cf-4007-8c96-8f21378bee3f",
"isResizable": true,
"w": 9,
"x": 15,
"y": 0
},
"maxPerRow": 4,
"name": "TTL",
"targets": [
{
"expr": "max(ping_ttl) by (target,subnet)",
"legend": "探测源: {{subnet}}目标地址: {{target }} TTL",
"refId": "A"
}
],
"type": "pie",
"version": "2.0.0"
},
{
"custom": {
"calc": "last",
"colorDomain": [
0,
50
],
"colorDomainAuto": false,
"colorRange": [
"#83c898",
"#c2c2c2",
"#fc653f"
],
"reverseColorOrder": false,
"textMode": "valueAndName"
},
"datasourceCate": "prometheus",
"datasourceName": "flashcat_prometheus",
"datasourceValue": "${datasource}",
"id": "95ad7fba-c794-47fc-aec3-dde0a4531829",
"layout": {
"h": 12,
"i": "95ad7fba-c794-47fc-aec3-dde0a4531829",
"isResizable": true,
"w": 24,
"x": 0,
"y": 11
},
"maxPerRow": 4,
"name": "丢包率",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(ping_percent_packet_loss) by (subnet,target)",
"legend": "目标地址: {{target}}",
"refId": "B"
}
],
"type": "hexbin",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "200a02f9-1132-4345-a251-3e497a2e01d1",
"layout": {
"h": 1,
"i": "200a02f9-1132-4345-a251-3e497a2e01d1",
"isResizable": false,
"w": 24,
"x": 0,
"y": 23
},
"name": "",
"panels": [],
"type": "row"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328226744000
}
================================================
FILE: integrations/Ping/markdown/README.md
================================================
# ping
ping 监控插件,探测远端目标地址能否 ping 通,如果机器没有禁 ping,这就是一个很好用的探测机器存活的手段
## Configuration
categraf 的 `conf/input.ping/ping.toml`。
要探测的机器配置到 targets 中,targets 是个数组,可以配置多个,当然也可以拆成多个 `[[instances]]` 配置段,比如:
```
[[instances]]
targets = [ "10.4.5.6" ]
labels = { region="cloud", product="n9e" }
[[instances]]
targets = [ "10.4.5.7" ]
labels = { region="cloud", product="zbx" }
```
上例中是 ping 两个地址,为了信息更丰富,附加了 region 和 product 标签
## File Limit
```sh
systemctl edit categraf
```
Increase the number of open files:
```ini
[Service]
LimitNOFILE=8192
```
Restart Categraf:
```sh
systemctl restart categraf
```
### Linux Permissions
On most systems, ping requires `CAP_NET_RAW` capabilities or for Categraf to be run as root.
With systemd:
```sh
systemctl edit categraf
```
```ini
[Service]
CapabilityBoundingSet=CAP_NET_RAW
AmbientCapabilities=CAP_NET_RAW
```
```sh
systemctl restart categraf
```
Without systemd:
```sh
setcap cap_net_raw=eip /usr/bin/categraf
```
Reference [`man 7 capabilities`][man 7 capabilities] for more information about
setting capabilities.
[man 7 capabilities]: http://man7.org/linux/man-pages/man7/capabilities.7.html
### Other OS Permissions
When using `method = "native"`, you will need permissions similar to the executable ping program for your OS.
================================================
FILE: integrations/Ping/metrics/categraf.json
================================================
[
{
"id": 0,
"uuid": 1717556328227717000,
"collector": "Categraf",
"typ": "Ping",
"name": "Ping ttl 时间",
"unit": "seconds",
"note": "Time To Live,指的是报文在网络中能够“存活”的限制时间",
"lang": "zh_CN",
"expression": "ping_ttl",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Ping ttl 时间",
"note": "Time To Live,指的是报文在网络中能够“存活”的限制时间"
},
{
"lang": "en_US",
"name": "Ping ttl time",
"note": "Time To Live refers to the limited time that a packet can \"survive\" in the network"
}
]
},
{
"id": 0,
"uuid": 1717556328229868000,
"collector": "Categraf",
"typ": "Ping",
"name": "Ping 丢包率",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "ping_percent_packet_loss",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Ping 丢包率",
"note": ""
},
{
"lang": "en_US",
"name": "Ping packet loss rate",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328231796000,
"collector": "Categraf",
"typ": "Ping",
"name": "Ping 平均耗时",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "ping_average_response_ms",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Ping 平均耗时",
"note": ""
},
{
"lang": "en_US",
"name": "Ping average time consumed",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328233567000,
"collector": "Categraf",
"typ": "Ping",
"name": "Ping 探测结果状态码",
"unit": "none",
"note": "值为 0 就是正常,非 0 值就是异常。如果 Ping 失败,Categraf 日志中理应会有异常日志",
"lang": "zh_CN",
"expression": "ping_result_code",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Ping 探测结果状态码",
"note": "值为 0 就是正常,非 0 值就是异常。如果 Ping 失败,Categraf 日志中理应会有异常日志"
},
{
"lang": "en_US",
"name": "Ping probe result status code",
"note": "A value of 0 is normal, and a non-0 value is abnormal. If the Ping fails, there should be an exception log in the Categraf log"
}
]
},
{
"id": 0,
"uuid": 1717556328235586000,
"collector": "Categraf",
"typ": "Ping",
"name": "Ping 最大耗时",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "ping_maximum_response_ms",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Ping 最大耗时",
"note": ""
},
{
"lang": "en_US",
"name": "Ping maximum time consumption",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328237578000,
"collector": "Categraf",
"typ": "Ping",
"name": "Ping 最小耗时",
"unit": "milliseconds",
"note": "",
"lang": "zh_CN",
"expression": "ping_minimum_response_ms",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "Ping 最小耗时",
"note": ""
},
{
"lang": "en_US",
"name": "Ping minimum time consumption",
"note": ""
}
]
}
]
================================================
FILE: integrations/PostgreSQL/alerts/postgresql_by_categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "posgresql读取时间过高",
"note": "数据库中花费在读取文件的时间,这个值很高说明内存较小,需要频繁从磁盘读入数据文件",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "postgresql_blk_read_time\u003e3000",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328241737000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Postgresql down",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "postgresql_up!=1",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328242260000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "postgresql写入时间过高",
"note": "数据库中花费在写数据文件的时间,pg中脏页一般写入page cache,如果这个值较高,则说明cache较小,操作系统的cache需要更积极的写入",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "postgresql_blk_write_time \u003e 1000",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328242692000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "postgresql有死锁",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "postgresql_deadlocks \u003e 0",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328243230000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Postgresql缓存命中率低于50%",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "postgresql_blks_hit*100/(postgresql_blks_hit+postgresql_blks_read) \u003c 50",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328243658000
}
]
================================================
FILE: integrations/PostgreSQL/collect/postgresql/postgresql.toml
================================================
# Read metrics from one or many postgresql servers
# # collect interval
# interval = 15
[[instances]]
## specify address via a url matching:
## postgres://[pqgotest[:password]]@localhost[/dbname]?sslmode=[disable|verify-ca|verify-full]
## or a simple string:
## host=localhost user=pqgotest password=... sslmode=... dbname=app_production
##
## All connection parameters are optional.
##
## Without the dbname parameter, the driver will default to a database
## with the same name as the user. This dbname is just for instantiating a
## connection with the server and doesn't restrict the databases we are trying
## to grab metrics for.
##
# address = "host=localhost user=postgres sslmode=disable"
## A custom name for the database that will be used as the "server" tag in the
## measurement output. If not specified, a default one generated from
## the connection address is used.
# outputaddress = "db01"
## connection configuration.
## maxlifetime - specify the maximum lifetime of a connection.
## default is forever (0s)
# max_lifetime = "0s"
## A list of databases to explicitly ignore. If not specified, metrics for all
## databases are gathered. Do NOT use with the 'databases' option.
# ignored_databases = ["postgres", "template0", "template1"]
## A list of databases to pull metrics about. If not specified, metrics for all
## databases are gathered. Do NOT use with the 'ignored_databases' option.
# databases = ["app_production", "testing"]
## Whether to use prepared statements when connecting to the database.
## This should be set to false when connecting through a PgBouncer instance
## with pool_mode set to transaction.
#prepared_statements = true
# [[instances.metrics]]
# measurement = "sessions"
# label_fields = [ "status", "type" ]
# metric_fields = [ "value" ]
# timeout = "3s"
# request = '''
# SELECT status, type, COUNT(*) as value FROM v$session GROUP BY status, type
# '''
================================================
FILE: integrations/PostgreSQL/dashboards/postgresql_by_categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "postgresql by categraf",
"ident": "",
"tags": "categraf",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [
{
"targetBlank": true,
"title": "文档",
"url": "https://github.com/ccfos/nightingale/tree/main/integrations/postgresql/markdown/"
}
],
"panels": [
{
"collapsed": true,
"id": "7f38f76f-5e62-4660-a104-a7f1e6e73aeb",
"layout": {
"h": 1,
"i": "7f38f76f-5e62-4660-a104-a7f1e6e73aeb",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "pg_stat_database",
"panels": [],
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {
"title": 32,
"value": 46
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "65571b64-d249-40f1-8c57-ef826118c7a9",
"layout": {
"h": 4,
"i": "65571b64-d249-40f1-8c57-ef826118c7a9",
"isResizable": true,
"w": 6,
"x": 0,
"y": 1
},
"name": "连接数",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "postgresql_numbackends{server=\"$server\",db=\"$db\"}",
"legend": "{{db}}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {
"title": 32,
"value": 46
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "命中率低于99%,就需要适当调大shared buffer",
"id": "bdf5299c-b621-4d98-bcf8-fa2ebd12b7d4",
"layout": {
"h": 4,
"i": "0fb92b82-63c4-4edd-a452-fb51150757ab",
"isResizable": true,
"w": 6,
"x": 6,
"y": 1
},
"name": "缓存命中率",
"options": {
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "postgresql_blks_hit{server=\"$server\",db=\"$db\"}/(postgresql_blks_hit{server=\"$server\",db=\"$db\"}+postgresql_blks_read{server=\"$server\",db=\"$db\"})",
"legend": "{{db}}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {
"title": 32,
"value": 46
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "死锁的数量,如果这个值很大说明业务逻辑有问题",
"id": "2a559648-ad51-40d1-a6ff-f641467b57ea",
"layout": {
"h": 4,
"i": "d78ae395-1d53-481a-9566-9f4913616330",
"isResizable": true,
"w": 6,
"x": 12,
"y": 1
},
"name": "死锁数",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "postgresql_deadlocks{server=\"$server\",db=\"$db\"}",
"legend": "{{db}}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {
"title": 32,
"value": 46
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "与恢复冲突取消的查询次数,只会在备机上发生",
"id": "87a74ec2-c6d2-49e8-a1ea-4dbf1252a33d",
"layout": {
"h": 4,
"i": "a666823e-455f-459e-bbfe-169341b77ddc",
"isResizable": true,
"w": 6,
"x": 18,
"y": 1
},
"name": "冲突数",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "postgresql_conflicts{server=\"$server\",db=\"$db\"}",
"legend": "{{db}}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "rollback 过多,则需要检查业务逻辑,表面业务逻辑存在问题",
"id": "3dea2872-5904-4abf-b0f8-64afe82b617a",
"layout": {
"h": 4,
"i": "443aee8c-d1b8-485b-b74f-13fa58b6e7ae",
"isResizable": true,
"w": 12,
"x": 0,
"y": 5
},
"name": "事务统计",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "increase(postgresql_xact_commit{server=\"$server\",db=\"$db\"}[5m])",
"legend": "commit",
"refId": "A"
},
{
"expr": "increase(postgresql_xact_rollback{server=\"$server\",db=\"$db\"}[5m])",
"legend": "rollback",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "returned ,代表执行查询遍历行数\nfetched,代表返回给客户端的行数\nreturned 远大于 fetched,代表查询效率低,存在全表扫描,应增加索引进行优化",
"id": "dec2c25d-ae70-4e47-bc4f-6dbaa9e72683",
"layout": {
"h": 4,
"i": "8b85bc95-091a-4868-9bec-b495609d7a04",
"isResizable": true,
"w": 12,
"x": 12,
"y": 5
},
"name": "数据查询统计",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "increase(postgresql_tup_returned{server=\"$server\",db=\"$db\"}[5m])",
"legend": "returned",
"refId": "A"
},
{
"expr": "increase(postgresql_tup_fetched{server=\"$server\",db=\"$db\"}[5m])",
"legend": "fetched",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "048d2652-40f1-42ab-a435-7e02d51c98c4",
"layout": {
"h": 4,
"i": "e08d94bb-1a72-4104-8ba0-11ab7317a192",
"isResizable": true,
"w": 12,
"x": 0,
"y": 9
},
"name": "数据更新统计",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "increase(postgresql_tup_inserted{server=\"$server\",db=\"$db\"}[5m])",
"legend": "inserted",
"refId": "A"
},
{
"expr": "increase(postgresql_tup_updated{server=\"$server\",db=\"$db\"}[5m])",
"legend": "updated",
"refId": "B"
},
{
"expr": "increase(postgresql_tup_deleted{server=\"$server\",db=\"$db\"}[5m])",
"legend": "deleted",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "temp_files较高说明存在很多排序,hash,或者聚合这种操作,可以增大work_mem减少临时文件的产生,并且同时这些操作的性能也会有较大的提升",
"id": "c89d1e4b-d8ad-402d-bee4-f6d362b4634a",
"layout": {
"h": 4,
"i": "971c5838-7408-4ec7-ae5e-33dcd0f20210",
"isResizable": true,
"w": 12,
"x": 12,
"y": 9
},
"name": "生成临时文件统计",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "increase(postgresql_temp_files{server=\"$server\",db=\"$db\"}[5m])",
"legend": "temp_files",
"refId": "A"
},
{
"expr": "increase(postgresql_temp_bytes{server=\"$server\",db=\"$db\"}[5m])",
"legend": "temp_bytes",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "数据库中花费在读取文件的时间,这个值很高说明内存较小,需要频繁从磁盘读入数据文件\n数据库中花费在写数据文件的时间,pg中脏页一般写入page cache,如果这个值较高,则说明cache较小,操作系统的cache需要更积极的写入",
"id": "7ae2c26a-cc2b-435d-9a16-52722c37bedf",
"layout": {
"h": 4,
"i": "71e28edd-8961-4a7a-be78-3a9a526c89bb",
"isResizable": true,
"w": 24,
"x": 0,
"y": 13
},
"name": "数据库读写时间统计",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "increase(postgresql_blk_read_time{server=\"$server\",db=\"$db\"}[5m])",
"legend": "blk_read_time",
"refId": "A"
},
{
"expr": "increase(postgresql_blk_write_time{server=\"$server\",db=\"$db\"}[5m])",
"legend": "blk_write_time",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "72575aba-dbf5-4a05-8bd6-64170bff617e",
"layout": {
"h": 1,
"i": "72575aba-dbf5-4a05-8bd6-64170bff617e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 17
},
"name": "pg_stat_bgwriter",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "postgresql_checkpoints_timed:指超过checkpoint_timeout的时间后触发的检查点次数\npostgresql_checkpoints_req:手动触发checkpoint或者因为WAL文件数量达到max_wal_size时也会增加,如果这个值大于postgresql_checkpoints_timed说明checkpoint_timeout设置的不合理",
"id": "600a9687-0a1e-4531-b9bc-ccf8589566a2",
"layout": {
"h": 5,
"i": "c3d3ccee-719f-4280-8921-367ea5343849",
"isResizable": true,
"w": 12,
"x": 0,
"y": 18
},
"name": "checkpoint分布",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "increase(postgresql_checkpoints_timed{server=\"$server\",db=\"$db\"}[5m])",
"legend": "postgresql_checkpoints_timed",
"refId": "A"
},
{
"expr": "increase(postgresql_checkpoints_req{server=\"$server\",db=\"$db\"}[5m])",
"legend": "postgresql_checkpoints_req",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "checkpoint_write_time: 从shared buffer 中write到page cache花费的时间\ncheckpoint_sync_time : checkpoint调用fsync将脏数据刷到磁盘花费的时间,如果这个值很长,容易造成IO抖动,需要增加checkpoint_timeout或者checkpoint_completion_target",
"id": "561012f8-481a-482a-8504-4bdd8c62e8a6",
"layout": {
"h": 5,
"i": "178d8220-3c22-4ea3-909d-71c73fbb3a78",
"isResizable": true,
"w": 12,
"x": 12,
"y": 18
},
"name": "checkpoint 写文件时间分布",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "increase(postgresql_checkpoint_write_time{server=\"$server\",db=\"$db\"}[5m])",
"legend": "checkpoint_write_time",
"refId": "A"
},
{
"expr": "increase(postgresql_checkpoint_sync_time{server=\"$server\",db=\"$db\"}[5m])",
"legend": "checkpoint_sync_time",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "buffers_checkpoint: checkpoint写数据块数量\nbuffers_clean:bgwriter写入数据块的数量\nbuffers_backend:通过backend写入的块数量, 此值过大,代表shared buffer没有维护好,后端进程不得不自己去进行刷盘的动作\nbuffers_backend_fsync: backend需要fsync的次数\n",
"id": "af9e2a7f-ce09-4de2-8625-e779c770d46d",
"layout": {
"h": 5,
"i": "23b6fc64-1a19-422e-b28e-453570af339c",
"isResizable": true,
"w": 24,
"x": 0,
"y": 23
},
"name": "数据块写入分布",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "increase(postgresql_buffers_checkpoint{server=\"$server\",db=\"$db\"}[5m])",
"legend": "buffers_checkpoint",
"refId": "A"
},
{
"expr": "increase(postgresql_buffers_clean{server=\"$server\",db=\"$db\"}[5m])",
"legend": "buffers_clean",
"refId": "B"
},
{
"expr": "increase(postgresql_buffers_backend{server=\"$server\",db=\"$db\"}[5m])",
"legend": "buffers_backend",
"refId": "C"
},
{
"expr": "increase(postgresql_buffers_backend_fsync{server=\"$server\",db=\"$db\"}[5m])",
"legend": "buffers_backend_fsync",
"refId": "D"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(postgresql_datid,server)",
"name": "server",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(postgresql_datid{server=\"$server\"},db)",
"name": "db",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328244926000
}
================================================
FILE: integrations/PostgreSQL/markdown/README.md
================================================
# PostgreSQL
categraf 作为一个 client 连上 pg,采集相关指标,首先要确保用户授权。举例:
```
create user categraf with password 'categraf';
alter user categraf set default_transaction_read_only=on;
grant usage on schema public to categraf;
grant select on all tables in schema public to categraf ;
```
## 配置文件示例
```toml
[[instances]]
address = "host=192.168.11.181 port=5432 user=postgres password=123456789 sslmode=disable"
## specify address via a url matching:
## postgres://[pqgotest[:password]]@localhost[/dbname]?sslmode=[disable|verify-ca|verify-full]
## or a simple string:
## host=localhost user=pqgotest password=... sslmode=... dbname=app_production
##
## All connection parameters are optional.
##
## Without the dbname parameter, the driver will default to a database
## with the same name as the user. This dbname is just for instantiating a
## connection with the server and doesn't restrict the databases we are trying
## to grab metrics for.
##
# address = "host=localhost user=postgres sslmode=disable"
## A custom name for the database that will be used as the "server" tag in the
## measurement output. If not specified, a default one generated from
## the connection address is used.
# outputaddress = "db01"
## connection configuration.
## maxlifetime - specify the maximum lifetime of a connection.
## default is forever (0s)
# max_lifetime = "0s"
## A list of databases to explicitly ignore. If not specified, metrics for all
## databases are gathered. Do NOT use with the 'databases' option.
# ignored_databases = ["postgres", "template0", "template1"]
## A list of databases to pull metrics about. If not specified, metrics for all
## databases are gathered. Do NOT use with the 'ignored_databases' option.
# databases = ["app_production", "testing"]
## Whether to use prepared statements when connecting to the database.
## This should be set to false when connecting through a PgBouncer instance
## with pool_mode set to transaction.
# prepared_statements = true
#
# [[instances.metrics]]
# measurement = "sessions"
# label_fields = [ "status", "type" ]
# metric_fields = [ "value" ]
# timeout = "3s"
# request = '''
# SELECT status, type, COUNT(*) as value FROM v$session GROUP BY status, type
# '''
```
================================================
FILE: integrations/Procstat/alerts/categraf-procstat.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "process handle limit is too low",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_rlimit_num_fds_soft \u003c 2048",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328261765000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "there is a process count of 0, indicating that a certain process may have crashed",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "procstat_lookup_count == 0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328262311000
}
]
================================================
FILE: integrations/Procstat/collect/procstat/procstat.toml
================================================
# # collect interval
# interval = 15
# [[instances]]
# # executable name (ie, pgrep )
# search_exec_substring = "nginx"
# # pattern as argument for pgrep (ie, pgrep -f )
# search_cmdline_substring = "n9e server"
# # windows service name
# search_win_service = ""
# # search process with specific user, option with exec_substring or cmdline_substring
# search_user = ""
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
# # mode to use when calculating CPU usage. can be one of 'solaris' or 'irix'
# mode = "irix"
# sum of threads/fd/io/cpu/mem, min of uptime/limit
gather_total = true
# will append pid as tag
gather_per_pid = false
# gather jvm metrics only when jstat is ready
# gather_more_metrics = [
# "threads",
# "fd",
# "io",
# "uptime",
# "cpu",
# "mem",
# "limit",
# "jvm"
# ]
================================================
FILE: integrations/Procstat/dashboards/categraf-procstat.json
================================================
{
"name": "Procstat",
"tags": "Categraf OS",
"ident": "",
"uuid": 1717556328263679000,
"configs": {
"panels": [
{
"type": "stat",
"id": "be9aac6c-4401-4c61-8c43-574cf314ffef",
"layout": {
"h": 4,
"i": "be9aac6c-4401-4c61-8c43-574cf314ffef",
"isResizable": true,
"w": 5,
"x": 0,
"y": 0
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"expr": "procstat_lookup_count{ident=~\"$ident\", search_string=~\"$search_string\"}",
"instant": true,
"legend": "{{ident}} {{search_string}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Proc Count Now",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "background",
"columns": [
"ident",
"search_string",
"value"
],
"displayMode": "labelsOfSeriesToRows",
"showHeader": true,
"sortColumn": "ident",
"sortOrder": "ascend",
"tableLayout": "fixed",
"nowrap": false
},
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"id": "da621e2c-ae2b-4375-9a66-2bec7832490b",
"layout": {
"h": 4,
"i": "79db82d9-5f46-4c45-bb9f-c23f94d99e0a",
"isResizable": true,
"w": 9,
"x": 5,
"y": 0
},
"name": "Proc Count Table",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "#fa0c0c"
},
"type": "special"
},
{
"match": {
"from": 1
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
}
]
},
"overrides": [
{}
],
"targets": [
{
"expr": "procstat_lookup_count{ident=~\"$ident\", search_string=~\"$search_string\"}",
"instant": true,
"legend": "{{ident}} {{search_string}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"type": "timeseries",
"id": "881c04fd-8804-432e-9b34-b4761590de20",
"layout": {
"h": 4,
"i": "24b55362-d900-43c0-98d5-f2e994bf22a6",
"isResizable": true,
"w": 10,
"x": 14,
"y": 0
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"expr": "procstat_lookup_count{ident=~\"$ident\", search_string=~\"$search_string\"}",
"instant": false,
"legend": "{{ident}} {{search_string}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Proc Count Trend",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "stat",
"id": "ffeb0fc6-ee02-4fdd-a8e3-ec2b9db9c23c",
"layout": {
"h": 4,
"i": "acd6e7b5-99f5-4d9b-9124-8072c14e5fea",
"isResizable": true,
"w": 5,
"x": 0,
"y": 4
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"expr": "procstat_uptime_minimum{ident=~\"$ident\", search_string=~\"$search_string\"}",
"instant": true,
"legend": "{{ident}} {{search_string}}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Proc Uptime",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "value",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "auto"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#2c9d3d",
"value": null,
"type": "base"
}
]
},
"standardOptions": {
"util": "seconds",
"decimals": 2
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
}
}
]
},
{
"type": "timeseries",
"id": "c642a30a-da86-402c-87bf-c2f98616bf95",
"layout": {
"h": 4,
"i": "c642a30a-da86-402c-87bf-c2f98616bf95",
"isResizable": true,
"w": 9,
"x": 5,
"y": 4
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"expr": "procstat_cpu_usage_total{ident=~\"$ident\", search_string=~\"$search_string\"}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "CPU Util",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "cbc2444e-49c7-45e1-b64e-cd1282b5a419",
"layout": {
"h": 4,
"i": "198846a2-4794-4ba9-9c2d-137bce22f266",
"isResizable": true,
"w": 10,
"x": 14,
"y": 4
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"expr": "procstat_mem_usage_total{ident=~\"$ident\", search_string=~\"$search_string\"}",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Mem Util",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "d2bff162-5801-4d85-94d7-d63145d5b935",
"layout": {
"h": 4,
"i": "a208e192-cf74-468b-9bcb-cb81c8d78d24",
"isResizable": true,
"w": 12,
"x": 0,
"y": 8
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"expr": "procstat_num_fds_total{ident=~\"$ident\", search_string=~\"$search_string\"}/procstat_rlimit_num_fds_soft_minimum{ident=~\"$ident\", search_string=~\"$search_string\"}*100",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "FD soft Util",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "46a71143-84b5-4dde-87db-2f0403df6519",
"layout": {
"h": 4,
"i": "22dfb5e4-1d17-4e06-a9b4-b25cb33d1c20",
"isResizable": true,
"w": 12,
"x": 12,
"y": 8
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"expr": "procstat_num_fds_total{ident=~\"$ident\", search_string=~\"$search_string\"}/procstat_rlimit_num_fds_hard_minimum{ident=~\"$ident\", search_string=~\"$search_string\"}*100",
"refId": "A",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "FD hard Util",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "percent",
"decimals": 2
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "3dda4eb5-a27f-4d54-9547-ae8f0ac9bb96",
"layout": {
"h": 4,
"i": "3dda4eb5-a27f-4d54-9547-ae8f0ac9bb96",
"isResizable": true,
"w": 12,
"x": 0,
"y": 12
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"expr": "rate(procstat_read_bytes_total{ident=~\"$ident\", search_string=~\"$search_string\"}[$__rate_interval])",
"refId": "A",
"maxDataPoints": 240,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Read bytes",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "c97403f4-618d-4037-8ea7-5deb32eb8d56",
"layout": {
"h": 4,
"i": "ae0dc449-8263-4f38-8c52-d50b3cb3f1b4",
"isResizable": true,
"w": 12,
"x": 12,
"y": 12
},
"version": "3.1.0",
"datasourceCate": "prometheus",
"datasourceValue": "${Datasource}",
"targets": [
{
"expr": "rate(procstat_write_bytes_total{ident=~\"$ident\", search_string=~\"$search_string\"}[$__rate_interval])",
"refId": "A",
"maxDataPoints": 240,
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Write bytes",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"placement": "bottom",
"behaviour": "showItem",
"selectMode": "single"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#634CD9",
"value": null,
"type": "base"
}
]
},
"thresholdsStyle": {
"mode": "dashed"
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"pointSize": 5
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
}
],
"var": [
{
"definition": "prometheus",
"name": "Datasource",
"type": "datasource"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${Datasource}"
},
"definition": "label_values(system_load_norm_1, ident)",
"label": "Host",
"multi": true,
"name": "ident",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${Datasource}"
},
"definition": "label_values(procstat_lookup_count{ident=~\"$ident\"}, search_string)",
"label": "Proc",
"multi": true,
"name": "search_string",
"type": "query"
}
],
"version": "3.0.0"
}
}
================================================
FILE: integrations/Procstat/markdown/readme.md
================================================
# 进程监控
使用 categraf procstat 插件。
## 配置文件
位置:categraf 的 `conf/input.procstat/procstat.toml`
样例配置:
```toml
[[instances]]
# # executable name (ie, pgrep )
search_exec_substring = "nginx"
# # pattern as argument for pgrep (ie, pgrep -f )
# search_cmdline_substring = "n9e server"
# # windows service name
# search_win_service = ""
# # search process with specific user, option with exec_substring or cmdline_substring
# search_user = ""
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
# # mode to use when calculating CPU usage. can be one of 'solaris' or 'irix'
# mode = "irix"
# sum of threads/fd/io/cpu/mem, min of uptime/limit
gather_total = true
# will append pid as tag
gather_per_pid = false
# gather jvm metrics only when jstat is ready
# gather_more_metrics = [
# "threads",
# "fd",
# "io",
# "uptime",
# "cpu",
# "mem",
# "limit",
# "jvm"
# ]
```
机器上有很多进程,要监控进程是否存活以及进程的资源占用,首先得告诉 categraf,要监控的进程是啥。所以,本插件一开始的几个配置,就是做进程过滤的,用来告诉 categraf 要监控的进程是哪些。
- search_exec_substring 配置一个查询字符串,相当于执行 `pgrep `
- search_cmdline_substring 配置一个查询字符串,相当于执行 `pgrep -f `
- search_win_service 配置一个 windows 服务名,相当于执行 `sc query `
上例默认是采集 nginx。默认只会采集一个指标:procstat_lookup_count,表示通过这些过滤条件,查询到的进程的数量。那显然,如果 `procstat_lookup_count <= 0` 就说明进程不存在了。
## CPU 利用率计算
在计算 CPU 利用率的时候有两种模式:irix(默认)、solaris。如果是 irix 模式,CPU 利用率会出现大于 100% 的情况,如果是 solaris 模式,会考虑 CPU 核数,所以 CPU 利用率不会大于 100%。
## 采集更多指标
`gather_more_metrics` 默认没有打开,即不会采集进程资源利用情况。如果想要采集,就打开 `gather_more_metrics` 这个配置即可。其中最为特殊的是 `jvm`,如果想要采集 jvm 指标,需要先安装好 jstat,然后再打开 `jvm` 这个配置。
## gather_total
比如进程名字是 mysql 的进程,同时可能运行了多个,我们想知道这个机器上的所有 mysql 的进程占用的总的 cpu、mem、fd 等,就设置 gather_total = true,当然,对于 uptime 和 limit 的采集,gather_total 的时候是取的多个进程的最小值。
## gather_per_pid
还是拿 mysql 举例,一个机器上可能同时运行了多个,我们可能想知道每个 mysql 进程的资源占用情况,此时就要启用 gather_per_pid 的配置,设置为 true,此时会采集每个进程的资源占用情况,并附上 pid 作为标签来区分
================================================
FILE: integrations/Procstat/metrics/categraf.json
================================================
[
{
"id": 0,
"uuid": 1717556328266104000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程 CPU 利用率(单进程)",
"unit": "percent",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)\n\nCPU 利用率有两个模式,一个是 solaris,一个是 irix,默认是 irix,irix 模式下,CPU 利用率可能会超过 100%,solaris 会考虑 CPU 核数,solaris 模式的 CPU 利用率不会超过 100%。",
"lang": "zh_CN",
"expression": "procstat_cpu_usage",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 CPU 利用率(单进程)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)\n\nCPU 利用率有两个模式,一个是 solaris,一个是 irix,默认是 irix,irix 模式下,CPU 利用率可能会超过 100%,solaris 会考虑 CPU 核数,solaris 模式的 CPU 利用率不会超过 100%。"
},
{
"lang": "en_US",
"name": "Process CPU utilization (single process)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md) \n \nThere are two modes of CPU utilization, one is solaris and the other is irix. The default is irix. In irix mode, the CPU utilization may exceed 100%. solaris will consider the number of CPU cores, and the CPU utilization in solaris mode will not exceed 100%."
}
]
},
{
"id": 0,
"uuid": 1717556328268566000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程 CPU 总利用率(匹配到的所有进程加和)",
"unit": "percent",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)\n\nCPU 利用率有两个模式,一个是 solaris,一个是 irix,默认是 irix,irix 模式下,CPU 利用率可能会超过 100%,solaris 会考虑 CPU 核数,solaris 模式的 CPU 利用率不会超过 100%。",
"lang": "zh_CN",
"expression": "procstat_cpu_usage_total",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 CPU 总利用率(匹配到的所有进程加和)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)\n\nCPU 利用率有两个模式,一个是 solaris,一个是 irix,默认是 irix,irix 模式下,CPU 利用率可能会超过 100%,solaris 会考虑 CPU 核数,solaris 模式的 CPU 利用率不会超过 100%。"
},
{
"lang": "en_US",
"name": "Total process CPU utilization (sum of all processes matched to)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md) \n \nThere are two modes of CPU utilization, one is solaris and the other is irix. The default is irix. In irix mode, the CPU utilization may exceed 100%. solaris will consider the number of CPU cores, and the CPU utilization in solaris mode will not exceed 100%."
}
]
},
{
"id": 0,
"uuid": 1717556328270644000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程 IO 每秒写入字节总数(匹配到的所有进程加和)",
"unit": "bytesSecIEC",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "irate(procstat_write_bytes_total[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 IO 每秒写入字节总数(匹配到的所有进程加和)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Total number of bytes written per second by process IO (sum of all processes matched to)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328272873000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程 IO 每秒写入字节数(单进程)",
"unit": "bytesSecIEC",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "irate(procstat_write_bytes[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 IO 每秒写入字节数(单进程)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Number of bytes written per second by process IO (single process)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328274907000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程 IO 每秒写入次数总数(匹配到的所有进程加和)",
"unit": "sishort",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "irate(procstat_write_count_total[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 IO 每秒写入次数总数(匹配到的所有进程加和)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Total number of process IO writes per second (sum of all processes matched to)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328277039000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程 IO 每秒写入次数(单进程)",
"unit": "sishort",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "irate(procstat_write_count[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 IO 每秒写入次数(单进程)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Process IO writes per second (single process)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328278789000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程 IO 每秒读取字节总数(匹配到的所有进程加和)",
"unit": "bytesSecIEC",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "irate(procstat_read_bytes_total[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 IO 每秒读取字节总数(匹配到的所有进程加和)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Total number of bytes read per second by process IO (sum of all processes matched to)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328280740000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程 IO 每秒读取字节数(单进程)",
"unit": "bytesSecIEC",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "irate(procstat_read_bytes[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 IO 每秒读取字节数(单进程)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Process IO reads bytes per second (single process)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328282919000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程 IO 每秒读取次数总数(匹配到的所有进程加和)",
"unit": "sishort",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "irate(procstat_read_count_total[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 IO 每秒读取次数总数(匹配到的所有进程加和)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Total number of process IO reads per second (sum of all processes matched to)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328285199000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程 IO 每秒读取次数(单进程)",
"unit": "sishort",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "irate(procstat_read_count[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 IO 每秒读取次数(单进程)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Process IO reads per second (single process)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328287298000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程 Memory 利用率(单进程)",
"unit": "percent",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "procstat_mem_usage",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 Memory 利用率(单进程)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Process Memory utilization (single process)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328289271000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程 Memory 总利用率(匹配到的所有进程加和)",
"unit": "percent",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "procstat_mem_usage_total",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 Memory 总利用率(匹配到的所有进程加和)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Process Memory Total utilization (sum of all processes matched to)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328291283000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程 rlimit fd 软限制数量(匹配到的所有进程中的最小值)",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "procstat_rlimit_num_fds_soft_minimum",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 rlimit fd 软限制数量(匹配到的所有进程中的最小值)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Process rlimit fd Number of soft limits (minimum of all processes matched to)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328293536000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程 rlimit fd 软限制数量(单进程)",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "procstat_rlimit_num_fds_soft",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程 rlimit fd 软限制数量(单进程)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Process rlimit fd Number of soft limits (single process)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328295723000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程启动时长(匹配到的所有进程的最小值)",
"unit": "seconds",
"note": "启动了多久",
"lang": "zh_CN",
"expression": "procstat_uptime_minimum",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程启动时长(匹配到的所有进程的最小值)",
"note": "启动了多久"
},
{
"lang": "en_US",
"name": "Process start time (minimum of all processes matched to)",
"note": "How long has it started"
}
]
},
{
"id": 0,
"uuid": 1717556328297802000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程启动时长(单进程)",
"unit": "seconds",
"note": "启动了多久",
"lang": "zh_CN",
"expression": "procstat_uptime",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程启动时长(单进程)",
"note": "启动了多久"
},
{
"lang": "en_US",
"name": "Process startup time (single process)",
"note": "How long has it started"
}
]
},
{
"id": 0,
"uuid": 1717556328299836000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程数量(根据匹配条件查到的进程数量)",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "procstat_lookup_count",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程数量(根据匹配条件查到的进程数量)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Number of processes (the number of processes found according to matching conditions)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328301689000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程文件句柄总打开数(匹配到的所有进程加和)",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "procstat_num_fds_total",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程文件句柄总打开数(匹配到的所有进程加和)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Total number of process file handles open (sum of all processes matched to)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328303570000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程文件句柄打开数(单进程)",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "procstat_num_fds",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程文件句柄打开数(单进程)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Number of process file handle openings (single process)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328305425000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程线程总数(匹配到的所有进程加和)",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "procstat_num_threads_total",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程线程总数(匹配到的所有进程加和)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Total number of process threads (sum of all processes matched to)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
},
{
"id": 0,
"uuid": 1717556328307331000,
"collector": "Categraf",
"typ": "Procstat",
"name": "进程线程数(单进程)",
"unit": "none",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)",
"lang": "zh_CN",
"expression": "procstat_num_threads",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "进程线程数(单进程)",
"note": "[文档](https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
},
{
"lang": "en_US",
"name": "Number of process threads (single process)",
"note": "[Documentation] (https://github.com/flashcatcloud/categraf/blob/main/inputs/procstat/README.md)"
}
]
}
]
================================================
FILE: integrations/Prometheus/collect/prometheus/prometheus.toml
================================================
# # collect interval
# interval = 15
[[instances]]
urls = [
# "http://localhost:19000/metrics"
]
url_label_key = "instance"
url_label_value = "{{.Host}}"
## Scrape Services available in Consul Catalog
# [instances.consul]
# enabled = false
# agent = "http://localhost:8500"
# query_interval = "5m"
# [[instances.consul.query]]
# name = "a service name"
# tag = "a service tag"
# url = 'http://{{if ne .ServiceAddress ""}}{{.ServiceAddress}}{{else}}{{.Address}}{{end}}:{{.ServicePort}}/{{with .ServiceMeta.metrics_path}}{{.}}{{else}}metrics{{end}}'
# [instances.consul.query.tags]
# host = "{{.Node}}"
# bearer_token_string = ""
# e.g. /run/secrets/kubernetes.io/serviceaccount/token
# bearer_token_file = ""
# # basic auth
# username = ""
# password = ""
# headers = ["X-From", "categraf"]
# # interval = global.interval * interval_times
# interval_times = 1
# labels = {}
# support glob
# ignore_metrics = [ "go_*" ]
# support glob
# ignore_label_keys = []
# timeout for every url
# timeout = "3s"
## Optional TLS Config
# use_tls = false
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
================================================
FILE: integrations/Prometheus/markdown/README.md
================================================
# prometheus
prometheus 插件的作用,就是抓取 `/metrics` 接口的数据,上报给服务端。通过,各类 exporter 会暴露 `/metrics` 接口数据,越来越多的开源组件也会内置 prometheus SDK,吐出 prometheus 格式的监控数据,比如 rabbitmq 插件,其 README 中就有介绍。
这个插件 fork 自 telegraf/prometheus,做了一些删减改造,仍然支持通过 consul 做服务发现,管理所有的目标地址,删掉了 Kubernetes 部分,Kubernetes 部分准备放到其他插件里实现。
增加了两个配置:url_label_key 和 url_label_value。为了标识监控数据是从哪个 scrape url 拉取的,会为监控数据附一个标签来标识这个 url,默认的标签 KEY 是用 instance,当然,也可以改成别的,不过不建议。url_label_value 是标签值,支持 go template 语法,如果为空,就是整个 url 的内容,也可以通过模板变量只取一部分,比如 `http://localhost:9104/metrics`,只想取 IP 和端口部分,就可以写成:
```ini
url_label_value = "{{.Host}}"
```
如果 HTTP scheme 部分和 `/metrics` Path 部分都想取,可以这么写:
```ini
url_label_value = "{{.Scheme}}://{{.Host}}{{.Path}}"
```
相关变量是用这个方法生成的,供大家参考:
```go
func (ul *UrlLabel) GenerateLabel(u *url.URL) (string, string, error) {
if ul.LabelValue == "" {
return ul.LabelKey, u.String(), nil
}
dict := map[string]string{
"Scheme": u.Scheme,
"Host": u.Host,
"Hostname": u.Hostname(),
"Port": u.Port(),
"Path": u.Path,
"Query": u.RawQuery,
"Fragment": u.Fragment,
}
var buffer bytes.Buffer
err := ul.LabelValueTpl.Execute(&buffer, dict)
if err != nil {
return "", "", err
}
return ul.LabelKey, buffer.String(), nil
}
```
================================================
FILE: integrations/RabbitMQ/alerts/alerts.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "[RabbitMQ] Cluster Operator Unavailable Replicas",
"note": "There are kube_pod_names that are either running but not yet available or kube_pod_names that still have not been created.",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_deployment_status_replicas_unavailable \u003e 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328314217000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "[RabbitMQ] Container Restarts",
"note": "Over the last 10 minutes a rabbitmq container was restarted",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "increase(kube__container_status_restarts_total[10m]) \u003e= 1\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328315028000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "[RabbitMQ] File Descriptors Near Limit",
"note": "The file descriptors are near to the limit",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum (max_over_time(rabbitmq_process_open_fds[5m]))/sum (rabbitmq_process_max_tcp_sockets)\u003e 0.8\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328315753000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "[RabbitMQ] High Connection Churn",
"note": "There are a high connection churn",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "( sum(rate(rabbitmq_connections_closed_total[5m]) ) + sum(rate(rabbitmq_connections_opened_total[5m]) ) )/sum (rabbitmq_connections) \u003e 0.1 unless sum (rabbitmq_connections) \u003c 100\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328316400000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "[RabbitMQ] Insufficient Established Erlang Distribution Links",
"note": "Insufficient established erlang distribution links",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "count (erlang_vm_dist_node_state) == 3\u003c count (rabbitmq_build_info ) *(count (rabbitmq_build_info - 1 ))\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328316947000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "[RabbitMQ] Low Disk Watermark Predicted",
"note": "The predicted free disk space in 24 hours from now is low",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 3600,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "( predict_linear(rabbitmq_disk_space_available_bytes[24h], 60*60*24) \u003c rabbitmq_disk_space_available_limit_bytes )and( count_over_time(rabbitmq_disk_space_available_limit_bytes[2h] offset 22h) \u003e 0)\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328317379000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "[RabbitMQ] No MajorityOfNodesReady",
"note": "There are so many nodes not ready",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_statefulset_status_replicas_ready\u003c=kube_statefulset_replicas/ 2 unless kube_statefulset_replicas== 0\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328317808000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "[RabbitMQ] Persistent Volume Missing",
"note": "There is at least one pvc not bound",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "kube_persistentvolumeclaim_status_phase{phase=\"Bound\"} == 0\n",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328318244000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "[RabbitMQ] TCP Sockets Near Limit",
"note": "The TCP sockets are near to the limit",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 600,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum (max_over_time(rabbitmq_process_open_tcp_sockets[5m]))/sum (rabbitmq_process_max_tcp_sockets)\u003e 0.8\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328318775000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "[RabbitMQ] Unroutable Messages",
"note": "There were unroutable message within the last 5 minutes in RabbitMQ cluster",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 300,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum (increase(rabbitmq_channel_messages_unroutable_dropped_total[5m])) \u003e= 1 or sum (increase(rabbitmq_channel_messages_unroutable_returned_total[5m])) \u003e= 1\n",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328321515000
}
]
================================================
FILE: integrations/RabbitMQ/collect/rabbitmq/rabbitmq.toml
================================================
# As of 3.8.0, RabbitMQ ships with built-in Prometheus & Grafana support.
# Support for Prometheus metric collector ships in the rabbitmq_prometheus plugin.
# The plugin exposes all RabbitMQ metrics on a dedicated TCP port, in Prometheus text format.
#
# enable prometheus plugin:
# `rabbitmq-plugins enable rabbitmq_prometheus`
# `curl http://localhost:15692/metrics`
#
# then use categraf prometheus plugin scrape http://localhost:15692/metrics instead of this rabbitmq plugin
# # collect interval
# interval = 15
[[instances]]
# # Management Plugin url
# url = "http://localhost:15672"
# username = "guest"
# password = "guest"
## Optional TLS Config
# use_tls = false
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
## Optional request timeouts
##
## ResponseHeaderTimeout, if non-zero, specifies the amount of time to wait
## for a server's response headers after fully writing the request.
# header_timeout = "3s"
##
## client_timeout specifies a time limit for requests made by this client.
## Includes connection time, any redirects, and reading the response body.
# client_timeout = "4s"
## A list of nodes to gather as the rabbitmq_node measurement. If not
## specified, metrics for all nodes are gathered.
# nodes = ["rabbit@node1", "rabbit@node2"]
## A list of exchanges to gather as the rabbitmq_exchange measurement. If not
## specified, metrics for all exchanges are gathered.
# exchanges = ["categraf"]
## Metrics to include and exclude. Globs accepted.
## Note that an empty array for both will include all metrics
## Currently the following metrics are supported: "exchange", "federation", "node", "overview", "queue"
# metric_include = []
# metric_exclude = []
## Queues to include and exclude. Globs accepted.
## Note that an empty array for both will include all queues
# queue_name_include = []
# queue_name_exclude = []
## Federation upstreams to include and exclude specified as an array of glob
## pattern strings. Federation links can also be limited by the queue and
## exchange filters.
# federation_upstream_include = []
# federation_upstream_exclude = []
# # interval = global.interval * interval_times
# interval_times = 1
# important! use global unique string to specify instance
# labels = { instance="rabbitmq-001" }
================================================
FILE: integrations/RabbitMQ/dashboards/rabbitmq_CN_v3.8_gt.json
================================================
{
"id": 0,
"group_id": 0,
"name": "RabbitMQ 3.8+ 中文版",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "4466a232-248d-45a8-bf4d-05d5139c7346",
"layout": {
"h": 1,
"i": "4466a232-248d-45a8-bf4d-05d5139c7346",
"w": 24,
"x": 0,
"y": 0
},
"name": "Overview",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "队列中已准备好被消费但尚未被消费者获取的消息总数。建议监控阈值:>1000时警告(可能消费能力不足),>10000时严重(需检查消费者状态)。该指标用于评估消息积压情况,若持续增长需排查消费者处理能力或网络延迟问题。",
"id": "a20b5d06-4343-457a-89f2-33a52c4dec04",
"layout": {
"h": 3,
"i": "a20b5d06-4343-457a-89f2-33a52c4dec04",
"w": 7,
"x": 0,
"y": 1
},
"name": "待处理消息数",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10000
},
"result": {
"color": "#4a90e2"
},
"type": "range"
},
{
"match": {
"from": 100000
},
"result": {
"color": "#f50a0a"
},
"type": "range"
},
{
"match": {
"to": 9999
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_ready * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "消息队列每秒接收的新消息数量。建议监控阈值:持续低于预期值时警告(可能生产者异常),突发高峰时需关注消费者处理能力。该指标反映系统写入负载,异常波动可能影响整体吞吐量。",
"id": "893409b0-4ca0-450b-a0c9-f48eddf0e243",
"layout": {
"h": 3,
"i": "893409b0-4ca0-450b-a0c9-f48eddf0e243",
"w": 5,
"x": 7,
"y": 1
},
"name": "每秒接收消息数",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 50
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_published_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "当前连接到消息队列并正在发布消息的生产者数量。建议监控阈值:突降为0时告警(可能生产者故障),异常激增时需关注资源竞争。该指标反映系统写入端的稳定性,适用于容量规划与异常检测场景。",
"id": "d596e7f0-5095-420e-bc38-674001dcf5f4",
"layout": {
"h": 3,
"i": "d596e7f0-5095-420e-bc38-674001dcf5f4",
"w": 4,
"x": 12,
"y": 1
},
"name": "活跃生产者数量",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_channels * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) - sum(rabbitmq_channel_consumers * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description":"当前与消息队列系统建立的客户端连接总数(包括生产者、消费者和管理工具)。建议监控阈值:突降至0时告警(可能网络隔离),异常高峰时需检查资源限制或DDoS攻击。该指标反映系统整体访问压力,是容量评估和故障排查的关键指标之一。",
"id": "9f6d7dee-666d-4e1b-90d0-129b2e5ba085",
"layout": {
"h": 3,
"i": "9f6d7dee-666d-4e1b-90d0-129b2e5ba085",
"w": 4,
"x": 16,
"y": 1
},
"name": "活跃连接数",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_connections * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "当前消息队列系统中存在的队列(或主题)总数。建议监控阈值:突增:可能因配置错误导致自动创建大量队列(需检查自动化策略)突降:可能误删除或系统元数据故障持续增长:需关注资源消耗(如内存/文件句柄)",
"id": "910eae0f-2b78-4d10-a780-8f997f6e96cb",
"layout": {
"h": 3,
"i": "910eae0f-2b78-4d10-a780-8f997f6e96cb",
"w": 4,
"x": 20,
"y": 1
},
"name": "队列数量",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_queues * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "当前已被消费者接收但尚未明确确认(ACK)的消息总数。该指标直接反映系统的消息处理可靠性和消费者健康状态。",
"id": "2e8cd60f-51b0-46b2-8c0f-bf55604d340d",
"layout": {
"h": 3,
"i": "2e8cd60f-51b0-46b2-8c0f-bf55604d340d",
"w": 7,
"x": 0,
"y": 2
},
"name": "未确认消息数",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"to": 99
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 100
},
"result": {
"color": "#4a90e2"
},
"type": "range"
},
{
"match": {
"from": 500
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_unacked * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "成功投递给消费者的消息数量。该指标反映系统的实时吞吐能力,是衡量消费者处理效率和服务端性能的关键指标。",
"id": "4b242c1e-85d5-48b3-8cce-b467209245ec",
"layout": {
"h": 3,
"i": "4b242c1e-85d5-48b3-8cce-b467209245ec",
"w": 5,
"x": 7,
"y": 2
},
"name": "出站消息速率(消息/秒)",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 50
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_redelivered_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) +\nsum(rate(rabbitmq_channel_messages_delivered_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) +\nsum(rate(rabbitmq_channel_messages_delivered_ack_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) +\nsum(rate(rabbitmq_channel_get_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) +\nsum(rate(rabbitmq_channel_get_ack_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "b87b1c6b-644a-42f4-915a-bd6857540f70",
"layout": {
"h": 3,
"i": "b87b1c6b-644a-42f4-915a-bd6857540f70",
"w": 4,
"x": 12,
"y": 2
},
"name": "消息消费者",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_channel_consumers * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "在消息队列(如 RabbitMQ)中,通道是客户端与服务器之间的轻量级虚拟连接,用于高效传输消息。多个通道可以共享同一个 TCP 连接,减少资源消耗并提高并发性能。",
"id": "65103bcf-fb21-488d-a06b-7b0ea130ca4d",
"layout": {
"h": 3,
"i": "65103bcf-fb21-488d-a06b-7b0ea130ca4d",
"w": 4,
"x": 16,
"y": 2
},
"name": "消息通道",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_channels * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "c6454712-e265-4387-ae86-9ac865af46f2",
"layout": {
"h": 3,
"i": "c6454712-e265-4387-ae86-9ac865af46f2",
"w": 4,
"x": 20,
"y": 2
},
"name": "Nodes",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 3,
"to": null
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 8
},
"result": {
"color": "#e70909"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_build_info * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "1712a96f-bcde-4d33-90b6-e2eba20527b9",
"layout": {
"h": 1,
"i": "1712a96f-bcde-4d33-90b6-e2eba20527b9",
"w": 24,
"x": 0,
"y": 5
},
"name": "Nodes",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"columns": [
"rabbitmq_cluster",
"rabbitmq_node",
"rabbitmq_version",
"erlang_version"
],
"displayMode": "labelsOfSeriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1b3fdea0-1921-48ae-b11f-31cc27f816b0",
"layout": {
"h": 3,
"i": "1b3fdea0-1921-48ae-b11f-31cc27f816b0",
"w": 24,
"x": 0,
"y": 6
},
"name": "nodes",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "rabbitmq_build_info * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "RabbitMQ 在内存压力下会主动阻塞消息发布者(Publisher),以避免内存耗尽导致服务崩溃。此指标表示当前剩余可用内存距离触发阻塞机制的阈值。",
"id": "b91068ed-0914-4a8d-91dd-9ffb3b692516",
"layout": {
"h": 7,
"i": "b91068ed-0914-4a8d-91dd-9ffb3b692516",
"w": 8,
"x": 0,
"y": 7
},
"name": "发布者阻塞阈值(剩余可用内存)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(rabbitmq_resident_memory_limit_bytes * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) -\n(rabbitmq_process_resident_memory_bytes * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "RabbitMQ 在磁盘空间不足时会主动阻塞消息发布者(Publisher),以防止因磁盘写满导致服务崩溃。此指标表示当前剩余可用磁盘空间距离触发阻塞机制的阈值。",
"id": "a74ca489-6101-49bc-9560-4cde368bc47e",
"layout": {
"h": 7,
"i": "a74ca489-6101-49bc-9560-4cde368bc47e",
"w": 8,
"x": 8,
"y": 7
},
"name": "发布者阻塞阈值(剩余可用磁盘空间)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_disk_space_available_bytes * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "RabbitMQ 依赖文件描述符(File Descriptors)来管理网络连接、队列、通道等资源。此指标表示当前系统剩余可用的文件描述符数量,若不足可能导致新连接被拒绝或服务异常。",
"id": "100ed9ae-2b84-4f2b-82b2-0444dd28deed",
"layout": {
"h": 3,
"i": "100ed9ae-2b84-4f2b-82b2-0444dd28deed",
"w": 8,
"x": 16,
"y": 7
},
"name": "可用文件句柄数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(rabbitmq_process_max_fds * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) -\n(rabbitmq_process_open_fds * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "RabbitMQ 使用TCP套接字处理客户端连接(如AMQP、MQTT、STOMP等协议)。此指标表示当前可用的TCP套接字资源数量,若不足可能导致新连接被拒绝。",
"id": "a4859891-2538-47fc-946b-7ae3aa507c51",
"layout": {
"h": 3,
"i": "a4859891-2538-47fc-946b-7ae3aa507c51",
"w": 8,
"x": 16,
"y": 8
},
"name": "可用TCP套接字数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(rabbitmq_process_max_tcp_sockets * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) -\n(rabbitmq_process_open_tcp_sockets * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "fec98a71-0615-4782-bf8d-960529e243f9",
"layout": {
"h": 1,
"i": "fec98a71-0615-4782-bf8d-960529e243f9",
"w": 24,
"x": 0,
"y": 14
},
"name": "QUEUED MESSAGES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示队列中已准备好但尚未被消费者接收的消息数量。此指标反映消费者的处理能力,若持续增长可能表明消费者处理速度不足或存在阻塞。",
"id": "145dc75a-d3b8-491f-9ba6-6da787c8e265",
"layout": {
"h": 3,
"i": "145dc75a-d3b8-491f-9ba6-6da787c8e265",
"w": 12,
"x": 0,
"y": 15
},
"name": "待消费消息数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_ready * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示已被消费者接收但尚未收到确认(ACK/NACK)的消息数量。此指标反映消费者的处理效率,若持续增长可能表明消费者处理延迟或存在故障。",
"id": "a6ca328a-8b19-488e-a70d-74372f994901",
"layout": {
"h": 3,
"i": "a6ca328a-8b19-488e-a70d-74372f994901",
"w": 12,
"x": 12,
"y": 15
},
"name": "待确认消息数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_unacked * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "500c51ed-a0d6-41d9-903f-d000e289dc2b",
"layout": {
"h": 1,
"i": "500c51ed-a0d6-41d9-903f-d000e289dc2b",
"w": 24,
"x": 0,
"y": 18
},
"name": "INCOMING MESSAGES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示生产者每秒向队列或交换机发布的消息数量。此指标反映系统的写入负载,突增可能影响消息积压或处理延迟。",
"id": "ef4352a4-c281-4596-b89e-d79a565ca112",
"layout": {
"h": 3,
"i": "ef4352a4-c281-4596-b89e-d79a565ca112",
"w": 12,
"x": 0,
"y": 19
},
"name": "每秒发布消息数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_published_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示服务器每秒向生产者发送的发布确认(Publisher Confirm)数量。此指标反映消息持久化或路由完成的效率,高确认速率通常表明系统处理能力良好。",
"id": "f0932549-d4a7-4eb1-a86e-35eb49569f29",
"layout": {
"h": 3,
"i": "f0932549-d4a7-4eb1-a86e-35eb49569f29",
"w": 12,
"x": 12,
"y": 19
},
"name": "每秒确认发布的消息数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_confirmed_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示交换机每秒成功路由到目标队列的消息数量。此指标反映消息分发的效率,若数值低于发布速率,可能存在路由键不匹配或队列未绑定等问题。",
"id": "f47766cb-3cdb-4f67-896f-77af421ff404",
"layout": {
"h": 3,
"i": "f47766cb-3cdb-4f67-896f-77af421ff404",
"w": 12,
"x": 0,
"y": 20
},
"name": "每秒路由到队列的消息数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queue_messages_published_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示服务器每秒未能向生产者发送确认(Publisher Confirm)的消息数量。此指标反映消息持久化或路由的潜在问题,持续较高的数值可能表明性能瓶颈或故障。",
"id": "644302ba-3c99-4787-8023-93770b0a9e6c",
"layout": {
"h": 3,
"i": "644302ba-3c99-4787-8023-93770b0a9e6c",
"w": 12,
"x": 12,
"y": 20
},
"name": "每秒未确认发布的消息数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_unconfirmed[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示交换机因无法匹配任何绑定队列而每秒丢弃的消息数量。此指标反映路由键配置错误或消费者缺失问题,需检查交换机和队列的绑定关系。",
"id": "0a3eadfd-d84c-4070-9527-b4c62fc20787",
"layout": {
"h": 3,
"i": "0a3eadfd-d84c-4070-9527-b4c62fc20787",
"w": 12,
"x": 0,
"y": 21
},
"name": "每秒丢弃的不可路由消息数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_unroutable_dropped_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示因无法路由到任何队列(且启用了 mandatory 标志)而每秒被退回给生产者的消息数量。此指标反映路由配置错误或消费者缺失,需检查交换机的绑定规则和生产者的错误处理逻辑。",
"id": "a14e8796-9614-4204-b5b8-5d1a47f356d8",
"layout": {
"h": 3,
"i": "a14e8796-9614-4204-b5b8-5d1a47f356d8",
"w": 12,
"x": 12,
"y": 21
},
"name": "每秒回退给生产者的不可路由消息数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_unroutable_returned_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "2405258c-c08f-4e49-960a-5c9a12b29f12",
"layout": {
"h": 1,
"i": "2405258c-c08f-4e49-960a-5c9a12b29f12",
"w": 24,
"x": 0,
"y": 24
},
"name": "OUTGOING MESSAGES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示消费者每秒成功接收的消息数量。此指标反映系统的实时吞吐量,若数值低于预期,可能因消费者处理能力不足、网络延迟或队列积压导致,需优化消费者性能或扩容。",
"id": "be0f1872-172c-4bc3-a901-4b645ebf5abe",
"layout": {
"h": 3,
"i": "be0f1872-172c-4bc3-a901-4b645ebf5abe",
"w": 12,
"x": 0,
"y": 25
},
"name": "每秒投递的消息数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(\n (rate(rabbitmq_channel_messages_delivered_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) +\n (rate(rabbitmq_channel_messages_delivered_ack_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})\n) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示因消费者未确认(NACK或超时)而每秒重新投递到队列的消息数量。此指标反映消费者处理异常或性能瓶颈,高频重投递可能需优化消费者逻辑或调整超时参数。",
"id": "eff794ca-e844-4a12-b230-690aadefa53f",
"layout": {
"h": 3,
"i": "eff794ca-e844-4a12-b230-690aadefa53f",
"w": 12,
"x": 12,
"y": 25
},
"name": "每秒重投递的消息数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_redelivered_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "示消费者在手动确认(manual acknowledgment)模式下每秒接收的消息数量。此指标反映消费者处理消息的效率,若数值偏低可能因业务逻辑复杂或确认延迟导致,需优化处理逻辑或调整确认策略。",
"id": "2ed4be63-4fe9-462f-bc2f-967319bc3626",
"layout": {
"h": 3,
"i": "2ed4be63-4fe9-462f-bc2f-967319bc3626",
"w": 12,
"x": 0,
"y": 26
},
"name": "每秒手动确认投递的消息数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_delivered_ack_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示消费者在自动确认(auto acknowledgment)模式下每秒接收的消息数量。此指标反映系统在无需手动确认情况下的消息处理能力,若数值异常可能因消费者处理速度不足或消息积压导致,需关注消费者性能或调整并发设置。",
"id": "116c44e0-c8e1-4f02-8eae-2140997e2280",
"layout": {
"h": 3,
"i": "116c44e0-c8e1-4f02-8eae-2140997e2280",
"w": 12,
"x": 12,
"y": 26
},
"name": "每秒自动确认投递的消息数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_delivered_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示消费者每秒成功确认(acknowledged)的消息数量。此指标反映消费者处理消息的完成效率,若数值偏低可能因业务处理延迟、网络问题或确认机制异常导致,需检查消费者性能或确认逻辑是否合理。",
"id": "a1184534-3226-4c9a-ba6c-6d5258998518",
"layout": {
"h": 3,
"i": "a1184534-3226-4c9a-ba6c-6d5258998518",
"w": 12,
"x": 0,
"y": 27
},
"name": "每秒确认的消息数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_acked_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示消费者在自动确认(auto acknowledgment)模式下每秒执行的轮询(polling)操作次数。此指标反映消费者从队列获取消息的频率,若数值异常可能因消费者处理能力不足或网络延迟导致,需优化轮询间隔或提升消费者性能。",
"id": "2f34e8c7-e7fb-4695-afce-034a10081437",
"layout": {
"h": 3,
"i": "2f34e8c7-e7fb-4695-afce-034a10081437",
"w": 12,
"x": 12,
"y": 27
},
"name": "每秒自动确认模式下的轮询操作数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_get_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示消费者每秒执行轮询(polling)操作但未获取到任何消息的次数。此指标反映队列的空闲程度或消费者轮询效率,若数值过高可能因队列消息不足或轮询间隔不合理导致,需调整消费者轮询策略或检查生产者消息生成情况。",
"id": "c7b1f4b8-5069-480f-a1f7-1c85dbc389c1",
"layout": {
"h": 3,
"i": "c7b1f4b8-5069-480f-a1f7-1c85dbc389c1",
"w": 12,
"x": 0,
"y": 28
},
"name": "每秒无结果的轮询操作数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_get_empty_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示消费者在手动确认(manual acknowledgment)模式下每秒执行的轮询(polling)操作次数。此指标反映消费者从队列获取消息并需显式确认(ACK/NACK)的处理频率,若数值异常可能因消费者处理延迟或消息积压导致,需优化确认逻辑或调整并发处理能力。",
"id": "4c2f60db-3cb4-4926-944c-022cf876eec2",
"layout": {
"h": 3,
"i": "4c2f60db-3cb4-4926-944c-022cf876eec2",
"w": 12,
"x": 12,
"y": 28
},
"name": "每秒手动确认模式下的轮询操作数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_get_ack_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "b6aa6f06-924f-4575-b4e0-d116ab744ea1",
"layout": {
"h": 1,
"i": "b6aa6f06-924f-4575-b4e0-d116ab744ea1",
"w": 24,
"x": 0,
"y": 31
},
"name": "QUEUES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示当前 RabbitMQ 集群或节点上存在的队列总数量。此指标反映系统整体负载和资源占用情况,若数值过高可能增加管理复杂度或影响性能,需结合队列消息数、消费者数量等指标评估是否需要合并或清理闲置队列。",
"id": "c26434ca-065f-4088-81c6-ef8f0cbca552",
"layout": {
"h": 3,
"i": "c26434ca-065f-4088-81c6-ef8f0cbca552",
"w": 12,
"x": 0,
"y": 32
},
"name": "队列总数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_queues * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示每秒在 RabbitMQ 上创建(声明)的新队列数量。此指标反映应用程序动态管理队列的活跃程度,若数值过高可能因频繁创建/删除队列导致资源开销增加,需检查是否存在不必要的临时队列或优化队列生命周期管理。",
"id": "ff021951-7991-4c3e-a667-8cd11e5c444c",
"layout": {
"h": 3,
"i": "ff021951-7991-4c3e-a667-8cd11e5c444c",
"w": 4,
"x": 12,
"y": 32
},
"name": "每秒队列声明数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queues_declared_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示每秒在 RabbitMQ 中新创建的队列数量。此指标反映系统处理新队列请求的频率,数值异常高可能表明应用程序存在频繁创建队列的行为,需结合业务逻辑判断是否合理,避免因过多队列导致资源浪费或性能下降。",
"id": "00bbb4fc-5cdf-4b29-a440-ff4da4325a0c",
"layout": {
"h": 3,
"i": "00bbb4fc-5cdf-4b29-a440-ff4da4325a0c",
"w": 4,
"x": 16,
"y": 32
},
"name": "每秒队列创建数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queues_created_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示每秒在 RabbitMQ 中被删除的队列数量。此指标反映系统清理或回收队列资源的频率。若数值异常高,可能表明应用程序存在频繁创建后立即删除队列的行为,需检查是否存在资源管理不当或临时队列滥用的情况,以避免不必要的性能开销。",
"id": "f802e41d-14fe-4193-a5cf-c31957b146f7",
"layout": {
"h": 3,
"i": "f802e41d-14fe-4193-a5cf-c31957b146f7",
"w": 4,
"x": 20,
"y": 32
},
"name": "每秒队列删除数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queues_deleted_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "5c7acadf-f9ff-4db9-a284-a206be245733",
"layout": {
"h": 1,
"i": "5c7acadf-f9ff-4db9-a284-a206be245733",
"w": 24,
"x": 0,
"y": 35
},
"name": "CHANNELS",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示当前 RabbitMQ 中所有活跃的 AMQP 通道数量。通道是客户端与服务器通信的轻量级连接,每个连接可以包含多个通道。此指标反映系统当前的并发通信负载,数值过高可能增加服务器资源消耗,需结合连接数监控以避免性能瓶颈或内存泄漏问题。",
"id": "362c622f-3fd8-4bdd-8dce-7ebf335f42f9",
"layout": {
"h": 3,
"i": "362c622f-3fd8-4bdd-8dce-7ebf335f42f9",
"w": 12,
"x": 0,
"y": 36
},
"name": "通道总数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_channels * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示每秒在 RabbitMQ 中新创建的 AMQP 通道数量。通道用于客户端与服务器之间的高效通信,但过多频繁创建可能导致资源消耗增加。若此指标异常升高,可能表明应用程序存在短生命周期通道滥用或连接管理不当,需优化以避免性能下降或内存压力。",
"id": "3a1a643b-8e2a-4ed1-8621-e5fae3ebc7c2",
"layout": {
"h": 3,
"i": "3a1a643b-8e2a-4ed1-8621-e5fae3ebc7c2",
"w": 6,
"x": 12,
"y": 36
},
"name": "每秒通道创建数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channels_opened_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示每秒在 RabbitMQ 中被关闭的 AMQP 通道数量。通道关闭是正常通信结束或异常中断的结果。若此指标与通道创建速率(Channels opened/s)长期不匹配,可能反映连接泄漏、客户端未正确释放资源或频繁重连问题,需结合错误日志排查潜在隐患。",
"id": "01f1771a-dd54-45e5-aa1d-b6ee2111c53b",
"layout": {
"h": 3,
"i": "01f1771a-dd54-45e5-aa1d-b6ee2111c53b",
"w": 6,
"x": 18,
"y": 36
},
"name": "每秒通道关闭数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channels_closed_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "419f22e8-08cc-4d91-8bc9-7c1055368146",
"layout": {
"h": 1,
"i": "419f22e8-08cc-4d91-8bc9-7c1055368146",
"w": 24,
"x": 0,
"y": 39
},
"name": "CONNECTIONS",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示每秒在 RabbitMQ 中被关闭的客户端连接数量。连接关闭可能是正常终止(如客户端主动断开)或异常中断(如网络问题、心跳超时)。若此指标突增或与连接创建速率(Connections opened/s)不匹配,可能表明客户端存在异常重连、资源未释放或服务端主动断开(如流量限制、认证失败等),需结合日志分析具体原因。",
"id": "c14fd6f4-d3de-4811-82c6-2b20c9146e89",
"layout": {
"h": 3,
"i": "c14fd6f4-d3de-4811-82c6-2b20c9146e89",
"w": 6,
"x": 18,
"y": 40
},
"name": "每秒连接关闭数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_connections_closed_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示 RabbitMQ 当前维护的所有客户端 TCP 连接数量。该指标直接反映系统负载和资源占用情况。若数值异常高(接近系统限制),可能导致文件描述符耗尽、内存压力增大或性能下降,需检查客户端连接池配置或是否存在连接泄漏问题。正常情况下应与业务流量趋势一致。",
"id": "1d06679b-c603-4cbf-85a7-7c8f4594258f",
"layout": {
"h": 3,
"i": "1d06679b-c603-4cbf-85a7-7c8f4594258f",
"w": 12,
"x": 0,
"y": 40
},
"name": "当前活跃连接总数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_connections * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "表示每秒在 RabbitMQ 中新建立的客户端连接数量。该指标反映客户端连接请求的活跃程度。若数值突增,可能因客户端频繁重连(如配置错误、网络抖动)或突发流量导致,需结合连接关闭速率(Connections closed/s)分析是否存在异常短连接。长期过高可能增加服务端负载,需优化客户端连接复用策略。",
"id": "8c91f7fe-79e9-454a-9dd3-6f214a29e1eb",
"layout": {
"h": 3,
"i": "8c91f7fe-79e9-454a-9dd3-6f214a29e1eb",
"w": 6,
"x": 12,
"y": 40
},
"name": "每秒新建连接数",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_connections_opened_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"name": "prom",
"type": "datasource",
"hide": false,
"definition": "prometheus",
"defaultValue": 1
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(rabbitmq_identity_info, rabbitmq_cluster)",
"name": "rabbitmq_cluster",
"type": "query"
}
],
"version": "3.0.0",
"graphTooltip": "sharedCrosshair"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328326911000
}
================================================
FILE: integrations/RabbitMQ/dashboards/rabbitmq_by_categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "RabbitMQ 3.8",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "4466a232-248d-45a8-bf4d-05d5139c7346",
"layout": {
"h": 1,
"i": "4466a232-248d-45a8-bf4d-05d5139c7346",
"w": 24,
"x": 0,
"y": 0
},
"name": "Overview",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "a20b5d06-4343-457a-89f2-33a52c4dec04",
"layout": {
"h": 3,
"i": "a20b5d06-4343-457a-89f2-33a52c4dec04",
"w": 7,
"x": 0,
"y": 1
},
"name": "Ready messages",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10000
},
"result": {
"color": "#4a90e2"
},
"type": "range"
},
{
"match": {
"from": 100000
},
"result": {
"color": "#f50a0a"
},
"type": "range"
},
{
"match": {
"to": 9999
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_ready * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "893409b0-4ca0-450b-a0c9-f48eddf0e243",
"layout": {
"h": 3,
"i": "893409b0-4ca0-450b-a0c9-f48eddf0e243",
"w": 5,
"x": 7,
"y": 1
},
"name": "Incoming messages / s",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 50
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_published_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "d596e7f0-5095-420e-bc38-674001dcf5f4",
"layout": {
"h": 3,
"i": "d596e7f0-5095-420e-bc38-674001dcf5f4",
"w": 4,
"x": 12,
"y": 1
},
"name": "Publishers",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_channels * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) - sum(rabbitmq_channel_consumers * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "9f6d7dee-666d-4e1b-90d0-129b2e5ba085",
"layout": {
"h": 3,
"i": "9f6d7dee-666d-4e1b-90d0-129b2e5ba085",
"w": 4,
"x": 16,
"y": 1
},
"name": "Connections",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_connections * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "910eae0f-2b78-4d10-a780-8f997f6e96cb",
"layout": {
"h": 3,
"i": "910eae0f-2b78-4d10-a780-8f997f6e96cb",
"w": 4,
"x": 20,
"y": 1
},
"name": "Queues",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_queues * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "2e8cd60f-51b0-46b2-8c0f-bf55604d340d",
"layout": {
"h": 3,
"i": "2e8cd60f-51b0-46b2-8c0f-bf55604d340d",
"w": 7,
"x": 0,
"y": 2
},
"name": "Unacknowledged messages",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"to": 99
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 100
},
"result": {
"color": "#4a90e2"
},
"type": "range"
},
{
"match": {
"from": 500
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_unacked * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "4b242c1e-85d5-48b3-8cce-b467209245ec",
"layout": {
"h": 3,
"i": "4b242c1e-85d5-48b3-8cce-b467209245ec",
"w": 5,
"x": 7,
"y": 2
},
"name": "Outgoing messages / s",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 50
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_redelivered_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) +\nsum(rate(rabbitmq_channel_messages_delivered_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) +\nsum(rate(rabbitmq_channel_messages_delivered_ack_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) +\nsum(rate(rabbitmq_channel_get_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) +\nsum(rate(rabbitmq_channel_get_ack_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "b87b1c6b-644a-42f4-915a-bd6857540f70",
"layout": {
"h": 3,
"i": "b87b1c6b-644a-42f4-915a-bd6857540f70",
"w": 4,
"x": 12,
"y": 2
},
"name": "Consumers",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_channel_consumers * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "65103bcf-fb21-488d-a06b-7b0ea130ca4d",
"layout": {
"h": 3,
"i": "65103bcf-fb21-488d-a06b-7b0ea130ca4d",
"w": 4,
"x": 16,
"y": 2
},
"name": "Channels",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_channels * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "c6454712-e265-4387-ae86-9ac865af46f2",
"layout": {
"h": 3,
"i": "c6454712-e265-4387-ae86-9ac865af46f2",
"w": 4,
"x": 20,
"y": 2
},
"name": "Nodes",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 3,
"to": null
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 8
},
"result": {
"color": "#e70909"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_build_info * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "1712a96f-bcde-4d33-90b6-e2eba20527b9",
"layout": {
"h": 1,
"i": "1712a96f-bcde-4d33-90b6-e2eba20527b9",
"w": 24,
"x": 0,
"y": 5
},
"name": "Nodes",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"columns": [
"rabbitmq_cluster",
"rabbitmq_node",
"rabbitmq_version",
"erlang_version"
],
"displayMode": "labelsOfSeriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1b3fdea0-1921-48ae-b11f-31cc27f816b0",
"layout": {
"h": 3,
"i": "1b3fdea0-1921-48ae-b11f-31cc27f816b0",
"w": 24,
"x": 0,
"y": 6
},
"name": "nodes",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "rabbitmq_build_info * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "If the value is zero or less, the memory alarm will be triggered and all publishing connections across all cluster nodes will be blocked.\n\nThis value can temporarily go negative because the memory alarm is triggered with a slight delay.\n\nThe kernel's view of the amount of memory used by the node can differ from what the node itself can observe. This means that this value can be negative for a sustained period of time.\n\nBy default nodes use resident set size (RSS) to compute how much memory they use. This strategy can be changed (see the guides below).\n\n* [Alarms](https://www.rabbitmq.com/alarms.html)\n* [Memory Alarms](https://www.rabbitmq.com/memory.html)\n* [Reasoning About Memory Use](https://www.rabbitmq.com/memory-use.html)\n* [Blocked Connection Notifications](https://www.rabbitmq.com/connection-blocked.html)",
"id": "b91068ed-0914-4a8d-91dd-9ffb3b692516",
"layout": {
"h": 7,
"i": "b91068ed-0914-4a8d-91dd-9ffb3b692516",
"w": 8,
"x": 0,
"y": 7
},
"name": "Memory available before publishers blocked",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(rabbitmq_resident_memory_limit_bytes * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) -\n(rabbitmq_process_resident_memory_bytes * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "This metric is reported for the partition where the RabbitMQ data directory is stored.\n\nIf the value is zero or less, the disk alarm will be triggered and all publishing connections across all cluster nodes will be blocked.\n\nThis value can temporarily go negative because the free disk space alarm is triggered with a slight delay.\n\n* [Alarms](https://www.rabbitmq.com/alarms.html)\n* [Disk Space Alarms](https://www.rabbitmq.com/disk-alarms.html)\n* [Disk Space](https://www.rabbitmq.com/production-checklist.html#resource-limits-disk-space)\n* [Persistence Configuration](https://www.rabbitmq.com/persistence-conf.html)\n* [Blocked Connection Notifications](https://www.rabbitmq.com/connection-blocked.html)",
"id": "a74ca489-6101-49bc-9560-4cde368bc47e",
"layout": {
"h": 7,
"i": "a74ca489-6101-49bc-9560-4cde368bc47e",
"w": 8,
"x": 8,
"y": 7
},
"name": "Disk space available before publishers blocked",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_disk_space_available_bytes * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "When this value reaches zero, new connections will not be accepted and disk write operations may fail.\n\nClient libraries, peer nodes and CLI tools will not be able to connect when the node runs out of available file descriptors.\n\n* [Open File Handles Limit](https://www.rabbitmq.com/production-checklist.html#resource-limits-file-handle-limit)",
"id": "100ed9ae-2b84-4f2b-82b2-0444dd28deed",
"layout": {
"h": 3,
"i": "100ed9ae-2b84-4f2b-82b2-0444dd28deed",
"w": 8,
"x": 16,
"y": 7
},
"name": "File descriptors available",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(rabbitmq_process_max_fds * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) -\n(rabbitmq_process_open_fds * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "When this value reaches zero, new connections will not be accepted.\n\nClient libraries, peer nodes and CLI tools will not be able to connect when the node runs out of available file descriptors.\n\n* [Networking and RabbitMQ](https://www.rabbitmq.com/networking.html)",
"id": "a4859891-2538-47fc-946b-7ae3aa507c51",
"layout": {
"h": 3,
"i": "a4859891-2538-47fc-946b-7ae3aa507c51",
"w": 8,
"x": 16,
"y": 8
},
"name": "TCP sockets available",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(rabbitmq_process_max_tcp_sockets * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) -\n(rabbitmq_process_open_tcp_sockets * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "fec98a71-0615-4782-bf8d-960529e243f9",
"layout": {
"h": 1,
"i": "fec98a71-0615-4782-bf8d-960529e243f9",
"w": 24,
"x": 0,
"y": 14
},
"name": "QUEUED MESSAGES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Total number of ready messages ready to be delivered to consumers.\n\nAim to keep this value as low as possible. RabbitMQ behaves best when messages are flowing through it. It's OK for publishers to occasionally outpace consumers, but the expectation is that consumers will eventually process all ready messages.\n\nIf this metric keeps increasing, your system will eventually run out of memory and/or disk space. Consider using TTL or Queue Length Limit to prevent unbounded message growth.\n\n* [Queues](https://www.rabbitmq.com/queues.html)\n* [Consumers](https://www.rabbitmq.com/consumers.html)\n* [Queue Length Limit](https://www.rabbitmq.com/maxlength.html)\n* [Time-To-Live and Expiration](https://www.rabbitmq.com/ttl.html)",
"id": "145dc75a-d3b8-491f-9ba6-6da787c8e265",
"layout": {
"h": 3,
"i": "145dc75a-d3b8-491f-9ba6-6da787c8e265",
"w": 12,
"x": 0,
"y": 15
},
"name": "Messages ready to be delivered to consumers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_ready * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The total number of messages that are either in-flight to consumers, currently being processed by consumers or simply waiting for the consumer acknowledgements to be processed by the queue. Until the queue processes the message acknowledgement, the message will remain unacknowledged.\n\n* [Queues](https://www.rabbitmq.com/queues.html)\n* [Confirms and Acknowledgements](https://www.rabbitmq.com/confirms.html)\n* [Consumer Prefetch](https://www.rabbitmq.com/consumer-prefetch.html)",
"id": "a6ca328a-8b19-488e-a70d-74372f994901",
"layout": {
"h": 3,
"i": "a6ca328a-8b19-488e-a70d-74372f994901",
"w": 12,
"x": 12,
"y": 15
},
"name": "Messages pending consumer acknowledgement",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_unacked * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "500c51ed-a0d6-41d9-903f-d000e289dc2b",
"layout": {
"h": 1,
"i": "500c51ed-a0d6-41d9-903f-d000e289dc2b",
"w": 24,
"x": 0,
"y": 18
},
"name": "INCOMING MESSAGES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The incoming message rate before any routing rules are applied.\n\nIf this value is lower than the number of messages published to queues, it may indicate that some messages are delivered to more than one queue.\n\nIf this value is higher than the number of messages published to queues, messages cannot be routed and will either be dropped or returned to publishers.\n\n* [Publishers](https://www.rabbitmq.com/publishers.html)",
"id": "ef4352a4-c281-4596-b89e-d79a565ca112",
"layout": {
"h": 3,
"i": "ef4352a4-c281-4596-b89e-d79a565ca112",
"w": 12,
"x": 0,
"y": 19
},
"name": "Messages published / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_published_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages confirmed by the broker to publishers. Publishers must opt-in to receive message confirmations.\n\nIf this metric is consistently at zero it may suggest that publisher confirms are not used by clients. The safety of published messages is likely to be at risk.\n\n* [Publisher Confirms](https://www.rabbitmq.com/confirms.html#publisher-confirms)\n* [Publisher Confirms and Data Safety](https://www.rabbitmq.com/publishers.html#data-safety)\n* [When Will Published Messages Be Confirmed by the Broker?](https://www.rabbitmq.com/confirms.html#when-publishes-are-confirmed)",
"id": "f0932549-d4a7-4eb1-a86e-35eb49569f29",
"layout": {
"h": 3,
"i": "f0932549-d4a7-4eb1-a86e-35eb49569f29",
"w": 12,
"x": 12,
"y": 19
},
"name": "Messages confirmed to publishers / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_confirmed_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages received from publishers and successfully routed to the master queue replicas.\n\n* [Queues](https://www.rabbitmq.com/queues.html)\n* [Publishers](https://www.rabbitmq.com/publishers.html)",
"id": "f47766cb-3cdb-4f67-896f-77af421ff404",
"layout": {
"h": 3,
"i": "f47766cb-3cdb-4f67-896f-77af421ff404",
"w": 12,
"x": 0,
"y": 20
},
"name": "Messages routed to queues / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queue_messages_published_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages received from publishers that have publisher confirms enabled and the broker has not confirmed yet.\n\n* [Publishers](https://www.rabbitmq.com/publishers.html)\n* [Confirms and Acknowledgements](https://www.rabbitmq.com/confirms.html)\n* [When Will Published Messages Be Confirmed by the Broker?](https://www.rabbitmq.com/confirms.html#when-publishes-are-confirmed)",
"id": "644302ba-3c99-4787-8023-93770b0a9e6c",
"layout": {
"h": 3,
"i": "644302ba-3c99-4787-8023-93770b0a9e6c",
"w": 12,
"x": 12,
"y": 20
},
"name": "Messages unconfirmed to publishers / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_unconfirmed[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages that cannot be routed and are dropped. \n\nAny value above zero means message loss and likely suggests a routing problem on the publisher end.\n\n* [Unroutable Message Handling](https://www.rabbitmq.com/publishers.html#unroutable)",
"id": "0a3eadfd-d84c-4070-9527-b4c62fc20787",
"layout": {
"h": 3,
"i": "0a3eadfd-d84c-4070-9527-b4c62fc20787",
"w": 12,
"x": 0,
"y": 21
},
"name": "Unroutable messages dropped / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_unroutable_dropped_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages that cannot be routed and are returned back to publishers.\n\nSustained values above zero may indicate a routing problem on the publisher end.\n\n* [Unroutable Message Handling](https://www.rabbitmq.com/publishers.html#unroutable)\n* [When Will Published Messages Be Confirmed by the Broker?](https://www.rabbitmq.com/confirms.html#when-publishes-are-confirmed)",
"id": "a14e8796-9614-4204-b5b8-5d1a47f356d8",
"layout": {
"h": 3,
"i": "a14e8796-9614-4204-b5b8-5d1a47f356d8",
"w": 12,
"x": 12,
"y": 21
},
"name": "Unroutable messages returned to publishers / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_unroutable_returned_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "2405258c-c08f-4e49-960a-5c9a12b29f12",
"layout": {
"h": 1,
"i": "2405258c-c08f-4e49-960a-5c9a12b29f12",
"w": 24,
"x": 0,
"y": 24
},
"name": "OUTGOING MESSAGES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages delivered to consumers. It includes messages that have been redelivered.\n\nThis metric does not include messages that have been fetched by consumers using `basic.get` (consumed by polling).\n\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "be0f1872-172c-4bc3-a901-4b645ebf5abe",
"layout": {
"h": 3,
"i": "be0f1872-172c-4bc3-a901-4b645ebf5abe",
"w": 12,
"x": 0,
"y": 25
},
"name": "Messages delivered / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(\n (rate(rabbitmq_channel_messages_delivered_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) +\n (rate(rabbitmq_channel_messages_delivered_ack_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})\n) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages that have been redelivered to consumers. It includes messages that have been requeued automatically and redelivered due to channel exceptions or connection closures.\n\nHaving some redeliveries is expected, but if this metric is consistently non-zero, it is worth investigating why.\n\n* [Negative Acknowledgement and Requeuing of Deliveries](https://www.rabbitmq.com/confirms.html#consumer-nacks-requeue)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "eff794ca-e844-4a12-b230-690aadefa53f",
"layout": {
"h": 3,
"i": "eff794ca-e844-4a12-b230-690aadefa53f",
"w": 12,
"x": 12,
"y": 25
},
"name": "Messages redelivered / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_redelivered_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of message deliveries to consumers that use manual acknowledgement mode.\n\nWhen this mode is used, RabbitMQ waits for consumers to acknowledge messages before more messages can be delivered.\n\nThis is the safest way of consuming messages.\n\n* [Consumer Acknowledgements](https://www.rabbitmq.com/confirms.html)\n* [Consumer Prefetch](https://www.rabbitmq.com/consumer-prefetch.html)\n* [Consumer Acknowledgement Modes, Prefetch and Throughput](https://www.rabbitmq.com/confirms.html#channel-qos-prefetch-throughput)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "2ed4be63-4fe9-462f-bc2f-967319bc3626",
"layout": {
"h": 3,
"i": "2ed4be63-4fe9-462f-bc2f-967319bc3626",
"w": 12,
"x": 0,
"y": 26
},
"name": "Messages delivered with manual ack / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_delivered_ack_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of message deliveries to consumers that use automatic acknowledgement mode.\n\nWhen this mode is used, RabbitMQ does not wait for consumers to acknowledge message deliveries.\n\nThis mode is fire-and-forget and does not offer any delivery safety guarantees. It tends to provide higher throughput and it may lead to consumer overload and higher consumer memory usage.\n\n* [Consumer Acknowledgement Modes, Prefetch and Throughput](https://www.rabbitmq.com/confirms.html#channel-qos-prefetch-throughput)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "116c44e0-c8e1-4f02-8eae-2140997e2280",
"layout": {
"h": 3,
"i": "116c44e0-c8e1-4f02-8eae-2140997e2280",
"w": 12,
"x": 12,
"y": 26
},
"name": "Messages delivered auto ack / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_delivered_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of message acknowledgements coming from consumers that use manual acknowledgement mode.\n\n* [Consumer Acknowledgements](https://www.rabbitmq.com/confirms.html)\n* [Consumer Prefetch](https://www.rabbitmq.com/consumer-prefetch.html)\n* [Consumer Acknowledgement Modes, Prefetch and Throughput](https://www.rabbitmq.com/confirms.html#channel-qos-prefetch-throughput)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "a1184534-3226-4c9a-ba6c-6d5258998518",
"layout": {
"h": 3,
"i": "a1184534-3226-4c9a-ba6c-6d5258998518",
"w": 12,
"x": 0,
"y": 27
},
"name": "Messages acknowledged / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_acked_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages delivered to polling consumers that use automatic acknowledgement mode.\n\nThe use of polling consumers is highly inefficient and therefore strongly discouraged.\n\n* [Fetching individual messages](https://www.rabbitmq.com/consumers.html#fetching)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "2f34e8c7-e7fb-4695-afce-034a10081437",
"layout": {
"h": 3,
"i": "2f34e8c7-e7fb-4695-afce-034a10081437",
"w": 12,
"x": 12,
"y": 27
},
"name": "Polling operations with auto ack / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_get_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of polling consumer operations that yield no result.\n\nAny value above zero means that RabbitMQ resources are wasted by polling consumers.\n\nCompare this metric to the other polling consumer metrics to see the inefficiency rate.\n\nThe use of polling consumers is highly inefficient and therefore strongly discouraged.\n\n* [Fetching individual messages](https://www.rabbitmq.com/consumers.html#fetching)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "c7b1f4b8-5069-480f-a1f7-1c85dbc389c1",
"layout": {
"h": 3,
"i": "c7b1f4b8-5069-480f-a1f7-1c85dbc389c1",
"w": 12,
"x": 0,
"y": 28
},
"name": "Polling operations that yield no result / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_get_empty_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages delivered to polling consumers that use manual acknowledgement mode.\n\nThe use of polling consumers is highly inefficient and therefore strongly discouraged.\n\n* [Fetching individual messages](https://www.rabbitmq.com/consumers.html#fetching)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "4c2f60db-3cb4-4926-944c-022cf876eec2",
"layout": {
"h": 3,
"i": "4c2f60db-3cb4-4926-944c-022cf876eec2",
"w": 12,
"x": 12,
"y": 28
},
"name": "Polling operations with manual ack / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_get_ack_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "b6aa6f06-924f-4575-b4e0-d116ab744ea1",
"layout": {
"h": 1,
"i": "b6aa6f06-924f-4575-b4e0-d116ab744ea1",
"w": 24,
"x": 0,
"y": 31
},
"name": "QUEUES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Total number of queue masters per node. \n\nThis metric makes it easy to see sub-optimal queue distribution in a cluster.\n\n* [Queue Masters, Data Locality](https://www.rabbitmq.com/ha.html#master-migration-data-locality)\n* [Queues](https://www.rabbitmq.com/queues.html)",
"id": "c26434ca-065f-4088-81c6-ef8f0cbca552",
"layout": {
"h": 3,
"i": "c26434ca-065f-4088-81c6-ef8f0cbca552",
"w": 12,
"x": 0,
"y": 32
},
"name": "Total queues",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_queues * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of queue declarations performed by clients.\n\nLow sustained values above zero are to be expected. High rates may be indicative of queue churn or high rates of connection recovery. Confirm connection recovery rates by using the _Connections opened_ metric.\n\n* [Queues](https://www.rabbitmq.com/queues.html)",
"id": "ff021951-7991-4c3e-a667-8cd11e5c444c",
"layout": {
"h": 3,
"i": "ff021951-7991-4c3e-a667-8cd11e5c444c",
"w": 4,
"x": 12,
"y": 32
},
"name": "Queues declared / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queues_declared_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of new queues created (as opposed to redeclarations).\n\nLow sustained values above zero are to be expected. High rates may be indicative of queue churn or high rates of connection recovery. Confirm connection recovery rates by using the _Connections opened_ metric.\n\n* [Queues](https://www.rabbitmq.com/queues.html)",
"id": "00bbb4fc-5cdf-4b29-a440-ff4da4325a0c",
"layout": {
"h": 3,
"i": "00bbb4fc-5cdf-4b29-a440-ff4da4325a0c",
"w": 4,
"x": 16,
"y": 32
},
"name": "Queues created / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queues_created_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of queues deleted.\n\nLow sustained values above zero are to be expected. High rates may be indicative of queue churn or high rates of connection recovery. Confirm connection recovery rates by using the _Connections opened_ metric.\n\n* [Queues](https://www.rabbitmq.com/queues.html)",
"id": "f802e41d-14fe-4193-a5cf-c31957b146f7",
"layout": {
"h": 3,
"i": "f802e41d-14fe-4193-a5cf-c31957b146f7",
"w": 4,
"x": 20,
"y": 32
},
"name": "Queues deleted / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queues_deleted_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "5c7acadf-f9ff-4db9-a284-a206be245733",
"layout": {
"h": 1,
"i": "5c7acadf-f9ff-4db9-a284-a206be245733",
"w": 24,
"x": 0,
"y": 35
},
"name": "CHANNELS",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Total number of channels on all currently opened connections.\n\nIf this metric grows monotonically it is highly likely a channel leak in one of the applications. Confirm channel leaks by using the _Channels opened_ and _Channels closed_ metrics.\n\n* [Channel Leak](https://www.rabbitmq.com/channels.html#channel-leaks)\n* [Channels](https://www.rabbitmq.com/channels.html)",
"id": "362c622f-3fd8-4bdd-8dce-7ebf335f42f9",
"layout": {
"h": 3,
"i": "362c622f-3fd8-4bdd-8dce-7ebf335f42f9",
"w": 12,
"x": 0,
"y": 36
},
"name": "Total channels",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_channels * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of new channels opened by applications across all connections. Channels are expected to be long-lived.\n\nLow sustained values above zero are to be expected. High rates may be indicative of channel churn or mass connection recovery. Confirm connection recovery rates by using the _Connections opened_ metric.\n\n* [High Channel Churn](https://www.rabbitmq.com/channels.html#high-channel-churn)\n* [Channels](https://www.rabbitmq.com/channels.html)",
"id": "3a1a643b-8e2a-4ed1-8621-e5fae3ebc7c2",
"layout": {
"h": 3,
"i": "3a1a643b-8e2a-4ed1-8621-e5fae3ebc7c2",
"w": 6,
"x": 12,
"y": 36
},
"name": "Channels opened / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channels_opened_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of channels closed by applications across all connections. Channels are expected to be long-lived.\n\nLow sustained values above zero are to be expected. High rates may be indicative of channel churn or mass connection recovery. Confirm connection recovery rates by using the _Connections opened_ metric.\n\n* [High Channel Churn](https://www.rabbitmq.com/channels.html#high-channel-churn)\n* [Channels](https://www.rabbitmq.com/channels.html)",
"id": "01f1771a-dd54-45e5-aa1d-b6ee2111c53b",
"layout": {
"h": 3,
"i": "01f1771a-dd54-45e5-aa1d-b6ee2111c53b",
"w": 6,
"x": 18,
"y": 36
},
"name": "Channels closed / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channels_closed_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "419f22e8-08cc-4d91-8bc9-7c1055368146",
"layout": {
"h": 1,
"i": "419f22e8-08cc-4d91-8bc9-7c1055368146",
"w": 24,
"x": 0,
"y": 39
},
"name": "CONNECTIONS",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of connections closed. Connections are expected to be long-lived.\n\nLow sustained values above zero are to be expected. High rates may be indicative of connection churn or mass connection recovery.\n\n* [Connections](https://www.rabbitmq.com/connections.html)",
"id": "c14fd6f4-d3de-4811-82c6-2b20c9146e89",
"layout": {
"h": 3,
"i": "c14fd6f4-d3de-4811-82c6-2b20c9146e89",
"w": 6,
"x": 18,
"y": 40
},
"name": "Connections closed / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_connections_closed_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Total number of client connections.\n\nIf this metric grows monotonically it is highly likely a connection leak in one of the applications. Confirm connection leaks by using the _Connections opened_ and _Connections closed_ metrics.\n\n* [Connection Leak](https://www.rabbitmq.com/connections.html#monitoring)\n* [Connections](https://www.rabbitmq.com/connections.html)",
"id": "1d06679b-c603-4cbf-85a7-7c8f4594258f",
"layout": {
"h": 3,
"i": "1d06679b-c603-4cbf-85a7-7c8f4594258f",
"w": 12,
"x": 0,
"y": 40
},
"name": "Total connections",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_connections * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of new connections opened by clients. Connections are expected to be long-lived.\n\nLow sustained values above zero are to be expected. High rates may be indicative of connection churn or mass connection recovery.\n\n* [Connection Leak](https://www.rabbitmq.com/connections.html#monitoring)\n* [Connections](https://www.rabbitmq.com/connections.html)",
"id": "8c91f7fe-79e9-454a-9dd3-6f214a29e1eb",
"layout": {
"h": 3,
"i": "8c91f7fe-79e9-454a-9dd3-6f214a29e1eb",
"w": 6,
"x": 12,
"y": 40
},
"name": "Connections opened / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_connections_opened_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(rabbitmq_identity_info, rabbitmq_cluster)",
"name": "rabbitmq_cluster",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328323134000
}
================================================
FILE: integrations/RabbitMQ/dashboards/rabbitmq_v3.8_gt.json
================================================
{
"id": 0,
"group_id": 0,
"name": "RabbitMQ 3.8+",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "4466a232-248d-45a8-bf4d-05d5139c7346",
"layout": {
"h": 1,
"i": "4466a232-248d-45a8-bf4d-05d5139c7346",
"w": 24,
"x": 0,
"y": 0
},
"name": "Overview",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "a20b5d06-4343-457a-89f2-33a52c4dec04",
"layout": {
"h": 3,
"i": "a20b5d06-4343-457a-89f2-33a52c4dec04",
"w": 7,
"x": 0,
"y": 1
},
"name": "Ready messages",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10000
},
"result": {
"color": "#4a90e2"
},
"type": "range"
},
{
"match": {
"from": 100000
},
"result": {
"color": "#f50a0a"
},
"type": "range"
},
{
"match": {
"to": 9999
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_ready * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "893409b0-4ca0-450b-a0c9-f48eddf0e243",
"layout": {
"h": 3,
"i": "893409b0-4ca0-450b-a0c9-f48eddf0e243",
"w": 5,
"x": 7,
"y": 1
},
"name": "Incoming messages / s",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 50
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_published_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "d596e7f0-5095-420e-bc38-674001dcf5f4",
"layout": {
"h": 3,
"i": "d596e7f0-5095-420e-bc38-674001dcf5f4",
"w": 4,
"x": 12,
"y": 1
},
"name": "Publishers",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_channels * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) - sum(rabbitmq_channel_consumers * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "9f6d7dee-666d-4e1b-90d0-129b2e5ba085",
"layout": {
"h": 3,
"i": "9f6d7dee-666d-4e1b-90d0-129b2e5ba085",
"w": 4,
"x": 16,
"y": 1
},
"name": "Connections",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_connections * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "910eae0f-2b78-4d10-a780-8f997f6e96cb",
"layout": {
"h": 3,
"i": "910eae0f-2b78-4d10-a780-8f997f6e96cb",
"w": 4,
"x": 20,
"y": 1
},
"name": "Queues",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_queues * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "2e8cd60f-51b0-46b2-8c0f-bf55604d340d",
"layout": {
"h": 3,
"i": "2e8cd60f-51b0-46b2-8c0f-bf55604d340d",
"w": 7,
"x": 0,
"y": 2
},
"name": "Unacknowledged messages",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"to": 99
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 100
},
"result": {
"color": "#4a90e2"
},
"type": "range"
},
{
"match": {
"from": 500
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_unacked * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "4b242c1e-85d5-48b3-8cce-b467209245ec",
"layout": {
"h": 3,
"i": "4b242c1e-85d5-48b3-8cce-b467209245ec",
"w": 5,
"x": 7,
"y": 2
},
"name": "Outgoing messages / s",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 50
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_redelivered_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) +\nsum(rate(rabbitmq_channel_messages_delivered_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) +\nsum(rate(rabbitmq_channel_messages_delivered_ack_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) +\nsum(rate(rabbitmq_channel_get_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) +\nsum(rate(rabbitmq_channel_get_ack_total[60s]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "b87b1c6b-644a-42f4-915a-bd6857540f70",
"layout": {
"h": 3,
"i": "b87b1c6b-644a-42f4-915a-bd6857540f70",
"w": 4,
"x": 12,
"y": 2
},
"name": "Consumers",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_channel_consumers * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "65103bcf-fb21-488d-a06b-7b0ea130ca4d",
"layout": {
"h": 3,
"i": "65103bcf-fb21-488d-a06b-7b0ea130ca4d",
"w": 4,
"x": 16,
"y": 2
},
"name": "Channels",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_channels * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "c6454712-e265-4387-ae86-9ac865af46f2",
"layout": {
"h": 3,
"i": "c6454712-e265-4387-ae86-9ac865af46f2",
"w": 4,
"x": 20,
"y": 2
},
"name": "Nodes",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 3,
"to": null
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 8
},
"result": {
"color": "#e70909"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_build_info * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "1712a96f-bcde-4d33-90b6-e2eba20527b9",
"layout": {
"h": 1,
"i": "1712a96f-bcde-4d33-90b6-e2eba20527b9",
"w": 24,
"x": 0,
"y": 5
},
"name": "Nodes",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"columns": [
"rabbitmq_cluster",
"rabbitmq_node",
"rabbitmq_version",
"erlang_version"
],
"displayMode": "labelsOfSeriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1b3fdea0-1921-48ae-b11f-31cc27f816b0",
"layout": {
"h": 3,
"i": "1b3fdea0-1921-48ae-b11f-31cc27f816b0",
"w": 24,
"x": 0,
"y": 6
},
"name": "nodes",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "rabbitmq_build_info * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "If the value is zero or less, the memory alarm will be triggered and all publishing connections across all cluster nodes will be blocked.\n\nThis value can temporarily go negative because the memory alarm is triggered with a slight delay.\n\nThe kernel's view of the amount of memory used by the node can differ from what the node itself can observe. This means that this value can be negative for a sustained period of time.\n\nBy default nodes use resident set size (RSS) to compute how much memory they use. This strategy can be changed (see the guides below).\n\n* [Alarms](https://www.rabbitmq.com/alarms.html)\n* [Memory Alarms](https://www.rabbitmq.com/memory.html)\n* [Reasoning About Memory Use](https://www.rabbitmq.com/memory-use.html)\n* [Blocked Connection Notifications](https://www.rabbitmq.com/connection-blocked.html)",
"id": "b91068ed-0914-4a8d-91dd-9ffb3b692516",
"layout": {
"h": 7,
"i": "b91068ed-0914-4a8d-91dd-9ffb3b692516",
"w": 8,
"x": 0,
"y": 7
},
"name": "Memory available before publishers blocked",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(rabbitmq_resident_memory_limit_bytes * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) -\n(rabbitmq_process_resident_memory_bytes * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "This metric is reported for the partition where the RabbitMQ data directory is stored.\n\nIf the value is zero or less, the disk alarm will be triggered and all publishing connections across all cluster nodes will be blocked.\n\nThis value can temporarily go negative because the free disk space alarm is triggered with a slight delay.\n\n* [Alarms](https://www.rabbitmq.com/alarms.html)\n* [Disk Space Alarms](https://www.rabbitmq.com/disk-alarms.html)\n* [Disk Space](https://www.rabbitmq.com/production-checklist.html#resource-limits-disk-space)\n* [Persistence Configuration](https://www.rabbitmq.com/persistence-conf.html)\n* [Blocked Connection Notifications](https://www.rabbitmq.com/connection-blocked.html)",
"id": "a74ca489-6101-49bc-9560-4cde368bc47e",
"layout": {
"h": 7,
"i": "a74ca489-6101-49bc-9560-4cde368bc47e",
"w": 8,
"x": 8,
"y": 7
},
"name": "Disk space available before publishers blocked",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_disk_space_available_bytes * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "When this value reaches zero, new connections will not be accepted and disk write operations may fail.\n\nClient libraries, peer nodes and CLI tools will not be able to connect when the node runs out of available file descriptors.\n\n* [Open File Handles Limit](https://www.rabbitmq.com/production-checklist.html#resource-limits-file-handle-limit)",
"id": "100ed9ae-2b84-4f2b-82b2-0444dd28deed",
"layout": {
"h": 3,
"i": "100ed9ae-2b84-4f2b-82b2-0444dd28deed",
"w": 8,
"x": 16,
"y": 7
},
"name": "File descriptors available",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(rabbitmq_process_max_fds * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) -\n(rabbitmq_process_open_fds * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "When this value reaches zero, new connections will not be accepted.\n\nClient libraries, peer nodes and CLI tools will not be able to connect when the node runs out of available file descriptors.\n\n* [Networking and RabbitMQ](https://www.rabbitmq.com/networking.html)",
"id": "a4859891-2538-47fc-946b-7ae3aa507c51",
"layout": {
"h": 3,
"i": "a4859891-2538-47fc-946b-7ae3aa507c51",
"w": 8,
"x": 16,
"y": 8
},
"name": "TCP sockets available",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(rabbitmq_process_max_tcp_sockets * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) -\n(rabbitmq_process_open_tcp_sockets * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "fec98a71-0615-4782-bf8d-960529e243f9",
"layout": {
"h": 1,
"i": "fec98a71-0615-4782-bf8d-960529e243f9",
"w": 24,
"x": 0,
"y": 14
},
"name": "QUEUED MESSAGES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Total number of ready messages ready to be delivered to consumers.\n\nAim to keep this value as low as possible. RabbitMQ behaves best when messages are flowing through it. It's OK for publishers to occasionally outpace consumers, but the expectation is that consumers will eventually process all ready messages.\n\nIf this metric keeps increasing, your system will eventually run out of memory and/or disk space. Consider using TTL or Queue Length Limit to prevent unbounded message growth.\n\n* [Queues](https://www.rabbitmq.com/queues.html)\n* [Consumers](https://www.rabbitmq.com/consumers.html)\n* [Queue Length Limit](https://www.rabbitmq.com/maxlength.html)\n* [Time-To-Live and Expiration](https://www.rabbitmq.com/ttl.html)",
"id": "145dc75a-d3b8-491f-9ba6-6da787c8e265",
"layout": {
"h": 3,
"i": "145dc75a-d3b8-491f-9ba6-6da787c8e265",
"w": 12,
"x": 0,
"y": 15
},
"name": "Messages ready to be delivered to consumers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_ready * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The total number of messages that are either in-flight to consumers, currently being processed by consumers or simply waiting for the consumer acknowledgements to be processed by the queue. Until the queue processes the message acknowledgement, the message will remain unacknowledged.\n\n* [Queues](https://www.rabbitmq.com/queues.html)\n* [Confirms and Acknowledgements](https://www.rabbitmq.com/confirms.html)\n* [Consumer Prefetch](https://www.rabbitmq.com/consumer-prefetch.html)",
"id": "a6ca328a-8b19-488e-a70d-74372f994901",
"layout": {
"h": 3,
"i": "a6ca328a-8b19-488e-a70d-74372f994901",
"w": 12,
"x": 12,
"y": 15
},
"name": "Messages pending consumer acknowledgement",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_unacked * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "500c51ed-a0d6-41d9-903f-d000e289dc2b",
"layout": {
"h": 1,
"i": "500c51ed-a0d6-41d9-903f-d000e289dc2b",
"w": 24,
"x": 0,
"y": 18
},
"name": "INCOMING MESSAGES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The incoming message rate before any routing rules are applied.\n\nIf this value is lower than the number of messages published to queues, it may indicate that some messages are delivered to more than one queue.\n\nIf this value is higher than the number of messages published to queues, messages cannot be routed and will either be dropped or returned to publishers.\n\n* [Publishers](https://www.rabbitmq.com/publishers.html)",
"id": "ef4352a4-c281-4596-b89e-d79a565ca112",
"layout": {
"h": 3,
"i": "ef4352a4-c281-4596-b89e-d79a565ca112",
"w": 12,
"x": 0,
"y": 19
},
"name": "Messages published / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_published_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages confirmed by the broker to publishers. Publishers must opt-in to receive message confirmations.\n\nIf this metric is consistently at zero it may suggest that publisher confirms are not used by clients. The safety of published messages is likely to be at risk.\n\n* [Publisher Confirms](https://www.rabbitmq.com/confirms.html#publisher-confirms)\n* [Publisher Confirms and Data Safety](https://www.rabbitmq.com/publishers.html#data-safety)\n* [When Will Published Messages Be Confirmed by the Broker?](https://www.rabbitmq.com/confirms.html#when-publishes-are-confirmed)",
"id": "f0932549-d4a7-4eb1-a86e-35eb49569f29",
"layout": {
"h": 3,
"i": "f0932549-d4a7-4eb1-a86e-35eb49569f29",
"w": 12,
"x": 12,
"y": 19
},
"name": "Messages confirmed to publishers / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_confirmed_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages received from publishers and successfully routed to the master queue replicas.\n\n* [Queues](https://www.rabbitmq.com/queues.html)\n* [Publishers](https://www.rabbitmq.com/publishers.html)",
"id": "f47766cb-3cdb-4f67-896f-77af421ff404",
"layout": {
"h": 3,
"i": "f47766cb-3cdb-4f67-896f-77af421ff404",
"w": 12,
"x": 0,
"y": 20
},
"name": "Messages routed to queues / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queue_messages_published_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages received from publishers that have publisher confirms enabled and the broker has not confirmed yet.\n\n* [Publishers](https://www.rabbitmq.com/publishers.html)\n* [Confirms and Acknowledgements](https://www.rabbitmq.com/confirms.html)\n* [When Will Published Messages Be Confirmed by the Broker?](https://www.rabbitmq.com/confirms.html#when-publishes-are-confirmed)",
"id": "644302ba-3c99-4787-8023-93770b0a9e6c",
"layout": {
"h": 3,
"i": "644302ba-3c99-4787-8023-93770b0a9e6c",
"w": 12,
"x": 12,
"y": 20
},
"name": "Messages unconfirmed to publishers / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_unconfirmed[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages that cannot be routed and are dropped. \n\nAny value above zero means message loss and likely suggests a routing problem on the publisher end.\n\n* [Unroutable Message Handling](https://www.rabbitmq.com/publishers.html#unroutable)",
"id": "0a3eadfd-d84c-4070-9527-b4c62fc20787",
"layout": {
"h": 3,
"i": "0a3eadfd-d84c-4070-9527-b4c62fc20787",
"w": 12,
"x": 0,
"y": 21
},
"name": "Unroutable messages dropped / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_unroutable_dropped_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages that cannot be routed and are returned back to publishers.\n\nSustained values above zero may indicate a routing problem on the publisher end.\n\n* [Unroutable Message Handling](https://www.rabbitmq.com/publishers.html#unroutable)\n* [When Will Published Messages Be Confirmed by the Broker?](https://www.rabbitmq.com/confirms.html#when-publishes-are-confirmed)",
"id": "a14e8796-9614-4204-b5b8-5d1a47f356d8",
"layout": {
"h": 3,
"i": "a14e8796-9614-4204-b5b8-5d1a47f356d8",
"w": 12,
"x": 12,
"y": 21
},
"name": "Unroutable messages returned to publishers / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_unroutable_returned_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "2405258c-c08f-4e49-960a-5c9a12b29f12",
"layout": {
"h": 1,
"i": "2405258c-c08f-4e49-960a-5c9a12b29f12",
"w": 24,
"x": 0,
"y": 24
},
"name": "OUTGOING MESSAGES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages delivered to consumers. It includes messages that have been redelivered.\n\nThis metric does not include messages that have been fetched by consumers using `basic.get` (consumed by polling).\n\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "be0f1872-172c-4bc3-a901-4b645ebf5abe",
"layout": {
"h": 3,
"i": "be0f1872-172c-4bc3-a901-4b645ebf5abe",
"w": 12,
"x": 0,
"y": 25
},
"name": "Messages delivered / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(\n (rate(rabbitmq_channel_messages_delivered_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) +\n (rate(rabbitmq_channel_messages_delivered_ack_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"})\n) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages that have been redelivered to consumers. It includes messages that have been requeued automatically and redelivered due to channel exceptions or connection closures.\n\nHaving some redeliveries is expected, but if this metric is consistently non-zero, it is worth investigating why.\n\n* [Negative Acknowledgement and Requeuing of Deliveries](https://www.rabbitmq.com/confirms.html#consumer-nacks-requeue)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "eff794ca-e844-4a12-b230-690aadefa53f",
"layout": {
"h": 3,
"i": "eff794ca-e844-4a12-b230-690aadefa53f",
"w": 12,
"x": 12,
"y": 25
},
"name": "Messages redelivered / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_redelivered_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of message deliveries to consumers that use manual acknowledgement mode.\n\nWhen this mode is used, RabbitMQ waits for consumers to acknowledge messages before more messages can be delivered.\n\nThis is the safest way of consuming messages.\n\n* [Consumer Acknowledgements](https://www.rabbitmq.com/confirms.html)\n* [Consumer Prefetch](https://www.rabbitmq.com/consumer-prefetch.html)\n* [Consumer Acknowledgement Modes, Prefetch and Throughput](https://www.rabbitmq.com/confirms.html#channel-qos-prefetch-throughput)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "2ed4be63-4fe9-462f-bc2f-967319bc3626",
"layout": {
"h": 3,
"i": "2ed4be63-4fe9-462f-bc2f-967319bc3626",
"w": 12,
"x": 0,
"y": 26
},
"name": "Messages delivered with manual ack / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_delivered_ack_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of message deliveries to consumers that use automatic acknowledgement mode.\n\nWhen this mode is used, RabbitMQ does not wait for consumers to acknowledge message deliveries.\n\nThis mode is fire-and-forget and does not offer any delivery safety guarantees. It tends to provide higher throughput and it may lead to consumer overload and higher consumer memory usage.\n\n* [Consumer Acknowledgement Modes, Prefetch and Throughput](https://www.rabbitmq.com/confirms.html#channel-qos-prefetch-throughput)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "116c44e0-c8e1-4f02-8eae-2140997e2280",
"layout": {
"h": 3,
"i": "116c44e0-c8e1-4f02-8eae-2140997e2280",
"w": 12,
"x": 12,
"y": 26
},
"name": "Messages delivered auto ack / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_delivered_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of message acknowledgements coming from consumers that use manual acknowledgement mode.\n\n* [Consumer Acknowledgements](https://www.rabbitmq.com/confirms.html)\n* [Consumer Prefetch](https://www.rabbitmq.com/consumer-prefetch.html)\n* [Consumer Acknowledgement Modes, Prefetch and Throughput](https://www.rabbitmq.com/confirms.html#channel-qos-prefetch-throughput)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "a1184534-3226-4c9a-ba6c-6d5258998518",
"layout": {
"h": 3,
"i": "a1184534-3226-4c9a-ba6c-6d5258998518",
"w": 12,
"x": 0,
"y": 27
},
"name": "Messages acknowledged / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_messages_acked_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages delivered to polling consumers that use automatic acknowledgement mode.\n\nThe use of polling consumers is highly inefficient and therefore strongly discouraged.\n\n* [Fetching individual messages](https://www.rabbitmq.com/consumers.html#fetching)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "2f34e8c7-e7fb-4695-afce-034a10081437",
"layout": {
"h": 3,
"i": "2f34e8c7-e7fb-4695-afce-034a10081437",
"w": 12,
"x": 12,
"y": 27
},
"name": "Polling operations with auto ack / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_get_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of polling consumer operations that yield no result.\n\nAny value above zero means that RabbitMQ resources are wasted by polling consumers.\n\nCompare this metric to the other polling consumer metrics to see the inefficiency rate.\n\nThe use of polling consumers is highly inefficient and therefore strongly discouraged.\n\n* [Fetching individual messages](https://www.rabbitmq.com/consumers.html#fetching)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "c7b1f4b8-5069-480f-a1f7-1c85dbc389c1",
"layout": {
"h": 3,
"i": "c7b1f4b8-5069-480f-a1f7-1c85dbc389c1",
"w": 12,
"x": 0,
"y": 28
},
"name": "Polling operations that yield no result / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_get_empty_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of messages delivered to polling consumers that use manual acknowledgement mode.\n\nThe use of polling consumers is highly inefficient and therefore strongly discouraged.\n\n* [Fetching individual messages](https://www.rabbitmq.com/consumers.html#fetching)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "4c2f60db-3cb4-4926-944c-022cf876eec2",
"layout": {
"h": 3,
"i": "4c2f60db-3cb4-4926-944c-022cf876eec2",
"w": 12,
"x": 12,
"y": 28
},
"name": "Polling operations with manual ack / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channel_get_ack_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "b6aa6f06-924f-4575-b4e0-d116ab744ea1",
"layout": {
"h": 1,
"i": "b6aa6f06-924f-4575-b4e0-d116ab744ea1",
"w": 24,
"x": 0,
"y": 31
},
"name": "QUEUES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Total number of queue masters per node. \n\nThis metric makes it easy to see sub-optimal queue distribution in a cluster.\n\n* [Queue Masters, Data Locality](https://www.rabbitmq.com/ha.html#master-migration-data-locality)\n* [Queues](https://www.rabbitmq.com/queues.html)",
"id": "c26434ca-065f-4088-81c6-ef8f0cbca552",
"layout": {
"h": 3,
"i": "c26434ca-065f-4088-81c6-ef8f0cbca552",
"w": 12,
"x": 0,
"y": 32
},
"name": "Total queues",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_queues * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of queue declarations performed by clients.\n\nLow sustained values above zero are to be expected. High rates may be indicative of queue churn or high rates of connection recovery. Confirm connection recovery rates by using the _Connections opened_ metric.\n\n* [Queues](https://www.rabbitmq.com/queues.html)",
"id": "ff021951-7991-4c3e-a667-8cd11e5c444c",
"layout": {
"h": 3,
"i": "ff021951-7991-4c3e-a667-8cd11e5c444c",
"w": 4,
"x": 12,
"y": 32
},
"name": "Queues declared / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queues_declared_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of new queues created (as opposed to redeclarations).\n\nLow sustained values above zero are to be expected. High rates may be indicative of queue churn or high rates of connection recovery. Confirm connection recovery rates by using the _Connections opened_ metric.\n\n* [Queues](https://www.rabbitmq.com/queues.html)",
"id": "00bbb4fc-5cdf-4b29-a440-ff4da4325a0c",
"layout": {
"h": 3,
"i": "00bbb4fc-5cdf-4b29-a440-ff4da4325a0c",
"w": 4,
"x": 16,
"y": 32
},
"name": "Queues created / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queues_created_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of queues deleted.\n\nLow sustained values above zero are to be expected. High rates may be indicative of queue churn or high rates of connection recovery. Confirm connection recovery rates by using the _Connections opened_ metric.\n\n* [Queues](https://www.rabbitmq.com/queues.html)",
"id": "f802e41d-14fe-4193-a5cf-c31957b146f7",
"layout": {
"h": 3,
"i": "f802e41d-14fe-4193-a5cf-c31957b146f7",
"w": 4,
"x": 20,
"y": 32
},
"name": "Queues deleted / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queues_deleted_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "5c7acadf-f9ff-4db9-a284-a206be245733",
"layout": {
"h": 1,
"i": "5c7acadf-f9ff-4db9-a284-a206be245733",
"w": 24,
"x": 0,
"y": 35
},
"name": "CHANNELS",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Total number of channels on all currently opened connections.\n\nIf this metric grows monotonically it is highly likely a channel leak in one of the applications. Confirm channel leaks by using the _Channels opened_ and _Channels closed_ metrics.\n\n* [Channel Leak](https://www.rabbitmq.com/channels.html#channel-leaks)\n* [Channels](https://www.rabbitmq.com/channels.html)",
"id": "362c622f-3fd8-4bdd-8dce-7ebf335f42f9",
"layout": {
"h": 3,
"i": "362c622f-3fd8-4bdd-8dce-7ebf335f42f9",
"w": 12,
"x": 0,
"y": 36
},
"name": "Total channels",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_channels * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of new channels opened by applications across all connections. Channels are expected to be long-lived.\n\nLow sustained values above zero are to be expected. High rates may be indicative of channel churn or mass connection recovery. Confirm connection recovery rates by using the _Connections opened_ metric.\n\n* [High Channel Churn](https://www.rabbitmq.com/channels.html#high-channel-churn)\n* [Channels](https://www.rabbitmq.com/channels.html)",
"id": "3a1a643b-8e2a-4ed1-8621-e5fae3ebc7c2",
"layout": {
"h": 3,
"i": "3a1a643b-8e2a-4ed1-8621-e5fae3ebc7c2",
"w": 6,
"x": 12,
"y": 36
},
"name": "Channels opened / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channels_opened_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of channels closed by applications across all connections. Channels are expected to be long-lived.\n\nLow sustained values above zero are to be expected. High rates may be indicative of channel churn or mass connection recovery. Confirm connection recovery rates by using the _Connections opened_ metric.\n\n* [High Channel Churn](https://www.rabbitmq.com/channels.html#high-channel-churn)\n* [Channels](https://www.rabbitmq.com/channels.html)",
"id": "01f1771a-dd54-45e5-aa1d-b6ee2111c53b",
"layout": {
"h": 3,
"i": "01f1771a-dd54-45e5-aa1d-b6ee2111c53b",
"w": 6,
"x": 18,
"y": 36
},
"name": "Channels closed / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_channels_closed_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "419f22e8-08cc-4d91-8bc9-7c1055368146",
"layout": {
"h": 1,
"i": "419f22e8-08cc-4d91-8bc9-7c1055368146",
"w": 24,
"x": 0,
"y": 39
},
"name": "CONNECTIONS",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of connections closed. Connections are expected to be long-lived.\n\nLow sustained values above zero are to be expected. High rates may be indicative of connection churn or mass connection recovery.\n\n* [Connections](https://www.rabbitmq.com/connections.html)",
"id": "c14fd6f4-d3de-4811-82c6-2b20c9146e89",
"layout": {
"h": 3,
"i": "c14fd6f4-d3de-4811-82c6-2b20c9146e89",
"w": 6,
"x": 18,
"y": 40
},
"name": "Connections closed / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_connections_closed_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Total number of client connections.\n\nIf this metric grows monotonically it is highly likely a connection leak in one of the applications. Confirm connection leaks by using the _Connections opened_ and _Connections closed_ metrics.\n\n* [Connection Leak](https://www.rabbitmq.com/connections.html#monitoring)\n* [Connections](https://www.rabbitmq.com/connections.html)",
"id": "1d06679b-c603-4cbf-85a7-7c8f4594258f",
"layout": {
"h": 3,
"i": "1d06679b-c603-4cbf-85a7-7c8f4594258f",
"w": 12,
"x": 0,
"y": 40
},
"name": "Total connections",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_connections * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"lineInterpolation": "smooth",
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "The rate of new connections opened by clients. Connections are expected to be long-lived.\n\nLow sustained values above zero are to be expected. High rates may be indicative of connection churn or mass connection recovery.\n\n* [Connection Leak](https://www.rabbitmq.com/connections.html#monitoring)\n* [Connections](https://www.rabbitmq.com/connections.html)",
"id": "8c91f7fe-79e9-454a-9dd3-6f214a29e1eb",
"layout": {
"h": 3,
"i": "8c91f7fe-79e9-454a-9dd3-6f214a29e1eb",
"w": 6,
"x": 12,
"y": 40
},
"name": "Connections opened / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_connections_opened_total[60s]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}) by(rabbitmq_node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(rabbitmq_identity_info, rabbitmq_cluster)",
"name": "rabbitmq_cluster",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328326982000
}
================================================
FILE: integrations/RabbitMQ/dashboards/rabbitmq_v3.8_lt.json
================================================
{
"id": 0,
"group_id": 0,
"name": "RabbitMQ 3.8-",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "e83c8286-1579-43d9-bbaa-f4ebfd81ff03",
"layout": {
"h": 1,
"i": "e83c8286-1579-43d9-bbaa-f4ebfd81ff03",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Overview",
"panels": [],
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9c1c09a9-65b9-468f-9fe8-4a7abe16ce60",
"layout": {
"h": 3,
"i": "9c1c09a9-65b9-468f-9fe8-4a7abe16ce60",
"isResizable": true,
"w": 7,
"x": 0,
"y": 1
},
"name": "Ready messages",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 10000
},
"result": {
"color": "#4a90e2"
},
"type": "range"
},
{
"match": {
"from": 100000
},
"result": {
"color": "#f50a0a"
},
"type": "range"
},
{
"match": {
"to": 9999
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_ready )",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "751ca66f-8275-411d-9bfe-e1cf967d90fe",
"layout": {
"h": 3,
"i": "751ca66f-8275-411d-9bfe-e1cf967d90fe",
"isResizable": true,
"w": 5,
"x": 7,
"y": 1
},
"name": "Incoming messages / s",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 50
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queue_messages_publish{ident=\"$rabbitmq_cluster\"}[60s]))",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3fb62e0e-0c28-4226-91ca-83405e0ca91f",
"layout": {
"h": 3,
"i": "3fb62e0e-0c28-4226-91ca-83405e0ca91f",
"isResizable": true,
"w": 4,
"x": 12,
"y": 1
},
"name": "Publishers",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_overview_channels{ident=\"$rabbitmq_cluster\"}) - sum(rabbitmq_overview_consumers{ident=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "dba4f405-ad01-45bb-96cf-c7d066fa101b",
"layout": {
"h": 3,
"i": "dba4f405-ad01-45bb-96cf-c7d066fa101b",
"isResizable": true,
"w": 4,
"x": 16,
"y": 1
},
"name": "Connections",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_overview_connections{ident=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a44a473f-6c7d-4e48-aaca-e4becef11051",
"layout": {
"h": 3,
"i": "a44a473f-6c7d-4e48-aaca-e4becef11051",
"isResizable": true,
"w": 4,
"x": 20,
"y": 1
},
"name": "Queues",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_overview_queues {ident=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7e3281c6-98a5-467d-8871-3ca634cf7bb8",
"layout": {
"h": 3,
"i": "7e3281c6-98a5-467d-8871-3ca634cf7bb8",
"isResizable": true,
"w": 7,
"x": 0,
"y": 4
},
"name": "Unacknowledged messages",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"to": 99
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 100
},
"result": {
"color": "#4a90e2"
},
"type": "range"
},
{
"match": {
"from": 500
},
"result": {
"color": "#d0021b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_unack {ident=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "94d400cb-5a67-4533-9c67-655ca9471466",
"layout": {
"h": 3,
"i": "94d400cb-5a67-4533-9c67-655ca9471466",
"isResizable": true,
"w": 5,
"x": 7,
"y": 4
},
"name": "Outgoing messages / s",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 50
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rate(rabbitmq_overview_messages_redelivered{ident=\"$rabbitmq_cluster\"}[60s]) )+\nsum(rate(rabbitmq_overview_messages_delivered{ident=\"$rabbitmq_cluster\"}[60s]) )",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d89d49d8-6b75-49f0-b0c9-4d65e1705bfa",
"layout": {
"h": 3,
"i": "d89d49d8-6b75-49f0-b0c9-4d65e1705bfa",
"isResizable": true,
"w": 4,
"x": 12,
"y": 4
},
"name": "Consumers",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_overview_consumers{ident=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c5523db6-56c7-4d5b-bae4-73fd4fa02074",
"layout": {
"h": 3,
"i": "c5523db6-56c7-4d5b-bae4-73fd4fa02074",
"isResizable": true,
"w": 4,
"x": 16,
"y": 4
},
"name": "Channels",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 10
},
"result": {
"color": "#417505"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(rabbitmq_overview_channels{ident=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "background",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "aefb7825-2a50-42ed-8f45-d10a87161563",
"layout": {
"h": 3,
"i": "aefb7825-2a50-42ed-8f45-d10a87161563",
"isResizable": true,
"w": 4,
"x": 20,
"y": 4
},
"name": "Nodes",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 3,
"to": null
},
"result": {
"color": "#417505"
},
"type": "range"
},
{
"match": {
"from": 8
},
"result": {
"color": "#e70909"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "count(rabbitmq_node_uptime{ident=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "90aabc6f-94a3-4093-819d-f7339839fd3b",
"layout": {
"h": 1,
"i": "90aabc6f-94a3-4093-819d-f7339839fd3b",
"isResizable": false,
"w": 24,
"x": 0,
"y": 7
},
"name": "Nodes",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"columns": [
"ident",
"node",
"region",
"url"
],
"displayMode": "labelsOfSeriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1c67db00-88d5-41df-a17a-bd122bfcb1ff",
"layout": {
"h": 3,
"i": "1c67db00-88d5-41df-a17a-bd122bfcb1ff",
"isResizable": true,
"w": 24,
"x": 0,
"y": 8
},
"name": "nodes",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "rabbitmq_node_uptime{ident=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "If the value is zero or less, the memory alarm will be triggered and all publishing connections across all cluster nodes will be blocked.\n\nThis value can temporarily go negative because the memory alarm is triggered with a slight delay.\n\nThe kernel's view of the amount of memory used by the node can differ from what the node itself can observe. This means that this value can be negative for a sustained period of time.\n\nBy default nodes use resident set size (RSS) to compute how much memory they use. This strategy can be changed (see the guides below).\n\n* [Alarms](https://www.rabbitmq.com/alarms.html)\n* [Memory Alarms](https://www.rabbitmq.com/memory.html)\n* [Reasoning About Memory Use](https://www.rabbitmq.com/memory-use.html)\n* [Blocked Connection Notifications](https://www.rabbitmq.com/connection-blocked.html)",
"id": "ad20de77-8f3a-46d7-bab9-adcb8150e916",
"layout": {
"h": 7,
"i": "ad20de77-8f3a-46d7-bab9-adcb8150e916",
"isResizable": true,
"w": 8,
"x": 0,
"y": 11
},
"name": "Memory available before publishers blocked",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_node_mem_limit{ident=\"$rabbitmq_cluster\"} - rabbitmq_node_mem_total{ident=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "This metric is reported for the partition where the RabbitMQ data directory is stored.\n\nIf the value is zero or less, the disk alarm will be triggered and all publishing connections across all cluster nodes will be blocked.\n\nThis value can temporarily go negative because the free disk space alarm is triggered with a slight delay.\n\n* [Alarms](https://www.rabbitmq.com/alarms.html)\n* [Disk Space Alarms](https://www.rabbitmq.com/disk-alarms.html)\n* [Disk Space](https://www.rabbitmq.com/production-checklist.html#resource-limits-disk-space)\n* [Persistence Configuration](https://www.rabbitmq.com/persistence-conf.html)\n* [Blocked Connection Notifications](https://www.rabbitmq.com/connection-blocked.html)",
"id": "12ca67d1-3953-4095-a706-e27bbd8cf5e1",
"layout": {
"h": 7,
"i": "12ca67d1-3953-4095-a706-e27bbd8cf5e1",
"isResizable": true,
"w": 8,
"x": 8,
"y": 11
},
"name": "Disk space available before publishers blocked",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_node_disk_free{ident=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "When this value reaches zero, new connections will not be accepted and disk write operations may fail.\n\nClient libraries, peer nodes and CLI tools will not be able to connect when the node runs out of available file descriptors.\n\n* [Open File Handles Limit](https://www.rabbitmq.com/production-checklist.html#resource-limits-file-handle-limit)",
"id": "0f26721e-42ff-4bb5-8124-ed1d9bafaeff",
"layout": {
"h": 3,
"i": "0f26721e-42ff-4bb5-8124-ed1d9bafaeff",
"isResizable": true,
"w": 8,
"x": 16,
"y": 11
},
"name": "File descriptors available",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(rabbitmq_node_fd_total {ident=\"$rabbitmq_cluster\"}) -\n(rabbitmq_node_fd_used{ident=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "When this value reaches zero, new connections will not be accepted.\n\nClient libraries, peer nodes and CLI tools will not be able to connect when the node runs out of available file descriptors.\n\n* [Networking and RabbitMQ](https://www.rabbitmq.com/networking.html)",
"id": "b4d6c2dc-413a-43d8-bb53-d084233ce984",
"layout": {
"h": 3,
"i": "b4d6c2dc-413a-43d8-bb53-d084233ce984",
"isResizable": true,
"w": 8,
"x": 16,
"y": 14
},
"name": "TCP sockets available",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(rabbitmq_node_sockets_total {ident=\"$rabbitmq_cluster\"}) -\n(rabbitmq_node_sockets_used{ident=\"$rabbitmq_cluster\"})",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "566edc72-b5c4-4e91-95a5-05c01b5a4fbf",
"layout": {
"h": 1,
"i": "566edc72-b5c4-4e91-95a5-05c01b5a4fbf",
"isResizable": false,
"w": 24,
"x": 0,
"y": 18
},
"name": "QUEUED MESSAGES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Total number of ready messages ready to be delivered to consumers.\n\nAim to keep this value as low as possible. RabbitMQ behaves best when messages are flowing through it. It's OK for publishers to occasionally outpace consumers, but the expectation is that consumers will eventually process all ready messages.\n\nIf this metric keeps increasing, your system will eventually run out of memory and/or disk space. Consider using TTL or Queue Length Limit to prevent unbounded message growth.\n\n* [Queues](https://www.rabbitmq.com/queues.html)\n* [Consumers](https://www.rabbitmq.com/consumers.html)\n* [Queue Length Limit](https://www.rabbitmq.com/maxlength.html)\n* [Time-To-Live and Expiration](https://www.rabbitmq.com/ttl.html)",
"id": "e8880d5b-4ed2-4f0d-a23d-03bfe4094eb8",
"layout": {
"h": 3,
"i": "e8880d5b-4ed2-4f0d-a23d-03bfe4094eb8",
"isResizable": true,
"w": 12,
"x": 0,
"y": 19
},
"name": "Messages ready to be delivered to consumers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_ready {ident=\"$rabbitmq_cluster\"}) by(node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The total number of messages that are either in-flight to consumers, currently being processed by consumers or simply waiting for the consumer acknowledgements to be processed by the queue. Until the queue processes the message acknowledgement, the message will remain unacknowledged.\n\n* [Queues](https://www.rabbitmq.com/queues.html)\n* [Confirms and Acknowledgements](https://www.rabbitmq.com/confirms.html)\n* [Consumer Prefetch](https://www.rabbitmq.com/consumer-prefetch.html)",
"id": "e48c24d4-fcee-437e-bbe7-3f61ad29cf12",
"layout": {
"h": 3,
"i": "e48c24d4-fcee-437e-bbe7-3f61ad29cf12",
"isResizable": true,
"w": 12,
"x": 12,
"y": 19
},
"name": "Messages pending consumer acknowledgement",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rabbitmq_queue_messages_unack {ident=\"$rabbitmq_cluster\"}) by(node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "b38c82fc-188e-47e8-97a9-30a9ab630bca",
"layout": {
"h": 1,
"i": "b38c82fc-188e-47e8-97a9-30a9ab630bca",
"isResizable": false,
"w": 24,
"x": 0,
"y": 22
},
"name": "INCOMING MESSAGES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The incoming message rate before any routing rules are applied.\n\nIf this value is lower than the number of messages published to queues, it may indicate that some messages are delivered to more than one queue.\n\nIf this value is higher than the number of messages published to queues, messages cannot be routed and will either be dropped or returned to publishers.\n\n* [Publishers](https://www.rabbitmq.com/publishers.html)",
"id": "2cc48e5a-3cf3-4707-8d7b-718528dffdac",
"layout": {
"h": 3,
"i": "2cc48e5a-3cf3-4707-8d7b-718528dffdac",
"isResizable": true,
"w": 12,
"x": 0,
"y": 23
},
"name": "Messages published / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queue_messages_publish{ident=\"$rabbitmq_cluster\"}[60s]) )by(node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The rate of messages confirmed by the broker to publishers. Publishers must opt-in to receive message confirmations.\n\nIf this metric is consistently at zero it may suggest that publisher confirms are not used by clients. The safety of published messages is likely to be at risk.\n\n* [Publisher Confirms](https://www.rabbitmq.com/confirms.html#publisher-confirms)\n* [Publisher Confirms and Data Safety](https://www.rabbitmq.com/publishers.html#data-safety)\n* [When Will Published Messages Be Confirmed by the Broker?](https://www.rabbitmq.com/confirms.html#when-publishes-are-confirmed)",
"id": "ab889d4e-8523-47c5-8513-2bdd8c4176eb",
"layout": {
"h": 3,
"i": "ab889d4e-8523-47c5-8513-2bdd8c4176eb",
"isResizable": true,
"w": 12,
"x": 12,
"y": 23
},
"name": "Messages confirmed to publishers / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queue_messages_publish{ident=\"$rabbitmq_cluster\"}[60s])) by(node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The rate of messages received from publishers and successfully routed to the master queue replicas.\n\n* [Queues](https://www.rabbitmq.com/queues.html)\n* [Publishers](https://www.rabbitmq.com/publishers.html)",
"id": "770a9c4e-111e-4129-ad3a-08f9aca80628",
"layout": {
"h": 3,
"i": "770a9c4e-111e-4129-ad3a-08f9aca80628",
"isResizable": true,
"w": 12,
"x": 0,
"y": 26
},
"name": "Messages routed to queues / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queue_messages_publish{ident=\"$rabbitmq_cluster\"}[60s])) by(node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The rate of messages received from publishers that have publisher confirms enabled and the broker has not confirmed yet.\n\n* [Publishers](https://www.rabbitmq.com/publishers.html)\n* [Confirms and Acknowledgements](https://www.rabbitmq.com/confirms.html)\n* [When Will Published Messages Be Confirmed by the Broker?](https://www.rabbitmq.com/confirms.html#when-publishes-are-confirmed)",
"id": "f942bdc4-1b89-48f9-84ad-6ff1448f9764",
"layout": {
"h": 3,
"i": "f942bdc4-1b89-48f9-84ad-6ff1448f9764",
"isResizable": true,
"w": 12,
"x": 12,
"y": 26
},
"name": "Messages unconfirmed to publishers / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queue_messages_unack{ident=\"$rabbitmq_cluster\"}[60s])) by(node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The rate of messages that cannot be routed and are dropped. \n\nAny value above zero means message loss and likely suggests a routing problem on the publisher end.\n\n* [Unroutable Message Handling](https://www.rabbitmq.com/publishers.html#unroutable)",
"id": "893a01fa-9310-4f8b-a321-ea65a73bc74f",
"layout": {
"h": 3,
"i": "893a01fa-9310-4f8b-a321-ea65a73bc74f",
"isResizable": true,
"w": 12,
"x": 0,
"y": 29
},
"name": "Unroutable messages dropped / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_overview_return_unroutable{ident=\"$rabbitmq_cluster\"}[60s])) by (ident)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "8e0ab8be-7b82-4757-bc9c-8b4a5d940c63",
"layout": {
"h": 1,
"i": "8e0ab8be-7b82-4757-bc9c-8b4a5d940c63",
"isResizable": false,
"w": 24,
"x": 0,
"y": 32
},
"name": "OUTGOING MESSAGES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The rate of messages delivered to consumers. It includes messages that have been redelivered.\n\nThis metric does not include messages that have been fetched by consumers using `basic.get` (consumed by polling).\n\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "104ae6cf-c39b-449b-8241-e2ff1bfa17d0",
"layout": {
"h": 3,
"i": "104ae6cf-c39b-449b-8241-e2ff1bfa17d0",
"isResizable": true,
"w": 12,
"x": 0,
"y": 33
},
"name": "Messages delivered / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(\n (rate(rabbitmq_queue_messages_deliver{ident=\"$rabbitmq_cluster\"}[60s]) ) \n) by(node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The rate of messages that have been redelivered to consumers. It includes messages that have been requeued automatically and redelivered due to channel exceptions or connection closures.\n\nHaving some redeliveries is expected, but if this metric is consistently non-zero, it is worth investigating why.\n\n* [Negative Acknowledgement and Requeuing of Deliveries](https://www.rabbitmq.com/confirms.html#consumer-nacks-requeue)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "bb0db7aa-e816-43a7-97a3-2798ab9fa04f",
"layout": {
"h": 3,
"i": "bb0db7aa-e816-43a7-97a3-2798ab9fa04f",
"isResizable": true,
"w": 12,
"x": 12,
"y": 33
},
"name": "Messages redelivered / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queue_messages_redeliver{ident=\"$rabbitmq_cluster\"}[60s])) by(node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The rate of message deliveries to consumers that use manual acknowledgement mode.\n\nWhen this mode is used, RabbitMQ waits for consumers to acknowledge messages before more messages can be delivered.\n\nThis is the safest way of consuming messages.\n\n* [Consumer Acknowledgements](https://www.rabbitmq.com/confirms.html)\n* [Consumer Prefetch](https://www.rabbitmq.com/consumer-prefetch.html)\n* [Consumer Acknowledgement Modes, Prefetch and Throughput](https://www.rabbitmq.com/confirms.html#channel-qos-prefetch-throughput)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "63128d34-dfc3-4ff1-932f-2c533203d8c4",
"layout": {
"h": 3,
"i": "63128d34-dfc3-4ff1-932f-2c533203d8c4",
"isResizable": true,
"w": 12,
"x": 0,
"y": 36
},
"name": "Messages delivered with ack / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queue_messages_ack{ident=\"$rabbitmq_cluster\"}[60s]) ) by(node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The rate of message acknowledgements coming from consumers that use manual acknowledgement mode.\n\n* [Consumer Acknowledgements](https://www.rabbitmq.com/confirms.html)\n* [Consumer Prefetch](https://www.rabbitmq.com/consumer-prefetch.html)\n* [Consumer Acknowledgement Modes, Prefetch and Throughput](https://www.rabbitmq.com/confirms.html#channel-qos-prefetch-throughput)\n* [Consumers](https://www.rabbitmq.com/consumers.html)",
"id": "43c7aaf9-1a12-4d51-bc9a-1ae9c74b5852",
"layout": {
"h": 3,
"i": "43c7aaf9-1a12-4d51-bc9a-1ae9c74b5852",
"isResizable": true,
"w": 12,
"x": 12,
"y": 36
},
"name": "Messages acknowledged / s",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(rabbitmq_queue_messages_ack{ident=\"$rabbitmq_cluster\"}[60s])) by(node)",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "a4135c37-c958-47e9-b5c9-74862d1325fa",
"layout": {
"h": 1,
"i": "a4135c37-c958-47e9-b5c9-74862d1325fa",
"isResizable": false,
"w": 24,
"x": 0,
"y": 39
},
"name": "QUEUES",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Total number of queue masters per node. \n\nThis metric makes it easy to see sub-optimal queue distribution in a cluster.\n\n* [Queue Masters, Data Locality](https://www.rabbitmq.com/ha.html#master-migration-data-locality)\n* [Queues](https://www.rabbitmq.com/queues.html)",
"id": "982f2841-be7a-488a-92d1-22c75551ed46",
"layout": {
"h": 3,
"i": "982f2841-be7a-488a-92d1-22c75551ed46",
"isResizable": true,
"w": 12,
"x": 0,
"y": 40
},
"name": "Total queues",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_overview_queues {ident=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "2ac43291-3149-4c9a-8812-a9087e682f9c",
"layout": {
"h": 1,
"i": "2ac43291-3149-4c9a-8812-a9087e682f9c",
"isResizable": false,
"w": 24,
"x": 0,
"y": 43
},
"name": "CHANNELS",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Total number of channels on all currently opened connections.\n\nIf this metric grows monotonically it is highly likely a channel leak in one of the applications. Confirm channel leaks by using the _Channels opened_ and _Channels closed_ metrics.\n\n* [Channel Leak](https://www.rabbitmq.com/channels.html#channel-leaks)\n* [Channels](https://www.rabbitmq.com/channels.html)",
"id": "0146d1d3-d08b-46b8-b392-e1daa3da23b4",
"layout": {
"h": 3,
"i": "0146d1d3-d08b-46b8-b392-e1daa3da23b4",
"isResizable": true,
"w": 12,
"x": 0,
"y": 44
},
"name": "Total channels",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_overview_channels{ident=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "a559739f-e971-4929-b84f-e64522b19ea5",
"layout": {
"h": 1,
"i": "a559739f-e971-4929-b84f-e64522b19ea5",
"isResizable": false,
"w": 24,
"x": 0,
"y": 47
},
"name": "CONNECTIONS",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Total number of client connections.\n\nIf this metric grows monotonically it is highly likely a connection leak in one of the applications. Confirm connection leaks by using the _Connections opened_ and _Connections closed_ metrics.\n\n* [Connection Leak](https://www.rabbitmq.com/connections.html#monitoring)\n* [Connections](https://www.rabbitmq.com/connections.html)",
"id": "a64f0f0d-2814-4626-845c-6c09e1de1d51",
"layout": {
"h": 3,
"i": "a64f0f0d-2814-4626-845c-6c09e1de1d51",
"isResizable": true,
"w": 12,
"x": 0,
"y": 48
},
"name": "Total connections",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rabbitmq_overview_connections {ident=\"$rabbitmq_cluster\"}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(rabbitmq_node_uptime, ident)",
"name": "rabbitmq_cluster",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328330392000
}
================================================
FILE: integrations/RabbitMQ/markdown/README.md
================================================
# RabbitMQ
高版本(3.8以上版本)的 RabbitMQ,已经内置支持了暴露 Prometheus 协议的监控数据。所以,直接使用 categraf 的 prometheus 插件即可采集。开启 RabbitMQ Prometheus 访问:
```bash
rabbitmq-plugins enable rabbitmq_prometheus
```
启用成功的话,rabbitmq 默认会在 15692 端口起监听,访问 `http://localhost:15692/metrics` 即可看到符合 prometheus 协议的监控数据。
如果低于 3.8 的版本,还是需要使用 categraf 的 rabbitmq 插件来采集监控数据。
================================================
FILE: integrations/Redis/alerts/redis_by_categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "High Redis eviction rate",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(sum(rate(redis_evicted_keys[5m])) / sum(redis_keyspace_keys)) \u003e 0.1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisHighKeysEvictionRatio"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328335595000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "High Redis memory usage rate",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "redis_maxmemory \u003e 0 and (redis_used_memory / redis_maxmemory) \u003e 0.85",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisHighMemoryUsage"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328337406000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "High Redis Ping latency (above 100 milliseconds)",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "redis_ping_use_seconds \u003e 0.1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=HighPingLatency"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328338174000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Low Redis hit rate",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(redis_keyspace_hits[5m])\n/\n(rate(redis_keyspace_misses[5m]) + rate(redis_keyspace_hits[5m]))\n\u003c 0.9",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisLowHitRatio"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328339001000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Redis connection refused",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(rate(redis_rejected_connections[5m])) \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisRejectedConnHigh"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328339447000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Redis has just been restarted, please be aware",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "redis_uptime_in_seconds \u003c 600",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisLowUptime"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328339909000
}
]
================================================
FILE: integrations/Redis/alerts/redis_by_exporter.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "High Redis client connection count - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(redis_connected_clients / redis_config_maxclients) \u003e 0.85",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisHighClientsUsage"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328342758000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "High Redis eviction rate - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(sum(rate(redis_evicted_keys_total[5m])) / sum(redis_db_keys)) \u003e 0.1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisHighKeysEvictionRatio"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328349758000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "High Redis latency - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(rate(redis_commands_duration_seconds_total[5m])) / sum(rate(redis_commands_processed_total[5m])) \u003e 0.25",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisHighResponseTime"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328350541000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "High Redis memory usage rate - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "redis_memory_max_bytes \u003e 0 and (redis_memory_used_bytes / redis_memory_max_bytes) \u003e 0.85",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisHighMemoryUsage"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328352303000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Low Redis hit rate - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "rate(redis_keyspace_hits_total[5m])\n/\n(rate(redis_keyspace_misses_total[5m]) + rate(redis_keyspace_hits_total[5m]))\n\u003c 0.9",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisLowHitRatio"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328353276000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Redis connection refused - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "(rate(redis_rejected_connections_total[5m])) \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisRejectedConnHigh"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328353766000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Redis has just been restarted, please be aware - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "redis_uptime_in_seconds \u003c 600",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=RedisLowUptime"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328354294000
}
]
================================================
FILE: integrations/Redis/collect/redis/redis.toml
================================================
# # collect interval
# interval = 15
[[instances]]
# address = "127.0.0.1:6379"
# username = ""
# password = ""
# pool_size = 2
# # Optional. Specify redis commands to retrieve values
# commands = [
# {command = ["get", "sample-key1"], metric = "custom_metric_name1"},
# {command = ["get", "sample-key2"], metric = "custom_metric_name2"}
# ]
# # interval = global.interval * interval_times
# interval_times = 1
# important! use global unique string to specify instance
# labels = { instance="n9e-10.2.3.4:6379" }
## Optional TLS Config
# use_tls = false
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
================================================
FILE: integrations/Redis/collect/redis_sentinel/redis_sentinel.toml
================================================
# # collect interval
# interval = 15
[[instances]]
# [protocol://][:password]@address[:port]
# e.g. servers = ["tcp://localhost:26379"]
servers = []
# # interval = global.interval * interval_times
# interval_times = 1
# add some dimension data by labels
# labels = {}
## Optional TLS Config
# use_tls = false
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
================================================
FILE: integrations/Redis/dashboards/FilterByAddress.json
================================================
{
"name": "Redis by address",
"tags": "Redis Categraf",
"configs": {
"panels": [
{
"collapsed": true,
"id": "2ecb82c6-4d1a-41b5-8cdc-0284db16bd54",
"layout": {
"h": 1,
"i": "2ecb82c6-4d1a-41b5-8cdc-0284db16bd54",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Basic Info",
"type": "row"
},
{
"custom": {
"alignItems": "center",
"bgColor": "rgba(0, 0, 0, 0)",
"content": " ",
"justifyContent": "center",
"textColor": "#000000",
"textDarkColor": "#FFFFFF",
"textSize": 12
},
"id": "b5acc352-a2bd-4afc-b6cd-d6db0905f807",
"layout": {
"h": 3,
"i": "b5acc352-a2bd-4afc-b6cd-d6db0905f807",
"isResizable": true,
"w": 4,
"x": 0,
"y": 1
},
"maxPerRow": 4,
"name": "",
"type": "text",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 0,
"colorMode": "background",
"graphMode": "none",
"orientation": "vertical",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "5eb6fbcf-4260-40d0-ad6a-540e54a1f922",
"layout": {
"h": 3,
"i": "2a02e1d4-2ed3-4bd2-9fa0-69bb10f13888",
"isResizable": true,
"w": 5,
"x": 4,
"y": 1
},
"maxPerRow": 4,
"name": "Redis Uptime",
"options": {
"standardOptions": {
"decimals": 2,
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "rgba(63, 196, 83, 1)",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"to": 600
},
"result": {
"color": "rgba(255, 101, 107, 1)"
},
"type": "range"
},
{
"match": {
"from": 600
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "redis_uptime_in_seconds{address=~\"$address\"}",
"legend": "{{address}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 0,
"colorMode": "background",
"graphMode": "none",
"orientation": "vertical",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "8ccada5e-02f3-4efc-9b36-2a367612e4cb",
"layout": {
"h": 3,
"i": "8ccada5e-02f3-4efc-9b36-2a367612e4cb",
"isResizable": true,
"w": 5,
"x": 9,
"y": 1
},
"maxPerRow": 4,
"name": "Connected Clients",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"to": 500
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
},
{
"match": {
"from": 500
},
"result": {
"color": "rgba(255, 101, 107, 1)"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "redis_connected_clients{address=~\"$address\"}",
"legend": "{{address}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 0,
"colorMode": "background",
"graphMode": "none",
"orientation": "vertical",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "716dc7e7-c9ec-4195-93f6-db1c572ae8b0",
"layout": {
"h": 3,
"i": "716dc7e7-c9ec-4195-93f6-db1c572ae8b0",
"isResizable": true,
"w": 5,
"x": 14,
"y": 1
},
"maxPerRow": 4,
"name": "Memory Used",
"options": {
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"to": 128000000
},
"result": {
"color": "#079e05"
},
"type": "range"
},
{
"match": {
"from": 128000000
},
"result": {
"color": "#f10909"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "redis_used_memory{address=~\"$address\"}",
"legend": "{{address}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 0,
"colorMode": "background",
"graphMode": "none",
"orientation": "vertical",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "c6948161-db07-42df-beb1-765ee9c071a9",
"layout": {
"h": 3,
"i": "c6948161-db07-42df-beb1-765ee9c071a9",
"isResizable": true,
"w": 5,
"x": 19,
"y": 1
},
"maxPerRow": 4,
"name": "Max Memory Limit",
"options": {
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "rgba(63, 196, 83, 1)",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "redis_maxmemory{address=~\"$address\"}",
"legend": "{{address}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "bd54cf4f-1abb-4945-8aab-f89aec16daef",
"layout": {
"h": 1,
"i": "bd54cf4f-1abb-4945-8aab-f89aec16daef",
"isResizable": false,
"w": 24,
"x": 0,
"y": 4
},
"name": "Commands",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "3d5f8c4e-0ddf-4d68-9f6d-2cc57d864a8e",
"layout": {
"h": 5,
"i": "3d5f8c4e-0ddf-4d68-9f6d-2cc57d864a8e",
"isResizable": true,
"w": 8,
"x": 0,
"y": 5
},
"maxPerRow": 4,
"name": "Commands Executed / sec",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(redis_total_commands_processed{address=~\"$address\"}[5m])",
"legend": "{{address}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "noraml"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "344a874d-c34d-4d2d-9bb4-46e0912cd9f5",
"layout": {
"h": 5,
"i": "344a874d-c34d-4d2d-9bb4-46e0912cd9f5",
"isResizable": true,
"w": 8,
"x": 8,
"y": 5
},
"maxPerRow": 4,
"name": "Hits / Misses per Sec",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "irate(redis_keyspace_hits{address=~\"$address\"}[5m])",
"legend": "{{address}} hits",
"maxDataPoints": 240
},
{
"expr": "irate(redis_keyspace_misses{address=~\"$address\"}[5m])",
"legend": "{{address}} misses",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "3c83cd35-585c-4070-a210-1f17345f13f4",
"layout": {
"h": 5,
"i": "3c83cd35-585c-4070-a210-1f17345f13f4",
"isResizable": true,
"w": 8,
"x": 16,
"y": 5
},
"maxPerRow": 4,
"name": "Top Commands",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "topk(5, irate(redis_cmdstat_calls{address=~\"$address\"}[1m]))",
"legend": "{{address}} {{command}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "1ea61073-a46d-4d7c-b072-fcdcbc5ac084",
"layout": {
"h": 1,
"i": "1ea61073-a46d-4d7c-b072-fcdcbc5ac084",
"isResizable": false,
"w": 24,
"x": 0,
"y": 10
},
"name": "Keys",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "b2b4451c-4f8a-438a-8c48-69c95c68361e",
"layout": {
"h": 5,
"i": "b2b4451c-4f8a-438a-8c48-69c95c68361e",
"isResizable": true,
"w": 8,
"x": 0,
"y": 11
},
"maxPerRow": 4,
"name": "Total Items per DB",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum(redis_keyspace_keys{address=~\"$address\"}) by (address, db)",
"legend": "{{address}} {{db}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "894b9beb-e764-441c-ae04-13e5dbbb901d",
"layout": {
"h": 5,
"i": "894b9beb-e764-441c-ae04-13e5dbbb901d",
"isResizable": true,
"w": 8,
"x": 8,
"y": 11
},
"maxPerRow": 4,
"name": "Expired / Evicted",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum(rate(redis_expired_keys{address=~\"$address\"}[5m])) by (address)",
"legend": "{{address}} expired",
"maxDataPoints": 240
},
{
"expr": "sum(rate(redis_evicted_keys{address=~\"$address\"}[5m])) by (address)",
"legend": "{{address}} evicted",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "noraml"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "f721a641-28c7-4e82-a37c-ec17704a0c57",
"layout": {
"h": 5,
"i": "f721a641-28c7-4e82-a37c-ec17704a0c57",
"isResizable": true,
"w": 8,
"x": 16,
"y": 11
},
"maxPerRow": 4,
"name": "Expiring vs Not-Expiring Keys",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum(redis_keyspace_keys{address=~\"$address\"}) - sum(redis_keyspace_expires{address=~\"$address\"}) ",
"legend": "{{address}} not expiring",
"maxDataPoints": 240
},
{
"expr": "sum(redis_keyspace_expires{address=~\"$address\"}) ",
"legend": "{{address}} expiring",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "60ff41ed-9d41-40ee-a13b-c968f3ca49d0",
"layout": {
"h": 1,
"i": "60ff41ed-9d41-40ee-a13b-c968f3ca49d0",
"isResizable": false,
"w": 24,
"x": 0,
"y": 16
},
"name": "Network",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1841950c-e867-4a62-b846-78754dc0e34d",
"layout": {
"h": 7,
"i": "1841950c-e867-4a62-b846-78754dc0e34d",
"isResizable": true,
"w": 24,
"x": 0,
"y": 17
},
"maxPerRow": 4,
"name": "Network I/O",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum(rate(redis_total_net_input_bytes{address=~\"$address\"}[5m]))",
"legend": "input",
"maxDataPoints": 240
},
{
"expr": "sum(rate(redis_total_net_output_bytes{address=~\"$address\"}[5m]))",
"legend": "output",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(redis_uptime_in_seconds,address)",
"hide": false,
"multi": true,
"name": "address",
"type": "query"
}
],
"version": "3.0.0"
},
"uuid": 1732008163114399
}
================================================
FILE: integrations/Redis/dashboards/redis_by_categraf.json
================================================
{
"name": "Redis by instance",
"tags": "Redis Categraf",
"ident": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "2ecb82c6-4d1a-41b5-8cdc-0284db16bd54",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 0,
"i": "2ecb82c6-4d1a-41b5-8cdc-0284db16bd54",
"isResizable": false
},
"name": "Basic Info",
"type": "row"
},
{
"type": "text",
"id": "b5acc352-a2bd-4afc-b6cd-d6db0905f807",
"layout": {
"h": 3,
"w": 4,
"x": 0,
"y": 1,
"i": "b5acc352-a2bd-4afc-b6cd-d6db0905f807",
"isResizable": true
},
"version": "3.0.0",
"name": "",
"maxPerRow": 4,
"custom": {
"textColor": "#000000",
"textDarkColor": "#FFFFFF",
"bgColor": "rgba(0, 0, 0, 0)",
"textSize": 12,
"justifyContent": "center",
"alignItems": "center",
"content": " "
}
},
{
"type": "stat",
"id": "5eb6fbcf-4260-40d0-ad6a-540e54a1f922",
"layout": {
"h": 3,
"w": 5,
"x": 4,
"y": 1,
"i": "2a02e1d4-2ed3-4bd2-9fa0-69bb10f13888",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "redis_uptime_in_seconds{instance=~\"$instance\"}",
"maxDataPoints": 240,
"legend": "{{instance}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Redis Uptime",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "vertical"
},
"options": {
"thresholds": {
"steps": [
{
"color": "rgba(63, 196, 83, 1)",
"value": null,
"type": "base"
}
]
},
"valueMappings": [
{
"type": "range",
"result": {
"color": "rgba(255, 101, 107, 1)"
},
"match": {
"to": 600
}
},
{
"type": "range",
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"match": {
"from": 600
}
}
],
"standardOptions": {
"util": "seconds",
"decimals": 2
}
}
},
{
"type": "stat",
"id": "8ccada5e-02f3-4efc-9b36-2a367612e4cb",
"layout": {
"h": 3,
"w": 5,
"x": 9,
"y": 1,
"i": "8ccada5e-02f3-4efc-9b36-2a367612e4cb",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "redis_connected_clients{instance=~\"$instance\"}",
"maxDataPoints": 240,
"legend": "{{instance}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Connected Clients",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "vertical"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"valueMappings": [
{
"type": "range",
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"match": {
"to": 500
}
},
{
"type": "range",
"result": {
"color": "rgba(255, 101, 107, 1)"
},
"match": {
"from": 500
}
}
],
"standardOptions": {}
}
},
{
"type": "stat",
"id": "716dc7e7-c9ec-4195-93f6-db1c572ae8b0",
"layout": {
"h": 3,
"w": 5,
"x": 14,
"y": 1,
"i": "716dc7e7-c9ec-4195-93f6-db1c572ae8b0",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "redis_used_memory{instance=~\"$instance\"}",
"maxDataPoints": 240,
"legend": "{{instance}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Memory Used",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "vertical"
},
"options": {
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
},
"valueMappings": [
{
"match": {
"to": 128000000
},
"result": {
"color": "#079e05"
},
"type": "range"
},
{
"match": {
"from": 128000000
},
"result": {
"color": "#f10909"
},
"type": "range"
}
],
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
}
}
},
{
"type": "stat",
"id": "c6948161-db07-42df-beb1-765ee9c071a9",
"layout": {
"h": 3,
"w": 5,
"x": 19,
"y": 1,
"i": "c6948161-db07-42df-beb1-765ee9c071a9",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "redis_maxmemory{instance=~\"$instance\"}",
"maxDataPoints": 240,
"legend": "{{instance}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Max Memory Limit",
"maxPerRow": 4,
"custom": {
"textMode": "valueAndName",
"graphMode": "none",
"colorMode": "background",
"calc": "lastNotNull",
"valueField": "Value",
"colSpan": 0,
"textSize": {},
"orientation": "vertical"
},
"options": {
"thresholds": {
"steps": [
{
"color": "rgba(63, 196, 83, 1)",
"value": null,
"type": "base"
}
]
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 1
}
}
},
{
"collapsed": true,
"id": "bd54cf4f-1abb-4945-8aab-f89aec16daef",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 4,
"i": "bd54cf4f-1abb-4945-8aab-f89aec16daef",
"isResizable": false
},
"name": "Commands",
"type": "row"
},
{
"type": "timeseries",
"id": "3d5f8c4e-0ddf-4d68-9f6d-2cc57d864a8e",
"layout": {
"h": 5,
"w": 8,
"x": 0,
"y": 5,
"i": "3d5f8c4e-0ddf-4d68-9f6d-2cc57d864a8e",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "rate(redis_total_commands_processed{instance=~\"$instance\"}[5m])",
"maxDataPoints": 240,
"legend": "{{instance}}"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Commands Executed / sec",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "344a874d-c34d-4d2d-9bb4-46e0912cd9f5",
"layout": {
"h": 5,
"w": 8,
"x": 8,
"y": 5,
"i": "344a874d-c34d-4d2d-9bb4-46e0912cd9f5",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "irate(redis_keyspace_hits{instance=~\"$instance\"}[5m])",
"legend": "{{instance}} hits",
"maxDataPoints": 240
},
{
"expr": "irate(redis_keyspace_misses{instance=~\"$instance\"}[5m])",
"legend": "{{instance}} misses",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Hits / Misses per Sec",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "noraml",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "3c83cd35-585c-4070-a210-1f17345f13f4",
"layout": {
"h": 5,
"w": 8,
"x": 16,
"y": 5,
"i": "3c83cd35-585c-4070-a210-1f17345f13f4",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "topk(5, irate(redis_cmdstat_calls{instance=~\"$instance\"}[1m]))",
"legend": "{{instance}} {{command}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Top Commands",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "1ea61073-a46d-4d7c-b072-fcdcbc5ac084",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 10,
"i": "1ea61073-a46d-4d7c-b072-fcdcbc5ac084",
"isResizable": false
},
"name": "Keys",
"type": "row"
},
{
"type": "timeseries",
"id": "b2b4451c-4f8a-438a-8c48-69c95c68361e",
"layout": {
"h": 5,
"w": 8,
"x": 0,
"y": 11,
"i": "b2b4451c-4f8a-438a-8c48-69c95c68361e",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "sum(redis_keyspace_keys{instance=~\"$instance\"}) by (instance, db)",
"legend": "{{instance}} {{db}}",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Total Items per DB",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "894b9beb-e764-441c-ae04-13e5dbbb901d",
"layout": {
"h": 5,
"w": 8,
"x": 8,
"y": 11,
"i": "894b9beb-e764-441c-ae04-13e5dbbb901d",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "sum(rate(redis_expired_keys{instance=~\"$instance\"}[5m])) by (instance)",
"legend": "{{instance}} expired",
"maxDataPoints": 240
},
{
"expr": "sum(rate(redis_evicted_keys{instance=~\"$instance\"}[5m])) by (instance)",
"legend": "{{instance}} evicted",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Expired / Evicted",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"type": "timeseries",
"id": "f721a641-28c7-4e82-a37c-ec17704a0c57",
"layout": {
"h": 5,
"w": 8,
"x": 16,
"y": 11,
"i": "f721a641-28c7-4e82-a37c-ec17704a0c57",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "sum(redis_keyspace_keys{instance=~\"$instance\"}) - sum(redis_keyspace_expires{instance=~\"$instance\"}) ",
"legend": "{{instance}} not expiring",
"maxDataPoints": 240
},
{
"expr": "sum(redis_keyspace_expires{instance=~\"$instance\"}) ",
"legend": "{{instance}} expiring",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Expiring vs Not-Expiring Keys",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "noraml",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
},
{
"collapsed": true,
"id": "60ff41ed-9d41-40ee-a13b-c968f3ca49d0",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 16,
"i": "60ff41ed-9d41-40ee-a13b-c968f3ca49d0",
"isResizable": false
},
"name": "Network",
"type": "row"
},
{
"type": "timeseries",
"id": "1841950c-e867-4a62-b846-78754dc0e34d",
"layout": {
"h": 7,
"w": 24,
"x": 0,
"y": 17,
"i": "1841950c-e867-4a62-b846-78754dc0e34d",
"isResizable": true
},
"version": "3.0.0",
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"targets": [
{
"expr": "sum(rate(redis_total_net_input_bytes{instance=~\"$instance\"}[5m]))",
"legend": "input",
"maxDataPoints": 240
},
{
"expr": "sum(rate(redis_total_net_output_bytes{instance=~\"$instance\"}[5m]))",
"legend": "output",
"maxDataPoints": 240
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"name": "Network I/O",
"maxPerRow": 4,
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden",
"behaviour": "showItem"
},
"standardOptions": {
"util": "bytesIEC",
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#6C53B1",
"value": null,
"type": "base"
}
]
}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"spanNulls": false,
"lineWidth": 2,
"fillOpacity": 0.3,
"gradientMode": "opacity",
"stack": "off",
"scaleDistribution": {
"type": "linear"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
]
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"name": "instance",
"type": "query",
"hide": false,
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(redis_uptime_in_seconds,instance)",
"multi": true,
"allOption": true
}
],
"version": "3.0.0"
},
"uuid": 1717556328355498000
}
================================================
FILE: integrations/Redis/dashboards/redis_by_exporter.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Redis Overview - exporter",
"ident": "",
"tags": "Redis Prometheus",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "7e236455-0927-4695-8f19-3d911d0c83eb",
"layout": {
"h": 1,
"i": "7e236455-0927-4695-8f19-3d911d0c83eb",
"w": 24,
"x": 0,
"y": 0
},
"name": "Basic Info",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "d610dcec-a0f2-49dc-a368-a9fda3450f80",
"layout": {
"h": 3,
"i": "d610dcec-a0f2-49dc-a368-a9fda3450f80",
"w": 6,
"x": 0,
"y": 1
},
"name": "Redis Uptime",
"options": {
"standardOptions": {
"util": "humantimeSeconds"
}
},
"targets": [
{
"expr": "min(redis_uptime_in_seconds{instance=~\"$instance\"})"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "339e6670-4597-4608-9f49-f7bdb243f7f1",
"layout": {
"h": 3,
"i": "339e6670-4597-4608-9f49-f7bdb243f7f1",
"w": 6,
"x": 6,
"y": 1
},
"name": "Connected Clients",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(redis_connected_clients{instance=~\"$instance\"})"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "321099de-9061-4027-b77a-a44885c11ec3",
"layout": {
"h": 3,
"i": "321099de-9061-4027-b77a-a44885c11ec3",
"w": 6,
"x": 12,
"y": 1
},
"name": "Memory Used",
"options": {
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"valueMappings": [
{
"match": {
"to": 128000000
},
"result": {
"color": "#079e05"
},
"type": "range"
},
{
"match": {
"from": 128000000
},
"result": {
"color": "#f10909"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "redis_memory_used_bytes{instance=~\"$instance\"}"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "3301dac6-df01-4373-84fb-b175ff2c7bfb",
"layout": {
"h": 3,
"i": "3301dac6-df01-4373-84fb-b175ff2c7bfb",
"w": 6,
"x": 18,
"y": 1
},
"name": "Max Memory Limit",
"options": {
"standardOptions": {
"util": "bytesIEC"
}
},
"targets": [
{
"expr": "redis_memory_max_bytes{instance=~\"$instance\"}"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "631895f0-8eba-42da-a82b-203aacf71855",
"layout": {
"h": 1,
"i": "631895f0-8eba-42da-a82b-203aacf71855",
"w": 24,
"x": 0,
"y": 4
},
"name": "Commands",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "02b519fd-3ddb-4a5e-b5f3-0ac00e1392e2",
"layout": {
"h": 7,
"i": "02b519fd-3ddb-4a5e-b5f3-0ac00e1392e2",
"w": 8,
"x": 0,
"y": 5
},
"name": "Commands Executed / sec",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(redis_commands_processed_total{instance=~\"$instance\"}[5m])"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "noraml"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "e65b838d-e38d-42c0-80fb-fb7ecad37445",
"layout": {
"h": 7,
"i": "e65b838d-e38d-42c0-80fb-fb7ecad37445",
"w": 8,
"x": 8,
"y": 5
},
"name": "Hits / Misses per Sec",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(redis_keyspace_hits_total{instance=~\"$instance\"}[5m])",
"legend": "hits"
},
{
"expr": "irate(redis_keyspace_misses_total{instance=~\"$instance\"}[5m])",
"legend": "misses"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "64301a42-adb8-4f12-9192-ae764f067305",
"layout": {
"h": 7,
"i": "64301a42-adb8-4f12-9192-ae764f067305",
"w": 8,
"x": 16,
"y": 5
},
"name": "Top Commands",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "topk(5, irate(redis_commands_total{instance=~\"$instance\"} [1m]))",
"legend": "{{cmd}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "2f8d0391-ecd6-4c35-acd7-99a340fa64bd",
"layout": {
"h": 1,
"i": "2f8d0391-ecd6-4c35-acd7-99a340fa64bd",
"w": 24,
"x": 0,
"y": 12
},
"name": "Keys",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "834d84f2-da11-49fc-8107-c253afcc4d67",
"layout": {
"h": 7,
"i": "834d84f2-da11-49fc-8107-c253afcc4d67",
"w": 8,
"x": 0,
"y": 13
},
"name": "Total Items per DB",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum (redis_db_keys{instance=~\"$instance\"}) by (db)",
"legend": "{{db}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "e3503fcc-31c5-4f66-a31c-fbb421d03280",
"layout": {
"h": 7,
"i": "e3503fcc-31c5-4f66-a31c-fbb421d03280",
"w": 8,
"x": 8,
"y": 13
},
"name": "Expired / Evicted",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(redis_expired_keys_total{instance=~\"$instance\"}[5m])) by (instance)",
"legend": "expired"
},
{
"expr": "sum(rate(redis_evicted_keys_total{instance=~\"$instance\"}[5m])) by (instance)",
"legend": "evicted"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "noraml"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "25daf589-9d61-476c-8f06-dd42a30f048d",
"layout": {
"h": 7,
"i": "25daf589-9d61-476c-8f06-dd42a30f048d",
"w": 8,
"x": 16,
"y": 13
},
"name": "Expiring vs Not-Expiring Keys",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(redis_db_keys{instance=~\"$instance\"}) - sum(redis_db_keys_expiring{instance=~\"$instance\"}) ",
"legend": "not expiring"
},
{
"expr": "sum(redis_db_keys_expiring{instance=~\"$instance\"}) ",
"legend": "expiring"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "c86fdeb4-768c-4aa2-8a2c-204296316090",
"layout": {
"h": 1,
"i": "c86fdeb4-768c-4aa2-8a2c-204296316090",
"w": 24,
"x": 0,
"y": 20
},
"name": "Network",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "dd402e9d-7aff-4d8a-9e16-c338033d8a4d",
"layout": {
"h": 7,
"i": "dd402e9d-7aff-4d8a-9e16-c338033d8a4d",
"w": 24,
"x": 0,
"y": 21
},
"name": "Network I/O",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(redis_net_input_bytes_total{instance=~\"$instance\"}[5m]))",
"legend": "input"
},
{
"expr": "sum(rate(redis_net_output_bytes_total{instance=~\"$instance\"}[5m]))",
"legend": "output"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(redis_uptime_in_seconds,instance)",
"name": "instance",
"selected": "10.206.0.16:6379",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328357035000
}
================================================
FILE: integrations/Redis/markdown/README.md
================================================
# redis
redis 的监控原理,就是连上 redis,执行 info 命令,解析结果,整理成监控数据上报。
## Configuration
redis 插件的配置在 `conf/input.redis/redis.toml` 最简单的配置如下:
```toml
[[instances]]
address = "127.0.0.1:6379"
username = ""
password = ""
labels = { instance="n9e-10.23.25.2:6379" }
```
如果要监控多个 redis 实例,就增加 instances 即可:
```toml
[[instances]]
address = "10.23.25.2:6379"
username = ""
password = ""
labels = { instance="n9e-10.23.25.2:6379" }
[[instances]]
address = "10.23.25.3:6379"
username = ""
password = ""
labels = { instance="n9e-10.23.25.3:6379" }
```
建议通过 labels 配置附加一个 instance 标签,便于后面复用监控大盘。
## redis 集群如何监控
其实,redis 集群的监控,还是去监控每个 redis 实例。
如果一个 redis 集群有 3 个实例,对于业务应用来讲,发起一个请求,可能随机请求到某一个实例上去了,这个是没问题的,但是对于监控 client 而言,显然是希望到所有实例上获取数据的。
当然,如果多个 redis 实例组成了集群,我们希望有个标识来标识这个集群,这个时候,可以通过 labels 来实现,比如给每个实例增加一个 redis_clus 的标签,值为集群名字即可。
# redis_sentinel
forked from [telegraf/redis_sentinel](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/redis_sentinel)
================================================
FILE: integrations/SMART/collect/smart/smart.toml
================================================
# Read metrics from storage devices supporting S.M.A.R.T.
[[instances]]
## Optionally specify the path to the smartctl executable
# path_smartctl = "/usr/bin/smartctl"
## Optionally specify the path to the nvme-cli executable
# path_nvme = "/usr/bin/nvme"
## Optionally specify if vendor specific attributes should be propagated for NVMe disk case
## ["auto-on"] - automatically find and enable additional vendor specific disk info
## ["vendor1", "vendor2", ...] - e.g. "Intel" enable additional Intel specific disk info
# enable_extensions = ["auto-on"]
## On most platforms used cli utilities requires root access.
## Setting 'use_sudo' to true will make use of sudo to run smartctl or nvme-cli.
## Sudo must be configured to allow the telegraf user to run smartctl or nvme-cli
## without a password.
# use_sudo = true
## Skip checking disks in this power mode. Defaults to
## "standby" to not wake up disks that have stopped rotating.
## See --nocheck in the man pages for smartctl.
## smartctl version 5.41 and 5.42 have faulty detection of
## power mode and might require changing this value to
## "never" depending on your disks.
# nocheck = "standby"
## Gather all returned S.M.A.R.T. attribute metrics and the detailed
## information from each drive into the 'smart_attribute' measurement.
# attributes = true
## Optionally specify devices to exclude from reporting if disks auto-discovery is performed.
# excludes = [ "/dev/pass6" ]
## Optionally specify devices and device type, if unset
## a scan (smartctl --scan and smartctl --scan -d nvme) for S.M.A.R.T. devices will be done
## and all found will be included except for the excluded in excludes.
# devices = [ "/dev/ada0 -d atacam", "/dev/nvme0"]
# devices = ["dev/nvme0 -d nvme", "/dev/nvme0"]
## Timeout for the cli command to complete.
timeout = "30s"
## Optionally call smartctl and nvme-cli with a specific concurrency policy.
## By default, smartctl and nvme-cli are called in separate threads (goroutines) to gather disk attributes.
## Some devices (e.g. disks in RAID arrays) may have access limitations that require sequential reading of
## SMART data - one individual array drive at the time. In such case please set this configuration option
## to "sequential" to get readings for all drives.
## valid options: concurrent, sequential
# read_method = "concurrent"
================================================
FILE: integrations/SMART/dashboards/smart.json
================================================
{
"id": 0,
"group_id": 0,
"name": "S.M.A.R.T Dashboard",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"collapsed": true,
"id": "2a2c3cec-5699-4860-a31d-14814371482c",
"layout": {
"h": 1,
"i": "2a2c3cec-5699-4860-a31d-14814371482c",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "【 General 】",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": 4,
"description": "设备数量",
"id": "c9d829c1-240f-46c1-8269-82879081eea2",
"layout": {
"h": 4,
"i": "c9d829c1-240f-46c1-8269-82879081eea2",
"isResizable": true,
"w": 4,
"x": 0,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Disk Drives Monitored",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "count(smart_device_temp_c{ident=\"$ident\"})",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"columns": [
"shop",
"ident",
"device",
"capacity",
"serial_no",
"wwn",
"model",
"enabled",
"value"
],
"displayMode": "labelsOfSeriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": 4,
"id": "24fbb1bf-9817-4fa0-9525-17dd4fa2a710",
"layout": {
"h": 8,
"i": "24fbb1bf-9817-4fa0-9525-17dd4fa2a710",
"isResizable": true,
"w": 20,
"x": 4,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Disk Drives List",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "smart_device_temp_c{ident=\"$ident\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": 4,
"id": "b9f030af-0c68-45e5-9f64-d4b4216c3ef0",
"layout": {
"h": 4,
"i": "b9f030af-0c68-45e5-9f64-d4b4216c3ef0",
"isResizable": true,
"w": 4,
"x": 0,
"y": 5
},
"links": [],
"maxPerRow": 4,
"name": "Unhealthy Disks",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "count(smart_device_temp_c{ident=~\"$ident\"})-count(smart_device_health_ok{ident=~\"$ident\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "e17b3260-b43d-4733-9e89-c91caf3689cf",
"layout": {
"h": 1,
"i": "e17b3260-b43d-4733-9e89-c91caf3689cf",
"isResizable": false,
"w": 24,
"x": 0,
"y": 9
},
"name": "【 Temperature 】",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 4,
"description": "",
"id": "2df514d0-6858-48d4-b2fc-ed6c4932cb33",
"layout": {
"h": 11,
"i": "2df514d0-6858-48d4-b2fc-ed6c4932cb33",
"isResizable": true,
"w": 12,
"x": 0,
"y": 10
},
"links": [],
"maxPerRow": 4,
"name": "Temperature History",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"repeat": null,
"targets": [
{
"expr": "label_replace(smart_device_temp_c{ ident=\"$ident\"}, \"ident\", \"$1\", \"ident\", \"([^.]+).*\")",
"legend": "【{{ident}}】 {{device}} {{model}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"baseColor": "#9470FF",
"calc": "avg",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": 4,
"id": "e028227f-cf3d-4f4e-abc6-96e16d2508c3",
"layout": {
"h": 11,
"i": "e028227f-cf3d-4f4e-abc6-96e16d2508c3",
"isResizable": true,
"w": 12,
"x": 12,
"y": 10
},
"links": [],
"maxPerRow": 4,
"name": "Temperature Bar",
"options": {
"standardOptions": {
"decimals": 1,
"max": "80",
"min": "0",
"util": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "label_replace(label_replace(smart_device_temp_c{ ident=~\"$ident\"}, \"ident\", \"$1\", \"ident\", \"([^.]+).*\"), \"device\", \"$1\", \"device\", \"(.*)\")",
"legend": "{{ident}}: {{device}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "7ce33f1a-85a0-4d0b-a696-caf679e1fa82",
"layout": {
"h": 1,
"i": "7ce33f1a-85a0-4d0b-a696-caf679e1fa82",
"isResizable": false,
"w": 24,
"x": 0,
"y": 21
},
"name": "【 Wear and Tear 】",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"baseColor": "#9470FF",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": 4,
"description": "通电时间",
"id": "c270b82c-610b-4f58-bba3-0ed2838cc925",
"layout": {
"h": 6,
"i": "c270b82c-610b-4f58-bba3-0ed2838cc925",
"isResizable": true,
"w": 12,
"x": 0,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "Power On Hours",
"options": {
"standardOptions": {
"decimals": 2,
"max": "17520",
"min": 0,
"util": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "label_replace(smart_attribute_power_on_hours{ident=~\"$ident\" }, \"ident\", \"$1\", \"ident\", \"([^.]+).*\")",
"legend": "【{{ident}} {{device}}】",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"baseColor": "#9470FF",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": 4,
"description": "通电次数",
"id": "33f83d64-23be-4d58-807a-4a3570da8b97",
"layout": {
"h": 6,
"i": "33f83d64-23be-4d58-807a-4a3570da8b97",
"isResizable": true,
"w": 12,
"x": 12,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "Power Cycle Count",
"options": {
"standardOptions": {
"decimals": 0,
"max": 2000,
"min": 0,
"util": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "label_replace(smart_attribute_power_cycle_count{ ident=~\"$ident\"}, \"ident\", \"$1\", \"ident\", \"([^.]+).*\")",
"legend": "【{{ident}} {{device}}】",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"baseColor": "#9470FF",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": 4,
"description": "非预期掉电次数统计",
"id": "20700be8-a0d8-4cef-84c5-cecef88bcbc9",
"layout": {
"h": 6,
"i": "20700be8-a0d8-4cef-84c5-cecef88bcbc9",
"isResizable": true,
"w": 12,
"x": 0,
"y": 28
},
"links": [],
"maxPerRow": 4,
"name": "Power Loss Count",
"options": {
"standardOptions": {
"max": 2500,
"util": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "label_replace(smart_attribute_unexpect_power_loss_ct{ ident=~\"$ident\"}, \"ident\", \"$1\", \"ident\", \"([^.]+).*\")",
"legend": "【{{ident}} {{device}}】",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "c9474027-b555-4085-89e8-56c2521919c7",
"layout": {
"h": 1,
"i": "c9474027-b555-4085-89e8-56c2521919c7",
"isResizable": false,
"w": 24,
"x": 0,
"y": 34
},
"name": "【 Errors 】",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"baseColor": "#9470FF",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": 4,
"description": "",
"id": "9f364792-45b4-444c-be76-20714396eadc",
"layout": {
"h": 6,
"i": "9f364792-45b4-444c-be76-20714396eadc",
"isResizable": true,
"w": 12,
"x": 0,
"y": 35
},
"links": [],
"maxPerRow": 4,
"name": "Raw Read Error",
"options": {
"standardOptions": {
"max": 20000,
"util": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "label_replace(smart_device_read_error_rate{ ident=~\"$ident\" }, \"ident\", \"$1\", \"ident\", \"([^.]+).*\")",
"legend": "【{{ident}} {{device}}】",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": 4,
"description": "Aggregated graph for all types of errors",
"id": "074ffe0a-7bda-4ac0-867a-08ba44a8c4fb",
"layout": {
"h": 6,
"i": "074ffe0a-7bda-4ac0-867a-08ba44a8c4fb",
"isResizable": true,
"w": 12,
"x": 12,
"y": 35
},
"links": [],
"maxPerRow": 4,
"name": "Total Errors",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum( smart_device_reallocated_sectors_count{ ident=~\"$ident\"})",
"legend": "smart_device_reallocated_sectors_count",
"refId": "B"
},
{
"expr": "sum( smart_attribute_reallocated_event_count{ ident=~\"$ident\" })",
"legend": "smart_attribute_reallocated_event_count",
"refId": "C"
},
{
"expr": "sum(smart_device_read_error_rate{ ident=~\"$ident\" })",
"legend": "smart_device_read_error_rate",
"refId": "D"
},
{
"expr": "sum(smart_device_pending_sector_count{ ident=~\"$ident\" })",
"legend": "smart_device_pending_sector_count",
"refId": "E"
},
{
"expr": "sum(smart_device_uncorrectable_errors{ ident=~\"$ident\"})",
"legend": "smart_device_uncorrectable_errors",
"refId": "F"
},
{
"expr": "sum(smart_device_udma_crc_errors{ ident=\"$ident\" })",
"legend": "smart_device_udma_crc_errors",
"refId": "G"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"baseColor": "#9470FF",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": 4,
"description": "",
"id": "f2d6fe6e-3cd5-4b5a-9978-6dcf3be45b39",
"layout": {
"h": 6,
"i": "f2d6fe6e-3cd5-4b5a-9978-6dcf3be45b39",
"isResizable": true,
"w": 12,
"x": 0,
"y": 41
},
"links": [],
"maxPerRow": 4,
"name": "UDMA CRC Error",
"options": {
"standardOptions": {
"max": 2500,
"util": "none"
},
"valueMappings": []
},
"targets": [
{
"expr": "label_replace(smart_device_udma_crc_errors{ ident=~\"$ident\"}, \"ident\", \"$1\", \"ident\", \"([^.]+).*\")",
"legend": "【{{ident}} {{device}}】",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "5e2273be-0948-47b6-acc4-5d460384813e",
"layout": {
"h": 1,
"i": "5e2273be-0948-47b6-acc4-5d460384813e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 47
},
"name": "【 SAS-specific Errors 】",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"calc": "mean",
"version": "3.0.0"
},
"description": "Grown defects are drive blocks that have been marked bad once the drive has been in use",
"id": "2d4d0c4e-9bc6-4dd6-bd28-7b0ce47c2bd1",
"layout": {
"h": 6,
"i": "2d4d0c4e-9bc6-4dd6-bd28-7b0ce47c2bd1",
"isResizable": true,
"w": 12,
"x": 0,
"y": 48
},
"links": [],
"maxPerRow": 4,
"name": "SAS grown defects",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"max": 2000,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#FADE2A",
"value": 800
},
{
"color": "#F2495C",
"value": 2000
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "label_replace(smartmon_sas_grown_defects_count_raw_value{ instance=~\"$instance\", disk=~\"$disk\" }, \"instance\", \"$1\", \"instance\", \"([^.]+).*\")",
"legend": "{{name}} 【{{instance}} {{disk}}】",
"refId": "A"
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"description": "Aggregated graph for all types of SAS errors",
"id": "9f80056a-5d5b-41e7-90aa-5ba75e71cbf4",
"layout": {
"h": 6,
"i": "9f80056a-5d5b-41e7-90aa-5ba75e71cbf4",
"isResizable": true,
"w": 12,
"x": 12,
"y": 48
},
"links": [],
"maxPerRow": 4,
"name": "SAS Total Errors",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(smartmon_sas_grown_defects_count_raw_value{ instance=~\"$instance\", disk=~\"$disk\" } )",
"legend": "sas_grown_defects_count",
"refId": "K"
},
{
"expr": "sum(smartmon_sas_non_medium_errors_count_raw_value{ instance=~\"$instance\", disk=~\"$disk\" } )",
"legend": "sas_non_medium_errors_count",
"refId": "L"
},
{
"expr": "sum(smartmon_sas_read_uncorrected_errors_count_raw_value{ instance=~\"$instance\", disk=~\"$disk\" } )",
"legend": "sas_read_uncorrected_errors_count",
"refId": "M"
},
{
"expr": "sum(smartmon_sas_verify_uncorrected_errors_count_raw_value{ instance=~\"$instance\", disk=~\"$disk\" })",
"legend": "sas_verify_uncorrected_errors_count",
"refId": "A"
},
{
"expr": "sum(smartmon_sas_write_uncorrected_errors_count_raw_value{ instance=~\"$instance\", disk=~\"$disk\" })",
"legend": "sas_write_uncorrected_errors_count",
"refId": "B"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"calc": "mean",
"version": "3.0.0"
},
"description": "SCSI command errors which might be related to bad cable, backplane or card",
"id": "f24e5f2d-0b7c-43bc-9a12-c7f4fa62d25a",
"layout": {
"h": 6,
"i": "f24e5f2d-0b7c-43bc-9a12-c7f4fa62d25a",
"isResizable": true,
"w": 12,
"x": 0,
"y": 54
},
"links": [],
"maxPerRow": 4,
"name": "SAS Non-Medium Errors",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"max": 2000,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#FADE2A",
"value": 800
},
{
"color": "#F2495C",
"value": 2000
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "label_replace(smartmon_sas_non_medium_errors_count_raw_value{ instance=~\"$instance\", disk=~\"$disk\" }, \"instance\", \"$1\", \"instance\", \"([^.]+).*\")",
"legend": "{{name}} 【{{instance}} {{disk}}】",
"refId": "A"
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"calc": "mean",
"version": "3.0.0"
},
"description": "Specifies the counter that contains the total number of blocks for which an uncorrected data error has occurred",
"id": "7882d995-95c3-411d-9a9e-237d60c52ba1",
"layout": {
"h": 6,
"i": "7882d995-95c3-411d-9a9e-237d60c52ba1",
"isResizable": true,
"w": 12,
"x": 12,
"y": 54
},
"links": [],
"maxPerRow": 4,
"name": "SAS Uncorrected Errors Total (read/write/verify)",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"max": 2000,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#FADE2A",
"value": 800
},
{
"color": "#F2495C",
"value": 2000
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "label_replace(sum (smartmon_sas_read_uncorrected_errors_count_raw_value{ instance=~\"$instance\", disk=~\"$disk\" } + smartmon_sas_write_uncorrected_errors_count_raw_value{ instance=~\"$instance\", disk=~\"$disk\" } + smartmon_sas_verify_uncorrected_errors_count_raw_value{ instance=~\"$instance\", disk=~\"$disk\" }) by (instance, disk, name), \"instance\", \"$1\", \"instance\", \"([^.]+).*\")",
"legend": "{{name}} 【{{instance}} {{disk}}】",
"refId": "A"
}
],
"type": "barGauge",
"version": "3.0.0"
}
],
"var": [
{
"datasource": {
"cate": "prometheus",
"value": 4
},
"definition": "label_values(smart_device_temp_c,ident)",
"multi": false,
"name": "ident",
"reg": "",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": 4
},
"definition": "label_values(smart_device_temp_c, device)",
"multi": false,
"name": "device",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328362726000
}
================================================
FILE: integrations/SMART/markdown/README.md
================================================
# S.M.A.R.T. 插件
从[telegraf](https://github.com/influxdata/telegraf/blob/master/plugins/inputs/smart/README.md) fork,略作改动
Get metrics using the command line utility `smartctl` for
S.M.A.R.T. (Self-Monitoring, Analysis and Reporting Technology) storage
devices. SMART is a monitoring system included in computer hard disk drives
(HDDs) and solid-state drives (SSDs) that detects and reports on various
indicators of drive reliability, with the intent of enabling the anticipation of
hardware failures. See smartmontools ( ).
SMART information is separated between different measurements: `smart_device` is
used for general information, while `smart_attribute` stores the detailed
attribute information if `attributes = true` is enabled in the plugin
configuration.
If no devices are specified, the plugin will scan for SMART devices via the
following command:
```sh
smartctl --scan
```
Metrics will be reported from the following `smartctl` command:
```sh
smartctl --info --attributes --health -n --format=brief
```
This plugin supports _smartmontools_ version 5.41 and above, but v. 5.41 and
v. 5.42 might require setting `nocheck`, see the comment in the sample
configuration. Also, NVMe capabilities were introduced in version 6.5.
To enable SMART on a storage device run:
```sh
smartctl -s on
```
## NVMe vendor specific attributes
For NVMe disk type, plugin can use command line utility `nvme-cli`. It has a
feature to easy access a vendor specific attributes. This plugin supports
nmve-cli version 1.5 and above (). In
case of `nvme-cli` absence NVMe vendor specific metrics will not be obtained.
Vendor specific SMART metrics for NVMe disks may be reported from the following
`nvme` command:
```sh
nvme smart-log-add
```
Note that vendor plugins for `nvme-cli` could require different naming
convention and report format.
To see installed plugin extensions, depended on the nvme-cli version, look at
the bottom of:
```sh
nvme help
```
To gather disk vendor id (vid) `id-ctrl` could be used:
```sh
nvme id-ctrl
```
Association between a vid and company can be found there:
.
Devices affiliation to being NVMe or non NVMe will be determined thanks to:
```sh
smartctl --scan
```
and:
```sh
smartctl --scan -d nvme
```
## Configuration
```toml @示例
# Read metrics from storage devices supporting S.M.A.R.T.
[[instances]]
## Optionally specify the path to the smartctl executable
# path_smartctl = "/usr/bin/smartctl"
## Optionally specify the path to the nvme-cli executable
# path_nvme = "/usr/bin/nvme"
## Optionally specify if vendor specific attributes should be propagated for NVMe disk case
## ["auto-on"] - automatically find and enable additional vendor specific disk info
## ["vendor1", "vendor2", ...] - e.g. "Intel" enable additional Intel specific disk info
# enable_extensions = ["auto-on"]
## On most platforms used cli utilities requires root access.
## Setting 'use_sudo' to true will make use of sudo to run smartctl or nvme-cli.
## Sudo must be configured to allow the categraf user to run smartctl or nvme-cli
## Sudo must be configured to allow the categraf user to run smartctl or nvme-cli
## without a password.
use_sudo = true
## Skip checking disks in this power mode. Defaults to
## "standby" to not wake up disks that have stopped rotating.
## See --nocheck in the man pages for smartctl.
## smartctl version 5.41 and 5.42 have faulty detection of
## power mode and might require changing this value to
## "never" depending on your disks.
# nocheck = "standby"
## Gather all returned S.M.A.R.T. attribute metrics and the detailed
## information from each drive into the 'smart_attribute' measurement.
attributes = true
## Optionally specify devices to exclude from reporting if disks auto-discovery is performed.
# excludes = [ "/dev/pass6" ]
## Optionally specify devices and device type, if unset
## a scan (smartctl --scan and smartctl --scan -d nvme) for S.M.A.R.T. devices will be done
## and all found will be included except for the excluded in excludes.
# devices = [ "/dev/ada0 -d atacam", "/dev/nvme0"]
# devices = ["dev/nvme0 -d nvme", "/dev/nvme0"]
## Timeout for the cli command to complete.
timeout = "30s"
## Optionally call smartctl and nvme-cli with a specific concurrency policy.
## By default, smartctl and nvme-cli are called in separate threads (goroutines) to gather disk attributes.
## Some devices (e.g. disks in RAID arrays) may have access limitations that require sequential reading of
## SMART data - one individual array drive at the time. In such case please set this configuration option
## to "sequential" to get readings for all drives.
## valid options: concurrent, sequential
# read_method = "concurrent"
```
## Permissions
采集需要sudo权限
## Metrics
- smart_device:
- tags:
- capacity
- device
- enabled
- model
- serial_no
- wwn
- fields:
- exit_status
- health_ok
- media_wearout_indicator
- percent_lifetime_remain
- read_error_rate
- seek_error
- temp_c
- udma_crc_errors
- wear_leveling_count
- smart_attribute:
- tags:
- capacity
- device
- enabled
- fail
- flags
- id
- model
- name
- serial_no
- wwn
- fields:
- exit_status
- threshold
- value
- worst
- critical_warning
- temperature_celsius
- available_spare
- available_spare_threshold
- percentage_used
- data_units_read
- data_units_written
- host_read_commands
- host_write_commands
- controller_busy_time
- power_cycle_count
- power_on_hours
- unsafe_shutdowns
- media_and_data_integrity_errors
- error_information_log_entries
- warning_temperature_time
- critical_temperature_time
- program_fail_count
- erase_fail_count
- wear_leveling_count
- end_to_end_error_detection_count
- crc_error_count
- media_wear_percentage
- host_reads
- timed_workload_timer
- thermal_throttle_status
- retry_buffer_overflow_count
- pll_lock_loss_count
### Flags
The interpretation of the tag `flags` is:
- `K` auto-keep
- `C` event count
- `R` error rate
- `S` speed/performance
- `O` updated online
- `P` prefailure warning
### Exit Status
The `exit_status` field captures the exit status of the used cli utilities
command which is defined by a bitmask. For the interpretation of the bitmask see
the man page for smartctl or nvme-cli.
## Device Names
Device names, e.g., `/dev/sda`, are _not persistent_, and may be
subject to change across reboots or system changes. Instead, you can use the
_World Wide Name_ (WWN) or serial number to identify devices. On Linux block
devices can be referenced by the WWN in the following location:
`/dev/disk/by-id/`.
## Troubleshooting
If you expect to see more SMART metrics than this plugin shows, be sure to use a
proper version of smartctl or nvme-cli utility which has the functionality to
gather desired data. Also, check your device capability because not every SMART
metrics are mandatory. For example the number of temperature sensors depends on
the device specification.
If this plugin is not working as expected for your SMART enabled device,
please run these commands and include the output in a bug report:
For non NVMe devices (from smartctl version >= 7.0 this will also return NVMe
devices by default):
```sh
smartctl --scan
```
For NVMe devices:
```sh
smartctl --scan -d nvme
```
Run the following command replacing your configuration setting for NOCHECK and
the DEVICE (name of the device could be taken from the previous command):
```sh
smartctl --info --health --attributes --tolerance=verypermissive --nocheck NOCHECK --format=brief -d DEVICE
```
If you try to gather vendor specific metrics, please provide this command
and replace vendor and device to match your case:
```sh
nvme VENDOR smart-log-add DEVICE
```
If you have specified devices array in configuration file, and categraf only
shows data from one device, you should change the plugin configuration to
sequentially gather disk attributes instead of collecting it in separate threads
(goroutines). To do this find in plugin configuration read_method and change it
to sequential:
```toml
## Optionally call smartctl and nvme-cli with a specific concurrency policy.
## By default, smartctl and nvme-cli are called in separate threads (goroutines) to gather disk attributes.
## Some devices (e.g. disks in RAID arrays) may have access limitations that require sequential reading of
## SMART data - one individual array drive at the time. In such case please set this configuration option
## to "sequential" to get readings for all drives.
## valid options: concurrent, sequential
read_method = "sequential"
```
## Example Output
```text
smart_device_health_ok agent_hostname=1.2.3.4 device=nvme0 model=INTEL_SSDPE2KX040T8 serial_no=PHLJ830200CH4P0DGN 1
smart_device_temp_c agent_hostname=1.2.3.4 device=nvme0 model=INTEL_SSDPE2KX040T8 serial_no=PHLJ830200CH4P0DGN 53
smart_attribute_program_fail_count agent_hostname=1.2.3.4 device=nvme0 model= name=Program_Fail_Count serial_no=PHLJ830200CH4P0DGN 0
smart_attribute_erase_fail_count agent_hostname=1.2.3.4 device=nvme0 model= name=Erase_Fail_Count serial_no=PHLJ830200CH4P0DGN 0
smart_attribute_wear_leveling_count agent_hostname=1.2.3.4 device=nvme0 model= name=Wear_Leveling_Count serial_no=PHLJ830200CH4P0DGN 34360328200
```
================================================
FILE: integrations/SNMP/collect/snmp/Cisco.toml
================================================
[[instances]]
agents = ["udp://127.0.0.1"]
timeout = "5s"
version = 2
## Path to mib files
## Used by the gosmi translator.
## To add paths when translating with netsnmp, use the MIBDIRS environment variable
##path = ["/usr/share/snmp/DCN"]
##translator = "gosmi"
community = "public"
agent_host_tag = "DCN"
retries = 3
max_repetitions = 100
##运行时间
[[instances.field]]
oid = "1.3.6.1.2.1.1.3.0"
name = "sys_uptime"
conversion = "float(2)"
[[instances.field]]
oid = "1.3.6.1.4.1.6339.100.1.11.10.0"
name = "cpu_usage"
[[instances.field]]
oid = "1.3.6.1.4.1.6339.100.1.11.6.0"
name = "mem_max"
[[instances.field]]
oid = "1.3.6.1.4.1.6339.100.1.11.7.0"
name = "mem_use"
#端口总和
[[instances.field]]
name = "TotalPorts"
oid = "1.3.6.1.4.1.6339.100.3.1.0"
##设备名称
[[instances.field]]
oid = "1.3.6.1.2.1.1.5.0"
name = "sys_name"
is_tag = true
##产品型号
[[instances.field]]
name = "sys_pm"
oid = "1.3.6.1.4.1.6339.100.25.1.1.1.0"
is_tag = true
#本机IP
[[instances.field]]
name = "LocalIP"
oid = "1.3.6.1.2.1.4.20.1.1"
is_tag = true
#接口表信息
[[instances.table]]
name = "interface"
inherit_tags = ["sys_name","sys_pm","LocalIP"]
#各个端口
[[instances.table.field]]
name = "ifDescr"
oid = "1.3.6.1.2.1.2.2.1.2"
is_tag = true
[[instances.table.field]]
name = "ifSpeed"
oid = "1.3.6.1.2.1.2.2.1.5"
conversion = "float(6)"
#is_tag = true
[[instances.table.field]]
name = "ifOperStatus"
oid = "1.3.6.1.2.1.2.2.1.8"
#is_tag = true
[[instances.table.field]]
name = "ifOutOctets"
oid = "1.3.6.1.2.1.2.2.1.16"
[[instances.table.field]]
name = "ifInOctets"
oid = "1.3.6.1.2.1.2.2.1.10"
#聚合状态
#oid = "1.3.6.1.4.1.6339.100.14.2.1.4.1.1.*"
#聚合端口
#oid = "1.3.6.1.4.1.6339.100.14.3.1.2"
================================================
FILE: integrations/SNMP/collect/snmp/snmp.toml
================================================
# Retrieves SNMP values from remote agents
[[instances]]
## Agent addresses to retrieve values from.
## format: agents = [":"]
## scheme: optional, either udp, udp4, udp6, tcp, tcp4, tcp6.
## default is udp
## port: optional
## example: agents = ["udp://127.0.0.1:161"]
## agents = ["tcp://127.0.0.1:161"]
## agents = ["udp4://v4only-snmp-agent"]
#agents = ["udp://127.0.0.1:161"]
agents = [
#
]
## Timeout for each request.
# timeout = "5s"
## SNMP version; can be 1, 2, or 3.
# version = 2
## Unconnected UDP socket
## When true, SNMP responses are accepted from any address not just
## the requested address. This can be useful when gathering from
## redundant/failover systems.
# unconnected_udp_socket = false
## Path to mib files
## Used by the gosmi translator.
## To add paths when translating with netsnmp, use the MIBDIRS environment variable
# path = ["/usr/share/snmp/mibs"]
## SNMP community string.
# community = "public"
## Agent host tag
# agent_host_tag = "agent_host"
## Number of retries to attempt.
# retries = 3
## The GETBULK max-repetitions parameter.
# max_repetitions = 10
## SNMPv3 authentication and encryption options.
##
## Security Name.
# sec_name = "myuser"
## Authentication protocol; one of "MD5", "SHA", "SHA224", "SHA256", "SHA384", "SHA512" or "".
# auth_protocol = "MD5"
## Authentication password.
# auth_password = "pass"
## Security Level; one of "noAuthNoPriv", "authNoPriv", or "authPriv".
# sec_level = "authNoPriv"
## Context Name.
# context_name = ""
## Privacy protocol used for encrypted messages; one of "DES", "AES", "AES192", "AES192C", "AES256", "AES256C", or "".
### Protocols "AES192", "AES192", "AES256", and "AES256C" require the underlying net-snmp tools
### to be compiled with --enable-blumenthal-aes (http://www.net-snmp.org/docs/INSTALL.html)
# priv_protocol = ""
## Privacy password used for encrypted messages.
# priv_password = ""
## Add fields and tables defining the variables you wish to collect. This
## example collects the system uptime and interface variables. Reference the
## full plugin documentation for configuration details.
#[[instances.field]]
#oid = "RFC1213-MIB::sysUpTime.0"
#name = "uptime"
#[[instances.field]]
#oid = "RFC1213-MIB::sysName.0"
#name = "source"
#is_tag = true
# filters = ["A:ifIndex:^2$","B:ifOperStatus:1", "C:ifDescr:^eno*"]
# filters_expression = "(A && B) || C"
#[[instances.table]]
#oid = "IF-MIB::ifTable"
#name = "interface"
#inherit_tags = ["source"]
#[[instances.table.field]]
#oid = "IF-MIB::ifDescr"
#name = "ifDescr"
#is_tag = true
================================================
FILE: integrations/SNMP/collect/snmp/snmp.toml.example
================================================
# Retrieves SNMP values from remote agents
[[instances]]
## Agent addresses to retrieve values from.
## format: agents = [":"]
## scheme: optional, either udp, udp4, udp6, tcp, tcp4, tcp6.
## default is udp
## port: optional
## example: agents = ["udp://127.0.0.1:161"]
## agents = ["tcp://127.0.0.1:161"]
## agents = ["udp4://v4only-snmp-agent"]
#agents = ["udp://127.0.0.1:161"]
# metrics_pass = ["*2$"]
agents = [
# "udp://10.206.0.16:161",
]
## Timeout for each request.
timeout = "5s"
## SNMP version; can be 1, 2, or 3.
version = 2
## Unconnected UDP socket
## When true, SNMP responses are accepted from any address not just
## the requested address. This can be useful when gathering from
## redundant/failover systems.
# unconnected_udp_socket = false
## Path to mib files
## Used by the gosmi translator.
## To add paths when translating with netsnmp, use the MIBDIRS environment variable
# path = ["/usr/share/snmp/mibs"]
## SNMP community string.
community = "public"
## Agent host tag
agent_host_tag = "agent_hostname"
## Number of retries to attempt.
retries = 3
## The GETBULK max-repetitions parameter.
# max_repetitions = 10
## SNMPv3 authentication and encryption options.
##
## Security Name.
# sec_name = "myuser"
## Authentication protocol; one of "MD5", "SHA", "SHA224", "SHA256", "SHA384", "SHA512" or "".
# auth_protocol = "MD5"
## Authentication password.
# auth_password = "pass"
## Security Level; one of "noAuthNoPriv", "authNoPriv", or "authPriv".
# sec_level = "authNoPriv"
## Context Name.
# context_name = ""
## Privacy protocol used for encrypted messages; one of "DES", "AES", "AES192", "AES192C", "AES256", "AES256C", or "".
### Protocols "AES192", "AES192", "AES256", and "AES256C" require the underlying net-snmp tools
### to be compiled with --enable-blumenthal-aes (http://www.net-snmp.org/docs/INSTALL.html)
# priv_protocol = ""
## Privacy password used for encrypted messages.
# priv_password = ""
## Add fields and tables defining the variables you wish to collect. This
## example collects the system uptime and interface variables. Reference the
## full plugin documentation for configuration details.
[[instances.field]]
#oid = "RFC1213-MIB::sysUpTime.0"
oid = ".1.3.6.1.2.1.1.3.0"
name = "uptime"
[[instances.field]]
oid = ".1.3.6.1.4.1.2021.11.9.0" # %
#oid = "UCD-SNMP-MIB::ssCpuUser.0"
name = "cpu_user"
[[instances.field]]
oid = ".1.3.6.1.4.1.2021.11.10.0" # %
name = "cpu_sys"
[[instances.field]]
oid = "1.3.6.1.4.1.2021.11.11.0" # %
name = "cpu_idle"
[[instances.field]]
oid = ".1.3.6.1.2.1.25.2.2.0"
name = "mem_total"
[[instances.field]]
oid = ".1.3.6.1.4.1.2021.4.11.0"
name = "mem_free"
[[instances.field]]
oid = ".1.3.6.1.4.1.2021.4.13.0"
name = "mem_shared"
[[instances.field]]
oid = ".1.3.6.1.4.1.2021.4.14.0"
name = "mem_buffer"
[[instances.field]]
oid = ".1.3.6.1.4.1.2021.4.15.0"
name = "mem_cached"
[[instances.field]]
oid = ".1.3.6.1.4.1.2021.10.1.3.1"
name = "cpu_load1"
[[instances.field]]
oid = ".1.3.6.1.4.1.2021.10.1.3.2"
name = "cpu_load5"
[[instances.field]]
oid = ".1.3.6.1.4.1.2021.10.1.3.3"
name = "cpu_load15"
# network
[[instances.table]]
oid = "IF-MIB::ifTable"
name = "interface"
inherit_tags = ["source"]
index_as_tag = true
include_filter = ["ifIndex:2","ifIndex:4"]
[[instances.table.field]]
oid = "IF-MIB::ifDescr"
name = "ifDescr"
is_tag = true
[[instances.table.field]]
oid = "IF-MIB::ifPhysAddress"
name = "ifPhysAddress"
is_tag = true
================================================
FILE: integrations/SNMP/dashboards/dashboards.json
================================================
{
"id": 0,
"group_id": 0,
"name": "SNMP Stats",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "系统启动时间,timetick转换为秒,大盘自动转换为时 天 周 月……",
"id": "d5e905cf-da22-48be-9fca-1f92695ca730",
"layout": {
"h": 3,
"i": "d5e905cf-da22-48be-9fca-1f92695ca730",
"isResizable": true,
"w": 8,
"x": 0,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Uptime",
"options": {
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "snmp_uptime /100",
"legend": "In",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "端口每秒最大出流量 单位是bit,大盘自动转换",
"id": "26ae7fc1-230e-451e-9415-ea93ae8b2abb",
"layout": {
"h": 3,
"i": "26ae7fc1-230e-451e-9415-ea93ae8b2abb",
"isResizable": true,
"w": 8,
"x": 8,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Max Out (Current)",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "max(irate(snmp_interface_ifOutOctets[5m]))",
"legend": "Out",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "端口每秒最大入流量",
"id": "c5496f39-e194-401c-888c-556292e39254",
"layout": {
"h": 3,
"i": "c5496f39-e194-401c-888c-556292e39254",
"isResizable": true,
"w": 8,
"x": 16,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "Max In (Current)",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "max(irate(snmp_interface_ifInOctets[5m]))",
"legend": "In",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "d755c99d-a323-41e6-8117-6bc006bef8b7",
"layout": {
"h": 3,
"i": "bd2cd5b0-50ac-42d7-b29d-ea89ceb015a7",
"isResizable": true,
"w": 8,
"x": 0,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "CPU 使用率 %",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "snmp_sys_cpu_usage",
"legend": "Out",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "内存使用率 ",
"id": "c3991b49-1ad8-4f63-87b8-d41bbf729833",
"layout": {
"h": 3,
"i": "109aad94-79bd-4aec-b8ac-db73cb6601a8",
"isResizable": true,
"w": 8,
"x": 8,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "内存使用率 %",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "snmp_sys_mem_usage",
"legend": "mem_usage",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "当前并发连接数",
"id": "024c8f3e-b632-4177-9a71-396d81ede19e",
"layout": {
"h": 3,
"i": "55c508d9-acaa-4bd2-a473-7b6176a5a44a",
"isResizable": true,
"w": 8,
"x": 16,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "并发连接数(当前)",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "snmp_sys_mem_usage",
"legend": "mem_usage",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "最近10分钟平均每秒新建连接数",
"id": "49e0453c-328c-4499-9097-a6bb52d92ad1",
"layout": {
"h": 3,
"i": "c7ddd2b8-5803-4de0-a8e7-3466020684e9",
"isResizable": true,
"w": 8,
"x": 0,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "每秒新建连接数",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "snmp_sys_session_rate",
"legend": "mem_usage",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "最近1分钟总计出流量",
"id": "ad0d02bb-9eb0-47d8-8529-1ce66150e4f9",
"layout": {
"h": 3,
"i": "ad0d02bb-9eb0-47d8-8529-1ce66150e4f9",
"isResizable": true,
"w": 8,
"x": 8,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Total Out",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "max(delta(snmp_interface_ifOutOctets[1m]))",
"legend": "Out",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "最近1分钟总计入流量",
"id": "616de58a-70a7-4c0b-b0f2-5151b9f0e9c5",
"layout": {
"h": 3,
"i": "616de58a-70a7-4c0b-b0f2-5151b9f0e9c5",
"isResizable": true,
"w": 8,
"x": 16,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Total In",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "max(delta(snmp_interface_ifInOctets[1m]))",
"legend": "In",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "入方向1m内丢包数",
"id": "fcdf91de-394f-4d9c-abb9-d750583ce6cf",
"layout": {
"h": 3,
"i": "da757839-0fc0-4d1b-b486-7070aa3d70f8",
"isResizable": true,
"w": 8,
"x": 0,
"y": 9
},
"links": [],
"maxPerRow": 4,
"name": "Drop Packets (In)",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(irate(snmp_interface_ifInDiscards[1m]))",
"legend": "In",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "出方向1m的丢包数",
"id": "b5d3c6de-6512-40d2-b167-a1e00bfaa795",
"layout": {
"h": 3,
"i": "ab9b1070-a81a-4a30-9dcd-7071dfafcdc6",
"isResizable": true,
"w": 8,
"x": 8,
"y": 9
},
"links": [],
"maxPerRow": 4,
"name": "Drop Packets (Out)",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(irate(snmp_interface_ifOutDiscards[1m]))",
"legend": "In",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"baseColor": "#9470FF",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Max: 10MB \nRed Status: 9MB",
"id": "a925c635-0ff8-4984-a8b0-ca99948960f0",
"layout": {
"h": 9,
"i": "a925c635-0ff8-4984-a8b0-ca99948960f0",
"isResizable": true,
"w": 12,
"x": 0,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "Out (Current)",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "irate(snmp_interface_ifOutOctets[5m])",
"legend": "{{ifDescr}}( {{ifIndex}})",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"baseColor": "#9470FF",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Max: 10MB \nRed Status: 9MB",
"id": "86345d37-977a-44e0-96ca-e46fe388b529",
"layout": {
"h": 9,
"i": "86345d37-977a-44e0-96ca-e46fe388b529",
"isResizable": true,
"w": 12,
"x": 12,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "In (Current)",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "irate(snmp_interface_ifInOctets[5m])",
"legend": "{{ifDescr}}({{ifIndex}})",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4bd4c3ad-e00f-4eac-89fc-6b146f36d922",
"layout": {
"h": 10,
"i": "4bd4c3ad-e00f-4eac-89fc-6b146f36d922",
"isResizable": true,
"w": 24,
"x": 0,
"y": 21
},
"links": [],
"maxPerRow": 4,
"name": "Out / In",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "irate(snmp_interface_ifOutOctets[5m])",
"legend": "Out: {{ifDescr}} ",
"refId": "A"
},
{
"expr": "-irate(snmp_interface_ifInOctets[5m])",
"legend": "In: {{ifDescr}} ",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"columns": [
"shop",
"role",
"ifIndex",
"value"
],
"displayMode": "labelsOfSeriesToRows",
"showHeader": true,
"sortColumn": "ifDescr",
"sortOrder": "ascend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b663d53d-41a9-4e28-a60e-6ced1b9d46ce",
"layout": {
"h": 10,
"i": "b663d53d-41a9-4e28-a60e-6ced1b9d46ce",
"isResizable": true,
"w": 24,
"x": 0,
"y": 31
},
"links": [],
"maxPerRow": 4,
"name": "Interface Status",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 1
},
"result": {
"color": "#3fc453",
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 2
},
"result": {
"color": "#ce4f52",
"text": "DOWN"
},
"type": "special"
},
{
"match": {
"special": 3
},
"result": {
"color": "#9470ff",
"text": "TESTING"
},
"type": "special"
},
{
"match": {
"special": 4
},
"result": {
"color": "#000000",
"text": "UNKNOWN"
},
"type": "special"
},
{
"match": {
"special": 5
},
"result": {
"color": "#ffae39",
"text": "DORMANT"
},
"type": "special"
},
{
"match": {
"special": 6
},
"result": {
"color": "#e6c627",
"text": "NotPresent"
},
"type": "special"
},
{
"match": {
"special": 7
},
"result": {
"color": "#ff8286",
"text": "LowerLayerDown"
},
"type": "special"
}
]
},
"overrides": [
{}
],
"targets": [
{
"expr": "snmp_interface_ifOperStatus",
"legend": "out",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
}
],
"var": [
{
"defaultValue": "",
"definition": "prometheus",
"hide": false,
"label": "datasource",
"name": "datasource",
"type": "datasource"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328367142000
}
================================================
FILE: integrations/SNMP/dashboards/switch branch.json
================================================
{
"id": 0,
"group_id": 0,
"name": "各个接入交换机",
"ident": "h",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"graphTooltip": "default",
"graphZoom": "default",
"links": [],
"panels": [
{
"custom": {
"aggrDimension": [
"sys_name",
"sys_pm",
"LocalIP"
],
"calc": "lastNotNull",
"colorMode": "background",
"displayMode": "labelValuesToRows",
"linkMode": "appendLinkColumn",
"showHeader": true,
"sortColumn": "sys_name",
"sortOrder": "ascend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${jieru}",
"id": "4f20d79f-8092-43eb-8298-7492b8fc7b4a",
"layout": {
"h": 3,
"i": "4f20d79f-8092-43eb-8298-7492b8fc7b4a",
"isResizable": true,
"w": 13,
"x": 0,
"y": 0
},
"maxPerRow": 4,
"name": "设备基本信息",
"options": {
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID",
"value": "A"
},
"properties": {
"standardOptions": {
"util": "humantimeSeconds"
},
"valueMappings": [
{
"match": {
"to": 86400
},
"result": {
"color": "rgba(250, 6, 6, 1)"
},
"type": "range"
},
{
"match": {
"to": 2592000
},
"result": {
"color": "rgba(230, 198, 39, 1)"
},
"type": "range"
},
{
"match": {
"from": 2592000
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
}
]
}
}
],
"targets": [
{
"expr": "snmp_sys_uptime",
"instant": false,
"legend": "设备在线",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"ifDescr": false
},
"indexByName": {
"ifDescr": 0
},
"renameByName": {
"LocalIP": "管理IP地址",
"ifDescr": "",
"sys_name": "设备名称",
"sys_pm": "设备型号"
}
}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {
"title": 24,
"value": 24
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${jieru}",
"id": "1ce09285-bf26-4518-acbd-26db4e1292a2",
"layout": {
"h": 4,
"i": "1ce09285-bf26-4518-acbd-26db4e1292a2",
"isResizable": true,
"w": 4,
"x": 13,
"y": 0
},
"maxPerRow": 4,
"name": "Panel Title",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "snmp_TotalPorts",
"legend": "设备端口数",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"textMode": "valueAndName",
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${jieru}",
"id": "b0a707e6-2059-4158-8e44-ee19db45e12e",
"layout": {
"h": 4,
"i": "b0a707e6-2059-4158-8e44-ee19db45e12e",
"isResizable": true,
"w": 7,
"x": 17,
"y": 0
},
"maxPerRow": 4,
"name": "设备资源占用",
"options": {
"standardOptions": {
"decimals": 0
},
"thresholds": {
"steps": [
{
"color": "#3FC453",
"type": "base",
"value": null
},
{
"color": "#FF9919",
"value": 60
},
{
"color": "#FF656B",
"value": 80
}
]
}
},
"targets": [
{
"expr": "snmp_cpu_usage",
"legend": "CPU使用率",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "snmp_mem_use / snmp_mem_max*100",
"legend": "内存使用率",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "gauge",
"version": "3.0.0"
},
{
"custom": {
"aggrDimension": [
"ifDescr"
],
"calc": "lastNotNull",
"colorMode": "background",
"displayMode": "labelValuesToRows",
"linkMode": "appendLinkColumn",
"showHeader": true,
"sortColumn": "A",
"sortOrder": "descend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${jieru}",
"id": "89d8708e-051d-4bc6-b846-17ade266c77b",
"layout": {
"h": 11,
"i": "89d8708e-051d-4bc6-b846-17ade266c77b",
"isResizable": true,
"w": 13,
"x": 0,
"y": 3
},
"maxPerRow": 4,
"name": "设备端口状态",
"options": {
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID",
"value": "C"
},
"properties": {
"valueMappings": [
{
"match": {
"special": 1,
"to": 1
},
"result": {
"color": "rgba(63, 196, 83, 1)",
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 2
},
"result": {
"color": "#000000",
"text": "DOWN"
},
"type": "special"
}
]
}
},
{
"matcher": {
"id": "byFrameRefID",
"value": "B"
},
"properties": {
"standardOptions": {
"util": "none"
},
"valueMappings": [
{
"match": {
"special": 1000
},
"result": {
"color": "rgba(108, 155, 248, 1)",
"text": "千兆"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"color": "#000000",
"text": "无连接"
},
"type": "special"
},
{
"match": {
"special": 100
},
"result": {
"color": "rgba(18, 238, 51, 1)",
"text": "百兆"
},
"type": "special"
},
{
"match": {
"special": 200
},
"result": {
"color": "rgba(33, 112, 44, 1)",
"text": "接入汇聚"
},
"type": "special"
},
{
"match": {
"special": 2000
},
"result": {
"color": "rgba(4, 62, 175, 1)",
"text": "核心堆叠"
},
"type": "special"
}
]
},
"type": "special"
}
],
"targets": [
{
"expr": "snmp_interface_ifSpeed",
"legend": "端口速率",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "snmp_interface_ifOperStatus",
"legend": "端口状态",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {
"renameByName": {
"DCN": "IP地址",
"ifDescr": "端口名称",
"ifSpeed": "端口速率",
"sysname": "设备名称"
}
}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorDomainAuto": true,
"colorRange": [
"#83c898",
"#c2c2c2",
"#fc653f"
],
"fontBackground": false,
"reverseColorOrder": false,
"textMode": "value",
"valueField": "ifDescr"
},
"datasourceCate": "prometheus",
"datasourceValue": "${jieru}",
"id": "2307e3b9-af4c-4ffa-aae2-987d7c010f00",
"layout": {
"h": 10,
"i": "2307e3b9-af4c-4ffa-aae2-987d7c010f00",
"isResizable": true,
"w": 11,
"x": 13,
"y": 4
},
"maxPerRow": 4,
"name": "端口状态",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "snmp_interface_ifOperStatus",
"legend": "{{LocalIP}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "hexbin",
"version": "3.0.0"
},
{
"custom": {
"barMaxWidth": null,
"calc": "lastNotNull",
"colorField": "__name__",
"xAxisField": "ifDescr",
"yAxisField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${jieru}",
"id": "7178cb95-1023-483e-bc27-8dffe6749524",
"layout": {
"h": 6,
"i": "7178cb95-1023-483e-bc27-8dffe6749524",
"isResizable": true,
"w": 24,
"x": 0,
"y": 14
},
"maxPerRow": 4,
"name": "各个端口流量汇总",
"options": {
"standardOptions": {
"util": "bytesSI"
}
},
"targets": [
{
"expr": "snmp_interface_ifInOctets",
"legend": "端口IN流量",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "snmp_interface_ifOutOctets",
"legend": "端口OUT流量",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barchart",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.75,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": true,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${jieru}",
"id": "ccb183c2-8918-464f-beda-511dac02390d",
"layout": {
"h": 8,
"i": "ccb183c2-8918-464f-beda-511dac02390d",
"isResizable": true,
"w": 24,
"x": 0,
"y": 20
},
"maxPerRow": 4,
"name": "核心堆叠口流量汇总",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off",
"standardOptions": {
"util": "none"
}
}
}
],
"targets": [
{
"expr": "irate(snmp_interface_ifInOctets{ifDescr=~\"Vlan256\"}[5m])",
"legend": "INT",
"maxDataPoints": 240,
"refId": "A",
"time": {
"end": "now",
"start": "now-6h"
}
},
{
"expr": "-irate(snmp_interface_ifOutOctets{ ifDescr=~\"Vlan256\"}[5m])",
"legend": "OUT",
"maxDataPoints": 240,
"refId": "B",
"time": {
"end": "now",
"start": "now-6h"
}
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"defaultValue": "",
"definition": "prometheus",
"label": "数据源",
"name": "jieru",
"type": "datasource"
},
{
"allOption": true,
"allValue": "",
"datasource": {
"cate": "prometheus",
"value": "${jieru}"
},
"definition": "label_values(LocalIP)",
"label": "ip地址",
"multi": true,
"name": "ipadd",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${jieru}"
},
"definition": "label_values(sys_name)",
"label": "设备名称",
"multi": true,
"name": "sys_name",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328371021000
}
================================================
FILE: integrations/SNMP/dashboards/switch main.json
================================================
{
"id": 0,
"group_id": 0,
"name": "网络交换机监控大盘",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"graphTooltip": "default",
"graphZoom": "default",
"links": [],
"panels": [
{
"custom": {
"calc": "lastNotNull",
"colSpan": 8,
"colorMode": "value",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {
"title": 20,
"value": 20
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"id": "190d574b-502d-482f-8b6f-981730dba70e",
"layout": {
"h": 4,
"i": "190d574b-502d-482f-8b6f-981730dba70e",
"isResizable": true,
"w": 4,
"x": 0,
"y": 0
},
"maxPerRow": 4,
"name": "设备数量",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "rgba(99, 76, 217, 1)",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "count(snmp_sys_uptime)",
"instant": false,
"legend": "设备在线",
"maxDataPoints": 20,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"aggrDimension": [
"sys_name",
"LocalIP",
"sys_pm"
],
"calc": "lastNotNull",
"colorMode": "background",
"displayMode": "labelValuesToRows",
"linkMode": "appendLinkColumn",
"links": [
{
"title": "详情",
"url": "http://182.182.61.13:17000/dashboards/h?ipadd=${__field.labels.LocalIP}\u0026jieru=1"
}
],
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": 1,
"id": "8b107213-6d65-4877-b3e2-31435bb8758c",
"layout": {
"h": 14,
"i": "8b107213-6d65-4877-b3e2-31435bb8758c",
"isResizable": true,
"w": 24,
"x": 0,
"y": 4
},
"maxPerRow": 4,
"name": "设备概览",
"options": {
"standardOptions": {},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID",
"value": "A"
},
"properties": {
"standardOptions": {
"util": "percent"
},
"valueMappings": [
{
"match": {
"to": 20
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
},
{
"match": {
"to": 50
},
"result": {
"color": "rgba(230, 198, 39, 1)"
},
"type": "range"
},
{
"match": {
"to": 100
},
"result": {
"color": "rgba(252, 4, 4, 1)"
},
"type": "range"
}
]
}
},
{
"matcher": {
"id": "byFrameRefID",
"value": "B"
},
"properties": {
"standardOptions": {
"decimals": 0,
"util": "percentUnit"
},
"valueMappings": [
{
"match": {
"to": 0.6
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
},
{
"match": {
"to": 0.8
},
"result": {
"color": "rgba(230, 198, 39, 1)"
},
"type": "range"
},
{
"match": {
"from": 0.8,
"to": 1
},
"result": {
"color": "rgba(253, 0, 0, 1)"
},
"type": "range"
}
]
},
"type": "special"
},
{
"matcher": {
"id": "byFrameRefID",
"value": "C"
},
"properties": {
"standardOptions": {
"util": "humantimeSeconds"
},
"valueMappings": [
{
"match": {
"to": 86400
},
"result": {
"color": "rgba(250, 4, 4, 1)"
},
"type": "range"
},
{
"match": {
"to": 2592000
},
"result": {
"color": "rgba(230, 198, 39, 1)"
},
"type": "range"
},
{
"match": {
"from": 2592000
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
}
]
},
"type": "special"
}
],
"targets": [
{
"expr": "snmp_cpu_usage",
"legend": "CPU使用率",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "snmp_mem_use/snmp_mem_max",
"legend": "内存使用率",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "snmp_sys_uptime",
"legend": "开机时间",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {
"renameByName": {
"DCN": "IP地址",
"LocalIP": "IP地址",
"sys_name": "设备名称",
"sys_pm": "设备型号",
"sysname": "设备名称"
}
}
}
],
"type": "table",
"version": "3.0.0"
}
],
"var": [],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328376415000
}
================================================
FILE: integrations/SNMP/markdown/README.md
================================================
# snmp
> 监控网络设备,主要是通过 SNMP 协议,Categraf、Telegraf、Datadog-Agent、snmp_exporter 都提供了这个能力。
Categraf 从 v0.2.13 版本开始把 Telegraf 的 snmp 插件集成了进来,推荐大家采用这个插件来监控网络设备。这个插件的核心逻辑是:要采集什么指标,直接配置对应的 oid 即可,而且可以把一些 oid 采集到的数据当做时序数据的标签,非常非常灵活。
当然,弊端也有,因为 SNMP 体系里有大量的私有 oid,比如不同的设备获取 CPU、内存利用率的 oid 都不一样,这就需要为不同的型号的设备采用不同的配置,维护起来比较麻烦,需要大量的积累。这里我倡议大家把不同的设备型号的采集配置积累到 [这里](https://github.com/flashcatcloud/categraf/tree/main/inputs/snmp),每个型号一个文件夹,长期积累下来,那将是利人利己的好事。不知道如何提 PR 的可以联系我们。
另外,也不用太悲观,针对网络设备而言,大部分监控数据的采集都是通用 oid 就可以搞定的,举个例子:
```toml
interval = 120
[[instances]]
agents = ["udp://172.30.15.189:161"]
interval_times = 1
timeout = "5s"
version = 2
community = "public"
agent_host_tag = "switch_ip"
retries = 1
[[instances.field]]
oid = "RFC1213-MIB::sysUpTime.0"
name = "uptime"
[[instances.field]]
oid = "RFC1213-MIB::sysName.0"
name = "source"
is_tag = true
[[instances.table]]
oid = "IF-MIB::ifTable"
name = "interface"
inherit_tags = ["source"]
[[instances.table.field]]
oid = "IF-MIB::ifDescr"
name = "ifDescr"
is_tag = true
```
上面的样例是 v2 版本的配置,如果是 v3 版本,校验方式举例:
```toml
version = 3
sec_name = "managev3user"
auth_protocol = "SHA"
auth_password = "example.Demo.c0m"
```
另外,snmp 的采集,建议大家部署单独的 Categraf 来做,因为不同监控对象采集频率可能不同,比如边缘交换机,我们 5min 采集一次就够了,核心交换机可以配置的频繁一些,比如 60s 或者 120s。
> 注意:如果采集的过于频繁,有些老款的交换机可能会被打挂,或者被限流,被限流的结果就是图上看到的是断点。
## 扩展阅读
- [SNMP(简单网络管理协议)简介](https://flashcat.cloud/blog/snmp-introduction/)
- [SNMP命令相关参数介绍](https://flashcat.cloud/blog/snmp-command-arguments/)
- [通过 Categraf SNMP 插件采集监控数据](https://flashcat.cloud/blog/snmp-metrics-collect-by-categraf/)
## 排错
要想通过 categraf 采集到 snmp 数据,首先要保证 categraf 所在的机器能够连通网络设备,可以通过 snmpget 命令来做测试:
```bash
snmpget -v2c -c public 172.30.15.189 RFC1213-MIB::sysUpTime.0
```
如果 snmpget 都跑不通,就得先解决这个问题,比如是 snmpd 没有启动,或者防火墙限制了 snmp 的访问,还是 snmpget 命令没有安装,等等。这些问题,gpt 和 google 都可以解决,这里不再赘述。
================================================
FILE: integrations/SQLServer/collect/sqlserver/sqlserver.toml
================================================
# # collect interval
# interval = 15
[[instances]]
## Specify instances to monitor with a list of connection strings.
## All connection parameters are optional.
## By default, the host is localhost, listening on default port, TCP 1433.
## for Windows, the user is the currently running AD user (SSO).
## See https://github.com/denisenkom/go-mssqldb for detailed connection
## parameters, in particular, tls connections can be created like so:
## "encrypt=true;certificate=;hostNameInCertificate="
# servers = ["Server=server.xxx.com;Port=1433;User Id=monitor;Password=xxxxxx;app name=categraf;log=1;"]
# servers = [ ]
## Authentication method
## valid methods: "connection_string", "AAD"
# auth_method = "connection_string"
## "database_type" enables a specific set of queries depending on the database type. If specified, it replaces azuredb = true/false and query_version = 2
## In the config file, the sql server plugin section should be repeated each with a set of servers for a specific database_type.
## Possible values for database_type are - "SQLServer" or "AzureSQLDB" or "AzureSQLManagedInstance" or "AzureSQLPool"
database_type = "SQLServer"
## A list of queries to include. If not specified, all the below listed queries are used.
include_query = []
## A list of queries to explicitly ignore.
exclude_query = ["SQLServerAvailabilityReplicaStates", "SQLServerDatabaseReplicaStates"]
## Queries enabled by default for database_type = "SQLServer" are -
## SQLServerPerformanceCounters, SQLServerWaitStatsCategorized, SQLServerDatabaseIO, SQLServerProperties, SQLServerMemoryClerks,
## SQLServerSchedulers, SQLServerRequests, SQLServerVolumeSpace, SQLServerCpu, SQLServerAvailabilityReplicaStates, SQLServerDatabaseReplicaStates,
## SQLServerRecentBackups
## Following are old config settings
## You may use them only if you are using the earlier flavor of queries, however it is recommended to use
## the new mechanism of identifying the database_type there by use it's corresponding queries
## Optional parameter, setting this to 2 will use a new version
## of the collection queries that break compatibility with the original
## dashboards.
## Version 2 - is compatible from SQL Server 2012 and later versions and also for SQL Azure DB
# query_version = 2
## Toggling this to true will emit an additional metric called "sqlserver_telegraf_health".
## This metric tracks the count of attempted queries and successful queries for each SQL instance specified in "servers".
## The purpose of this metric is to assist with identifying and diagnosing any connectivity or query issues.
## This setting/metric is optional and is disabled by default.
# health_metric = false
## Possible queries across different versions of the collectors
## Queries enabled by default for specific Database Type
## database_type = SQLServer by default collects the following queries
## - SQLServerPerformanceCounters
## - SQLServerWaitStatsCategorized
## - SQLServerDatabaseIO
## - SQLServerProperties
## - SQLServerMemoryClerks
## - SQLServerSchedulers
## - SQLServerRequests
## - SQLServerVolumeSpace
## - SQLServerCpu
## - SQLServerRecentBackups
## and following as optional (if mentioned in the include_query list)
## - SQLServerAvailabilityReplicaStates
## - SQLServerDatabaseReplicaStates
## Version 2 by default collects the following queries
## Version 2 is being deprecated, please consider using database_type.
## - PerformanceCounters
## - WaitStatsCategorized
## - DatabaseIO
## - ServerProperties
## - MemoryClerk
## - Schedulers
## - SqlRequests
## - VolumeSpace
## - Cpu
## Version 1 by default collects the following queries
## Version 1 is deprecated, please consider using database_type.
## - PerformanceCounters
## - WaitStatsCategorized
## - CPUHistory
## - DatabaseIO
## - DatabaseSize
## - DatabaseStats
## - DatabaseProperties
## - MemoryClerk
## - VolumeSpace
## - PerformanceMetrics
================================================
FILE: integrations/SQLServer/dashboards/sqlserver.json
================================================
{
"id": 0,
"group_id": 0,
"name": "SQLServer",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"aggrDimension": [],
"calc": "lastNotNull",
"colorMode": "background",
"displayMode": "labelValuesToRows",
"linkMode": "appendLinkColumn",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "28f81145-c0d2-49bc-9b6c-28e969c25537",
"layout": {
"h": 3,
"i": "28f81145-c0d2-49bc-9b6c-28e969c25537",
"isResizable": true,
"w": 24,
"x": 0,
"y": 0
},
"maxPerRow": 4,
"name": "Server resource overview",
"options": {
"standardOptions": {}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID",
"value": "F"
},
"properties": {
"standardOptions": {
"util": "seconds"
},
"valueMappings": [
{
"match": {
"from": 0,
"special": 0,
"to": 100
},
"result": {
"color": "#ff9919"
},
"type": "range"
},
{
"match": {
"from": 100
},
"result": {
"color": "#3fc453"
},
"type": "range"
}
]
}
},
{
"matcher": {
"id": "byFrameRefID",
"value": "D"
},
"properties": {
"standardOptions": {
"util": "none"
}
},
"type": "special"
},
{
"matcher": {
"id": "byFrameRefID",
"value": "H"
},
"properties": {
"standardOptions": {
"util": "bytesSecSI"
}
},
"type": "special"
},
{
"matcher": {
"id": "byFrameRefID",
"value": "B"
},
"properties": {
"standardOptions": {
"util": "bytesIEC"
},
"valueMappings": [
{
"match": {
"from": 102400000,
"textValue": "200",
"to": null
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
},
{
"match": {
"from": 0,
"to": 100000000
},
"result": {
"color": "#ffae39"
},
"type": "range"
}
]
},
"type": "special"
},
{
"matcher": {
"id": "byFrameRefID",
"value": "G"
},
"properties": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "#3fc453"
},
"type": "special"
}
]
},
"type": "special"
},
{
"matcher": {
"id": "byFrameRefID",
"value": "I"
},
"properties": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "#3fc453"
},
"type": "special"
},
{
"match": {
"from": 1
},
"result": {
"color": "#ff9919"
},
"type": "range"
}
]
},
"type": "special"
}
],
"targets": [
{
"expr": "sqlserver_performance_value{counter=\"Total Server Memory (KB)\",serverName=\"$instance\"}",
"legend": "Total RAM",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "sqlserver_performance_value{counter=\"Total Server Memory (KB)\",serverName=\"$instance\"}-sqlserver_performance_value{counter=\"Used memory (KB)\",serverName=\"$instance\"}",
"legend": "RAM available",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "sqlserver_performance_value{counter=\"Free Space in tempdb (KB)\",serverName=\"$instance\"}",
"legend": "Pagefile available size",
"maxDataPoints": 240,
"refId": "C"
},
{
"expr": "sqlserver_performance_value{counter=\"Page life expectancy\",serverName=\"$instance\"}",
"legend": "Total page faults",
"maxDataPoints": 240,
"refId": "D"
},
{
"expr": "sqlserver_performance_value{counter=\"Batch Requests/sec\",serverName=\"$instance\"}",
"legend": "Batch reqs /sec",
"maxDataPoints": 240,
"refId": "E"
},
{
"expr": "sqlserver_performance_value{counter=\"Page life expectancy\",serverName=\"$instance\"}",
"legend": "Page life expectancy(sec)",
"maxDataPoints": 240,
"refId": "F"
},
{
"expr": "sqlserver_performance_value{counter=\"Number of Deadlocks/sec\",serverName=\"$instance\"}",
"legend": "Deadlocks",
"maxDataPoints": 240,
"refId": "G"
},
{
"expr": "sqlserver_performance_value{counter=\"Errors/sec\",serverName=\"$instance\"}",
"legend": "User errors /sec",
"maxDataPoints": 240,
"refId": "H"
},
{
"expr": "mssql_kill_connection_errors{job=~\"$Job\"}-0",
"legend": "Kill conn errors /sec",
"maxDataPoints": 240,
"refId": "I"
},
{
"expr": "sqlserver_cpu_system_idle_cpu{serverName=\"$instance\"}",
"legend": "sqlserver_cpu_system_idle",
"maxDataPoints": 240,
"refId": "K"
},
{
"expr": "sqlserver_up{serverName=\"$instance\"}",
"legend": "sqlserver_up",
"maxDataPoints": 240,
"refId": "L"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "77467d35-8002-4211-a95d-ed6278567ab4",
"layout": {
"h": 1,
"i": "77467d35-8002-4211-a95d-ed6278567ab4",
"isResizable": false,
"w": 24,
"x": 0,
"y": 3
},
"name": "Summary",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f8795e89-0e61-43ec-9c05-90424bf7eb60",
"layout": {
"h": 7,
"i": "f8795e89-0e61-43ec-9c05-90424bf7eb60",
"isResizable": true,
"w": 7,
"x": 0,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "当前数据库连接",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "table"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sqlserver_performance_value{counter=\"Logical Connections\",serverName=\"$instance\"}",
"legend": "{{sql_instance}} ",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9247c782-4abf-4423-84ba-6453d5491e23",
"layout": {
"h": 7,
"i": "9247c782-4abf-4423-84ba-6453d5491e23",
"isResizable": true,
"w": 9,
"x": 7,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "DB Log growth since last restart",
"options": {
"legend": {
"behaviour": "showItem",
"columns": [
"last"
],
"displayMode": "list",
"placement": "right"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sqlserver_performance_value{counter=\"Log Growths\",serverName=\"$instance\"}",
"legend": "{{instance}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "47686441-a9d6-4ede-8901-5d05ce3ef8b8",
"layout": {
"h": 7,
"i": "47686441-a9d6-4ede-8901-5d05ce3ef8b8",
"isResizable": true,
"w": 8,
"x": 16,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Number of Deadlocks/sec",
"options": {
"legend": {
"behaviour": "showItem",
"columns": [
"last"
],
"displayMode": "list",
"placement": "right"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sqlserver_performance_value{counter=\"Number of Deadlocks/sec\",serverName=\"$instance\"}",
"legend": "Number of Deadlocks/sec",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"baseColor": "#9470FF",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "de76a102-cf5e-40b6-9e2d-833446b48ec2",
"layout": {
"h": 7,
"i": "28d8f090-b9c1-4bf9-a2ff-fa4f1d3da661",
"isResizable": true,
"w": 7,
"x": 0,
"y": 11
},
"links": [],
"maxPerRow": 4,
"name": "硬盘空闲空间",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sqlserver_volume_space_available_space_bytes{serverName=\"$instance\"}",
"instant": true,
"legend": "硬盘空闲空间",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "barGauge",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9f7ac94a-cc31-4a9b-a242-3ef3daaccd9e",
"layout": {
"h": 7,
"i": "db5406f3-7877-42b5-a5d8-23d02be39d4f",
"isResizable": true,
"w": 9,
"x": 7,
"y": 11
},
"links": [],
"maxPerRow": 4,
"name": "CPU",
"options": {
"legend": {
"behaviour": "showItem",
"columns": [
"last"
],
"displayMode": "list",
"placement": "right"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sqlserver_cpu_sqlserver_process_cpu{serverName=\"$instance\"}",
"legend": "CPU",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "875eba8d-b99d-462d-99e9-00bd9682e713",
"layout": {
"h": 7,
"i": "92d37458-9b5a-4f61-8779-9e5f23e98d27",
"isResizable": true,
"w": 8,
"x": 16,
"y": 11
},
"links": [],
"maxPerRow": 4,
"name": "Total wait time of I/O stall",
"options": {
"legend": {
"behaviour": "showItem",
"columns": [
"last"
],
"displayMode": "list",
"placement": "right"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sqlserver_database_io_read_latency_ms{serverName=\"$instance\",database_name=\"h3cloud\"}",
"instant": false,
"legend": "io_read_latency_ms",
"maxDataPoints": 240,
"refId": "A",
"time": {
"end": "now",
"start": "now-1h"
}
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "bb996a6d-ddec-46c3-8c64-dd2a57a13fd4",
"layout": {
"h": 1,
"i": "bb996a6d-ddec-46c3-8c64-dd2a57a13fd4",
"isResizable": false,
"w": 24,
"x": 0,
"y": 18
},
"name": "Database I/O wait of stall detailed",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "9c5037d4-563c-412f-b4e9-103d90ecc62d",
"layout": {
"h": 8,
"i": "9c5037d4-563c-412f-b4e9-103d90ecc62d",
"isResizable": true,
"w": 24,
"x": 0,
"y": 19
},
"links": [],
"maxPerRow": 3,
"name": "Database wait by I/O stall ",
"options": {
"legend": {
"behaviour": "showItem",
"columns": [
"last"
],
"displayMode": "list",
"placement": "right"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"repeat": "database",
"targets": [
{
"expr": "sqlserver_database_io_rg_write_stall_ms{serverName=\"$instance\"}+sqlserver_database_io_rg_read_stall_ms{serverName=\"$instance\"}",
"legend": "{{database_name}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"label": "DS_PROMETHEUS",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(sqlserver_up,sql_instance)",
"name": "instance",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328380733000
}
================================================
FILE: integrations/SQLServer/markdown/README.md
================================================
# sqlserver
forked from telegraf/sqlserver. 这个插件的作用是获取sqlserver的监控指标,这里去掉了Azure相关部分监控,只保留了本地部署sqlserver情况。
# 使用
按照下面方法创建监控账号,用于读取监控数据
USE master;
CREATE LOGIN [categraf] WITH PASSWORD = N'mystrongpassword';
GRANT VIEW SERVER STATE TO [categraf];
GRANT VIEW ANY DEFINITION TO [categraf];
Data Source=10.19.1.1;Initial Catalog=hc;User ID=sa;Password=mystrongpassword;
================================================
FILE: integrations/SpringBoot/alerts/alerts.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "springboot HEAP内存使用率大于20%",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "sum(jvm_memory_used_bytes{instance=~\".+\", area=\"heap\"})*100/sum(jvm_memory_max_bytes{instance=~\".+\", area=\"heap\"}) \u003e20",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328389228000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "springboot HTTP请求延迟大于10s",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "max(http_server_requests_seconds_max{instance=~\".+\", status!~\"5..\"}) \u003e 10",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328389834000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "springboot HTTP错误数",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "sum(rate(http_server_requests_seconds_count{instance=~\".+\", status=~\"5.+\"}[1m])) != 0",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328390482000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "springboot NOHEAP内存使用率大于30%",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "sum(jvm_memory_used_bytes{instance=~\".+\", area=\"nonheap\"})*100/sum(jvm_memory_max_bytes{instance=~\".+\", area=\"nonheap\"}) \u003e 30",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328391132000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "springboot 事件错误数",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "increase(logback_events_total{instance=\"$instance\"}[1m]) \u003e 0",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328391712000
}
]
================================================
FILE: integrations/SpringBoot/dashboards/JVM(Actuator)withapplicationname.json
================================================
{
"id": 0,
"group_id": 0,
"name": "JVM (Actuator) with application name",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"collapsed": true,
"id": "98ba9b06-9d1e-44b9-a626-f4e704cc39ac",
"layout": {
"h": 1,
"i": "98ba9b06-9d1e-44b9-a626-f4e704cc39ac",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Quick Facts",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c325f6ba-bca2-42f1-a518-1d3077b54a54",
"layout": {
"h": 3,
"i": "c325f6ba-bca2-42f1-a518-1d3077b54a54",
"isResizable": true,
"w": 6,
"x": 0,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Start time",
"options": {
"standardOptions": {
"util": "datetimeMilliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "process_start_time_seconds{instance=\"$instance\"}*1000",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "03627849-92c0-4a5c-9558-738dc6bd6186",
"layout": {
"h": 3,
"i": "03627849-92c0-4a5c-9558-738dc6bd6186",
"isResizable": true,
"w": 6,
"x": 12,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Heap used",
"options": {
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(jvm_memory_used_bytes{instance=\"$instance\", area=\"heap\"})*100/sum(jvm_memory_max_bytes{instance=\"$instance\", area=\"heap\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4e85e1c3-9623-4e1c-bbb2-a4824455c5b3",
"layout": {
"h": 3,
"i": "4e85e1c3-9623-4e1c-bbb2-a4824455c5b3",
"isResizable": true,
"w": 6,
"x": 18,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Non-Heap used",
"options": {
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(jvm_memory_used_bytes{instance=\"$instance\", area=\"nonheap\"})*100/sum(jvm_memory_max_bytes{instance=\"$instance\", area=\"nonheap\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "95bc69c0-0b29-4dc5-8404-c0ca0741918f",
"layout": {
"h": 3,
"i": "95bc69c0-0b29-4dc5-8404-c0ca0741918f",
"isResizable": true,
"w": 6,
"x": 6,
"y": 2
},
"links": [],
"maxPerRow": 4,
"name": "Uptime",
"options": {
"standardOptions": {
"util": "humantimeSeconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "process_uptime_seconds{instance=\"$instance\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "abe6c0c1-8b88-450a-908f-0f51cc973761",
"layout": {
"h": 1,
"i": "abe6c0c1-8b88-450a-908f-0f51cc973761",
"isResizable": false,
"w": 24,
"x": 0,
"y": 5
},
"name": "I/O Overview",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4d5bedf5-2b3c-425b-a4be-b5e7f36300af",
"layout": {
"h": 7,
"i": "4d5bedf5-2b3c-425b-a4be-b5e7f36300af",
"isResizable": true,
"w": 6,
"x": 0,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Rate",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum(rate(http_server_requests_seconds_count{instance=\"$instance\"}[5m]))",
"legend": "HTTP",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "68f7255b-6637-4967-906d-a6d4c9a61d88",
"layout": {
"h": 7,
"i": "68f7255b-6637-4967-906d-a6d4c9a61d88",
"isResizable": true,
"w": 6,
"x": 6,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Errors",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(http_server_requests_seconds_count{instance=\"$instance\", status=~\"5..\"}[1m]))",
"legend": "HTTP - 5xx",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "07d11fe1-7005-420f-bbfc-16f304294ef7",
"layout": {
"h": 7,
"i": "07d11fe1-7005-420f-bbfc-16f304294ef7",
"isResizable": true,
"w": 6,
"x": 12,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Duration",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(http_server_requests_seconds_sum{instance=\"$instance\", status!~\"5..\"}[1m]))/sum(rate(http_server_requests_seconds_count{instance=\"$instance\", status!~\"5..\"}[1m]))",
"legend": "HTTP - AVG",
"refId": "A"
},
{
"expr": "max(http_server_requests_seconds_max{instance=\"$instance\", status!~\"5..\"})",
"legend": "HTTP - MAX",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "caf07cdd-42e4-427b-b511-ee967246330a",
"layout": {
"h": 7,
"i": "caf07cdd-42e4-427b-b511-ee967246330a",
"isResizable": true,
"w": 6,
"x": 18,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Utilisation",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "tomcat_threads_busy{instance=\"$instance\"} or tomcat_threads_busy_threads{instance=\"$instance\"}",
"legend": "TOMCAT - BSY",
"refId": "A"
},
{
"expr": "tomcat_threads_current{instance=\"$instance\"} or tomcat_threads_current_threads{instance=\"$instance\"}",
"legend": "TOMCAT - CUR",
"refId": "B"
},
{
"expr": "tomcat_threads_config_max{instance=\"$instance\"} or tomcat_threads_config_max_threads{instance=\"$instance\"}",
"legend": "TOMCAT - MAX",
"refId": "C"
},
{
"expr": "jetty_threads_busy{instance=\"$instance\"}",
"legend": "JETTY - BSY",
"refId": "D"
},
{
"expr": "jetty_threads_current{instance=\"$instance\"}",
"legend": "JETTY - CUR",
"refId": "E"
},
{
"expr": "jetty_threads_config_max{instance=\"$instance\"}",
"legend": "JETTY - MAX",
"refId": "F"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "83e3307c-b862-471b-a14f-9f88c8b67dbf",
"layout": {
"h": 1,
"i": "83e3307c-b862-471b-a14f-9f88c8b67dbf",
"isResizable": false,
"w": 24,
"x": 0,
"y": 13
},
"name": "JVM Memory",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1aeab3d4-de80-4795-ae7e-57c9b085a074",
"layout": {
"h": 7,
"i": "1aeab3d4-de80-4795-ae7e-57c9b085a074",
"isResizable": true,
"w": 6,
"x": 0,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "JVM Heap",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(jvm_memory_used_bytes{instance=\"$instance\", area=\"heap\"})",
"legend": "used",
"refId": "A"
},
{
"expr": "sum(jvm_memory_committed_bytes{instance=\"$instance\", area=\"heap\"})",
"legend": "committed",
"refId": "B"
},
{
"expr": "sum(jvm_memory_max_bytes{instance=\"$instance\", area=\"heap\"})",
"legend": "max",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "dd9a63b6-9a33-416f-8dbe-dbbae1ff3fed",
"layout": {
"h": 7,
"i": "dd9a63b6-9a33-416f-8dbe-dbbae1ff3fed",
"isResizable": true,
"w": 6,
"x": 6,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "JVM Non-Heap",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(jvm_memory_used_bytes{instance=\"$instance\", area=\"nonheap\"})",
"legend": "used",
"refId": "A"
},
{
"expr": "sum(jvm_memory_committed_bytes{instance=\"$instance\", area=\"nonheap\"})",
"legend": "committed",
"refId": "B"
},
{
"expr": "sum(jvm_memory_max_bytes{instance=\"$instance\", area=\"nonheap\"})",
"legend": "max",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "df058502-188f-4711-b1da-3240a39cb35b",
"layout": {
"h": 7,
"i": "df058502-188f-4711-b1da-3240a39cb35b",
"isResizable": true,
"w": 6,
"x": 12,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "JVM Total",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(jvm_memory_used_bytes{instance=\"$instance\"})",
"legend": "used",
"refId": "A"
},
{
"expr": "sum(jvm_memory_committed_bytes{instance=\"$instance\"})",
"legend": "committed",
"refId": "B"
},
{
"expr": "sum(jvm_memory_max_bytes{instance=\"$instance\"})",
"legend": "max",
"refId": "C"
},
{
"expr": "process_memory_rss_bytes{instance=\"$instance\"}",
"legend": "rss",
"refId": "E"
},
{
"expr": "process_memory_pss_bytes{instance=\"$instance\"}",
"legend": "pss",
"refId": "F"
},
{
"expr": "process_memory_swap_bytes{instance=\"$instance\"}",
"legend": "swap",
"refId": "G"
},
{
"expr": "process_memory_swappss_bytes{instance=\"$instance\"}",
"legend": "swappss",
"refId": "H"
},
{
"expr": "process_memory_pss_bytes{instance=\"$instance\"} + process_memory_swap_bytes{instance=\"$instance\"}",
"legend": "phys (pss+swap)",
"refId": "I"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1607d97f-074a-457c-90f2-c6ccbf5ffef3",
"layout": {
"h": 7,
"i": "1607d97f-074a-457c-90f2-c6ccbf5ffef3",
"isResizable": true,
"w": 6,
"x": 18,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "JVM Native Memory",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(process_memory_pss_bytes{instance=\"$instance\"} + process_memory_swap_bytes{instance=\"$instance\"} - on(instance) sum(jvm_memory_committed_bytes{instance=\"$instance\"}) by(instance)) \u003e= 0",
"legend": "native",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "ea05f925-19aa-4ba4-8daa-4133e939c5fb",
"layout": {
"h": 1,
"i": "ea05f925-19aa-4ba4-8daa-4133e939c5fb",
"isResizable": false,
"w": 24,
"x": 0,
"y": 21
},
"name": "JVM Misc",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8e65b992-ea8b-40b0-b201-23c8e54436a5",
"layout": {
"h": 7,
"i": "8e65b992-ea8b-40b0-b201-23c8e54436a5",
"isResizable": true,
"w": 6,
"x": 0,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "CPU",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "system_cpu_usage{instance=\"$instance\"}",
"legend": "system",
"refId": "A"
},
{
"expr": "process_cpu_usage{instance=\"$instance\"}",
"legend": "process",
"refId": "B"
},
{
"expr": "avg_over_time(process_cpu_usage{instance=\"$instance\"}[1h])",
"legend": "process-1h",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c8db3f1c-b857-473b-be1a-4d4d5b1d7618",
"layout": {
"h": 7,
"i": "c8db3f1c-b857-473b-be1a-4d4d5b1d7618",
"isResizable": true,
"w": 6,
"x": 6,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "File Descriptors",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "process_open_fds{instance=\"$instance\"}",
"legend": "open",
"refId": "A"
},
{
"expr": "process_max_fds{instance=\"$instance\"}",
"legend": "max",
"refId": "B"
},
{
"expr": "process_files_open{instance=\"$instance\"} or process_files_open_files{instance=\"$instance\"}",
"legend": "open",
"refId": "C"
},
{
"expr": "process_files_max{instance=\"$instance\"} or process_files_max_files{instance=\"$instance\"}",
"legend": "max",
"refId": "D"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d1e449d5-31e3-4df9-a69a-c22edab3a44e",
"layout": {
"h": 7,
"i": "d1e449d5-31e3-4df9-a69a-c22edab3a44e",
"isResizable": true,
"w": 6,
"x": 12,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "Threads",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "jvm_threads_live{instance=\"$instance\"} or jvm_threads_live_threads{instance=\"$instance\"}",
"legend": "live",
"refId": "A"
},
{
"expr": "jvm_threads_daemon{instance=\"$instance\"} or jvm_threads_daemon_threads{instance=\"$instance\"}",
"legend": "daemon",
"refId": "B"
},
{
"expr": "jvm_threads_peak{instance=\"$instance\"} or jvm_threads_peak_threads{instance=\"$instance\"}",
"legend": "peak",
"refId": "C"
},
{
"expr": "process_threads{instance=\"$instance\"}",
"legend": "process",
"refId": "D"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3b8ef3a3-f687-4cf9-8348-da9e60bfa0f5",
"layout": {
"h": 7,
"i": "3b8ef3a3-f687-4cf9-8348-da9e60bfa0f5",
"isResizable": true,
"w": 6,
"x": 18,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "Thread States",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "jvm_threads_states_threads{instance=\"$instance\"}",
"legend": "{{state}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2e02407c-9beb-460a-bca8-9b4dfd2e3a37",
"layout": {
"h": 8,
"i": "2e02407c-9beb-460a-bca8-9b4dfd2e3a37",
"isResizable": true,
"w": 24,
"x": 0,
"y": 29
},
"links": [],
"maxPerRow": 4,
"name": "Log Events (1m)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "increase(logback_events_total{instance=\"$instance\"}[5m])",
"legend": "{{level}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "84242a2f-64ec-4b71-8a9e-5441eb3ac271",
"layout": {
"h": 1,
"i": "84242a2f-64ec-4b71-8a9e-5441eb3ac271",
"isResizable": false,
"w": 24,
"x": 0,
"y": 37
},
"name": "JVM Memory Pools (Heap)",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "22a3c8aa-4452-46c9-9439-4a5c524ab79e",
"layout": {
"h": 8,
"i": "22a3c8aa-4452-46c9-9439-4a5c524ab79e",
"isResizable": true,
"w": 24,
"x": 0,
"y": 38
},
"links": [],
"maxPerRow": 3,
"name": "$jvm_memory_pool_heap",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"repeat": "jvm_memory_pool_heap",
"targets": [
{
"expr": "jvm_memory_used_bytes{instance=\"$instance\", id=\"$jvm_memory_pool_heap\"}",
"legend": "used",
"refId": "A"
},
{
"expr": "jvm_memory_committed_bytes{instance=\"$instance\", id=\"$jvm_memory_pool_heap\"}",
"legend": "commited",
"refId": "B"
},
{
"expr": "jvm_memory_max_bytes{instance=\"$instance\", id=\"$jvm_memory_pool_heap\"}",
"legend": "max",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "5b6abe6d-7496-454c-9ab9-fcb7f068f626",
"layout": {
"h": 1,
"i": "5b6abe6d-7496-454c-9ab9-fcb7f068f626",
"isResizable": false,
"w": 24,
"x": 0,
"y": 102
},
"name": "JVM Memory Pools (Non-Heap)",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6ebac55a-4d88-4f3e-8414-5df8e27d3d60",
"layout": {
"h": 7,
"i": "6ebac55a-4d88-4f3e-8414-5df8e27d3d60",
"isResizable": true,
"w": 24,
"x": 0,
"y": 103
},
"links": [],
"maxPerRow": 3,
"name": "$jvm_memory_pool_nonheap",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"repeat": "jvm_memory_pool_nonheap",
"targets": [
{
"expr": "jvm_memory_used_bytes{instance=\"$instance\", id=\"$jvm_memory_pool_nonheap\"}",
"legend": "used",
"refId": "A"
},
{
"expr": "jvm_memory_committed_bytes{instance=\"$instance\", id=\"$jvm_memory_pool_nonheap\"}",
"legend": "commited",
"refId": "B"
},
{
"expr": "jvm_memory_max_bytes{instance=\"$instance\", id=\"$jvm_memory_pool_nonheap\"}",
"legend": "max",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "ec6d25b8-cfe1-42c0-afc4-c32886432636",
"layout": {
"h": 1,
"i": "ec6d25b8-cfe1-42c0-afc4-c32886432636",
"isResizable": false,
"w": 24,
"x": 0,
"y": 152
},
"name": "Garbage Collection",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "adf56723-9689-4202-a7dd-1d9d844a5e66",
"layout": {
"h": 7,
"i": "adf56723-9689-4202-a7dd-1d9d844a5e66",
"isResizable": true,
"w": 8,
"x": 0,
"y": 153
},
"links": [],
"maxPerRow": 4,
"name": "Collections",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(jvm_gc_pause_seconds_count{instance=\"$instance\"}[5m])",
"legend": "{{action}} ({{cause}})",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7caea97e-1cfb-45c6-892f-fe89377bd8fa",
"layout": {
"h": 7,
"i": "7caea97e-1cfb-45c6-892f-fe89377bd8fa",
"isResizable": true,
"w": 8,
"x": 8,
"y": 153
},
"links": [],
"maxPerRow": 4,
"name": "Pause Durations",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(jvm_gc_pause_seconds_sum{instance=\"$instance\"}[5m])/rate(jvm_gc_pause_seconds_count{instance=\"$instance\"}[5m])",
"legend": "avg {{action}} ({{cause}})",
"refId": "A"
},
{
"expr": "jvm_gc_pause_seconds_max{instance=\"$instance\"}",
"legend": "max {{action}} ({{cause}})",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "00223434-f6e0-4c53-8562-38bcb46948f4",
"layout": {
"h": 7,
"i": "00223434-f6e0-4c53-8562-38bcb46948f4",
"isResizable": true,
"w": 8,
"x": 16,
"y": 153
},
"links": [],
"maxPerRow": 4,
"name": "Allocated/Promoted",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(jvm_gc_memory_allocated_bytes_total{instance=\"$instance\"}[5m])",
"legend": "allocated",
"refId": "A"
},
{
"expr": "rate(jvm_gc_memory_promoted_bytes_total{instance=\"$instance\"}[1m])",
"legend": "promoted",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "a199f2f8-136c-44f4-aea5-da20e0e479fb",
"layout": {
"h": 1,
"i": "a199f2f8-136c-44f4-aea5-da20e0e479fb",
"isResizable": false,
"w": 24,
"x": 0,
"y": 160
},
"name": "Classloading",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "903e6501-6f64-4860-a12a-4e07a7424ef8",
"layout": {
"h": 7,
"i": "903e6501-6f64-4860-a12a-4e07a7424ef8",
"isResizable": true,
"w": 12,
"x": 0,
"y": 161
},
"links": [],
"maxPerRow": 4,
"name": "Classes loaded",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "jvm_classes_loaded{instance=\"$instance\"} or jvm_classes_loaded_classes{instance=\"$instance\"}",
"legend": "loaded",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "dc21d536-f4c0-48c0-99f1-eef82b30e979",
"layout": {
"h": 7,
"i": "dc21d536-f4c0-48c0-99f1-eef82b30e979",
"isResizable": true,
"w": 12,
"x": 12,
"y": 161
},
"links": [],
"maxPerRow": 4,
"name": "Class delta (5m)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "delta(jvm_classes_loaded{application=\"$application\",instance=\"$instance\"}[5m]) or delta(jvm_classes_loaded_classes{application=\"$application\",instance=\"$instance\"}[5m])",
"legend": "delta",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "6c164737-84da-4d51-911b-38e207bef57e",
"layout": {
"h": 1,
"i": "6c164737-84da-4d51-911b-38e207bef57e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 168
},
"name": "Buffer Pools",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9ac2c14e-9871-4b84-a5cb-976f8dfe6c19",
"layout": {
"h": 7,
"i": "9ac2c14e-9871-4b84-a5cb-976f8dfe6c19",
"isResizable": true,
"w": 6,
"x": 0,
"y": 169
},
"links": [],
"maxPerRow": 4,
"name": "Direct Buffers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "jvm_buffer_memory_used_bytes{instance=\"$instance\", id=\"direct\"}",
"legend": "used",
"refId": "A"
},
{
"expr": "jvm_buffer_total_capacity_bytes{instance=\"$instance\", id=\"direct\"}",
"legend": "capacity",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f38df099-4559-4239-b283-21bd3949c81d",
"layout": {
"h": 7,
"i": "f38df099-4559-4239-b283-21bd3949c81d",
"isResizable": true,
"w": 6,
"x": 6,
"y": 169
},
"links": [],
"maxPerRow": 4,
"name": "Direct Buffers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "jvm_buffer_count{instance=\"$instance\", id=\"direct\"} or jvm_buffer_count_buffers{instance=\"$instance\", id=\"direct\"}",
"legend": "count",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ef8f0a38-f98e-428c-a601-186dccafd2c1",
"layout": {
"h": 7,
"i": "ef8f0a38-f98e-428c-a601-186dccafd2c1",
"isResizable": true,
"w": 6,
"x": 12,
"y": 169
},
"links": [],
"maxPerRow": 4,
"name": "Mapped Buffers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "jvm_buffer_memory_used_bytes{instance=\"$instance\", id=\"mapped\"}",
"legend": "used",
"refId": "A"
},
{
"expr": "jvm_buffer_total_capacity_bytes{instance=\"$instance\", id=\"mapped\"}",
"legend": "capacity",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3abe8423-f8fd-44b8-8321-1753943d4e9c",
"layout": {
"h": 7,
"i": "3abe8423-f8fd-44b8-8321-1753943d4e9c",
"isResizable": true,
"w": 6,
"x": 18,
"y": 169
},
"links": [],
"maxPerRow": 4,
"name": "Mapped Buffers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "jvm_buffer_count{instance=\"$instance\", id=\"mapped\"} or jvm_buffer_count_buffers{instance=\"$instance\", id=\"mapped\"}",
"legend": "count",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"label": "datasource",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(jvm_memory_used_bytes, application)",
"multi": false,
"name": "application",
"reg": "",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(jvm_memory_used_bytes{application=\"$application\", }, instance)",
"multi": false,
"name": "instance",
"reg": "",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(jvm_memory_used_bytes{application=\"$application\",instance=\"$instance\",area=\"heap\", },id)",
"multi": false,
"name": "jvm_memory_pool_heap",
"reg": "",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(jvm_memory_used_bytes{application=\"$application\",instance=\"$instance\",area=\"nonheap\"}, id)",
"multi": false,
"name": "jvm_memory_pool_nonheap",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328396955000
}
================================================
FILE: integrations/SpringBoot/dashboards/JVM.json
================================================
{
"name": "JVM仪表盘",
"tags": "Actuator",
"configs": {
"links": [],
"panels": [
{
"collapsed": true,
"id": "98ba9b06-9d1e-44b9-a626-f4e704cc39ac",
"layout": {
"h": 1,
"i": "98ba9b06-9d1e-44b9-a626-f4e704cc39ac",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Quick Facts",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 2,
"colorMode": "value",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c325f6ba-bca2-42f1-a518-1d3077b54a54",
"layout": {
"h": 4,
"i": "c325f6ba-bca2-42f1-a518-1d3077b54a54",
"isResizable": true,
"w": 6,
"x": 0,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Start time",
"options": {
"standardOptions": {
"util": "datetimeMilliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "process_start_time_seconds{ident=~\"$instance\"}*1000",
"legend": "{{ident}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 2,
"colorMode": "value",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "95bc69c0-0b29-4dc5-8404-c0ca0741918f",
"layout": {
"h": 4,
"i": "95bc69c0-0b29-4dc5-8404-c0ca0741918f",
"isResizable": true,
"w": 6,
"x": 6,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Uptime",
"options": {
"standardOptions": {
"util": "humantimeSeconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "process_uptime_seconds{ident=~\"$instance\"}",
"legend": "{{ident}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 2,
"colorMode": "value",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "03627849-92c0-4a5c-9558-738dc6bd6186",
"layout": {
"h": 4,
"i": "03627849-92c0-4a5c-9558-738dc6bd6186",
"isResizable": true,
"w": 6,
"x": 12,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Heap used",
"options": {
"standardOptions": {
"decimals": 2,
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(jvm_memory_used_bytes{ident=~\"$instance\", area=\"heap\"}) by(ident)/on(ident)(sum(jvm_memory_max_bytes{ident=~\"$instance\", area=\"heap\"}) by(ident)) * 100",
"legend": "{{ident}}",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 2,
"colorMode": "value",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4e85e1c3-9623-4e1c-bbb2-a4824455c5b3",
"layout": {
"h": 4,
"i": "4e85e1c3-9623-4e1c-bbb2-a4824455c5b3",
"isResizable": true,
"w": 6,
"x": 18,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Non-Heap used",
"options": {
"standardOptions": {
"decimals": 2,
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "sum(jvm_memory_used_bytes{ident=~\"$instance\", area=\"nonheap\"})by(ident)/on(ident)(sum(jvm_memory_max_bytes{ident=~\"$instance\", area=\"nonheap\"})by(ident))*100",
"legend": "",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "abe6c0c1-8b88-450a-908f-0f51cc973761",
"layout": {
"h": 1,
"i": "abe6c0c1-8b88-450a-908f-0f51cc973761",
"isResizable": false,
"w": 24,
"x": 0,
"y": 5
},
"name": "I/O Overview",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4d5bedf5-2b3c-425b-a4be-b5e7f36300af",
"layout": {
"h": 7,
"i": "4d5bedf5-2b3c-425b-a4be-b5e7f36300af",
"isResizable": true,
"w": 6,
"x": 0,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Rate",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum(rate(http_server_requests_seconds_count{ident=~\"$instance\"}[1m])) by (ident)",
"legend": "{{ident}}-HTTP",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "68f7255b-6637-4967-906d-a6d4c9a61d88",
"layout": {
"h": 7,
"i": "68f7255b-6637-4967-906d-a6d4c9a61d88",
"isResizable": true,
"w": 6,
"x": 6,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Errors",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum(rate(http_server_requests_seconds_count{ident=~\"$instance\", status=~\"5..\"}[1m])) by (ident)",
"legend": "{{ident}}-HTTP - 5xx",
"maxDataPoints": 240,
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "none",
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "07d11fe1-7005-420f-bbfc-16f304294ef7",
"layout": {
"h": 7,
"i": "07d11fe1-7005-420f-bbfc-16f304294ef7",
"isResizable": true,
"w": 6,
"x": 12,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Duration",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden",
"heightInPercentage": 30,
"placement": "bottom"
},
"standardOptions": {
"decimals": 2,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum(http_server_requests_seconds_sum{ident=~\"$instance\", status!~\"5..\"}[1m]) by(ident)/on(ident)(sum(http_server_requests_seconds_count{ident=~\"$instance\", status!~\"5..\"}[1m]) by(ident))",
"legend": "{{ident}}-HTTP - AVG",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "max(http_server_requests_seconds_max{ident=~\"$instance\", status!~\"5..\"}) by (ident)",
"hide": false,
"legend": "{{ident}}-HTTP - MAX",
"maxDataPoints": 240,
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "caf07cdd-42e4-427b-b511-ee967246330a",
"layout": {
"h": 7,
"i": "caf07cdd-42e4-427b-b511-ee967246330a",
"isResizable": true,
"w": 6,
"x": 18,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Utilisation",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "tomcat_threads_busy{ident=~\"$instance\"} or tomcat_threads_busy_threads{ident=~\"$instance\"}",
"legend": "{{ident}}-TOMCAT - BSY",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "tomcat_threads_current{ident=~\"$instance\"} or tomcat_threads_current_threads{ident=~\"$instance\"}",
"legend": "{{ident}}-TOMCAT - CUR",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "tomcat_threads_config_max{ident=~\"$instance\"} or tomcat_threads_config_max_threads{ident=~\"$instance\"}",
"legend": "{{ident}}-TOMCAT - MAX",
"maxDataPoints": 240,
"refId": "C"
},
{
"expr": "jetty_threads_busy{ident=~\"$instance\"}",
"legend": "{{ident}}-JETTY - BSY",
"maxDataPoints": 240,
"refId": "D"
},
{
"expr": "jetty_threads_current{ident=~\"$instance\"}",
"legend": "{{ident}}-JETTY - CUR",
"maxDataPoints": 240,
"refId": "E"
},
{
"expr": "jetty_threads_config_max{ident=~\"$instance\"}",
"legend": "{{ident}}-JETTY - MAX",
"maxDataPoints": 240,
"refId": "F"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "83e3307c-b862-471b-a14f-9f88c8b67dbf",
"layout": {
"h": 1,
"i": "83e3307c-b862-471b-a14f-9f88c8b67dbf",
"isResizable": false,
"w": 24,
"x": 0,
"y": 13
},
"name": "JVM Memory",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1aeab3d4-de80-4795-ae7e-57c9b085a074",
"layout": {
"h": 7,
"i": "1aeab3d4-de80-4795-ae7e-57c9b085a074",
"isResizable": true,
"w": 6,
"x": 0,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "JVM Heap",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off",
"standardOptions": {
"decimals": null
}
}
}
],
"targets": [
{
"expr": "sum(jvm_memory_used_bytes{ident=~\"$instance\", area=\"heap\"}) by (ident)",
"legend": "{{ident}}-used",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "sum(jvm_memory_committed_bytes{ident=~\"$instance\", area=\"heap\"}) by (ident)",
"legend": "{{ident}}-committed",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "sum(jvm_memory_max_bytes{ident=~\"$instance\", area=\"heap\"}) by (ident)",
"legend": "{{ident}}-max",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "dd9a63b6-9a33-416f-8dbe-dbbae1ff3fed",
"layout": {
"h": 7,
"i": "dd9a63b6-9a33-416f-8dbe-dbbae1ff3fed",
"isResizable": true,
"w": 6,
"x": 6,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "JVM Non-Heap",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum(jvm_memory_used_bytes{ident=~\"$instance\", area=\"nonheap\"}) by (ident)",
"legend": "{{ident}}-used",
"maxDataPoints": 240,
"refId": "A"
},
{
"expr": "sum(jvm_memory_committed_bytes{ident=~\"$instance\", area=\"nonheap\"}) by (ident)",
"legend": "{{ident}}-committed",
"maxDataPoints": 240,
"refId": "B"
},
{
"expr": "sum(jvm_memory_max_bytes{ident=~\"$instance\", area=\"nonheap\"}) by (ident)",
"legend": "{{ident}}-max",
"maxDataPoints": 240,
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "df058502-188f-4711-b1da-3240a39cb35b",
"layout": {
"h": 7,
"i": "df058502-188f-4711-b1da-3240a39cb35b",
"isResizable": true,
"w": 6,
"x": 12,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "JVM Total",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "sum(jvm_memory_used_bytes{ident=\"$instance\"})",
"legend": "used",
"refId": "A"
},
{
"expr": "sum(jvm_memory_committed_bytes{ident=\"$instance\"})",
"legend": "committed",
"refId": "B"
},
{
"expr": "sum(jvm_memory_max_bytes{ident=\"$instance\"})",
"legend": "max",
"refId": "C"
},
{
"expr": "process_memory_rss_bytes{ident=\"$instance\"}",
"legend": "rss",
"refId": "E"
},
{
"expr": "process_memory_pss_bytes{ident=\"$instance\"}",
"legend": "pss",
"refId": "F"
},
{
"expr": "process_memory_swap_bytes{ident=\"$instance\"}",
"legend": "swap",
"refId": "G"
},
{
"expr": "process_memory_swappss_bytes{ident=\"$instance\"}",
"legend": "swappss",
"refId": "H"
},
{
"expr": "process_memory_pss_bytes{ident=\"$instance\"} + process_memory_swap_bytes{ident=\"$instance\"}",
"legend": "phys (pss+swap)",
"refId": "I"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1607d97f-074a-457c-90f2-c6ccbf5ffef3",
"layout": {
"h": 7,
"i": "1607d97f-074a-457c-90f2-c6ccbf5ffef3",
"isResizable": true,
"w": 6,
"x": 18,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "JVM Native Memory",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "(process_memory_pss_bytes{ident=\"$instance\"} + process_memory_swap_bytes{ident=\"$instance\"} - on(instance) sum(jvm_memory_committed_bytes{ident=\"$instance\"}) by(instance)) >= 0",
"legend": "native",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "ea05f925-19aa-4ba4-8daa-4133e939c5fb",
"layout": {
"h": 1,
"i": "ea05f925-19aa-4ba4-8daa-4133e939c5fb",
"isResizable": false,
"w": 24,
"x": 0,
"y": 28
},
"name": "JVM Misc",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8e65b992-ea8b-40b0-b201-23c8e54436a5",
"layout": {
"h": 7,
"i": "8e65b992-ea8b-40b0-b201-23c8e54436a5",
"isResizable": true,
"w": 6,
"x": 0,
"y": 29
},
"links": [],
"maxPerRow": 4,
"name": "CPU",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "system_cpu_usage{ident=\"$instance\"}",
"legend": "system",
"refId": "A"
},
{
"expr": "process_cpu_usage{ident=\"$instance\"}",
"legend": "process",
"refId": "B"
},
{
"expr": "avg_over_time(process_cpu_usage{ident=\"$instance\"}[1h])",
"legend": "process-1h",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c8db3f1c-b857-473b-be1a-4d4d5b1d7618",
"layout": {
"h": 7,
"i": "c8db3f1c-b857-473b-be1a-4d4d5b1d7618",
"isResizable": true,
"w": 6,
"x": 6,
"y": 29
},
"links": [],
"maxPerRow": 4,
"name": "File Descriptors",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "process_open_fds{ident=\"$instance\"}",
"legend": "open",
"refId": "A"
},
{
"expr": "process_max_fds{ident=\"$instance\"}",
"legend": "max",
"refId": "B"
},
{
"expr": "process_files_open{ident=\"$instance\"} or process_files_open_files{ident=\"$instance\"}",
"legend": "open",
"refId": "C"
},
{
"expr": "process_files_max{ident=\"$instance\"} or process_files_max_files{ident=\"$instance\"}",
"legend": "max",
"refId": "D"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d1e449d5-31e3-4df9-a69a-c22edab3a44e",
"layout": {
"h": 7,
"i": "d1e449d5-31e3-4df9-a69a-c22edab3a44e",
"isResizable": true,
"w": 6,
"x": 12,
"y": 29
},
"links": [],
"maxPerRow": 4,
"name": "Threads",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_threads_live{ident=\"$instance\"} or jvm_threads_live_threads{ident=\"$instance\"}",
"legend": "live",
"refId": "A"
},
{
"expr": "jvm_threads_daemon{ident=\"$instance\"} or jvm_threads_daemon_threads{ident=\"$instance\"}",
"legend": "daemon",
"refId": "B"
},
{
"expr": "jvm_threads_peak{ident=\"$instance\"} or jvm_threads_peak_threads{ident=\"$instance\"}",
"legend": "peak",
"refId": "C"
},
{
"expr": "process_threads{ident=\"$instance\"}",
"legend": "process",
"refId": "D"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3b8ef3a3-f687-4cf9-8348-da9e60bfa0f5",
"layout": {
"h": 7,
"i": "3b8ef3a3-f687-4cf9-8348-da9e60bfa0f5",
"isResizable": true,
"w": 6,
"x": 18,
"y": 29
},
"links": [],
"maxPerRow": 4,
"name": "Thread States",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_threads_states_threads{ident=\"$instance\"}",
"legend": "{{state}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2e02407c-9beb-460a-bca8-9b4dfd2e3a37",
"layout": {
"h": 8,
"i": "2e02407c-9beb-460a-bca8-9b4dfd2e3a37",
"isResizable": true,
"w": 24,
"x": 0,
"y": 36
},
"links": [],
"maxPerRow": 4,
"name": "Log Events (1m)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "increase(logback_events_total{ident=\"$instance\"}[1m])",
"legend": "{{level}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "84242a2f-64ec-4b71-8a9e-5441eb3ac271",
"layout": {
"h": 1,
"i": "84242a2f-64ec-4b71-8a9e-5441eb3ac271",
"isResizable": false,
"w": 24,
"x": 0,
"y": 44
},
"name": "JVM Memory Pools (Heap)",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "22a3c8aa-4452-46c9-9439-4a5c524ab79e",
"layout": {
"h": 8,
"i": "22a3c8aa-4452-46c9-9439-4a5c524ab79e",
"isResizable": true,
"w": 24,
"x": 0,
"y": 45
},
"links": [],
"maxPerRow": 3,
"name": "$jvm_memory_pool_heap",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"repeat": "jvm_memory_pool_heap",
"targets": [
{
"expr": "jvm_memory_used_bytes{ident=\"$instance\", id=\"$jvm_memory_pool_heap\"}",
"legend": "used",
"refId": "A"
},
{
"expr": "jvm_memory_committed_bytes{ident=\"$instance\", id=\"$jvm_memory_pool_heap\"}",
"legend": "commited",
"refId": "B"
},
{
"expr": "jvm_memory_max_bytes{ident=\"$instance\", id=\"$jvm_memory_pool_heap\"}",
"legend": "max",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "5b6abe6d-7496-454c-9ab9-fcb7f068f626",
"layout": {
"h": 1,
"i": "5b6abe6d-7496-454c-9ab9-fcb7f068f626",
"isResizable": false,
"w": 24,
"x": 0,
"y": 77
},
"name": "JVM Memory Pools (Non-Heap)",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6ebac55a-4d88-4f3e-8414-5df8e27d3d60",
"layout": {
"h": 7,
"i": "6ebac55a-4d88-4f3e-8414-5df8e27d3d60",
"isResizable": true,
"w": 24,
"x": 0,
"y": 78
},
"links": [],
"maxPerRow": 3,
"name": "$jvm_memory_pool_nonheap",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"repeat": "jvm_memory_pool_nonheap",
"targets": [
{
"expr": "jvm_memory_used_bytes{ident=\"$instance\", id=\"$jvm_memory_pool_nonheap\"}",
"legend": "used",
"refId": "A"
},
{
"expr": "jvm_memory_committed_bytes{ident=\"$instance\", id=\"$jvm_memory_pool_nonheap\"}",
"legend": "commited",
"refId": "B"
},
{
"expr": "jvm_memory_max_bytes{ident=\"$instance\", id=\"$jvm_memory_pool_nonheap\"}",
"legend": "max",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "ec6d25b8-cfe1-42c0-afc4-c32886432636",
"layout": {
"h": 1,
"i": "ec6d25b8-cfe1-42c0-afc4-c32886432636",
"isResizable": false,
"w": 24,
"x": 0,
"y": 106
},
"name": "Garbage Collection",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "adf56723-9689-4202-a7dd-1d9d844a5e66",
"layout": {
"h": 7,
"i": "adf56723-9689-4202-a7dd-1d9d844a5e66",
"isResizable": true,
"w": 8,
"x": 0,
"y": 107
},
"links": [],
"maxPerRow": 4,
"name": "Collections",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(jvm_gc_pause_seconds_count{ident=\"$instance\"}[1m])",
"legend": "{{action}} ({{cause}})",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7caea97e-1cfb-45c6-892f-fe89377bd8fa",
"layout": {
"h": 7,
"i": "7caea97e-1cfb-45c6-892f-fe89377bd8fa",
"isResizable": true,
"w": 8,
"x": 8,
"y": 107
},
"links": [],
"maxPerRow": 4,
"name": "Pause Durations",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(jvm_gc_pause_seconds_sum{ident=\"$instance\"}[1m])/rate(jvm_gc_pause_seconds_count{ident=\"$instance\"}[1m])",
"legend": "avg {{action}} ({{cause}})",
"refId": "A"
},
{
"expr": "jvm_gc_pause_seconds_max{ident=\"$instance\"}",
"legend": "max {{action}} ({{cause}})",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "00223434-f6e0-4c53-8562-38bcb46948f4",
"layout": {
"h": 7,
"i": "00223434-f6e0-4c53-8562-38bcb46948f4",
"isResizable": true,
"w": 8,
"x": 16,
"y": 107
},
"links": [],
"maxPerRow": 4,
"name": "Allocated/Promoted",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(jvm_gc_memory_allocated_bytes_total{ident=\"$instance\"}[1m])",
"legend": "allocated",
"refId": "A"
},
{
"expr": "rate(jvm_gc_memory_promoted_bytes_total{ident=\"$instance\"}[1m])",
"legend": "promoted",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "fe904abb-6089-453d-8e7e-79e2483ef9c9",
"layout": {
"h": 7,
"i": "ea58a286-37cb-48a8-92f6-f09347a04964",
"isResizable": true,
"w": 8,
"x": 8,
"y": 114
},
"links": [],
"maxPerRow": 4,
"name": "Pause Durations",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "rate(jvm_gc_pause_seconds_sum{ident=\"$instance\"}[1m])/rate(jvm_gc_pause_seconds_count{ident=\"$instance\"}[1m])",
"legend": "avg {{action}} ({{cause}})",
"refId": "A"
},
{
"expr": "jvm_gc_pause_seconds_max{ident=\"$instance\"}",
"legend": "max {{action}} ({{cause}})",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "a199f2f8-136c-44f4-aea5-da20e0e479fb",
"layout": {
"h": 1,
"i": "a199f2f8-136c-44f4-aea5-da20e0e479fb",
"isResizable": false,
"w": 24,
"x": 0,
"y": 121
},
"name": "Classloading",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "903e6501-6f64-4860-a12a-4e07a7424ef8",
"layout": {
"h": 7,
"i": "903e6501-6f64-4860-a12a-4e07a7424ef8",
"isResizable": true,
"w": 12,
"x": 0,
"y": 122
},
"links": [],
"maxPerRow": 4,
"name": "Classes loaded",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_classes_loaded{ident=\"$instance\"} or jvm_classes_loaded_classes{ident=\"$instance\"}",
"legend": "loaded",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "dc21d536-f4c0-48c0-99f1-eef82b30e979",
"layout": {
"h": 7,
"i": "dc21d536-f4c0-48c0-99f1-eef82b30e979",
"isResizable": true,
"w": 12,
"x": 12,
"y": 122
},
"links": [],
"maxPerRow": 4,
"name": "Class delta (5m)",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "delta(jvm_classes_loaded{application=\"$application\",ident=\"$instance\"}[5m]) or delta(jvm_classes_loaded_classes{application=\"$application\",ident=\"$instance\"}[5m])",
"legend": "delta",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": false,
"id": "6c164737-84da-4d51-911b-38e207bef57e",
"layout": {
"h": 1,
"i": "6c164737-84da-4d51-911b-38e207bef57e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 129
},
"name": "Buffer Pools",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "9ac2c14e-9871-4b84-a5cb-976f8dfe6c19",
"layout": {
"h": 7,
"i": "9ac2c14e-9871-4b84-a5cb-976f8dfe6c19",
"isResizable": true,
"w": 6,
"x": 0,
"y": 130
},
"links": [],
"maxPerRow": 4,
"name": "Direct Buffers",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_buffer_memory_used_bytes{ident=\"$instance\", id=\"direct\"}",
"legend": "used",
"refId": "A"
},
{
"expr": "jvm_buffer_total_capacity_bytes{ident=\"$instance\", id=\"direct\"}",
"legend": "capacity",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f38df099-4559-4239-b283-21bd3949c81d",
"layout": {
"h": 7,
"i": "f38df099-4559-4239-b283-21bd3949c81d",
"isResizable": true,
"w": 6,
"x": 6,
"y": 130
},
"links": [],
"maxPerRow": 4,
"name": "Direct Buffers",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_buffer_count{ident=\"$instance\", id=\"direct\"} or jvm_buffer_count_buffers{ident=\"$instance\", id=\"direct\"}",
"legend": "count",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ef8f0a38-f98e-428c-a601-186dccafd2c1",
"layout": {
"h": 7,
"i": "ef8f0a38-f98e-428c-a601-186dccafd2c1",
"isResizable": true,
"w": 6,
"x": 12,
"y": 130
},
"links": [],
"maxPerRow": 4,
"name": "Mapped Buffers",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_buffer_memory_used_bytes{ident=\"$instance\", id=\"mapped\"}",
"legend": "used",
"refId": "A"
},
{
"expr": "jvm_buffer_total_capacity_bytes{ident=\"$instance\", id=\"mapped\"}",
"legend": "capacity",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3abe8423-f8fd-44b8-8321-1753943d4e9c",
"layout": {
"h": 7,
"i": "3abe8423-f8fd-44b8-8321-1753943d4e9c",
"isResizable": true,
"w": 6,
"x": 18,
"y": 130
},
"links": [],
"maxPerRow": 4,
"name": "Mapped Buffers",
"options": {
"legend": {
"behaviour": "showItem",
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"overrides": [
{
"matcher": {
"id": "byFrameRefID"
},
"properties": {
"rightYAxisDisplay": "off"
}
}
],
"targets": [
{
"expr": "jvm_buffer_count{ident=\"$instance\", id=\"mapped\"} or jvm_buffer_count_buffers{ident=\"$instance\", id=\"mapped\"}",
"legend": "count",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(jvm_memory_used_bytes, application)",
"multi": false,
"name": "application",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(jvm_memory_used_bytes{application=\"$application\"}, ident)",
"multi": true,
"name": "instance",
"reg": "",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(jvm_memory_used_bytes{application=\"$application\",ident=~\"$instance\",area=\"heap\", },id)",
"hide": false,
"multi": false,
"name": "jvm_memory_pool_heap",
"reg": "",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(jvm_memory_used_bytes{application=\"$application\",ident=~\"$instance\",area=\"nonheap\"}, id)",
"hide": false,
"multi": false,
"name": "jvm_memory_pool_nonheap",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"uuid": 1724670732548038000
}
================================================
FILE: integrations/SpringBoot/markdown/README.md
================================================
# SpringBoot
Java 生态的项目,如果要暴露 metrics 数据,一般可以选择 micrometer,不过 SpringBoot 项目可以直接使用 SpringBoot Actuator 暴露 metrics 数据,Actuator 底层也是使用 micrometer 来实现的,只是使用起来更加简单。
## 应用配置
在 application.properties 中加入如下配置:
```properties
management.endpoint.metrics.enabled=true
management.endpoints.web.exposure.include=*
management.endpoint.prometheus.enabled=true
management.metrics.export.prometheus.enabled=true
```
完事启动项目,访问 `http://localhost:8080/actuator/prometheus` 即可看到符合 prometheus 协议的监控数据。
## 采集配置
既然暴露了 Prometheus 协议的监控数据,那通过 categraf prometheus 插件直接采集即可。配置文件是 `conf/input.prometheus/prometheus.toml`。配置样例如下:
```toml
[[instances]]
urls = [
"http://192.168.11.177:8080/actuator/prometheus"
]
```
================================================
FILE: integrations/Switch_Legacy/collect/switch_legacy/switch_legacy.toml
================================================
# # collect interval
# interval = "300s"
switch_id_label = "ip"
[mappings]
"192.168.88.160" = "switch001.bj"
"192.168.88.161" = "switch002.bj"
[[instances]]
# # interval = global.interval * interval_times
# interval_times = 1
# use global unique string to specify instance
# labels = { region="beijing" }
ips = [
# "172.16.2.1",
# "172.16.4/24",
# "192.168.56.102-192.168.56.120"
]
community = "public"
# whether use index tag
index_tag = false
ignore_ifaces = ["Nu", "NU", "Vlan", "Vl"]
concurrency_for_address = 1000
concurrency_for_request = 4
ping_enable = true
ping_mode_fastping = true
ping_timeout_ms = 300
ping_retries = 4
# true: use gosnmp, false: use snmpwalk
snmp_mode_gosnmp = true
snmp_timeout_ms = 1000
snmp_retries = 5
gather_ping_metrics = true
gather_flow_metrics = true
gather_cpu_metrics = true
gather_mem_metrics = true
gather_oper_status = false
gather_pkt = false
gather_broadcast_pkt = false
gather_multicast_pkt = false
gather_discards = false
gather_errors = false
gather_unknown_protos = false
gather_out_qlen = false
# ignore metrics if limit > 0 and collected value > limit
speed_limit = 0
pkt_limit = 0
broadcast_pkt_limit = 0
multicast_pkt_limit = 0
discards_pkt_limit = 0
errors_pkt_limit = 0
unknown_protos_pkt_limit = 0
out_qlen_pkt_limit = 0
# [[instances.customs]]
# metric = "AnyconnectSession"
# tags = {}
# oid = "1.3.6.1.4.1.9.9.392.1.3.35.0"
# [[instances.customs]]
# metric = "ConnectionStat"
# tags = {}
# oid = "1.3.6.1.4.1.9.9.147.1.2.2.2.1.5.40.6"
# [[instances.customs]]
# metric = "TempStatus"
# tags = {}
# oid = "1.3.6.1.4.1.9.9.13.1.3.1.3.1004"
================================================
FILE: integrations/Switch_Legacy/dashboards/dashboard.json
================================================
{
"id": 0,
"group_id": 0,
"name": "网络交换机",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "1cb95ad2-81f1-4977-9924-4af57a542490",
"layout": {
"h": 1,
"i": "1cb95ad2-81f1-4977-9924-4af57a542490",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "总体概览",
"panels": [],
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6c851c33-4ea4-4b76-be59-d4c19c5b2844",
"layout": {
"h": 4,
"i": "6c851c33-4ea4-4b76-be59-d4c19c5b2844",
"isResizable": true,
"w": 3,
"x": 0,
"y": 1
},
"name": "交换机数目",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "count(switch_legacy_sysUpTime)",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"baseColor": "#FF656B",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "23220dc3-ab9a-40f9-b1d3-135bb3bbb734",
"layout": {
"h": 4,
"i": "23220dc3-ab9a-40f9-b1d3-135bb3bbb734",
"isResizable": true,
"w": 8,
"x": 3,
"y": 1
},
"name": "CPU使用率排名",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "switch_legacy_cpu_util",
"legend": "{{ip}}",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"baseColor": "#FF656B",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "5bf72a5b-8273-4a40-94f9-8d671b01f16e",
"layout": {
"h": 4,
"i": "5bf72a5b-8273-4a40-94f9-8d671b01f16e",
"isResizable": true,
"w": 13,
"x": 11,
"y": 1
},
"name": "网络in流量大于300MB",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(switch_legacy_if_in) by(ip)\u003e300000000",
"legend": "{{ip}}",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"donut": true,
"labelWithName": false,
"legengPosition": "hidden"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "bb0b07f1-e193-46ed-bc6e-c40d42e4eb4a",
"layout": {
"h": 4,
"i": "43fa2e2d-8c69-4cc0-bb6b-6f359395f5c9",
"isResizable": true,
"w": 3,
"x": 0,
"y": 5
},
"name": "采集耗时",
"targets": [
{
"expr": "switch_legacy_ifstat_use_time_sec",
"legend": "{{ip}}",
"refId": "A"
}
],
"type": "pie",
"version": "2.0.0"
},
{
"custom": {
"baseColor": "#FF656B",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7f36ea1c-fd51-43bf-93ab-2787d630c530",
"layout": {
"h": 4,
"i": "7f36ea1c-fd51-43bf-93ab-2787d630c530",
"isResizable": true,
"w": 8,
"x": 3,
"y": 5
},
"name": "内存使用率排名",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "switch_legacy_mem_util",
"legend": "{{ip}}",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"baseColor": "#FF656B",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6f9ce40e-7927-43b3-a34f-16772bf88b7f",
"layout": {
"h": 4,
"i": "3387f6ea-dcca-41a8-ae18-ef3d8a065d94",
"isResizable": true,
"w": 13,
"x": 11,
"y": 5
},
"name": "网络out流量大于300MB",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "max(switch_legacy_if_out) by(ip)\u003e300000000",
"legend": "{{ip}}",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"baseColor": "#FF656B",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "356fe265-4a02-411d-9b3f-1578146c49a1",
"layout": {
"h": 4,
"i": "31b68228-1f52-4ef9-aaa3-f605f490516f",
"isResizable": true,
"w": 11,
"x": 0,
"y": 9
},
"name": "设备运行时长",
"options": {
"standardOptions": {
"util": "none"
}
},
"targets": [
{
"expr": "switch_legacy_sysUpTime/100/(24*3600)",
"legend": "{{ip}}",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"baseColor": "#FF656B",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "eec57d38-9a9f-4530-a0b5-e524d867759d",
"layout": {
"h": 4,
"i": "eec57d38-9a9f-4530-a0b5-e524d867759d",
"isResizable": true,
"w": 13,
"x": 11,
"y": 9
},
"name": "设备温度",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "switch_legacy_TempStatus",
"legend": "{{ip}}",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "559f99bc-2519-406e-b2b6-8c2e0895bd53",
"layout": {
"h": 1,
"i": "559f99bc-2519-406e-b2b6-8c2e0895bd53",
"isResizable": false,
"w": 24,
"x": 0,
"y": 13
},
"name": "单机详情",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "69a26105-9170-4f00-a3c2-0fa866f9d5b8",
"layout": {
"h": 5,
"i": "69a26105-9170-4f00-a3c2-0fa866f9d5b8",
"isResizable": true,
"w": 12,
"x": 0,
"y": 14
},
"name": "cpu使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"expr": "switch_legacy_cpu_util{ip=\"$IP\"}",
"legend": "{{ip}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "cc38fed9-f224-4684-9541-6a57e543062d",
"layout": {
"h": 5,
"i": "b90ba3f4-58ac-4fa1-ac69-f63388536126",
"isResizable": true,
"w": 12,
"x": 12,
"y": 14
},
"name": "网络in流量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "switch_legacy_if_in{ip=\"$IP\"}\u003e0",
"legend": "{{ip}}-{{ifname}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "1b84f22c-2535-4e16-b884-86e95bed444f",
"layout": {
"h": 5,
"i": "0158a0bb-6cb3-4a61-ab32-c4ff470eee8c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 19
},
"name": "内存使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"expr": "switch_legacy_mem_util{ip=\"$IP\"}",
"legend": "{{ip}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2e08574d-0378-45c1-9384-a80559e1df99",
"layout": {
"h": 5,
"i": "6d230fe8-9bd8-4c10-8fde-4e34ed3a62e3",
"isResizable": true,
"w": 12,
"x": 12,
"y": 19
},
"name": "网络out流量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "switch_legacy_if_out{ip=\"$IP\"}\u003e0",
"legend": "{{ip}}-{{ifname}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(switch_legacy_if_in,ip)",
"multi": false,
"name": "IP",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(switch_legacy_if_in{ip=\"$IP\"},ifname)",
"name": "Interface",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328402091000
}
================================================
FILE: integrations/Switch_Legacy/markdown/README.md
================================================
# switch_legacy
交换机监控插件,fork 自 [https://github.com/gaochao1/swcollector](https://github.com/gaochao1/swcollector) 可以自动探测网络设备型号,获取 CPU、内存使用率,当然,还有各个网口的监控数据,这是通用的 oid
## Configuration
最核心的配置就是指定 IP 列表,有三种写法:
```toml
[[instances]]
ips = [
"172.16.2.1",
"172.16.4/24",
"192.168.56.102-192.168.56.120"
]
```
该插件只支持 SNMP v2c,所以认证信息就是一个 community 字符串
## 唯一标识标签
网络设备的监控数据,默认都会带有 ip 标签,指定监控数据来源于哪个设备,如果想把监控数据当做夜莺里的监控对象,让网络设备自动出现在夜莺的监控对象表格里,只需要把 switch_id_label 设置为 ident 即可,这样一来,网络设备的 IP 信息会作为 ident 标签的值上报,夜莺会自动读取 ident 标签的值入库
## 名称映射
有时,我们看到网络设备的 IP,无法分辨是具体哪个设备,此时可以给 IP 一个映射名称:
```ini
[mappings]
"192.168.88.160" = "switch001.bj"
"192.168.88.161" = "switch002.bj"
```
这样一来,上报的监控数据就不用 IP 做标识了,而是使用 switch001.bj 这样的字符串做标识,更易读一些
## 自定义 oid
`[[instances.customs]]` 部分可以配置多个,表示自定义 oid,默认情况下,该插件采集的都是设备各个网口的监控数据以及CPU和内存的使用率,如果要采集别的 oid,就需要使用这个自定义功能
================================================
FILE: integrations/Systemd/collect/systemd/systemd.toml
================================================
# # collect interval
# interval = 15
enable=false # 设置为true 打开采集
#unit_include=".+"
#unit_exclude=""
enable_start_time_metrics=true #是否采集service unit的启动时间信息 单位秒
enable_task_metrics=true # 是否采集service unit task的metrics
enable_restarts_metrics=true #是否采集service unit重启的次数信息
================================================
FILE: integrations/Systemd/markdown/README.md
================================================
# systemd 插件
自 [node_exporter](https://github.com/prometheus/node_exporter/blob/master/collector/systemd_linux.go) fork 并改动
## Configuration
```toml
enable=false # 设置为true 打开采集
#unit_include=".+"
#unit_exclude=""
enable_start_time_metrics=true #是否采集service unit的启动时间信息 单位秒
enable_task_metrics=true # 是否采集service unit task的metrics
enable_restarts_metrics=true #是否采集service unit重启的次数信息
```
================================================
FILE: integrations/TDEngine/dashboards/tasokeeper3.x.json
================================================
{
"id": 0,
"group_id": 0,
"name": "TaosKeeper 3.x Prometheus Dashboard",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"calc": "last",
"colorMode": "value",
"textMode": "value",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "85aad21e-601f-4279-b1da-5b2675a17c58",
"layout": {
"h": 4,
"i": "85aad21e-601f-4279-b1da-5b2675a17c58",
"w": 3,
"x": 0,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "First EP",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "taos_cluster_info_first_ep{cluster=\"$cluster\"}",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2f75b3a1-4105-42fd-a060-dfc88b871e6e",
"layout": {
"h": 4,
"i": "2f75b3a1-4105-42fd-a060-dfc88b871e6e",
"w": 3,
"x": 3,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Version",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "taos_cluster_info_version{cluster=\"$cluster\"}",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "MNode 被选举后经过的时长",
"id": "a1f5853b-ee1a-44a8-aaa5-a432204eaaa2",
"layout": {
"h": 4,
"i": "a1f5853b-ee1a-44a8-aaa5-a432204eaaa2",
"w": 3,
"x": 6,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Master Uptime",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "taos_cluster_info_master_uptime{cluster=\"$cluster\"}",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "企业版授权到期时间",
"id": "d40bdb61-9c8e-4646-8632-9048e339b009",
"layout": {
"h": 4,
"i": "d40bdb61-9c8e-4646-8632-9048e339b009",
"w": 3,
"x": 9,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Expire Time",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "taos_grants_info_expire_time{cluster=\"$cluster\"}",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "last",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "企业版授权已用测点数",
"id": "f91206a8-8a02-4bd9-84f1-683a4b6eee7a",
"layout": {
"h": 4,
"i": "f91206a8-8a02-4bd9-84f1-683a4b6eee7a",
"w": 4,
"x": 12,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Used Measuring Points",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "taos_dnodes_info_disk_engine",
"legend": "used",
"refId": "A"
},
{
"expr": "taos_dnodes_info_disk_total",
"legend": "total",
"refId": "B"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "数据库个数",
"id": "db696656-c586-42b1-a38d-030b187ef338",
"layout": {
"h": 4,
"i": "db696656-c586-42b1-a38d-030b187ef338",
"w": 2,
"x": 16,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Databases",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "count(taos_vgroups_info_status{cluster=\"$cluster\"})",
"legend": "databases",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "所有数据库的表数量之和",
"id": "9080b0e0-6f30-4d3b-b675-535664804635",
"layout": {
"h": 4,
"i": "9080b0e0-6f30-4d3b-b675-535664804635",
"w": 3,
"x": 18,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Tables",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(taos_tables_per_database{cluster=\"$cluster\"})",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "当前TDengine集群DNode数量,Alive 为存活,Total 为所有",
"id": "7148c52d-a2a8-4cc5-9bb5-b598041cf500",
"layout": {
"h": 4,
"i": "7148c52d-a2a8-4cc5-9bb5-b598041cf500",
"w": 6,
"x": 0,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "DNodes",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "taos_cluster_info_dnodes_total",
"legend": "",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "当前TDengine集群MNode数量,Alive 为存活,Total 为所有",
"id": "bd41d367-c7e0-4f2c-a178-4f6af23965d0",
"layout": {
"h": 4,
"i": "bd41d367-c7e0-4f2c-a178-4f6af23965d0",
"w": 6,
"x": 6,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "MNodes",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "taos_cluster_info_mnodes_total{cluster=\"$cluster\"}",
"legend": "",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "当前TDengine集群 VGroups 数量,Alive 为存活,Total 为所有",
"id": "2fea6c74-2963-4815-a584-d2fb5cffe2ba",
"layout": {
"h": 4,
"i": "2fea6c74-2963-4815-a584-d2fb5cffe2ba",
"w": 6,
"x": 12,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "VGroups",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "count(count(taos_vgroups_info_status{cluster=\"$cluster\"}) by (vgroup_id))",
"legend": "",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "当前TDengine集群 VNodes 数量,Alive 为存活,Total 为所有",
"id": "537931bd-6979-4eba-a5d4-9958109ff81f",
"layout": {
"h": 4,
"i": "537931bd-6979-4eba-a5d4-9958109ff81f",
"w": 6,
"x": 18,
"y": 8
},
"links": [],
"maxPerRow": 4,
"name": "VNodes",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(taos_vgroups_info_online_vnodes{cluster=\"$cluster\"})",
"legend": "",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "37d26059-acdd-4d15-a135-266d0155ff81",
"layout": {
"h": 6,
"i": "37d26059-acdd-4d15-a135-266d0155ff81",
"w": 6,
"x": 0,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "DNodes Alive Percent",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "taos_cluster_info_dnodes_alive / taos_cluster_info_dnodes_total",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c109ddac-bb23-406a-b1a9-30ebf13b0c8c",
"layout": {
"h": 6,
"i": "c109ddac-bb23-406a-b1a9-30ebf13b0c8c",
"w": 6,
"x": 6,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "MNodes Alive Percent",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "taos_cluster_info_mnodes_alive / taos_cluster_info_mnodes_total",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d6837203-415f-4f58-8892-79a3eff8ce8b",
"layout": {
"h": 6,
"i": "d6837203-415f-4f58-8892-79a3eff8ce8b",
"w": 6,
"x": 12,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "VGroups Alive Percent",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "taos_cluster_info_vgroups_alive / taos_cluster_info_vgroups_total",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a54710e4-73b4-49c4-99d4-6a50a411823d",
"layout": {
"h": 6,
"i": "a54710e4-73b4-49c4-99d4-6a50a411823d",
"w": 6,
"x": 18,
"y": 12
},
"links": [],
"maxPerRow": 4,
"name": "VNodes Alive Percent",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "taos_cluster_info_vnodes_alive / taos_cluster_info_vnodes_total",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "59ec36a1-b3ac-4db1-9c54-0e63e5ecb2c1",
"layout": {
"h": 6,
"i": "59ec36a1-b3ac-4db1-9c54-0e63e5ecb2c1",
"w": 6,
"x": 0,
"y": 18
},
"links": [],
"maxPerRow": 4,
"name": "Measuring Points Used Percent Alert",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "taos_grants_info_timeseries_used / taos_grants_info_timeseries_total {cluster=\"$cluster\"}",
"legend": "percent",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "57d3394a-9382-413f-9d25-7693e7090989",
"layout": {
"h": 6,
"i": "57d3394a-9382-413f-9d25-7693e7090989",
"w": 6,
"x": 6,
"y": 18
},
"links": [],
"maxPerRow": 4,
"name": "Grants Expire Time",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "taos_grants_info_expire_time",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "错误率(每秒错误数)",
"id": "63a3dcea-93d8-435e-a207-653ee4502c20",
"layout": {
"h": 6,
"i": "63a3dcea-93d8-435e-a207-653ee4502c20",
"w": 6,
"x": 12,
"y": 18
},
"links": [],
"maxPerRow": 4,
"name": "Error Rate",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "deriv(taos_dnodes_info_errors{cluster=\"$cluster\"}[1m])",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0010cd3a-1a0f-4f9c-9b21-438adf761750",
"layout": {
"h": 6,
"i": "0010cd3a-1a0f-4f9c-9b21-438adf761750",
"w": 4,
"x": 0,
"y": 25
},
"links": [],
"maxPerRow": 4,
"name": "DNodes Lifetime",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "taos_dnodes_info_uptime{cluster=\"$cluster\"}",
"legend": "{{dnode_ep}}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "29d21947-4661-4893-b907-5e7363787a10",
"layout": {
"h": 6,
"i": "29d21947-4661-4893-b907-5e7363787a10",
"w": 10,
"x": 4,
"y": 25
},
"links": [],
"maxPerRow": 4,
"name": "DNodes Number",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "taos_cluster_info_dnodes_total{cluster=\"$cluster\"}",
"legend": "total",
"refId": "A"
},
{
"expr": "taos_cluster_info_dnodes_alive{cluster=\"$cluster\"}",
"legend": "alive",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "eb3dbe22-6550-4937-9848-897bd7c8ff22",
"layout": {
"h": 6,
"i": "eb3dbe22-6550-4937-9848-897bd7c8ff22",
"w": 10,
"x": 14,
"y": 25
},
"links": [],
"maxPerRow": 4,
"name": "MNodes Number",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "taos_cluster_info_mnodes_total{cluster=\"$cluster\"}",
"legend": "total",
"refId": "A"
},
{
"expr": "taos_cluster_info_mnodes_alive{cluster=\"$cluster\"}",
"legend": "alive",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(taos_dn_cpu_taosd, cluster)",
"multi": false,
"name": "cluster",
"reg": "",
"type": "query"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(taos_dn_cpu_taosd{cluster=\"$cluster\"}, fqdn)",
"multi": false,
"name": "fqdn",
"reg": "",
"type": "query"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(taos_dn_cpu_taosd{cluster=\"$cluster\"}, dnodeid)",
"multi": false,
"name": "dnodeid",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(taos_vgroups_info_status{cluster=\"$cluster\"}, database_name)",
"multi": true,
"name": "database",
"reg": "",
"type": "query"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(taos_cluster_info_first_ep{cluster=\"$cluster\"}, value)",
"multi": false,
"name": "firstEp",
"reg": "",
"type": "query"
},
{
"allOption": false,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(taos_cluster_info_version{cluster=\"$cluster\"}, value)",
"multi": false,
"name": "version",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328408805000
}
================================================
FILE: integrations/TDEngine/markdown/README.md
================================================
# TDEngine
TDEngine 也可以暴露 Prometheus 的监控数据,具体启用方法如下:
TODO
## 采集配置
既然暴露了 Prometheus 协议的监控数据,那通过 categraf prometheus 插件直接采集即可。配置文件是 `conf/input.prometheus/prometheus.toml`。配置样例如下:
```toml
[[instances]]
urls = [
"http://192.168.11.177:8080/xxxx"
]
```
================================================
FILE: integrations/TiDB/alerts/tidb-alerts.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-CPU 使用率大于 80%",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "avg(irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) by(instance) * 100 \u003c= 20",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328412826000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-Node_exporter_is_down",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "probe_success{group=\"node_exporter\"} == 0",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 1,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328413298000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-PD 写盘延迟大于 1s ,leader 写盘慢会导致 leader 切换",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) \u003e 1",
"severity": 3
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328413731000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-PD 无 leader ",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "sum(etcd_server_is_leader) !=1",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {
"描述": "集群不可用,可能 pd 多数派故障,需要分析日志查看无法选举 leader 的原因。"
},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328414240000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-PD 节点之间网络延迟大于 1s",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) \u003e 1",
"severity": 3
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328414657000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-TCP 连接数大于 50000 ",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "node_netstat_Tcp_CurrEstab \u003e 50000",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {
"描述": "通常有连接泄露,需分析主要是与哪个对象的连接。"
},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328415098000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-TiDB_server_is_down",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "probe_success{group=\"tidb\"} == 0",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328415532000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-blackbox_is_down",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "probe_success{group=\"blackbox_exporter\"} == 0",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328416073000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-drainer_server_is_down",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "probe_success{group=\"drainer\"} == 0",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328416479000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-grafana_is_down",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "probe_success{group=\"grafana\"} == 0",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328416917000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-pd_server_is_down",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "probe_success{group=\"pd\"} == 0",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328417394000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-pump_server_is_down",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "probe_success{group=\"pump\"} == 0",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328417841000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-tidb CPU 使用率超过可用 CPU的 80%",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "rate(process_cpu_seconds_total{job=\"tidb\"}[1m]) \u003e tidb_server_maxprocs{job=\"tidb\"} * 0.8",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328418304000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-tidb 最近 15 分钟加载 schema 出现错误",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "increase(tidb_session_schema_lease_error_total{type=\"outdated\"}[15m]) \u003e 0",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328418783000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-tidb 最近五分钟出现跳过 binlog",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "increase( tidb_server_critical_error_total[5m] ) \u003e 0",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328419214000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-tidb 最近十分钟加载 schema 失败大于 10 次",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "increase( tidb_domain_load_schema_total{type=\"failed\"}[10m] ) \u003e 10",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328419672000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-tiflash_server_is_down",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "probe_success{group=\"tiflash\"} == 0",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328420192000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-tikv 5分钟内内存增长大于 5G",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "process_resident_memory_bytes{job=~\"tikv\",instance=~\".*\"} - (process_resident_memory_bytes{job=~\"tikv\",instance=~\".*\"} offset 5m) \u003e 5*1024*1024*1024",
"severity": 3
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {
"描述": "通常 tikv 实例刚启动时会触发。或者有较复杂的计算 offload 到 tikv 上导致短暂的内存升高,需要分析具体 SQL 语句。"
},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328420743000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-tikv 发生重启",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "changes(process_start_time_seconds{job=\"tikv\"}[5m]) \u003e 0",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328421448000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-tikv 可用空间低于 20%",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "sum(tikv_store_size_bytes{type=\"available\"}) by (instance) / sum(tikv_store_size_bytes{type=\"capacity\"}) by (instance) \u003c 0.2",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {
"描述": "需要检查 tikv 是否有删除数据但空间未释放的问题,如果是正常的空间使用,则需要扩容 tikv 节点。"
},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328422047000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-tikv_server_is_down",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "probe_success{group=\"tikv\"} == 0",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328422549000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-内存使用大于 80%",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "(((node_memory_MemTotal_bytes-node_memory_MemFree_bytes-node_memory_Cached_bytes)/(node_memory_MemTotal_bytes)*100)) \u003e= 80",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328423070000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-挂载点状态变为只读",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "node_filesystem_readonly{fstype=~\"(ext.|xfs)\"} == 1",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328423515000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-文件系统 inode 使用率大于 80%",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "node_filesystem_files_free{fstype=~\"(ext.|xfs)\"} / node_filesystem_files{fstype=~\"(ext.|xfs)\"} * 100 \u003c 20",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328423963000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-磁盘使用率大于 80%",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "node_filesystem_avail_bytes{fstype=~\"(ext.|xfs)\", mountpoint!~\"/boot\"} / node_filesystem_size_bytes{fstype=~\"(ext.|xfs)\", mountpoint!~\"/boot\"} * 100 \u003c= 20",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328424400000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-网卡-入向有丢包",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "increase(net_drop_in[1m]) \u003e 0",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328424820000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-网卡-出向有丢包",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "increase(net_drop_out[1m]) \u003e 0",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328425282000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
7
],
"cluster": "",
"name": "TiDB-集群中出现状态为 down 的 tikv 数量大于 0",
"note": "TiDB(中控)",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "(sum(pd_cluster_status{type=\"store_down_count\"}) by (instance) \u003e 0) and (sum(etcd_server_is_leader) by (instance) \u003e 0)",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"nvwa"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328425803000
}
]
================================================
FILE: integrations/TiDB/dashboards/tidb-dashboard.json
================================================
{
"id": 0,
"group_id": 0,
"name": "tidb-Overview",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [
{
"targetBlank": true,
"title": "慢sql监控",
"url": ""
},
{
"targetBlank": true,
"title": "详细指标-DBA",
"url": ""
}
],
"panels": [
{
"collapsed": true,
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "535636fd-2f92-484c-9e23-42118801f7c9",
"layout": {
"h": 1,
"i": "535636fd-2f92-484c-9e23-42118801f7c9",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Services Status",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorDomain": [
1,
99999999999
],
"colorDomainAuto": false,
"colorRange": [
"#83c898",
"#c2c2c2",
"#fc653f"
],
"reverseColorOrder": false,
"textMode": "valueAndName"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "d18288fc-3f6d-4520-b112-ede7948b3c0f",
"layout": {
"h": 6,
"i": "d18288fc-3f6d-4520-b112-ede7948b3c0f",
"isResizable": true,
"w": 12,
"x": 0,
"y": 1
},
"links": [],
"name": "Service Up",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"tidb\"} == 1)",
"legend": "TiDB",
"refId": "A"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"pd\"} == 1)",
"legend": "PD",
"refId": "B"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"tikv\"} == 1)",
"legend": "TiKV",
"refId": "C"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"tiflash\"} == 1)",
"legend": "TiFlash",
"refId": "D"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"pump\"} == 1)",
"legend": "Pump",
"refId": "E"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"drainer\"} == 1)",
"legend": "Drainer",
"refId": "F"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"kafka\"} == 1)",
"legend": "Kafka",
"refId": "G"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"zookeeper\"} == 1)",
"legend": "Zookeeper",
"refId": "H"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"node_exporter\"} == 1)",
"legend": "Node_exporter",
"refId": "I"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"blackbox_exporter\"} == 1)",
"legend": "Blackbox_exporter",
"refId": "J"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"grafana\"} == 1)",
"legend": "Grafana",
"refId": "K"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", job=\"blackbox_exporter_http\"} == 1)",
"legend": "Pushgateway",
"refId": "L"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"kafka_exporter\"} == 1)",
"legend": "Kafka_exporter",
"refId": "M"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "hexbin",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorDomain": [
1,
20
],
"colorDomainAuto": false,
"colorRange": [
"#ffeda0",
"#fc4e2a",
"#800026"
],
"reverseColorOrder": false,
"textMode": "valueAndName"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "955bd169-0b3c-4476-939e-89302e74097a",
"layout": {
"h": 6,
"i": "955bd169-0b3c-4476-939e-89302e74097a",
"isResizable": true,
"w": 12,
"x": 12,
"y": 1
},
"links": [],
"name": "Service Down",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"tidb\"} == 0)",
"legend": "TiDB",
"refId": "A"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"pd\"} == 0)",
"legend": "PD",
"refId": "B"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"tikv\"} == 0)",
"legend": "TiKV",
"refId": "C"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"tiflash\"} == 0)",
"legend": "TiFlash",
"refId": "D"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"pump\"} == 0)",
"legend": "Pump",
"refId": "E"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"drainer\"} == 0)",
"legend": "Drainer",
"refId": "F"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"kafka\"} == 0)",
"legend": "Kafka",
"refId": "G"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"zookeeper\"} == 0)",
"legend": "Zookeeper",
"refId": "H"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"node_exporter\"} == 0)",
"legend": "Node_exporter",
"refId": "I"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"blackbox_exporter\"} == 0)",
"legend": "Blackbox_exporter",
"refId": "J"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"grafana\"} == 0)",
"legend": "Grafana",
"refId": "K"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", job=\"blackbox_exporter_http\"} == 0)",
"legend": "Pushgateway",
"refId": "L"
},
{
"expr": "\ncount(probe_success{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", group=\"kafka_exporter\"} == 0)",
"legend": "Kafka_exporter",
"refId": "M"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "hexbin",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "uptime since last restart",
"id": "057219ac-0441-4e0e-b65a-a358f72fcf51",
"layout": {
"h": 6,
"i": "057219ac-0441-4e0e-b65a-a358f72fcf51",
"isResizable": true,
"w": 12,
"x": 0,
"y": 7
},
"links": [],
"name": "Uptime",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "humantimeSeconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(time() - process_start_time_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",job=~\".*pd.*\"})",
"legend": "{{job}}-{{instance}}",
"refId": "A"
},
{
"expr": "(time() - process_start_time_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", job=\"tidb\"})",
"legend": "{{job}}-{{instance}}",
"refId": "B"
},
{
"expr": "(time() - process_start_time_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", job=~\".*tikv\"})",
"legend": "{{job}}-{{instance}}",
"refId": "C"
},
{
"expr": "tiflash_system_asynchronous_metric_Uptime{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}",
"legend": "tiflash-{{instance}}",
"refId": "D"
},
{
"expr": "(time() - process_start_time_seconds{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", job=\"ticdc\"})",
"legend": "{{job}}-{{instance}}",
"refId": "E"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorDomain": [
-1,
1
],
"colorDomainAuto": false,
"colorRange": [
"#83c898",
"#c2c2c2",
"#fc653f"
],
"reverseColorOrder": false,
"textMode": "valueAndName"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "e4af8eb3-f52d-434e-a6cc-83359b36d5a2",
"layout": {
"h": 6,
"i": "e4af8eb3-f52d-434e-a6cc-83359b36d5a2",
"isResizable": true,
"w": 12,
"x": 12,
"y": 7
},
"links": [],
"name": "Abnormal stores",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\", type=\"store_disconnected_count\"})",
"legend": "Disconnect Stores",
"refId": "B"
},
{
"expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\", type=\"store_unhealth_count\"})",
"legend": "Unhealth Stores",
"refId": "C"
},
{
"expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\", type=\"store_low_space_count\"})",
"legend": "LowSpace Stores",
"refId": "D"
},
{
"expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\", type=\"store_down_count\"})",
"legend": "Down Stores",
"refId": "E"
},
{
"expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\", type=\"store_offline_count\"})",
"legend": "Offline Stores",
"refId": "F"
},
{
"expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\", type=\"store_tombstone_count\"})",
"legend": "Tombstone Stores",
"refId": "G"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "hexbin",
"version": "3.0.0"
},
{
"collapsed": true,
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "3ed49e8a-c1f5-4e5a-ac01-c8ae1c734cc7",
"layout": {
"h": 1,
"i": "3ed49e8a-c1f5-4e5a-ac01-c8ae1c734cc7",
"isResizable": false,
"w": 24,
"x": 0,
"y": 73
},
"name": "PD",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "0e14ace8-401d-4215-bf15-09a7abf00414",
"layout": {
"h": 7,
"i": "0e14ace8-401d-4215-bf15-09a7abf00414",
"isResizable": true,
"w": 5,
"x": 0,
"y": 74
},
"links": [],
"name": "PD role",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"special": 1
},
"result": {
"color": "#3fc453",
"text": "Leader"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"color": "#ce4f52",
"text": "Follower"
},
"type": "special"
}
]
},
"targets": [
{
"expr": "pd_tso_role{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\", dc=\"global\"}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "ef1dc81d-83c2-49f9-b44c-d97fdb320753",
"layout": {
"h": 7,
"i": "ef1dc81d-83c2-49f9-b44c-d97fdb320753",
"isResizable": true,
"w": 5,
"x": 5,
"y": 74
},
"links": [],
"name": "Storage capacity",
"options": {
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 0,
"to": 1000000000000000000
},
"result": {
"color": "#3fc453"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\",type=\"storage_capacity\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "d8955955-37ac-485e-935c-cac66800f436",
"layout": {
"h": 7,
"i": "d8955955-37ac-485e-935c-cac66800f436",
"isResizable": true,
"w": 5,
"x": 10,
"y": 74
},
"links": [],
"name": "Current storage size",
"options": {
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 0,
"to": 1000000000000000000
},
"result": {
"color": "#3fc453"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\",type=\"storage_size\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "The current storage size and used ratio of the cluster",
"id": "a76f3318-30f7-4c17-949c-91ef452b9d82",
"layout": {
"h": 7,
"i": "a76f3318-30f7-4c17-949c-91ef452b9d82",
"isResizable": true,
"w": 5,
"x": 15,
"y": 74
},
"links": [],
"name": "Current storage used",
"options": {
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 0,
"to": 0.7
},
"result": {
"color": "#3fc453"
},
"type": "range"
},
{
"match": {
"from": 0.7,
"to": 1
},
"result": {
"color": "#ce4f52"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\",type=\"storage_size\"}) / sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\",type=\"storage_capacity\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "f893916f-dc2f-4e91-a20d-43fc4bf92304",
"layout": {
"h": 7,
"i": "f893916f-dc2f-4e91-a20d-43fc4bf92304",
"isResizable": true,
"w": 4,
"x": 20,
"y": 74
},
"links": [],
"name": "Normal stores",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"special": 0
},
"result": {
"color": "#ce4f52"
},
"type": "special"
},
{
"match": {
"from": 1,
"to": 9999
},
"result": {
"color": "#3fc453",
"text": ""
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\", type=\"store_up_count\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "9181ea69-d2d6-48bd-b137-3dbb7365d1fa",
"layout": {
"h": 6,
"i": "9181ea69-d2d6-48bd-b137-3dbb7365d1fa",
"isResizable": true,
"w": 12,
"x": 0,
"y": 81
},
"links": [],
"name": "Region health",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "pd_regions_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\"}",
"legend": "{{type}}",
"refId": "A",
"step": null
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "bars",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "noraml"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "The current running schedulers",
"id": "a7c270f9-fddf-4bce-8a77-ab64dc02f4c6",
"layout": {
"h": 6,
"i": "a7c270f9-fddf-4bce-8a77-ab64dc02f4c6",
"isResizable": true,
"w": 12,
"x": 12,
"y": 81
},
"links": [],
"name": "Scheduler is running",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "pd_scheduler_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"allow\",instance=\"$pd_instance\"}",
"legend": "{{kind}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "The leader count of each TiKV instance",
"id": "94a89a96-4016-450e-8d34-921e6d222e57",
"layout": {
"h": 6,
"i": "94a89a96-4016-450e-8d34-921e6d222e57",
"isResizable": true,
"w": 12,
"x": 0,
"y": 87
},
"links": [],
"name": "Store leader count",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "pd_scheduler_store_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\", type=\"leader_count\"}",
"legend": "{{address}}-store-{{store}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "The Region count of each TiKV instance \t",
"id": "45d762bf-1c78-4462-a021-787cc2ef43a1",
"layout": {
"h": 6,
"i": "45d762bf-1c78-4462-a021-787cc2ef43a1",
"isResizable": true,
"w": 12,
"x": 12,
"y": 87
},
"links": [],
"name": "Store Region count",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "pd_scheduler_store_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\", type=\"region_count\"}",
"legend": "{{address}}-store-{{store}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "The time consumed of completing each kind of gRPC commands in .99",
"id": "be24f158-2662-4640-a699-ca02820448a2",
"layout": {
"h": 6,
"i": "be24f158-2662-4640-a699-ca02820448a2",
"isResizable": true,
"w": 12,
"x": 0,
"y": 93
},
"links": [],
"name": "99% Completed commands duration",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$pd_instance\"}[5m])) by (grpc_method, le))*1000",
"legend": "{{grpc_method}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "The time consumed of handling TiDB requests",
"id": "e64e2809-e92c-4765-947e-862cda4e4f03",
"layout": {
"h": 6,
"i": "e64e2809-e92c-4765-947e-862cda4e4f03",
"isResizable": true,
"w": 12,
"x": 12,
"y": 93
},
"links": [],
"name": "PD server TSO handle time",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.90, sum(rate(pd_server_handle_tso_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))*1000",
"legend": "90% tso",
"refId": "A"
},
{
"expr": "histogram_quantile(0.99, sum(rate(pd_server_handle_tso_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))*1000",
"legend": "99% tso",
"refId": "B"
},
{
"expr": "histogram_quantile(0.999, sum(rate(pd_server_handle_tso_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))*1000",
"legend": "99.9% tso",
"refId": "C"
},
{
"expr": "histogram_quantile(0.99999, sum(rate(pd_server_handle_tso_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))*1000",
"legend": "99.999% tso",
"refId": "D"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "PD cpu usage calculated with process cpu running seconds",
"id": "9f2e927e-f293-46da-8688-1475343cbd1b",
"layout": {
"h": 7,
"i": "9f2e927e-f293-46da-8688-1475343cbd1b",
"isResizable": true,
"w": 12,
"x": 0,
"y": 99
},
"links": [],
"name": "CPU Usage(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(process_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",job=~\".*pd.*\"}[30s])*100",
"legend": "{{job}}-{{instance}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "PD memory usage. ",
"id": "bae011b2-b545-4314-8083-9436716cd281",
"layout": {
"h": 7,
"i": "bae011b2-b545-4314-8083-9436716cd281",
"isResizable": true,
"w": 12,
"x": 12,
"y": 99
},
"links": [],
"name": "Memory Usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "process_resident_memory_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",job=~\".*pd.*\"}",
"legend": "process-{{job}}-{{instance}}",
"refId": "A"
},
{
"expr": "go_memstats_heap_inuse_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",job=~\".*pd.*\"}",
"legend": "HeapInuse-{{job}}-{{instance}}",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "2f1959a6-654c-48bd-9f78-38865ec7b406",
"layout": {
"h": 1,
"i": "2f1959a6-654c-48bd-9f78-38865ec7b406",
"isResizable": false,
"w": 24,
"x": 0,
"y": 202
},
"name": "TiDB",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "TiDB statement statistics.\nBold red line on right Y axis for Failed Queries per second",
"id": "bc04bd9e-39ed-4969-a969-d2a5d0c731ac",
"layout": {
"h": 7,
"i": "bc04bd9e-39ed-4969-a969-d2a5d0c731ac",
"isResizable": true,
"w": 12,
"x": 0,
"y": 203
},
"links": [],
"name": "QPS",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(tidb_executor_statement_total{k8s_cluster=\"$k8s_cluster\",instance=~\"$tidb_instance\",tidb_cluster=\"$tidb_cluster\"}[1m])) by (type)",
"legend": "{{type}}",
"refId": "A"
},
{
"expr": "sum(rate(tidb_executor_statement_total{k8s_cluster=\"$k8s_cluster\",instance=~\"$tidb_instance\",tidb_cluster=\"$tidb_cluster\"}[1m]))",
"legend": "Total",
"refId": "B"
},
{
"expr": "sum(rate(tidb_server_execute_error_total{k8s_cluster=\"$k8s_cluster\",instance=~\"$tidb_instance\",tidb_cluster=\"$tidb_cluster\"}[1m])) ",
"legend": "Failed",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "82ef5626-87ac-4577-b888-cc51b038e91f",
"layout": {
"h": 7,
"i": "82ef5626-87ac-4577-b888-cc51b038e91f",
"isResizable": true,
"w": 12,
"x": 12,
"y": 203
},
"links": [],
"name": "Duration",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.9999, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\",instance=~\"$tidb_instance\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le))*1000",
"legend": "9999",
"refId": "A"
},
{
"expr": "histogram_quantile(0.999, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\",instance=~\"$tidb_instance\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le))*1000",
"legend": "999",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\",instance=~\"$tidb_instance\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le))*1000",
"legend": "99",
"refId": "C"
},
{
"expr": "histogram_quantile(0.95, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\",instance=~\"$tidb_instance\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le))*1000",
"legend": "95",
"refId": "D"
},
{
"expr": "histogram_quantile(0.80, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\",instance=~\"$tidb_instance\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le))*1000",
"legend": "80",
"refId": "F"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "Service Time Per Second, show service time distribution among different SQL types:\n1. Database time, the total time that the TiDB cluster is processing application requests.\n2. The service time of different SQL types.\n",
"id": "cf1426f5-85b1-413b-b2d6-a24a693f5609",
"layout": {
"h": 7,
"i": "cf1426f5-85b1-413b-b2d6-a24a693f5609",
"isResizable": true,
"w": 8,
"x": 0,
"y": 210
},
"links": [],
"name": "Database Time by SQL Type",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(tidb_server_handle_query_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\", sql_type!=\"internal\"}[1m]))",
"legend": "database time",
"refId": "A"
},
{
"expr": "sum(rate(tidb_server_handle_query_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\", sql_type!=\"internal\"}[1m])) by (sql_type)",
"legend": "{{sql_type}}",
"refId": "G"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "Service Time Per Second, show service time distribution among different SQL phases.\n1. Database time, the total time that the TiDB cluster is processing application requests.\n2. The service time of different SQL phases.\n",
"id": "e4b40e12-1cf9-449a-86c6-a9433c1e4dae",
"layout": {
"h": 7,
"i": "e4b40e12-1cf9-449a-86c6-a9433c1e4dae",
"isResizable": true,
"w": 8,
"x": 8,
"y": 210
},
"links": [],
"name": "Database Time by SQL Phase",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(tidb_server_handle_query_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\", sql_type!=\"internal\"}[1m]))",
"legend": "database time",
"refId": "A"
},
{
"expr": "sum(rate(tidb_session_parse_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\", sql_type=\"general\"}[1m]))",
"legend": "parse",
"refId": "D"
},
{
"expr": "sum(rate(tidb_session_compile_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\", sql_type=\"general\"}[1m]))",
"legend": "compile",
"refId": "E"
},
{
"expr": "sum(rate(tidb_session_execute_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\", sql_type=\"general\"}[1m]))",
"legend": "execute",
"refId": "F"
},
{
"expr": "sum(rate(tidb_server_get_token_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\"}[1m]))/1000000",
"legend": "get token",
"refId": "G"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "Service Time Per Second, show service time distribution among different KV/PD request:\n1. Execute time, the execute time in SQL Phase\n2. Service time of different KV/PD request ",
"id": "e64105a4-56b7-4b74-948e-c2bc8cffdebd",
"layout": {
"h": 7,
"i": "e64105a4-56b7-4b74-948e-c2bc8cffdebd",
"isResizable": true,
"w": 8,
"x": 16,
"y": 210
},
"links": [],
"name": "SQL Execute Time Overview",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(tidb_tikvclient_request_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\", store!=\"0\"}[1m])) by (type)",
"legend": "{{type}}",
"refId": "B"
},
{
"expr": "sum(rate(pd_client_cmd_handle_cmds_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\", type=\"wait\"}[1m]))",
"legend": "tso_wait",
"refId": "C"
},
{
"expr": "sum(rate(tidb_session_execute_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\", sql_type=\"general\"}[1m]))",
"legend": "execute time",
"refId": "F"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "TiDB current connection counts",
"id": "648f51db-4891-42ca-a6aa-ee2cf9976b62",
"layout": {
"h": 7,
"i": "648f51db-4891-42ca-a6aa-ee2cf9976b62",
"isResizable": true,
"w": 12,
"x": 0,
"y": 217
},
"links": [],
"name": "Connection Count",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "tidb_server_connections{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\"}",
"legend": "{{instance}}",
"refId": "A"
},
{
"expr": "sum(tidb_server_connections{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\"})",
"legend": "total",
"refId": "B"
},
{
"expr": "sum(rate(tidb_server_handle_query_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\", sql_type!=\"internal\"}[1m]))",
"legend": "active connections",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "TiDB slow query statistics with slow query durations and coprocessor waiting/executing durations",
"id": "2d1a443c-afbf-4197-b2a8-3b99d8087590",
"layout": {
"h": 7,
"i": "2d1a443c-afbf-4197-b2a8-3b99d8087590",
"isResizable": true,
"w": 12,
"x": 12,
"y": 217
},
"links": [],
"name": "Slow query",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.90, sum(rate(tidb_server_slow_query_process_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tidb_instance\"}[1m])) by (le,sql_type))",
"legend": "all_proc_{{sql_type}}",
"refId": "A"
},
{
"expr": "histogram_quantile(0.90, sum(rate(tidb_server_slow_query_cop_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tidb_instance\"}[1m])) by (le,sql_type))",
"legend": "all_cop_proc_{{sql_type}}",
"refId": "B"
},
{
"expr": "histogram_quantile(0.90, sum(rate(tidb_server_slow_query_wait_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tidb_instance\"}[1m])) by (le,sql_type))",
"legend": "all_cop_wait_{{sql_type}}",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "TiDB failed query statistics by query type",
"id": "4df8dfb9-5151-4cc8-8560-30fb259dcb75",
"layout": {
"h": 6,
"i": "4df8dfb9-5151-4cc8-8560-30fb259dcb75",
"isResizable": true,
"w": 24,
"x": 0,
"y": 224
},
"links": [],
"name": "Failed Query OPM",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(increase(tidb_server_execute_error_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tidb_instance\"}[1m])) by (type, instance)",
"legend": " {{type}}-{{instance}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "TiDB cpu usage calculated with process cpu running seconds",
"id": "5bfbc205-53f1-4d1d-9bac-8d685bd7b9a2",
"layout": {
"h": 7,
"i": "5bfbc205-53f1-4d1d-9bac-8d685bd7b9a2",
"isResizable": true,
"w": 12,
"x": 0,
"y": 230
},
"links": [],
"name": "CPU Usage(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "",
"value": null
},
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(process_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tidb_instance\", job=\"tidb\"}[30s])*100",
"legend": "{{instance}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "a4725eba-e15d-4dee-92ac-71dc2d8dba3a",
"layout": {
"h": 7,
"i": "a4725eba-e15d-4dee-92ac-71dc2d8dba3a",
"isResizable": true,
"w": 12,
"x": 12,
"y": 230
},
"links": [],
"name": "Memory Usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "process_resident_memory_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\", job=\"tidb\"}",
"legend": "process-{{instance}}",
"refId": "A"
},
{
"expr": "go_memstats_heap_inuse_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\", job=\"tidb\"}",
"legend": "HeapInuse-{{instance}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "d661c0cc-f4f1-45b9-8fd7-69a863d57225",
"layout": {
"h": 1,
"i": "d661c0cc-f4f1-45b9-8fd7-69a863d57225",
"isResizable": false,
"w": 24,
"x": 0,
"y": 271
},
"name": "TiKV",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "tidb avg kv request duration",
"id": "babad1cd-29ac-4fe7-8b7b-55ac7ebc2963",
"layout": {
"h": 8,
"i": "babad1cd-29ac-4fe7-8b7b-55ac7ebc2963",
"isResizable": true,
"w": 8,
"x": 0,
"y": 272
},
"links": [],
"name": "Avg TiDB KV Request Duration",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(tidb_tikvclient_request_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\", store!=\"0\"}[1m])) by (type)/ sum(rate(tidb_tikvclient_request_seconds_count{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\", store!=\"0\"}[1m])) by (type)*1000",
"legend": "{{type}}",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "tikv grpc avg duration",
"id": "bd36343a-d3a7-4e46-be96-5e3976741206",
"layout": {
"h": 8,
"i": "bd36343a-d3a7-4e46-be96-5e3976741206",
"isResizable": true,
"w": 8,
"x": 8,
"y": 272
},
"links": [],
"name": "Avg TiKV GRPC Duration",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(tikv_grpc_msg_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tikv_instance\", store!=\"0\"}[1m])) by (type)/ sum(rate(tikv_grpc_msg_duration_seconds_count{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tikv_instance\", store!=\"0\"}[1m])) by (type)*1000",
"legend": "{{type}}",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "The time consumed by processing asynchronous write requests.\nStorage async write duration = store duration + apply duration",
"id": "85d4b493-8bd9-472a-8281-23085cde1f88",
"layout": {
"h": 8,
"i": "85d4b493-8bd9-472a-8281-23085cde1f88",
"isResizable": true,
"w": 8,
"x": 16,
"y": 272
},
"links": [],
"name": "Storage Async Write Duration",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(tikv_storage_engine_async_request_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tikv_instance\", type=\"write\"}[1m])) / sum(rate(tikv_storage_engine_async_request_duration_seconds_count{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tikv_instance\", type=\"write\"}[1m]))*1000",
"legend": "avg",
"refId": "C"
},
{
"expr": "histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tikv_instance\", type=\"write\"}[1m])) by (le))*1000",
"legend": "99",
"refId": "A"
},
{
"expr": "histogram_quantile(0.999, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\",tidb_cluster=\"$tidb_cluster\",instance=~\"$tikv_instance\", type=\"write\"}[1m])) by (le))*1000",
"legend": "999",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "The CPU utilization of raftstore thread",
"id": "03b56519-9926-4aa7-bff5-e1c7c8a576be",
"layout": {
"h": 7,
"i": "03b56519-9926-4aa7-bff5-e1c7c8a576be",
"isResizable": true,
"w": 12,
"x": 0,
"y": 280
},
"links": [],
"name": "Raft store CPU(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", name=~\"raftstore_.*\"}[1m])) by (instance)*100",
"legend": "{{instance}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "The CPU utilization of the unified read pool",
"id": "189a88d9-5aac-4855-a427-45adc353a5fa",
"layout": {
"h": 7,
"i": "189a88d9-5aac-4855-a427-45adc353a5fa",
"isResizable": true,
"w": 12,
"x": 12,
"y": 280
},
"links": [],
"name": "Unified read pool CPU(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", name=~\"unified_read_po*\"}[1m])) by (instance)*100",
"legend": "{{instance}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "6149705b-aa38-49a7-83b4-430e0ae4e193",
"layout": {
"h": 8,
"i": "6149705b-aa38-49a7-83b4-430e0ae4e193",
"isResizable": true,
"w": 12,
"x": 0,
"y": 287
},
"links": [],
"name": "CPU(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tikv_instance\", job=\"tikv\"}[1m])) by (instance)*100",
"legend": "{{instance}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "80304baf-0534-4799-b093-571ec11c7e5a",
"layout": {
"h": 8,
"i": "80304baf-0534-4799-b093-571ec11c7e5a",
"isResizable": true,
"w": 12,
"x": 12,
"y": 287
},
"links": [],
"name": "Memory",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "avg(process_resident_memory_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tikv_instance\", job=\"tikv\"}) by (instance)",
"legend": "{{instance}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "74a0ec87-31c6-4f6c-a0b3-912583f276b0",
"layout": {
"h": 1,
"i": "74a0ec87-31c6-4f6c-a0b3-912583f276b0",
"isResizable": false,
"w": 24,
"x": 0,
"y": 318
},
"name": "TiFlash",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "208123c5-6b6f-4b89-9c9e-53439840649f",
"layout": {
"h": 7,
"i": "208123c5-6b6f-4b89-9c9e-53439840649f",
"isResizable": true,
"w": 12,
"x": 0,
"y": 319
},
"links": [],
"name": "Request QPS",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(tiflash_coprocessor_request_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (type)",
"legend": "{{type}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "be97f8e3-f2f8-4c6e-bb22-aa06d9364f47",
"layout": {
"h": 7,
"i": "be97f8e3-f2f8-4c6e-bb22-aa06d9364f47",
"isResizable": true,
"w": 12,
"x": 12,
"y": 319
},
"links": [],
"name": "Executor QPS",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(tiflash_coprocessor_executor_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (type)",
"legend": "{{type}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "9b1a9996-01f9-45f8-9c33-5eabb4470ddf",
"layout": {
"h": 7,
"i": "9b1a9996-01f9-45f8-9c33-5eabb4470ddf",
"isResizable": true,
"w": 12,
"x": 0,
"y": 326
},
"links": [],
"name": "Request Duration",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "histogram_quantile(0.999, sum(rate(tiflash_coprocessor_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le))",
"legend": "999",
"refId": "A"
},
{
"expr": "histogram_quantile(0.99, sum(rate(tiflash_coprocessor_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le))",
"legend": "99",
"refId": "B"
},
{
"expr": "histogram_quantile(0.95, sum(rate(tiflash_coprocessor_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le))",
"legend": "95",
"refId": "C"
},
{
"expr": "histogram_quantile(0.80, sum(rate(tiflash_coprocessor_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le))",
"legend": "80",
"refId": "D"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "ac51b582-b174-40de-8105-674086eca665",
"layout": {
"h": 7,
"i": "ac51b582-b174-40de-8105-674086eca665",
"isResizable": true,
"w": 12,
"x": 12,
"y": 326
},
"links": [],
"name": "Error QPS",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "sum(rate(tiflash_coprocessor_request_error{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (reason)",
"legend": "{{reason}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "TiFlash CPU usage calculated with process CPU running seconds.",
"id": "fc9eee95-22d3-4273-8750-4fea4ec7db64",
"layout": {
"h": 7,
"i": "fc9eee95-22d3-4273-8750-4fea4ec7db64",
"isResizable": true,
"w": 12,
"x": 0,
"y": 333
},
"links": [],
"name": "CPU Usage(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(tiflash_proxy_process_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", job=\"tiflash\"}[1m])*100",
"legend": "{{instance}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "The memory usage per TiFlash instance",
"id": "5f8b7d63-84a9-44ce-82d7-8499a50f1acf",
"layout": {
"h": 7,
"i": "5f8b7d63-84a9-44ce-82d7-8499a50f1acf",
"isResizable": true,
"w": 12,
"x": 12,
"y": 333
},
"links": [],
"name": "Memory",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "tiflash_proxy_process_resident_memory_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", job=\"tiflash\"}",
"legend": "{{instance}}",
"refId": "K"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "ce791748-16dc-42fc-863e-420676b78a20",
"layout": {
"h": 1,
"i": "ce791748-16dc-42fc-863e-420676b78a20",
"isResizable": false,
"w": 24,
"x": 0,
"y": 361
},
"name": "Replication",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "6bfc7d33-d7e8-4c23-987d-bd90a18f4486",
"layout": {
"h": 7,
"i": "6bfc7d33-d7e8-4c23-987d-bd90a18f4486",
"isResizable": true,
"w": 12,
"x": 0,
"y": 362
},
"links": [],
"name": "Drainer Checkpoint TSO",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "datetimeMilliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "binlog_drainer_checkpoint_tso{}",
"legend": "drainer-{{instance}}",
"refId": "A"
},
{
"expr": "max(pd_cluster_tso{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"})",
"legend": "approximate current time (s)",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "35128ec8-f870-42bb-b2b9-ebb50de5a75d",
"layout": {
"h": 7,
"i": "35128ec8-f870-42bb-b2b9-ebb50de5a75d",
"isResizable": true,
"w": 12,
"x": 12,
"y": 362
},
"links": [],
"name": "TiDB Server Skip Binlog Count",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "tidb_server_critical_error_total",
"legend": "{{instance}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"description": "The checkpoint ts of changefeeds.",
"id": "fb2e0749-6d67-44e6-a8ca-756fba222d7d",
"layout": {
"h": 7,
"i": "fb2e0749-6d67-44e6-a8ca-756fba222d7d",
"isResizable": true,
"w": 12,
"x": 0,
"y": 369
},
"links": [],
"name": "CDC Changefeed checkpoint",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "datetimeMilliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "max(pd_cluster_tso{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"})",
"legend": "approximate current time (s)",
"refId": "A"
},
{
"expr": "max(ticdc_owner_checkpoint_ts{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}) by (changefeed)",
"legend": "{{changefeed}}",
"refId": "B"
},
{
"expr": "max(ticdc_processor_checkpoint_ts{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}) by (instance, changefeed)",
"legend": "{{instance}}-{{changefeed}}",
"refId": "C"
},
{
"expr": "max(ticdc_owner_barrier_ts{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}) by (changefeed)",
"legend": "{{changefeed}}-barrierTs",
"refId": "D"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "79da928c-64c4-4e1a-86c0-73c7e69bf0a0",
"layout": {
"h": 1,
"i": "79da928c-64c4-4e1a-86c0-73c7e69bf0a0",
"isResizable": false,
"w": 24,
"x": 0,
"y": 390
},
"name": "System Info",
"panels": [],
"type": "row",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorDomain": [
1,
99999999999
],
"colorDomainAuto": false,
"colorRange": [
"#83c898",
"#c2c2c2",
"#fc653f"
],
"reverseColorOrder": false,
"textMode": "valueAndName"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "b1552049-e2fe-4994-8475-daf6933d092f",
"layout": {
"h": 5,
"i": "b1552049-e2fe-4994-8475-daf6933d092f",
"isResizable": true,
"w": 6,
"x": 0,
"y": 391
},
"links": [],
"name": "Vcores",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "count(node_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", mode=\"user\"}) by (instance)",
"legend": "{{ instance }}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "hexbin",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorDomain": [
1,
900000000000000000
],
"colorDomainAuto": false,
"colorRange": [
"#83c898",
"#c2c2c2",
"#fc653f"
],
"reverseColorOrder": false,
"textMode": "valueAndName"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "5b20c6c9-3985-468c-ac2f-f685a5b3cbad",
"layout": {
"h": 5,
"i": "5b20c6c9-3985-468c-ac2f-f685a5b3cbad",
"isResizable": true,
"w": 6,
"x": 6,
"y": 391
},
"links": [],
"name": "Memory",
"options": {
"standardOptions": {
"util": "bytesIEC"
}
},
"targets": [
{
"expr": "node_memory_MemTotal_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}",
"legend": "{{ instance }}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "hexbin",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "677c3bf1-d593-445f-8313-b45a99017b8d",
"layout": {
"h": 5,
"i": "677c3bf1-d593-445f-8313-b45a99017b8d",
"isResizable": true,
"w": 12,
"x": 12,
"y": 391
},
"links": [],
"name": "CPU Usage(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "100 - avg by (instance) (irate(node_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", mode=\"idle\"}[1m]) ) * 100",
"legend": "{{instance}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "f7da66ec-80be-4c66-b99b-f0fc38d64e31",
"layout": {
"h": 5,
"i": "f7da66ec-80be-4c66-b99b-f0fc38d64e31",
"isResizable": true,
"w": 12,
"x": 0,
"y": 396
},
"links": [],
"name": "Load [1m]",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "node_load1{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}",
"legend": "{{instance}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "d61da9f0-0f8d-4012-b419-275de5d61cc1",
"layout": {
"h": 5,
"i": "d61da9f0-0f8d-4012-b419-275de5d61cc1",
"isResizable": true,
"w": 12,
"x": 12,
"y": 396
},
"links": [],
"name": "Memory Available",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "node_memory_MemAvailable_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}",
"legend": "{{ instance }}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "84e5094f-f60f-4fe5-a79f-20ad9f816428",
"layout": {
"h": 5,
"i": "84e5094f-f60f-4fe5-a79f-20ad9f816428",
"isResizable": true,
"w": 12,
"x": 0,
"y": 401
},
"links": [],
"name": "Network Traffic",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(node_network_receive_bytes_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", device!=\"lo\"}[5m])",
"legend": "Inbound: {{instance}}-{{device}}",
"refId": "A"
},
{
"expr": "irate(node_network_transmit_bytes_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", device!=\"lo\"}[5m])",
"legend": "Outbound: {{instance}}-{{device}}",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "f2625015-e8d4-4a63-8bdb-deb99a76c331",
"layout": {
"h": 5,
"i": "f2625015-e8d4-4a63-8bdb-deb99a76c331",
"isResizable": true,
"w": 12,
"x": 12,
"y": 401
},
"links": [],
"name": "TCP Retrans",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(node_netstat_Tcp_RetransSegs{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])",
"legend": "{{instance}} - TCPSlowStartRetrans",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${tidb_datasource}",
"id": "3a15cdf1-596a-4329-b909-81dc1eda59fb",
"layout": {
"h": 7,
"i": "3a15cdf1-596a-4329-b909-81dc1eda59fb",
"isResizable": true,
"w": 24,
"x": 0,
"y": 406
},
"links": [],
"name": "IO Util(%)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(node_disk_io_time_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])*100",
"legend": "{{instance}} - {{device}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"hide": true,
"name": "tidb_datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${tidb_datasource}"
},
"definition": "label_values(pd_cluster_status, k8s_cluster)",
"hide": true,
"name": "k8s_cluster",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${tidb_datasource}"
},
"definition": "label_values(pd_cluster_status{k8s_cluster=\"$k8s_cluster\"}, tidb_cluster)",
"hide": true,
"multi": false,
"name": "tidb_cluster",
"reg": "",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${tidb_datasource}"
},
"definition": "label_values(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}, instance)",
"multi": false,
"name": "pd_instance",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${tidb_datasource}"
},
"definition": "label_values(tidb_server_connections{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}, instance)",
"multi": true,
"name": "tidb_instance",
"reg": "",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${tidb_datasource}"
},
"definition": "label_values(tikv_engine_size_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}, instance)",
"multi": true,
"name": "tikv_instance",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328428617000
}
================================================
FILE: integrations/Tomcat/collect/tomcat/tomcat.toml
================================================
# # collect interval
# interval = 15
# Gather metrics from the Tomcat server status page.
[[instances]]
## URL of the Tomcat server status
# url = "http://127.0.0.1:8080/manager/status/all?XML=true"
url = ""
## HTTP Basic Auth Credentials
# username = "tomcat"
# password = "s3cret"
## Request timeout
# timeout = "5s"
# # interval = global.interval * interval_times
# interval_times = 1
# important! use global unique string to specify instance
# labels = { instance="192.168.1.2:8080", url="-" }
## Optional TLS Config
# use_tls = false
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
================================================
FILE: integrations/Tomcat/dashboards/tomcat_by_categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Tomcat - categraf",
"ident": "",
"tags": "Categraf",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [
{
"targetBlank": true,
"title": "n9e",
"url": "https://n9e.gitee.io/"
}
],
"panels": [
{
"collapsed": true,
"id": "cdc17d90-17f5-44b8-99a1-94764de61698",
"layout": {
"h": 1,
"i": "cdc17d90-17f5-44b8-99a1-94764de61698",
"w": 24,
"x": 0,
"y": 0
},
"name": "connector",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "004b1408-dcfe-40e1-8910-a1f16a574a85",
"layout": {
"h": 7,
"i": "004b1408-dcfe-40e1-8910-a1f16a574a85",
"w": 12,
"x": 0,
"y": 1
},
"name": "Traffic Bytes / Second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(tomcat_connector_bytes_sent{instance=\"$instance\"}[1m])",
"legend": "sent",
"refId": "A"
},
{
"expr": "rate(tomcat_connector_bytes_received{instance=\"$instance\"}[1m])",
"legend": "received",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "010aabe0-9b60-4bf3-867e-b5773703090a",
"layout": {
"h": 7,
"i": "010aabe0-9b60-4bf3-867e-b5773703090a",
"w": 12,
"x": 12,
"y": 1
},
"name": "Request count / Second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(tomcat_connector_request_count{instance=\"$instance\"}[1m])",
"legend": "tomcat_connector_request_count",
"refId": "A"
},
{
"expr": "rate(tomcat_connector_error_count{instance=\"$instance\"}[1m])",
"legend": "tomcat_connector_error_count",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "max_threads: The maximum number of allowed worker threads.\ncurrent_thread_count: The number of threads managed by the thread pool\ncurrent_threads_busy: The number of threads that are in use",
"id": "40ea7316-46b9-4447-8e45-6b5acb77f0d2",
"layout": {
"h": 7,
"i": "40ea7316-46b9-4447-8e45-6b5acb77f0d2",
"w": 12,
"x": 0,
"y": 3
},
"name": "Tread",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "tomcat_connector_max_threads{instance=\"$instance\"}",
"legend": "max_threads",
"refId": "A"
},
{
"expr": "tomcat_connector_current_thread_count{instance=\"$instance\"}",
"legend": "current_thread_count",
"refId": "B"
},
{
"expr": "tomcat_connector_current_threads_busy{instance=\"$instance\"}",
"legend": "current_threads_busy",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "c99a33ea-53d1-464d-84aa-e7be9e15cfb1",
"layout": {
"h": 7,
"i": "c99a33ea-53d1-464d-84aa-e7be9e15cfb1",
"w": 12,
"x": 12,
"y": 3
},
"name": "Processing time",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(tomcat_connector_processing_time{instance=\"$instance\"}[1m])",
"legend": "{{name}}-processing_time",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "1cca4a8b-9352-4d15-9488-ae3aee7b17e3",
"layout": {
"h": 1,
"i": "1cca4a8b-9352-4d15-9488-ae3aee7b17e3",
"w": 24,
"x": 0,
"y": 10
},
"name": "mem used",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "07059834-fb32-4dfd-88cd-a3bad0203c79",
"layout": {
"h": 7,
"i": "07059834-fb32-4dfd-88cd-a3bad0203c79",
"w": 24,
"x": 0,
"y": 11
},
"name": "Mem Used",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "tomcat_jvm_memory_max{instance=\"$instance\"}",
"legend": "max",
"refId": "A"
},
{
"expr": "tomcat_jvm_memory_total{instance=\"$instance\"}",
"legend": "used",
"refId": "B"
},
{
"expr": "tomcat_jvm_memory_free{instance=\"$instance\"}",
"legend": "free",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "6e07a88a-41b4-4fe2-b11f-0f5c8cb95d52",
"layout": {
"h": 1,
"i": "6e07a88a-41b4-4fe2-b11f-0f5c8cb95d52",
"w": 24,
"x": 0,
"y": 18
},
"name": "memorypool",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "92d79cd5-7a53-4f29-a42a-34db741e3c62",
"layout": {
"h": 7,
"i": "92d79cd5-7a53-4f29-a42a-34db741e3c62",
"w": 6,
"x": 0,
"y": 19
},
"name": "Used",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "tomcat_jvm_memorypool_used{instance=\"$instance\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "c6aace01-4b13-45dd-87e2-fd73cc3e5a28",
"layout": {
"h": 7,
"i": "c6aace01-4b13-45dd-87e2-fd73cc3e5a28",
"w": 6,
"x": 6,
"y": 19
},
"name": "Max",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "tomcat_jvm_memorypool_max{instance=\"$instance\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "8cbc2b26-290e-4d11-a56c-e2f0645179a0",
"layout": {
"h": 7,
"i": "8cbc2b26-290e-4d11-a56c-e2f0645179a0",
"w": 6,
"x": 12,
"y": 19
},
"name": "Committed",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "tomcat_jvm_memorypool_committed{instance=\"$instance\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "b81e4684-c03b-4ee1-86f2-45c883ace756",
"layout": {
"h": 7,
"i": "b81e4684-c03b-4ee1-86f2-45c883ace756",
"w": 6,
"x": 18,
"y": 19
},
"name": "Init",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "tomcat_jvm_memorypool_init{instance=\"$instance\"}",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(tomcat_up, instance)",
"name": "instance",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328436213000
}
================================================
FILE: integrations/Tomcat/markdown/README.md
================================================
# tomcat
tomcat 采集器,是读取 tomcat 的管理侧接口 `/manager/status/all` 这个接口需要鉴权。修改 `tomcat-users.xml` ,增加下面的内容:
```xml
```
此外,还需要注释文件**webapps/manager/META-INF/context.xml**的以下内容,
```xml
```
否则 tomcat 会报以下错误,导致 tomcat 采集器无法采集到数据。
```html
403 Access Denied
You are not authorized to view this page.
By default the Manager is only accessible from a browser running on the same machine as Tomcat. If you wish to modify this restriction, you'll need to edit the Manager's context.xml file.
```
## Configuration
配置文件在 `conf/input.tomcat/tomcat.toml`
```toml
[[instances]]
## URL of the Tomcat server status
url = "http://127.0.0.1:8080/manager/status/all?XML=true"
## HTTP Basic Auth Credentials
username = "tomcat"
password = "s3cret"
## Request timeout
# timeout = "5s"
# # interval = global.interval * interval_times
# interval_times = 1
# important! use global unique string to specify instance
# labels = { instance="192.168.1.2:8080", url="-" }
## Optional TLS Config
# use_tls = false
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
```
================================================
FILE: integrations/VictoriaMetrics/alerts/alerts.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "vm error logging rate 大于0 ",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(rate(vm_log_messages_total{job=~\"$job\",instance=~\"$instance\", level!=\"info\"}[5m])) by (job, level) \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [
"https://api.flashcat.cloud/event/push/alert/n9e?integration_key=f1258018f5595ba7bf30572f2f44c1ac973"
],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328440334000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "vm-insert 和 vm-storage 组件之间连接的饱和度大于9",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "max(rate(vm_rpc_send_duration_seconds_total{job_name=~\".+\"}[5m])) by(addr) / 1000 \u003e 9",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [
"https://api.flashcat.cloud/event/push/alert/n9e?integration_key=f1258018f5595ba7bf30572f2f44c1ac973"
],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328440822000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "vm-insert 实例端口异常",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "net_response_result_code{service=\"vm-insert\"}!=0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 120,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328441356000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "vm-select 实例端口异常",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "net_response_result_code{service=\"vm-select\"}!=0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [
"https://api.flashcat.cloud/event/push/alert/n9e?integration_key=f1258018f5595ba7bf30572f2f44c1ac973"
],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328441809000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "vm-select请求查询延迟超过15s",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "max(vm_request_duration_seconds{job_name=~\".+\"}) by (path) \u003e 15",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [
"https://api.flashcat.cloud/event/push/alert/n9e?integration_key=f1258018f5595ba7bf30572f2f44c1ac973"
],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328442240000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "vm-storage 实例端口异常",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 120,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "net_response_result_code{service=\"vm-storage\",env!=\"luke\",env!=\"guotai\"}!=0",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 120,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328442679000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "vminsert服务宕机",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "up{service=~\"vminsert.+\"} \u003c 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 60,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328443166000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "vmselect服务宕机",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "up{service=~\"vmselect.+\"} \u003c 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 60,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328443676000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "vmstorage服务宕机",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "up{service=~\"vmstorage.+\"} \u003c 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 60,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328444251000
}
]
================================================
FILE: integrations/VictoriaMetrics/dashboards/victoriametrics-cluster.json
================================================
{
"id": 0,
"group_id": 0,
"name": "VictoriaMetrics - Cluster",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [
{
"targetBlank": true,
"title": "Cluster Wiki",
"url": "https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Cluster-VictoriaMetrics"
},
{
"targetBlank": true,
"title": "Found a bug?",
"url": "https://github.com/VictoriaMetrics/VictoriaMetrics/issues"
},
{
"targetBlank": true,
"title": "New releases",
"url": "https://github.com/VictoriaMetrics/VictoriaMetrics/releases"
}
],
"panels": [
{
"collapsed": true,
"id": "693f2a6f-0328-4900-9bac-4d508cbcac1d",
"layout": {
"h": 1,
"i": "693f2a6f-0328-4900-9bac-4d508cbcac1d",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "状态",
"panels": [],
"type": "row",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "当前存储中的数据点大小",
"id": "4b227cc2-3d8f-4e43-97de-0eb34537fcdf",
"layout": {
"h": 3,
"i": "4b227cc2-3d8f-4e43-97de-0eb34537fcdf",
"isResizable": true,
"w": 6,
"x": 0,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Total datapoints",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(vm_rows{job=~\"$storage\", type!~\"indexdb.*\"})",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示数据点5分钟时间范围抓取时序值大小,包括复制因子",
"id": "9b17094d-58de-4e1a-a2c4-40aa438f483a",
"layout": {
"h": 3,
"i": "9b17094d-58de-4e1a-a2c4-40aa438f483a",
"isResizable": true,
"w": 6,
"x": 6,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Ingestion rate",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(rate(vm_vminsert_metrics_read_total{job=~\"$storage\", instance=~\"$instance\"}[5m])) ",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示查询模块的HTTP读请求速率",
"id": "6c44ab37-aae0-469d-ab3f-1d117f6b66b5",
"layout": {
"h": 3,
"i": "6c44ab37-aae0-469d-ab3f-1d117f6b66b5",
"isResizable": true,
"w": 6,
"x": 12,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Read requests",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(rate(vm_http_requests_total{job=~\"$job\", instance=~\"$instance\", path=~\"/select/.*\"}[5m]))",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "所有组件可用CPU总数",
"id": "c374afde-795c-4046-805b-2a0934813cf2",
"layout": {
"h": 3,
"i": "c374afde-795c-4046-805b-2a0934813cf2",
"isResizable": true,
"w": 6,
"x": 18,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Available CPU",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(vm_available_cpu_cores{job=~\"$job\", instance=~\"$instance\"})",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示在过去一小时内插入新数据点的活动时间序列的数量,如果值变高可能会导致抓取速度变慢\n\n具体详情: https://docs.victoriametrics.com/FAQ.html#what-is-an-active-time-series",
"id": "a2a05daa-56fc-40b3-b551-282470400963",
"layout": {
"h": 3,
"i": "a2a05daa-56fc-40b3-b551-282470400963",
"isResizable": true,
"w": 6,
"x": 0,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Active series",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(max_over_time(vm_cache_entries{job=~\"$job\", instance=~\"$instance\", type=\"storage/hour_metric_ids\"}[1h]))",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "已经使用的磁盘空间总量",
"id": "2f7a302b-ac69-4054-9ce9-7d18177c508c",
"layout": {
"h": 3,
"i": "2f7a302b-ac69-4054-9ce9-7d18177c508c",
"isResizable": true,
"w": 6,
"x": 6,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Disk space usage",
"options": {
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(vm_data_size_bytes{job=~\"$storage\"})",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "每个数据点的平均磁盘使用率",
"id": "d96f568e-20ca-47d6-a12a-c13a8754910b",
"layout": {
"h": 3,
"i": "d96f568e-20ca-47d6-a12a-c13a8754910b",
"isResizable": true,
"w": 6,
"x": 12,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Bytes per point",
"options": {
"standardOptions": {
"decimals": 2,
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(vm_data_size_bytes{job=~\"$storage\", type!~\"indexdb.*\"}) / sum(vm_rows{job=~\"$storage\", type!~\"indexdb.*\"})",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Total size of available memory for all VM components.",
"id": "67dcc1fa-5d51-44b2-af7b-45ea285e8d78",
"layout": {
"h": 3,
"i": "67dcc1fa-5d51-44b2-af7b-45ea285e8d78",
"isResizable": true,
"w": 6,
"x": 18,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Available memory",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(vm_available_memory_bytes{job=~\"$job\", instance=~\"$instance\"})",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "当前JOB名称,版本号,副本数",
"id": "f27606ef-b371-4d0a-a48b-4f935e1f5100",
"layout": {
"h": 5,
"i": "f27606ef-b371-4d0a-a48b-4f935e1f5100",
"isResizable": true,
"w": 12,
"x": 0,
"y": 7
},
"links": [],
"maxPerRow": 4,
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "sum(vm_app_version{job=~\"$job\", instance=~\"$instance\"}) by(job, short_version)",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "当前组件副本数",
"id": "66c86f6a-8ef1-46ed-8f22-f04677931c28",
"layout": {
"h": 5,
"i": "66c86f6a-8ef1-46ed-8f22-f04677931c28",
"isResizable": true,
"w": 12,
"x": 12,
"y": 7
},
"links": [],
"maxPerRow": 4,
"name": "Uptime ($job)",
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"standardOptions": {
"decimals": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(min_over_time(up{job=~\"$job\", instance=~\"$instance\"}[5m])) by (job)",
"legend": "{{job}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "72a53a57-6b0e-4457-ab3d-2af04fefa79e",
"layout": {
"h": 1,
"i": "72a53a57-6b0e-4457-ab3d-2af04fefa79e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 12
},
"name": "Overview",
"panels": [],
"type": "row",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "normal"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "在复制之前,每秒有多少个数据点通过协议插入到集群中",
"id": "e13f37cd-3543-434d-844c-a30154a262e4",
"layout": {
"h": 8,
"i": "e13f37cd-3543-434d-844c-a30154a262e4",
"isResizable": true,
"w": 12,
"x": 0,
"y": 13
},
"links": [],
"maxPerRow": 4,
"name": "Datapoints ingestion rate ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(vm_rows_inserted_total{job=~\"$job\", instance=~\"$instance\"}[5m])) by (type) \u003e 0 ",
"legend": "{{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "5分钟请求速率:\n* ' * ' -不支持的查询路径 \n* ' /write ' -插入到虚拟机 \n* ' /metrics ' -查询虚拟机系统指标 \n* ' /query ' -查询即时值 \n* ' /query_range ' -查询时间范围 \n* ' /series ' -匹配某个标签集 \n* ' /label/{}/values ' -查询标签值列表(主要是变量)",
"id": "f0cbee55-badd-4cb6-adfe-cb22c0abdc05",
"layout": {
"h": 8,
"i": "f0cbee55-badd-4cb6-adfe-cb22c0abdc05",
"isResizable": true,
"w": 12,
"x": 12,
"y": 13
},
"links": [],
"maxPerRow": 4,
"name": "Requests rate ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(vm_http_requests_total{job=~\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[5m])) by (path) \u003e 0",
"legend": "{{path}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示所有存储节点在过去一小时内插入新数据点的活动时间序列的数量,如果值过高可能会导致查询速度变慢。",
"id": "bf981fc8-7690-41b3-81ef-207687218564",
"layout": {
"h": 8,
"i": "bf981fc8-7690-41b3-81ef-207687218564",
"isResizable": true,
"w": 12,
"x": 0,
"y": 21
},
"links": [],
"maxPerRow": 4,
"name": "Active time series ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(vm_cache_entries{job=~\"$job\", instance=~\"$instance\", type=\"storage/hour_metric_ids\"})",
"legend": "Active time series",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "查询耗费时间越少越好:\n* ' * ' -不支持的查询路径 \n* ' /write ' -插入到虚拟机 \n* ' /metrics ' -查询虚拟机系统指标 \n* ' /query ' -查询即时值 \n* ' /query_range ' -查询时间范围 \n* ' /series ' -匹配某个标签集 \n* ' /label/{}/values ' -查询标签值列表(主要是变量)",
"id": "d3a42a4a-864c-4431-adea-15dc911ba3da",
"layout": {
"h": 8,
"i": "d3a42a4a-864c-4431-adea-15dc911ba3da",
"isResizable": true,
"w": 12,
"x": 12,
"y": 21
},
"links": [],
"maxPerRow": 4,
"name": "Query duration 0.99 quantile ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(vm_request_duration_seconds{job=~\"$job\", instance=~\"$instance\", quantile=\"0.99\"}) by (path) \u003e 0",
"legend": "{{path}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "请求错误速率:\n* ' * ' -不支持的查询路径 \n* ' /write ' -插入到虚拟机 \n* ' /metrics ' -查询虚拟机系统指标 \n* ' /query ' -查询即时值 \n* ' /query_range ' -查询时间范围 \n* ' /series ' -匹配某个标签集 \n* ' /label/{}/values ' -查询标签值列表(主要是变量)",
"id": "3f6f09a7-9d40-431f-8585-ab0fe9e27d3d",
"layout": {
"h": 8,
"i": "3f6f09a7-9d40-431f-8585-ab0fe9e27d3d",
"isResizable": true,
"w": 12,
"x": 0,
"y": 29
},
"links": [],
"maxPerRow": 4,
"name": "Requests error rate ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(vm_http_request_errors_total{job=~\"$job\", instance=~\"$instance\"}[5m])) by (job, path) \u003e 0",
"legend": "{{path}} ({{job}})",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "按级别显示记录消息的速率。",
"id": "4d5103a6-1c51-4d90-8a95-08b590e053e0",
"layout": {
"h": 8,
"i": "4d5103a6-1c51-4d90-8a95-08b590e053e0",
"isResizable": true,
"w": 12,
"x": 12,
"y": 29
},
"links": [],
"maxPerRow": 4,
"name": "Logging rate",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(vm_log_messages_total{job=~\"$job\",instance=~\"$instance\", level!=\"info\"}[5m])) by (job, level) \u003e 0",
"legend": "{{job}} - {{level}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "75f79ba0-cd99-4aa0-903e-8a3432b7590e",
"layout": {
"h": 1,
"i": "75f79ba0-cd99-4aa0-903e-8a3432b7590e",
"isResizable": false,
"w": 24,
"x": 0,
"y": 37
},
"name": "Resource usage ($job)",
"panels": [],
"type": "row",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "已用内存的百分比(常驻)。当内存使用率接近 100% 时,应用程序的性能将表示在下降。",
"id": "ed0f0389-99c0-4f49-9460-ce2027c85256",
"layout": {
"h": 8,
"i": "ed0f0389-99c0-4f49-9460-ce2027c85256",
"isResizable": true,
"w": 12,
"x": 0,
"y": 38
},
"links": [],
"maxPerRow": 4,
"name": "RSS memory % usage ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(\n max_over_time(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"}[5m])\n /\n vm_available_memory_bytes{job=~\"$job\", instance=~\"$instance\"}\n) by(job)",
"legend": "{{job}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "共享进程本身分配的内存。当内存使用率达到100%时,有被OOM-killed.的风险。",
"id": "0d98c801-be38-4d9d-a2aa-e47895c6a9cb",
"layout": {
"h": 8,
"i": "0d98c801-be38-4d9d-a2aa-e47895c6a9cb",
"isResizable": true,
"w": 12,
"x": 12,
"y": 38
},
"links": [],
"maxPerRow": 4,
"name": "RSS anonymous memory % usage ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(\n max_over_time(process_resident_memory_anon_bytes{job=~\"$job\", instance=~\"$instance\"}[5m])\n /\n vm_available_memory_bytes{job=~\"$job\", instance=~\"$instance\"}\n) by(job)",
"legend": "{{job}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "每个模块的CPU使用率",
"id": "bb8b7ddc-fc10-4ee2-9d50-6d76209c22a1",
"layout": {
"h": 8,
"i": "bb8b7ddc-fc10-4ee2-9d50-6d76209c22a1",
"isResizable": true,
"w": 12,
"x": 0,
"y": 46
},
"links": [],
"maxPerRow": 4,
"name": "CPU ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(\n rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[5m])\n /\n vm_available_cpu_cores{job=~\"$job\", instance=~\"$instance\"}\n) by(job)",
"legend": "{{job}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示从存储磁盘读取/写入的字节数。",
"id": "758b1ac7-23dc-4540-a60b-19dc63712d4b",
"layout": {
"h": 8,
"i": "758b1ac7-23dc-4540-a60b-19dc63712d4b",
"isResizable": true,
"w": 12,
"x": 12,
"y": 46
},
"links": [],
"maxPerRow": 4,
"name": "Disk writes/reads ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(process_io_storage_read_bytes_total{job=~\"$job\", instance=~\"$instance\"}[5m])) by(job) \u003e 0",
"legend": "read {{job}}",
"refId": "A"
},
{
"expr": "sum(rate(process_io_storage_written_bytes_total{job=~\"$job\", instance=~\"$instance\"}[5m])) by(job) \u003e 0",
"legend": "write {{job}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示打开的文件描述符与操作系统中设置的限制的百分比。 \n达到打开文件的限制可能导致各种问题,提前做好告警配置。\n\n在这里查看如何调整限制: https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a",
"id": "11bf58b2-3866-4861-b0f7-d97e30e5ded8",
"layout": {
"h": 8,
"i": "11bf58b2-3866-4861-b0f7-d97e30e5ded8",
"isResizable": true,
"w": 12,
"x": 0,
"y": 54
},
"links": [],
"maxPerRow": 4,
"name": "Open FDs usage % ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 2,
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(\n max_over_time(process_open_fds{job=~\"$job\", instance=~\"$instance\"}[5m])\n /\n process_max_fds{job=~\"$job\", instance=~\"$instance\"}\n) by(job)",
"legend": "{{job}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "5分钟内TCP连接最大数",
"id": "c3f34986-6a80-4a98-aff2-6bfb07b1c199",
"layout": {
"h": 8,
"i": "c3f34986-6a80-4a98-aff2-6bfb07b1c199",
"isResizable": true,
"w": 12,
"x": 12,
"y": 54
},
"links": [],
"maxPerRow": 4,
"name": "TCP connections ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(max_over_time(vm_tcplistener_conns{job=~\"$job\", instance=~\"$instance\"}[5m])) by(job)",
"legend": "{{job}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "5分钟内Goroutines最大数",
"id": "ecf03c92-cf17-4f63-b1db-2e37e6170fc5",
"layout": {
"h": 8,
"i": "ecf03c92-cf17-4f63-b1db-2e37e6170fc5",
"isResizable": true,
"w": 12,
"x": 0,
"y": 62
},
"links": [],
"maxPerRow": 4,
"name": "Goroutines ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 0,
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(max_over_time(go_goroutines{job=~\"$job\", instance=~\"$instance\"}[5m])) by(job)",
"legend": "{{job}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "TCP5分钟内监听的accepts速率",
"id": "92132b50-f7a2-44b8-9fa0-059884b6875e",
"layout": {
"h": 8,
"i": "92132b50-f7a2-44b8-9fa0-059884b6875e",
"isResizable": true,
"w": 12,
"x": 12,
"y": 62
},
"links": [],
"maxPerRow": 4,
"name": "TCP connections rate ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(vm_tcplistener_accepts_total{job=~\"$job\", instance=~\"$instance\"}[5m])) by(job)",
"legend": "{{job}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "5分钟最大线程数",
"id": "11a12db0-c519-4c12-a26c-458436262cc3",
"layout": {
"h": 8,
"i": "11a12db0-c519-4c12-a26c-458436262cc3",
"isResizable": true,
"w": 12,
"x": 0,
"y": 70
},
"links": [],
"maxPerRow": 4,
"name": "Threads ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 0,
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(max_over_time(process_num_threads{job=~\"$job\", instance=~\"$instance\"}[5m])) by(job)",
"legend": "{{job}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "5d9e60b8-4efd-4d16-b429-f3670ebf0cd8",
"layout": {
"h": 1,
"i": "5d9e60b8-4efd-4d16-b429-f3670ebf0cd8",
"isResizable": false,
"w": 24,
"x": 0,
"y": 78
},
"name": "Troubleshooting",
"panels": [],
"type": "row",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示过去 24 小时内创建的新系列的比率和总数。\n\n高Churn率与数据库性能密切相关,可能会导致意外的 OOM 或缓慢的查询。建议始终关注此指标以避免意外的基数“爆炸”。\n\nChurn率越高,处理它所需的资源就越多。考虑保持尽可能低的流失率。\n\n要调查有关最核心TSDB的统计信息,请使用 api/v1/status/tsdb 处理程序。\n详情: https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#url-format\n\n推荐阅读:\n* https://www.robustperception.io/cardinality-is-key\n* https://valyala.medium.com/high-cardinality-tsdb-benchmarks-victoriametrics-vs-timescaledb-vs-influxdb-13e6ee64dd6b",
"id": "f416e311-611c-4414-8176-59b5b7e224d0",
"layout": {
"h": 8,
"i": "f416e311-611c-4414-8176-59b5b7e224d0",
"isResizable": true,
"w": 12,
"x": 0,
"y": 79
},
"links": [],
"maxPerRow": 4,
"name": "Churn rate ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(vm_new_timeseries_created_total{job=~\"$storage\", instance=~\"$instance\"}[5m]))",
"legend": "churn rate",
"refId": "A"
},
{
"expr": "sum(increase(vm_new_timeseries_created_total{job=~\"$storage\", instance=~\"$instance\"}[24h]))",
"legend": "new series over 24h",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "过去 5 分钟内慢速插入占总插入率的百分比。\n\n值越小越好。如果在较长时间内百分比仍然很高 (\u003e10%),则可能需要更多 RAM 来优化处理当前数量的活动时间序列。\n\n一般来说,VictoriaMetrics 每个活动时间序列需要 ~1KB 或 RAM,因此根据容量规划文档计算当前工作负载所需的 RAM 量应该很容易。但结果数字可能与实际数字相去甚远,因为所需的内存量取决于其他因素,例如每个时间序列的标签数量和标签值的长度。",
"id": "8d8fa792-0d30-4204-b518-91ab5eed2c8c",
"layout": {
"h": 8,
"i": "8d8fa792-0d30-4204-b518-91ab5eed2c8c",
"isResizable": true,
"w": 12,
"x": 12,
"y": 79
},
"links": [],
"maxPerRow": 4,
"name": "Slow inserts",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "transparent",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 0.1
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(\n rate(vm_slow_row_inserts_total{job=~\"$storage\"}[5m]) \n / rate(vm_rows_added_to_storage_total{job=~\"$storage\"}[5m])\n)",
"legend": "slow inserts",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "当指定addr位置的vmstorage向指定vminsert节点通信,表示当前处于只读模式,无法接受新数据时,该值大于0",
"id": "b02351fa-01af-41da-bdad-c031c8880798",
"layout": {
"h": 8,
"i": "b02351fa-01af-41da-bdad-c031c8880798",
"isResizable": true,
"w": 12,
"x": 0,
"y": 87
},
"links": [],
"maxPerRow": 4,
"name": "Storage in readonly status for vminsert ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(vm_rpc_vmstorage_is_read_only{job=~\"$insert\", instance=~\"$instance\"}) by(instance, addr) \u003e 0",
"legend": "{{instance}} =\u003e {{addr}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "根据 search.logSlowQueryDuration 标志进行慢查询,默认为 5s。",
"id": "55f7842d-b27c-4cbe-b294-fb4dee1d9c40",
"layout": {
"h": 8,
"i": "55f7842d-b27c-4cbe-b294-fb4dee1d9c40",
"isResizable": true,
"w": 12,
"x": 12,
"y": 87
},
"links": [],
"maxPerRow": 4,
"name": "Slow queries rate ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(vm_slow_queries_total{job=~\"$select\", instance=~\"$instance\"}[5m]))",
"legend": "slow queries rate",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "当 vmstorage 跟不上合并部件时,合并辅助就会发生。这通常是 vmstorage 过载的状况。",
"id": "719dacf7-1352-40d5-a3fb-131cf2838afd",
"layout": {
"h": 8,
"i": "719dacf7-1352-40d5-a3fb-131cf2838afd",
"isResizable": true,
"w": 12,
"x": 0,
"y": 95
},
"links": [],
"maxPerRow": 4,
"name": "Assisted merges ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(increase(vm_assisted_merges_total{job=~\"$storage\", instance=~\"$instance\"}[5m])) by(type) \u003e 0",
"legend": "__auto",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "VictoriaMetrics 使用 -maxLabelsPerTimeseries 命令行标志限制每个指标的标签数量。\n\n这可以防止抓取带有过多标签的指标。必须根据您的工作负载调整 maxLabelsPerTimeseries 的值。\n\n当超出限制时(图表 \u003e 0)- 多余的标签将被丢弃,这可能会导致意外的相同时间序列。",
"id": "14737fe6-c582-48bc-b4a5-cc4e09df728f",
"layout": {
"h": 8,
"i": "14737fe6-c582-48bc-b4a5-cc4e09df728f",
"isResizable": true,
"w": 12,
"x": 12,
"y": 95
},
"links": [],
"maxPerRow": 4,
"name": "Labels limit exceeded ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 2,
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(increase(vm_metrics_with_dropped_labels_total{job=~\"$insert\", instance=~\"$instance\"}[5m]))",
"legend": "metrics with dropped labels",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "\n按类型显示已用缓存大小与允许大小的百分比。\n接近 100% 的值表示最大的潜在利用率。\n接近 0% 的值表明缓存未得到充分利用。",
"id": "24816924-fd34-4d5d-8806-30d129937bfe",
"layout": {
"h": 9,
"i": "24816924-fd34-4d5d-8806-30d129937bfe",
"isResizable": true,
"w": 12,
"x": 0,
"y": 103
},
"links": [],
"maxPerRow": 4,
"name": "Cache usage % by type ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(\n vm_cache_size_bytes{job=~\"$storage\", instance=~\"$instance\"} \n /\n vm_cache_size_max_bytes{job=~\"$job\", instance=~\"$instance\"}\n) by(type)",
"legend": "{{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示缓存丢失率,越低越好;",
"id": "549d18f7-715c-445a-8291-d4e43aa6b099",
"layout": {
"h": 9,
"i": "549d18f7-715c-445a-8291-d4e43aa6b099",
"isResizable": true,
"w": 12,
"x": 12,
"y": 103
},
"links": [],
"maxPerRow": 4,
"name": "Cache miss ratio ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"max": 1,
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(\n rate(vm_cache_misses_total{job=~\"$job\", instance=~\"$instance\"}[5m])\n /\n rate(vm_cache_requests_total{job=~\"$job\", instance=~\"$instance\"}[5m])\n) by(type)",
"legend": "{{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "a9e81e32-ad2b-487d-adae-3ee852bbc7b5",
"layout": {
"h": 7,
"i": "a9e81e32-ad2b-487d-adae-3ee852bbc7b5",
"isResizable": true,
"w": 24,
"x": 0,
"y": 112
},
"links": [],
"maxPerRow": 4,
"name": "Non-default flags",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "sum(flag{is_set=\"true\", job=~\"$job\", instance=~\"$instance\"}) by(job, instance, name, value)",
"legend": "{{job}} --\u003e {{name}} --\u003e {{value}}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"collapsed": false,
"id": "af7aae38-80a1-45cb-8fbc-f5ae6557cf9f",
"layout": {
"h": 1,
"i": "af7aae38-80a1-45cb-8fbc-f5ae6557cf9f",
"isResizable": false,
"w": 24,
"x": 0,
"y": 119
},
"name": "Interconnection ($job)",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "linear",
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows number of pushed and sent rows. \n* `Pushed rows` - rows added to internal inserter buffers before send\n* `Sent rows` - successfully transmitted rows to storage nodes\n\nPlease note, it could be that `Sent \u003e Pushed` because of the replication factor.",
"id": "78eca416-f3f2-459d-8a9f-50427f1ac9fb",
"layout": {
"h": 9,
"i": "78eca416-f3f2-459d-8a9f-50427f1ac9fb",
"w": 12,
"x": 0,
"y": 21
},
"links": [],
"maxPerRow": 4,
"name": "Rows ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(rate(vm_rpc_rows_pushed_total{job=~\"$job\",instance=~\"$instance\"}[5m]))",
"legend": "Pushed",
"refId": "B"
},
{
"expr": "sum(rate(vm_rpc_rows_sent_total{job=~\"$job\",instance=~\"$instance\"}[5m]))",
"legend": "Sent",
"refId": "E"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "linear",
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Rate of RPC errors by type:\n* `Connection` - the number of connection errors to vmstorage node\n* `Dial` - the number of dial errors to vmstorage node.\n* `Handshake` - the number of handshake errors to vmstorage node\n* `Rerouted` - errors appeared during rerouting of rows from un-healthy storage node to a healthy one.",
"id": "ae7d8087-3715-44a1-80a0-4b23795e2274",
"layout": {
"h": 9,
"i": "ae7d8087-3715-44a1-80a0-4b23795e2274",
"w": 12,
"x": 12,
"y": 21
},
"links": [],
"maxPerRow": 4,
"name": "RPC errors ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(rate(vm_rpc_connection_errors_total{job=~\"$job\",instance=~\"$instance\"}[5m]))",
"legend": "Connection",
"refId": "A"
},
{
"expr": "sum(rate(vm_rpc_dial_errors_total{job=~\"$job\",instance=~\"$instance\"}[5m]))",
"legend": "Dial",
"refId": "B"
},
{
"expr": "sum(rate(vm_rpc_handshake_errors_total{job=~\"$job\",instance=~\"$instance\"}[5m]))",
"legend": "Handshake",
"refId": "E"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "linear",
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The number of rows rerouted to the vmstorage node from other nodes when they were unhealthy.",
"id": "f7831239-6ba7-4509-afe6-f0850d4e2df7",
"layout": {
"h": 8,
"i": "f7831239-6ba7-4509-afe6-f0850d4e2df7",
"w": 12,
"x": 0,
"y": 30
},
"links": [],
"maxPerRow": 4,
"name": "Rows ($instance) rerouted to ",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(rate(vm_rpc_rows_rerouted_to_here_total{job=~\"$job\", instance=~\"$instance\"}[5m])) by(addr) \u003e 0",
"legend": "__auto",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "linear",
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The number of rows rerouted from the vmstorage node to healthy nodes when the given node was unhealthy.",
"id": "5b11b5b8-9f45-4a83-80df-b0a5c285a4d3",
"layout": {
"h": 8,
"i": "5b11b5b8-9f45-4a83-80df-b0a5c285a4d3",
"w": 12,
"x": 12,
"y": 30
},
"links": [],
"maxPerRow": 4,
"name": "Rows ($instance) rerouted from",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(rate(vm_rpc_rows_rerouted_from_here_total{job=~\"$job\", instance=~\"$instance\"}[5m])) by(addr) \u003e 0",
"legend": "{{addr}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "linear",
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The number of rows or bytes that vminesrt internal buffer contains at the moment.",
"id": "e8f68356-4bb3-46b5-b79d-d4de25ef16d2",
"layout": {
"h": 8,
"i": "e8f68356-4bb3-46b5-b79d-d4de25ef16d2",
"w": 12,
"x": 0,
"y": 38
},
"links": [],
"maxPerRow": 4,
"name": "Pending",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(vm_rpc_buf_pending_bytes{job=~\"$job\", instance=~\"$instance\"})",
"legend": "bytes",
"refId": "A"
},
{
"expr": "sum(vm_rpc_rows_pending{job=~\"$job\", instance=~\"$instance\"})",
"legend": "rows",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"lineInterpolation": "linear",
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Network usage by internal VictoriaMetrics RPC protocol",
"id": "0f34fea5-4a9c-411f-8659-853a0e85a350",
"layout": {
"h": 8,
"i": "0f34fea5-4a9c-411f-8659-853a0e85a350",
"w": 12,
"x": 12,
"y": 38
},
"links": [],
"maxPerRow": 4,
"name": "RPC network usage ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(rate(vm_tcpdialer_written_bytes_total{job=~\"$job\", instance=~\"$instance\"}[5m])) * 8",
"legend": "network usage",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"type": "row",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "a9237812-bf90-4c00-9a26-4ce8a6ab41cd",
"layout": {
"h": 1,
"i": "a9237812-bf90-4c00-9a26-4ce8a6ab41cd",
"isResizable": false,
"w": 24,
"x": 0,
"y": 120
},
"name": "vmstorage ($instance)",
"panels": [],
"type": "row",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "normal"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示每秒输入存储节点的数据点数。这个指标不会显示所有存储的数据点,因为其中一些数据点可能会因为错误的时间戳或解码错误而被丢弃。",
"id": "70a7c7b6-fc79-4154-8540-b85c00bf3f19",
"layout": {
"h": 8,
"i": "70a7c7b6-fc79-4154-8540-b85c00bf3f19",
"isResizable": true,
"w": 12,
"x": 0,
"y": 121
},
"links": [],
"maxPerRow": 4,
"name": "Ingestion rate ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(vm_vminsert_metrics_read_total{job=~\"$storage\", instance=~\"$instance\"}[5m])) ",
"legend": "ingestion rate",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "根据以下参数,显示至少一个vmstorage节点的磁盘容量达到100%所需的大约时间: \n\n可用磁盘空间; \n\n排摄食率; \n\ndedup率; \n\n压缩。 \n\n使用此面板进行容量规划,以便估计耗尽磁盘空间所需的剩余时间。",
"id": "303e6719-7d0e-4dbc-a063-dc2e6adb15b8",
"layout": {
"h": 8,
"i": "303e6719-7d0e-4dbc-a063-dc2e6adb15b8",
"isResizable": true,
"w": 12,
"x": 12,
"y": 121
},
"links": [],
"maxPerRow": 4,
"name": "Storage full ETA ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "min(vm_free_disk_space_bytes{job=~\"$storage\", instance=~\"$instance\"} \n/ \nignoring(path) (\n (\n rate(vm_rows_added_to_storage_total{job=~\"$storage\", instance=~\"$instance\"}[1d])\n - \n ignoring(type) rate(vm_deduplicated_samples_total{job=~\"$storage\", instance=~\"$instance\", type=\"merge\"}[1d])\n ) * scalar(\n sum(vm_data_size_bytes{job=~\"$storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n / \n sum(vm_rows{job=~\"$storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n )\n))",
"legend": "min ETA",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "storage组件CPU使用率",
"id": "6435eb33-8527-4f99-830b-298718893375",
"layout": {
"h": 7,
"i": "6435eb33-8527-4f99-830b-298718893375",
"isResizable": true,
"w": 12,
"x": 0,
"y": 129
},
"links": [],
"maxPerRow": 4,
"name": "CPU usage % ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 0.9
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(\n rate(process_cpu_seconds_total{job=~\"$storage\", instance=~\"$instance\"}[5m])\n /\n process_cpu_cores_available{job=~\"$storage\", instance=~\"$instance\"}\n)",
"legend": "max",
"refId": "A"
},
{
"expr": "min(\n rate(process_cpu_seconds_total{job=~\"$storage\", instance=~\"$instance\"}[5m])\n /\n process_cpu_cores_available{job=~\"$storage\", instance=~\"$instance\"}\n)",
"legend": "min",
"refId": "B"
},
{
"expr": "avg(\n rate(process_cpu_seconds_total{job=~\"$storage\", instance=~\"$instance\"}[5m])\n /\n process_cpu_cores_available{job=~\"$storage\", instance=~\"$instance\"}\n)",
"legend": "avg",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "已用内存的百分比(常驻)。当内存使用率接近 100% 时,应用程序的性能将显着下降。",
"id": "ab3bf331-6dde-4eff-b89b-29432c02f466",
"layout": {
"h": 7,
"i": "ab3bf331-6dde-4eff-b89b-29432c02f466",
"isResizable": true,
"w": 12,
"x": 12,
"y": 129
},
"links": [],
"maxPerRow": 4,
"name": "Memory usage % ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 0.9
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(\n max_over_time(process_resident_memory_bytes{job=~\"$storage\", instance=~\"$instance\"}[5m])\n /\n vm_available_memory_bytes{job=~\"$storage\", instance=~\"$instance\"}\n)",
"legend": "max",
"refId": "A"
},
{
"expr": "min(\n max_over_time(process_resident_memory_bytes{job=~\"$storage\", instance=~\"$instance\"}[5m])\n /\n vm_available_memory_bytes{job=~\"$storage\", instance=~\"$instance\"}\n)",
"legend": "min",
"refId": "B"
},
{
"expr": "avg(\n max_over_time(process_resident_memory_bytes{job=~\"$storage\", instance=~\"$instance\"}[5m])\n /\n vm_available_memory_bytes{job=~\"$storage\", instance=~\"$instance\"}\n)",
"legend": "avg",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "\u003e 0表示vmstorage处于只读模式。",
"id": "0fb1c607-1dff-4906-bfde-a083be35719f",
"layout": {
"h": 8,
"i": "0fb1c607-1dff-4906-bfde-a083be35719f",
"isResizable": true,
"w": 12,
"x": 0,
"y": 136
},
"links": [],
"maxPerRow": 4,
"name": "Readonly mode",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "vm_storage_is_read_only{job=~\"$storage\", instance=~\"$instance\"} \u003e 0",
"legend": "{{ instance }}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示磁盘上正在进行的插入(不是API/写调用)的数量,其中: \n* ' max ' -等于cpu的数量; \n* ' current ' -当前忙于将行插入底层存储的例程数量。 \n \n每个成功的API /写调用都会在磁盘上刷新。“max”是内部限制,不能更改。它总是等于cpu的数量。 \n \n当“current”不断达到“max”时,这意味着存储过载,需要更多的CPU或更快的磁盘。",
"id": "57f9de7a-7fc7-453a-a3a5-d6e4a6b3cdad",
"layout": {
"h": 8,
"i": "57f9de7a-7fc7-453a-a3a5-d6e4a6b3cdad",
"isResizable": true,
"w": 12,
"x": 12,
"y": 136
},
"links": [],
"maxPerRow": 4,
"name": "Concurrent flushes on disk ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(\n max_over_time(vm_concurrent_addrows_current{job=~\"$storage\", \n instance=~\"$instance\"}[5m])\n)",
"legend": "current",
"refId": "A"
},
{
"expr": "min(vm_concurrent_addrows_capacity{job=~\"$storage\", instance=~\"$instance\"})",
"legend": "max",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "存储节点中正在进行的合并数。预计会有大量的[存储/小指标]。",
"id": "82b1a90d-3e80-4129-8047-4ccfbd8609af",
"layout": {
"h": 8,
"i": "82b1a90d-3e80-4129-8047-4ccfbd8609af",
"isResizable": true,
"w": 12,
"x": 0,
"y": 144
},
"links": [],
"maxPerRow": 4,
"name": "Active merges ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 0,
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(max_over_time(vm_active_merges{job=~\"$storage\", instance=~\"$instance\"}[5m])) by(type)",
"legend": "{{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "存储节点每秒合并的行数",
"id": "a4ea4f0a-94ad-4d1f-b1df-e9cc7218e5eb",
"layout": {
"h": 8,
"i": "a4ea4f0a-94ad-4d1f-b1df-e9cc7218e5eb",
"isResizable": true,
"w": 12,
"x": 12,
"y": 144
},
"links": [],
"maxPerRow": 4,
"name": "Merge speed",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 0,
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(vm_rows_merged_total{job=~\"$storage\", instance=~\"$instance\"}[5m])) by(type)",
"legend": "{{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示已用磁盘空间的百分比。建议至少有 20% 的可用磁盘空间以获得最佳性能。",
"id": "e8ccf000-fedb-43a8-aefb-95fca44eb8f0",
"layout": {
"h": 8,
"i": "e8ccf000-fedb-43a8-aefb-95fca44eb8f0",
"isResizable": true,
"w": 12,
"x": 0,
"y": 152
},
"links": [],
"maxPerRow": 4,
"name": "Disk space usage % ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) /\n (\n sum(vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) +\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)\n ) \n)",
"legend": "max",
"refId": "A"
},
{
"expr": "min(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) /\n (\n sum(vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) +\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)\n ) \n)",
"legend": "min",
"refId": "B"
},
{
"expr": "avg(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) /\n (\n sum(vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance) +\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)\n ) \n)",
"legend": "avg",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "所有存储节点上 LSM 树的最大数据部分数。大量部件(硬限制为 512)是合并性能缓慢的证据 - 检查资源利用率。\n\nindexdb — 倒排索引\nstorage/small - 最近添加的部分数据被抓取到存储中(热数据)\nstorage/big - 小部分逐渐合并为大部分(冷数据)",
"id": "e7dcd30f-66d7-4af5-a138-28325c85ecb2",
"layout": {
"h": 8,
"i": "e7dcd30f-66d7-4af5-a138-28325c85ecb2",
"isResizable": true,
"w": 12,
"x": 12,
"y": 152
},
"links": [],
"maxPerRow": 4,
"name": "LSM parts max by type ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(vm_parts{job=~\"$storage\", instance=~\"$instance\"}) by(type)",
"legend": "{{type}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "按类型显示已用磁盘空间的百分比:datapoints 或 indexdb。通常,与数据点相比,indexdb 占用的空间要少得多。但是随着流失率的提高,indexdb 的大小可能会显着增长。\n\n% 的总和可以 \u003e 100%,因为面板显示每个作业和每个实例的最大 %。这意味着不同的实例在数据点和索引数据库大小之间可以有不同的比率。",
"id": "817bdcd4-b83f-429b-8439-68edb181cea0",
"layout": {
"h": 8,
"i": "817bdcd4-b83f-429b-8439-68edb181cea0",
"isResizable": true,
"w": 12,
"x": 0,
"y": 160
},
"links": [],
"maxPerRow": 4,
"name": "Disk space usage % by type ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\", type=~\"indexdb.*\"}) by(job, instance)\n / \n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)\n)",
"legend": "indexdb",
"refId": "A"
},
{
"expr": "max(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"}) by(job, instance)\n / \n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)\n)",
"legend": "datapoints",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "How many datapoints are in RAM queue waiting to be written into storage. The number of pending data points should be in the range from 0 to `2*\u003cingestion_rate\u003e`, since VictoriaMetrics pushes pending data to persistent storage every second. The index datapoints value in general is much lower.",
"id": "71aec357-04e8-4180-b808-0ebb61aeaa0e",
"layout": {
"h": 8,
"i": "71aec357-04e8-4180-b808-0ebb61aeaa0e",
"isResizable": true,
"w": 12,
"x": 12,
"y": 160
},
"links": [],
"maxPerRow": 4,
"name": "Pending datapoints ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(max_over_time(vm_pending_rows{job=~\"$storage\", instance=~\"$instance\", type=\"storage\"}[5m]))",
"legend": "pending datapoints",
"refId": "A"
},
{
"expr": "sum(max_over_time(vm_pending_rows{job=~\"$storage\", instance=~\"$instance\", type=\"indexdb\"}[5m]))",
"legend": "pending index entries",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示一小时时间范围内由于损坏或超出保留时间戳而在插入时忽略了多少行,正常情况下为0;",
"id": "e172680e-5d73-4568-b32b-0b4ef0eebfd3",
"layout": {
"h": 8,
"i": "e172680e-5d73-4568-b32b-0b4ef0eebfd3",
"isResizable": true,
"w": 12,
"x": 0,
"y": 168
},
"links": [],
"maxPerRow": 4,
"name": "Rows ignored for last 1h ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(increase(vm_rows_ignored_total{job=~\"$storage\", instance=~\"$instance\"}[1h])) by (reason)",
"legend": "{{reason}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": false,
"id": "47fed7f1-260b-4bf2-8d5c-0b5cb58c542d",
"layout": {
"h": 1,
"i": "47fed7f1-260b-4bf2-8d5c-0b5cb58c542d",
"isResizable": false,
"w": 24,
"x": 0,
"y": 176
},
"name": "vmselect ($instance)",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "vmselect 节点接受的请求率",
"id": "5214fb3d-7d6c-4796-a39d-008f86d91eda",
"layout": {
"h": 8,
"i": "5214fb3d-7d6c-4796-a39d-008f86d91eda",
"isResizable": true,
"w": 12,
"x": 0,
"y": 177
},
"links": [],
"maxPerRow": 4,
"name": "Requests rate ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(vm_http_requests_total{job=~\"$select\", instance=~\"$instance\", path!~\"/favicon.ico|/metrics\"}[5m])) by (path) \u003e 0",
"legend": "{{path}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示跨实例的最大并发选择数。\n* `max` - 默认情况下等于 CPU 数量 * 2。可以配置 `search.maxConcurrentRequests` 标志\n* `current` - 当前忙于处理请求的 goroutines 数量\n\n当 current 不断达到 max 时,这意味着一个或多个 vmselect 节点过载,需要更多 CPU 或更好的负载平衡。如果 CPU 面板显示有可用资源 - 尝试增加 `search.maxConcurrentRequests`。",
"id": "b1a54478-360f-46d0-ac5d-b5957f3b9a4a",
"layout": {
"h": 8,
"i": "b1a54478-360f-46d0-ac5d-b5957f3b9a4a",
"isResizable": true,
"w": 12,
"x": 12,
"y": 177
},
"links": [],
"maxPerRow": 4,
"name": "Concurrent selects ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 0,
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(max_over_time(vm_concurrent_select_current{job=~\"$select\", instance=~\"$instance\"}[5m])) ",
"legend": "current",
"refId": "A"
},
{
"expr": "min(vm_concurrent_select_capacity{job=~\"$select\", instance=~\"$instance\"})",
"legend": "max",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "linear",
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "70fee9dd-61a1-4c1c-9fbd-101b8b61ffa5",
"layout": {
"h": 8,
"i": "70fee9dd-61a1-4c1c-9fbd-101b8b61ffa5",
"isResizable": true,
"w": 12,
"x": 0,
"y": 185
},
"links": [],
"maxPerRow": 4,
"name": "CPU usage % ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 0.9
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "max(\n rate(process_cpu_seconds_total{job=~\"$select\", instance=~\"$instance\"}[5m])\n /\n process_cpu_cores_available{job=~\"$select\", instance=~\"$instance\"}\n)",
"legend": "max",
"refId": "A"
},
{
"expr": "min(\n rate(process_cpu_seconds_total{job=~\"$select\", instance=~\"$instance\"}[5m])\n /\n process_cpu_cores_available{job=~\"$select\", instance=~\"$instance\"}\n)",
"legend": "min",
"refId": "B"
},
{
"expr": "avg(\n rate(process_cpu_seconds_total{job=~\"$select\", instance=~\"$instance\"}[5m])\n /\n process_cpu_cores_available{job=~\"$select\", instance=~\"$instance\"}\n)",
"legend": "avg",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "linear",
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "f1f98135-9133-49fb-b9ce-f401206ce005",
"layout": {
"h": 8,
"i": "f1f98135-9133-49fb-b9ce-f401206ce005",
"isResizable": true,
"w": 12,
"x": 12,
"y": 185
},
"links": [],
"maxPerRow": 4,
"name": "Memory usage % ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 0.9
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "max(\n max_over_time(process_resident_memory_bytes{job=~\"$select\", instance=~\"$instance\"}[5m])\n /\n vm_available_memory_bytes{job=~\"$select\", instance=~\"$instance\"}\n)",
"legend": "max",
"refId": "A"
},
{
"expr": "min(\n max_over_time(process_resident_memory_bytes{job=~\"$select\", instance=~\"$instance\"}[5m])\n /\n vm_available_memory_bytes{job=~\"$select\", instance=~\"$instance\"}\n)",
"legend": "min",
"refId": "B"
},
{
"expr": "avg(\n max_over_time(process_resident_memory_bytes{job=~\"$select\", instance=~\"$instance\"}[5m])\n /\n vm_available_memory_bytes{job=~\"$select\", instance=~\"$instance\"}\n)",
"legend": "avg",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "linear",
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "99th percentile of the number of time series read per query.",
"id": "abc3cbf4-b299-4ebb-832f-c6c883976ad8",
"layout": {
"h": 8,
"i": "abc3cbf4-b299-4ebb-832f-c6c883976ad8",
"isResizable": true,
"w": 12,
"x": 0,
"y": 193
},
"links": [],
"maxPerRow": 4,
"name": "Series read per query ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 2,
"min": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(histogram_quantile(0.99, sum(rate(vm_series_read_per_query_bucket{job=~\"$select\", instance=~\"$instance\"}[5m])) by (instance, vmrange)))",
"legend": "series",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "linear",
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "99th percentile of number of raw datapoints read per queried time series.",
"id": "d6aa190b-7ba4-4853-b408-79ac7b9d7262",
"layout": {
"h": 8,
"i": "d6aa190b-7ba4-4853-b408-79ac7b9d7262",
"isResizable": true,
"w": 12,
"x": 12,
"y": 193
},
"links": [],
"maxPerRow": 4,
"name": "Datapoints read per series ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 2,
"min": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(histogram_quantile(0.99, sum(rate(vm_rows_read_per_series_bucket{job=~\"$select\", instance=~\"$instance\"}[5m])) by (instance, vmrange)))",
"legend": "datapoints",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "linear",
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "99th percentile of number of raw datapoints read per query.",
"id": "1b7e2b67-0c31-4801-a717-7ada0194d94e",
"layout": {
"h": 8,
"i": "1b7e2b67-0c31-4801-a717-7ada0194d94e",
"isResizable": true,
"w": 12,
"x": 0,
"y": 201
},
"links": [],
"maxPerRow": 4,
"name": "Datapoints read per query ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 2,
"min": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(histogram_quantile(0.99, sum(rate(vm_rows_read_per_query_bucket{job=~\"$select\", instance=~\"$instance\"}[5m])) by (instance, vmrange)))",
"legend": "datapoints",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"lineInterpolation": "linear",
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "99th percentile of number of raw datapoints scanner per query.\n\nThis number can exceed number of DatapointsReadPerQuery if `step` query arg passed to [/api/v1/query_range](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries) is smaller than the lookbehind window set in square brackets of [rollup function](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions). For example, if `increase(some_metric[1h])` is executed with the `step=5m`, then the same raw samples on a hour time range are scanned `1h/5m=12` times. See [this article](https://valyala.medium.com/how-to-optimize-promql-and-metricsql-queries-85a1b75bf986) for details.",
"id": "28ab893f-efd9-4e40-80ea-fb50cec4e925",
"layout": {
"h": 8,
"i": "28ab893f-efd9-4e40-80ea-fb50cec4e925",
"isResizable": true,
"w": 12,
"x": 12,
"y": 201
},
"links": [],
"maxPerRow": 4,
"name": "Datapoints scanned per series ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 2,
"min": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69"
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(histogram_quantile(0.99, sum(rate(vm_rows_scanned_per_query_bucket{job=~\"$select\", instance=~\"$instance\"}[5m])) by (instance, vmrange)))",
"legend": "datapoints",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "5分钟内网络使用率",
"id": "e10ec01f-8276-4e92-9040-38257a4732a8",
"layout": {
"h": 8,
"i": "e10ec01f-8276-4e92-9040-38257a4732a8",
"isResizable": true,
"w": 24,
"x": 0,
"y": 209
},
"links": [],
"maxPerRow": 4,
"name": "Network usage ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "bitsSI"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(vm_tcplistener_read_bytes_total{job=~\"$select\", instance=~\"$instance\"}[5m])) * 8 \u003e 0",
"legend": "read",
"refId": "A"
},
{
"expr": "sum(rate(vm_tcplistener_written_bytes_total{job=~\"$select\", instance=~\"$instance\"}[5m])) * 8 \u003e 0",
"legend": "write ",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"type": "row",
"version": "2.0.0"
},
{
"collapsed": false,
"id": "b888b6ae-d2e5-4dcb-b649-c8364b29304a",
"layout": {
"h": 1,
"i": "b888b6ae-d2e5-4dcb-b649-c8364b29304a",
"isResizable": false,
"w": 24,
"x": 0,
"y": 177
},
"name": "vminsert ($instance)",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* - 不支持的查询路径\n\n/write - 插入虚拟机\n\n/metrics - 查询 VM 系统指标\n\n/query - 查询即时值\n\n/query_range - 在一段时间内查询\n\n/series - 匹配某个标签集\n\n/label/{}/values - 查询标签值列表(主要是变量)",
"id": "64b2044b-e481-4faa-bd02-3e2b7ee1acf3",
"layout": {
"h": 8,
"i": "64b2044b-e481-4faa-bd02-3e2b7ee1acf3",
"isResizable": true,
"w": 12,
"x": 0,
"y": 178
},
"links": [],
"maxPerRow": 4,
"name": "Requests rate ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(vm_http_requests_total{job=~\"$insert\", instance=~\"$instance\", path!~\"/favicon.ico|/metrics\"}[5m])) by (path) \u003e 0",
"legend": "{{path}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示正在进行的插入的最大数量。\n\nmax - 默认情况下等于 CPU 数量 * 4,可以配置 maxConcurrentInserts 标志;\n\ncurrent - 当前忙于处理请求的 goroutines 数量。\n\n-maxConcurrentInserts 限制在任何给定时间点可以主动处理的插入请求的数量。所有其他插入\n请求都排队等待 \n\n-insert.maxQueueDuration,希望它们有机会被处理。该队列主要用于吸收传入插入请求率的峰值。\n\n当命中不断达到最大值时,这意味着 vminsert 节点过载并且需要更多的 CPU 或更高的限制。",
"id": "d02921bc-1482-45c4-adf8-f25c7e7dafe2",
"layout": {
"h": 8,
"i": "d02921bc-1482-45c4-adf8-f25c7e7dafe2",
"isResizable": true,
"w": 12,
"x": 12,
"y": 178
},
"links": [],
"maxPerRow": 4,
"name": "Concurrent inserts ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 0,
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(max_over_time(vm_concurrent_insert_current{job=~\"$insert\", instance=~\"$instance\"}[5m]))",
"legend": "current",
"refId": "A"
},
{
"expr": "min(vm_concurrent_insert_capacity{job=~\"$insert\", instance=~\"$instance\"})",
"legend": "max",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "5分钟内insert组件CPU使用率",
"id": "97d5079f-c104-4e8f-b2ac-d66eeb288f32",
"layout": {
"h": 8,
"i": "97d5079f-c104-4e8f-b2ac-d66eeb288f32",
"isResizable": true,
"w": 12,
"x": 0,
"y": 186
},
"links": [],
"maxPerRow": 4,
"name": "CPU usage % ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 0.9
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(\n rate(process_cpu_seconds_total{job=~\"$insert\", instance=~\"$instance\"}[5m])\n /\n process_cpu_cores_available{job=~\"$insert\", instance=~\"$instance\"}\n)",
"legend": "max",
"refId": "A"
},
{
"expr": "min(\n rate(process_cpu_seconds_total{job=~\"$insert\", instance=~\"$instance\"}[5m])\n /\n process_cpu_cores_available{job=~\"$insert\", instance=~\"$instance\"}\n)",
"legend": "min",
"refId": "B"
},
{
"expr": "avg(\n rate(process_cpu_seconds_total{job=~\"$insert\", instance=~\"$instance\"}[5m])\n /\n process_cpu_cores_available{job=~\"$insert\", instance=~\"$instance\"}\n)",
"legend": "avg",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "5分钟以内insert组件内存使用率",
"id": "6b7a3248-7307-4166-b2b8-9283a687e166",
"layout": {
"h": 8,
"i": "6b7a3248-7307-4166-b2b8-9283a687e166",
"isResizable": true,
"w": 12,
"x": 12,
"y": 186
},
"links": [],
"maxPerRow": 4,
"name": "Memory usage % ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 0.9
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(\n max_over_time(process_resident_memory_bytes{job=~\"$insert\", instance=~\"$instance\"}[5m])\n /\n vm_available_memory_bytes{job=~\"$insert\", instance=~\"$instance\"}\n)",
"legend": "max",
"refId": "A"
},
{
"expr": "min(\n max_over_time(process_resident_memory_bytes{job=~\"$insert\", instance=~\"$instance\"}[5m])\n /\n vm_available_memory_bytes{job=~\"$insert\", instance=~\"$instance\"}\n)",
"legend": "min",
"refId": "B"
},
{
"expr": "avg(\n max_over_time(process_resident_memory_bytes{job=~\"$insert\", instance=~\"$instance\"}[5m])\n /\n vm_available_memory_bytes{job=~\"$insert\", instance=~\"$instance\"}\n)",
"legend": "avg",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "normal"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "5分钟内网络读取速率",
"id": "ebe4d8cf-96f4-4c8e-88a4-eebc26bbea4e",
"layout": {
"h": 8,
"i": "ebe4d8cf-96f4-4c8e-88a4-eebc26bbea4e",
"isResizable": true,
"w": 12,
"x": 0,
"y": 194
},
"links": [],
"maxPerRow": 4,
"name": "Network usage ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(vm_tcplistener_read_bytes_total{job=~\"$insert\", instance=~\"$instance\"}[5m])) * 8 \u003e 0",
"legend": "read",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "5分钟内插入最大行数",
"id": "7866fa29-5c62-4fe8-b20e-90070a102428",
"layout": {
"h": 8,
"i": "7866fa29-5c62-4fe8-b20e-90070a102428",
"isResizable": true,
"w": 12,
"x": 12,
"y": 194
},
"links": [],
"maxPerRow": 4,
"name": "Rows per insert ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 2,
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(histogram_quantile(0.99, sum(increase(vm_rows_per_insert_bucket{job=~\"$job\", instance=~\"$instance\"}[5m])) by (instance, vmrange)))",
"legend": "max",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示 vminsert 和 vmstorage 组件之间连接的饱和度。如果达到 0.9 秒的阈值,则连接饱和度超过 90%,vminsert 将无法跟上。这通常意味着必须向集群添加更多 vminsert 或 vmstorage 节点,以增加 vminsert -\u003e vmstorage 链接的总数。",
"id": "d0a46b48-be53-4034-a2e3-488bd8e18203",
"layout": {
"h": 8,
"i": "d0a46b48-be53-4034-a2e3-488bd8e18203",
"isResizable": true,
"w": 12,
"x": 0,
"y": 202
},
"links": [],
"maxPerRow": 4,
"name": "Storage connection saturation ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 0,
"min": 0,
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "transparent",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 0.9
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(rate(vm_rpc_send_duration_seconds_total{job=~\"$job\", instance=~\"$instance\"}[5m])) by(addr)",
"legend": "{{addr}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "当 vmstorage 节点无法访问 vminsert 时显示,默认无数据。",
"id": "a86e2c44-fc5c-4607-90b4-09e02b742261",
"layout": {
"h": 8,
"i": "a86e2c44-fc5c-4607-90b4-09e02b742261",
"isResizable": true,
"w": 12,
"x": 12,
"y": 202
},
"links": [],
"maxPerRow": 4,
"name": "Storage reachability ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"decimals": 0,
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "vm_rpc_vmstorage_is_reachable{job=~\"$job\", instance=~\"$instance\"} != 1",
"legend": "{{instance}} =\u003e {{addr}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"type": "row",
"version": "2.0.0"
},
{
"collapsed": false,
"id": "8d53366b-dc71-4c15-9edd-b701ba54b289",
"layout": {
"h": 1,
"i": "8d53366b-dc71-4c15-9edd-b701ba54b289",
"isResizable": false,
"w": 24,
"x": 0,
"y": 178
},
"name": "Drilldown",
"panels": [
{
"custom": {
"content": "Drilldown row is used by other panels on the dashboard to show more detailed metrics per-instance.",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d9563990-aade-403a-afc4-d81913b5aafe",
"layout": {
"h": 2,
"i": "d9563990-aade-403a-afc4-d81913b5aafe",
"isResizable": true,
"w": 24,
"x": 0,
"y": 179
},
"links": [],
"maxPerRow": 4,
"options": {},
"targets": [],
"type": "text",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示已用内存(常驻)。当内存使用率接近 100% 时,应用程序的性能将逐渐下降。",
"id": "11dbb220-8224-4e61-a3a7-0718be6fc8ae",
"layout": {
"h": 8,
"i": "11dbb220-8224-4e61-a3a7-0718be6fc8ae",
"isResizable": true,
"w": 12,
"x": 0,
"y": 181
},
"links": [],
"maxPerRow": 4,
"name": "RSS memory usage ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max_over_time(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"}[5m])",
"legend": "{{instance}} ({{job}})",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "RSS share for memory allocated by the process itself. This share cannot be freed by the OS, so it must be taken into account by OOM killer.\n\nRSS 共享由进程本身分配的内存。操作系统无法释放此共享,因此 OOM killer 必须将其考虑在内。",
"id": "759dddcc-0d30-408d-9778-e4e07cd65618",
"layout": {
"h": 8,
"i": "759dddcc-0d30-408d-9778-e4e07cd65618",
"isResizable": true,
"w": 12,
"x": 12,
"y": 181
},
"links": [],
"maxPerRow": 4,
"name": "RSS anonymous memory usage ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max_over_time(process_resident_memory_anon_bytes{job=~\"$job\", instance=~\"$instance\"}[5m])",
"legend": "{{instance}} ({{job}})",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "所有组件CPU使用率",
"id": "c61a4c5d-fea6-4dcc-89ee-1a100fa22bd2",
"layout": {
"h": 7,
"i": "c61a4c5d-fea6-4dcc-89ee-1a100fa22bd2",
"isResizable": true,
"w": 12,
"x": 0,
"y": 189
},
"links": [],
"maxPerRow": 4,
"name": "CPU usage ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[5m])) by(job, instance)",
"legend": "{{instance}} ({{job}})",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "根据以下参数显示达到 100% 磁盘容量所需的大约时间:\n\n* free disk space; 可用磁盘空间\n* row ingestion rate; 行抓取率\n* dedup rate; 去重率\n* compression. 压缩\n\n使用此面板进行容量规划,以估计磁盘空间用完的剩余时间。",
"id": "5467f4d9-6922-4501-95a1-4055e0d8dd0d",
"layout": {
"h": 7,
"i": "5467f4d9-6922-4501-95a1-4055e0d8dd0d",
"isResizable": true,
"w": 12,
"x": 12,
"y": 189
},
"links": [],
"maxPerRow": 4,
"name": "Storage full ETA ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "vm_free_disk_space_bytes{job=~\"$storage\", instance=~\"$instance\"} \n/ \nignoring(path) (\n (\n rate(vm_rows_added_to_storage_total{job=~\"$storage\", instance=~\"$instance\"}[1d])\n - \n ignoring(type) rate(vm_deduplicated_samples_total{job=~\"$storage\", instance=~\"$instance\", type=\"merge\"}[1d])\n ) * scalar(\n sum(vm_data_size_bytes{job=~\"$storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n / \n sum(vm_rows{job=~\"$storage\", instance=~\"$instance\", type!~\"indexdb.*\"})\n )\n)",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "显示已用磁盘空间的百分比。建议至少有 20% 的可用磁盘空间;",
"id": "8b7c719e-b9d7-4ebe-b120-345b5df41df1",
"layout": {
"h": 8,
"i": "8b7c719e-b9d7-4ebe-b120-345b5df41df1",
"isResizable": true,
"w": 12,
"x": 0,
"y": 196
},
"links": [],
"maxPerRow": 4,
"name": "Disk space usage ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) by(job, instance)",
"legend": "",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "使用数据类型的磁盘使用率百分比",
"id": "5d423d45-09f3-4eb0-89f4-85f1944975b9",
"layout": {
"h": 8,
"i": "5d423d45-09f3-4eb0-89f4-85f1944975b9",
"isResizable": true,
"w": 12,
"x": 12,
"y": 196
},
"links": [],
"maxPerRow": 4,
"name": "Disk space usage by type ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\", type=~\"indexdb.*\"}) by(job, instance)",
"legend": "{{job}}:{{instance}} (indexdb)",
"refId": "A"
},
{
"expr": "sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"}) by(job, instance)",
"legend": "{{job}}:{{instance}} (datapoints)",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"type": "row",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"label": "",
"name": "datasource",
"type": "datasource"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(vm_app_version{job=~\"^vm-(insert|select|storage).*\"}, job)",
"multi": true,
"name": "job",
"reg": "",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(vm_app_version{job=~\"$job\", version=~\"^vminsert.*\"}, job)",
"multi": false,
"name": "insert",
"reg": "",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(vm_app_version{job=~\"$job\", version=~\"^vmselect.*\"}, job)",
"multi": false,
"name": "select",
"reg": "",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(vm_app_version{job=~\"$job\", version=~\"^vmstorage.*\"}, job)",
"multi": false,
"name": "storage",
"reg": "",
"type": "query"
},
{
"allOption": true,
"allValue": ".*",
"datasource": {
"cate": "prometheus"
},
"definition": "label_values(vm_app_version{job=~\"$job\"}, instance)",
"multi": true,
"name": "instance",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328447051000
}
================================================
FILE: integrations/VictoriaMetrics/dashboards/victoriametrics-single.json
================================================
{
"id": 0,
"group_id": 0,
"name": "VictoriaMetrics - Single",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [
{
"targetBlank": true,
"title": "Single server Wiki",
"url": "https://docs.victoriametrics.com/"
},
{
"targetBlank": true,
"title": "Found a bug?",
"url": "https://github.com/VictoriaMetrics/VictoriaMetrics/issues"
},
{
"targetBlank": true,
"title": "New releases",
"url": "https://github.com/VictoriaMetrics/VictoriaMetrics/releases"
}
],
"panels": [
{
"collapsed": true,
"id": "665759e5-283d-40a7-8fe8-f58c6d235c39",
"layout": {
"h": 1,
"i": "665759e5-283d-40a7-8fe8-f58c6d235c39",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Stats",
"panels": [],
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "57c1976f-974e-4539-a6b9-749bd4b60953",
"layout": {
"h": 2,
"i": "57c1976f-974e-4539-a6b9-749bd4b60953",
"isResizable": true,
"w": 4,
"x": 0,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Uptime",
"options": {
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#F2495C",
"type": "base",
"value": null
},
{
"color": "#73BF69",
"value": 1800
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "vm_app_uptime_seconds{job=~\"$job\", instance=~\"$instance\"}",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"graphMode": "none",
"textMode": "value",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "How many datapoints are in storage",
"id": "5b0ea4ef-be95-44c9-b192-0f0edc92bdf2",
"layout": {
"h": 2,
"i": "5b0ea4ef-be95-44c9-b192-0f0edc92bdf2",
"isResizable": true,
"w": 5,
"x": 4,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Total datapoints",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(vm_rows{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"})",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows the datapoints ingestion rate.",
"id": "0669b69e-d4db-4874-b14c-05afd66489db",
"layout": {
"h": 2,
"i": "0669b69e-d4db-4874-b14c-05afd66489db",
"isResizable": true,
"w": 5,
"x": 9,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Ingestion rate",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(rate(vm_rows_inserted_total{job=~\"$job\", instance=~\"$instance\"}[5m]))",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows the rate of HTTP read requests.",
"id": "4f1923f0-a415-435e-98c9-c9b58225512c",
"layout": {
"h": 2,
"i": "4f1923f0-a415-435e-98c9-c9b58225512c",
"isResizable": true,
"w": 5,
"x": 14,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Read requests",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(rate(vm_http_requests_total{job=~\"$job\", instance=~\"$instance\", path!~\".*(/write|/metrics)\"}[5m]))",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Total number of available CPUs for VM process",
"id": "49dcab13-6410-4adf-b953-5c1678ce6583",
"layout": {
"h": 2,
"i": "49dcab13-6410-4adf-b953-5c1678ce6583",
"isResizable": true,
"w": 5,
"x": 19,
"y": 1
},
"links": [],
"maxPerRow": 4,
"name": "Available CPU",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(vm_available_cpu_cores{job=~\"$job\", instance=~\"$instance\"})",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"alignItems": "center",
"bgColor": "#FFFFFF",
"content": "\u003cdiv style=\"text-align: center;\"\u003e${version}\u003c/div\u003e",
"justifyContent": "center",
"textColor": "#000000",
"textSize": 12
},
"id": "9add6be9-59a1-496e-8357-5056df376e02",
"layout": {
"h": 2,
"i": "661c50c4-244a-4218-9412-bcfb253790d6",
"isResizable": true,
"w": 4,
"x": 0,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "version",
"type": "text",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows the number of active time series with new data points inserted during the last hour. High value may result in ingestion slowdown. \n\nSee more details here https://docs.victoriametrics.com/FAQ.html#what-is-an-active-time-series",
"id": "558e366e-81f8-457d-8954-891361552120",
"layout": {
"h": 2,
"i": "558e366e-81f8-457d-8954-891361552120",
"isResizable": true,
"w": 5,
"x": 4,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "Active series",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "vm_cache_entries{job=~\"$job\", instance=~\"$instance\", type=\"storage/hour_metric_ids\"}",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Total amount of used disk space",
"id": "cf10b65f-dce6-407a-b7c5-c609f3b3d011",
"layout": {
"h": 2,
"i": "cf10b65f-dce6-407a-b7c5-c609f3b3d011",
"isResizable": true,
"w": 5,
"x": 9,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "Disk space usage",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"})",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Average disk usage per datapoint.",
"id": "0a5c466a-88ec-4b60-ac10-523151a85105",
"layout": {
"h": 2,
"i": "0a5c466a-88ec-4b60-ac10-523151a85105",
"isResizable": true,
"w": 5,
"x": 14,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "Bytes per point",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) / sum(vm_rows{job=~\"$job\", instance=~\"$instance\"})",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"textMode": "value",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Total size of available memory for VM process",
"id": "7c43575f-fb3c-4c99-9a2b-1efe31ed8bda",
"layout": {
"h": 2,
"i": "7c43575f-fb3c-4c99-9a2b-1efe31ed8bda",
"isResizable": true,
"w": 5,
"x": 19,
"y": 3
},
"links": [],
"maxPerRow": 4,
"name": "Available memory",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
}
],
"style": "line"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(vm_available_memory_bytes{job=~\"$job\", instance=~\"$instance\"})",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "333a3f2f-ea3e-49b6-9ea6-706cf187383b",
"layout": {
"h": 1,
"i": "333a3f2f-ea3e-49b6-9ea6-706cf187383b",
"isResizable": false,
"w": 24,
"x": 0,
"y": 5
},
"name": "Overview",
"panels": [],
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "How many datapoints are inserted into storage per second",
"id": "6bd25ffa-76c9-4d3d-beb6-78e8a6490380",
"layout": {
"h": 8,
"i": "6bd25ffa-76c9-4d3d-beb6-78e8a6490380",
"isResizable": true,
"w": 12,
"x": 0,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Datapoints ingestion rate ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(vm_rows_inserted_total{job=~\"$job\", instance=~\"$instance\"}[5m])) by (type) \u003e 0",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)",
"id": "88e743aa-3091-4f82-be96-1d6f264c3133",
"layout": {
"h": 8,
"i": "88e743aa-3091-4f82-be96-1d6f264c3133",
"isResizable": true,
"w": 12,
"x": 12,
"y": 6
},
"links": [],
"maxPerRow": 4,
"name": "Requests rate ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(rate(vm_http_requests_total{job=~\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[5m])) by (path) \u003e 0",
"legend": "{{path}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"lineInterpolation": "linear",
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows the number of active time series with new data points inserted during the last hour. High value may result in ingestion slowdown. \n\nSee following link for details:",
"id": "9aa5b5be-0ecd-4f8b-9fa6-43def64f4537",
"layout": {
"h": 8,
"i": "9aa5b5be-0ecd-4f8b-9fa6-43def64f4537",
"isResizable": true,
"w": 12,
"x": 0,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "Active time series ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "vm_cache_entries{job=~\"$job\", instance=~\"$instance\", type=\"storage/hour_metric_ids\"}",
"legend": "Active time series",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The less time it takes is better.\n* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)",
"id": "4f280522-25f4-4ed5-b6b8-094322b61e02",
"layout": {
"h": 8,
"i": "4f280522-25f4-4ed5-b6b8-094322b61e02",
"isResizable": true,
"w": 12,
"x": 12,
"y": 14
},
"links": [],
"maxPerRow": 4,
"name": "Query duration 0.99 quantile ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "max(vm_request_duration_seconds{job=~\"$job\", instance=~\"$instance\", quantile=\"0.99\"}) by (path) \u003e 0",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.1,
"gradientMode": "none",
"lineInterpolation": "linear",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)",
"id": "1640bdab-1de2-4a8d-90e0-90d57b97dec2",
"layout": {
"h": 8,
"i": "1640bdab-1de2-4a8d-90e0-90d57b97dec2",
"isResizable": true,
"w": 12,
"x": 0,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "Requests error rate ($instance)",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"min": 0,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
]
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(vm_http_request_errors_total{job=~\"$job\", instance=~\"$instance\"}[5m])) by (path) \u003e 0",
"legend": "",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "3.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.",
"id": "394f20a6-6cf4-4f21-a150-22b9e23f5e8f",
"layout": {
"h": 8,
"i": "394f20a6-6cf4-4f21-a150-22b9e23f5e8f",
"isResizable": true,
"w": 12,
"x": 12,
"y": 22
},
"links": [],
"maxPerRow": 4,
"name": "Logging rate",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"type": "base",
"value": null
},
{
"color": "#F2495C",
"value": 80
}
],
"style": "off"
},
"tooltip": {
"mode": "multi"
},
"valueMappings": []
},
"targets": [
{
"expr": "sum(rate(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[5m])) by (level, location) \u003e 0",
"legend": "{{level}}: {{location}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": false,
"id": "3b19bc17-c76f-4b18-abfe-e9561be56589",
"layout": {
"h": 1,
"i": "3b19bc17-c76f-4b18-abfe-e9561be56589",
"isResizable": false,
"w": 24,
"x": 0,
"y": 30
},
"name": "Resource usage",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Percentage of used memory (resident).\nThe application's performance will significantly degrade when memory usage is close to 100%.",
"id": "59a136e8-d464-4d02-bc7c-1178087bcf95",
"layout": {
"h": 4,
"i": "59a136e8-d464-4d02-bc7c-1178087bcf95",
"isResizable": true,
"w": 12,
"x": 0,
"y": 31
},
"maxPerRow": 4,
"name": "RSS memory % usage ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(\n max_over_time(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"}[5m])\n /\n vm_available_memory_bytes{job=~\"$job\", instance=~\"$instance\"}\n) by(instance)",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "eaf92126-fcf4-4d28-9281-d8037deca2a0",
"layout": {
"h": 4,
"i": "15f065a6-0b6a-4df2-b98e-b1757771570d",
"isResizable": true,
"w": 12,
"x": 12,
"y": 31
},
"maxPerRow": 4,
"name": "CPU % usage ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2,
"min": 0,
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(\n rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n /\n vm_available_cpu_cores{job=~\"$job\", instance=~\"$instance\"}\n) by(job)",
"instant": false,
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "268997ad-a0ce-4ebf-945d-51c9e3f0f460",
"layout": {
"h": 4,
"i": "bf29719b-f5bb-4b89-8e60-53bd739f4424",
"isResizable": true,
"w": 12,
"x": 0,
"y": 35
},
"maxPerRow": 4,
"name": "Memory usage ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesSI"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(go_memstats_sys_bytes{job=~\"$job\", instance=~\"$instance\"}) + sum(vm_cache_size_bytes{job=~\"$job\", instance=~\"$instance\"})",
"legend": "requested from system",
"refId": "A"
},
{
"expr": "sum(go_memstats_heap_inuse_bytes{job=~\"$job\", instance=~\"$instance\"}) + sum(vm_cache_size_bytes{job=~\"$job\", instance=~\"$instance\"})",
"legend": "heap inuse",
"refId": "B"
},
{
"expr": "sum(go_memstats_stack_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})",
"legend": "stack inuse",
"refId": "C"
},
{
"expr": "sum(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"})",
"legend": "resident",
"refId": "D"
},
{
"expr": "sum(process_resident_memory_anon_bytes{job=~\"$job\", instance=~\"$instance\"})",
"legend": "resident anonymous",
"refId": "E"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "45707854-779f-4285-b0ad-957074e42351",
"layout": {
"h": 4,
"i": "778684b6-8d6b-4005-8af7-a5280b102ddc",
"isResizable": true,
"w": 12,
"x": 12,
"y": 35
},
"maxPerRow": 4,
"name": "CPU ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])",
"legend": "CPU cores used",
"refId": "A",
"step": 15
},
{
"expr": "process_cpu_cores_available{job=~\"$job\", instance=~\"$instance\"}",
"legend": "Limit",
"refId": "B",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows the percentage of open file descriptors compared to the limit set in the OS.",
"id": "a5ba85e4-5b91-4b79-bcd0-ced25f48c19e",
"layout": {
"h": 4,
"i": "10a8ee1b-18ce-4a8a-9c6a-8f4b265cdd16",
"isResizable": true,
"w": 12,
"x": 0,
"y": 39
},
"maxPerRow": 4,
"name": "Open FDs ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max_over_time(process_open_fds{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n/\nprocess_max_fds{job=~\"$job\", instance=~\"$instance\"}",
"legend": "{{job}}",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows the number of bytes read/write from the storage layer.",
"id": "bbe4eed4-26d4-4dfd-8ab1-35310f7cf734",
"layout": {
"h": 4,
"i": "1c130355-908f-4878-8f3e-30320a458cce",
"isResizable": true,
"w": 12,
"x": 12,
"y": 39
},
"maxPerRow": 4,
"name": "Disk writes/reads ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(process_io_storage_read_bytes_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))",
"legend": "read",
"refId": "A",
"step": 15
},
{
"expr": "sum(rate(process_io_storage_written_bytes_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))",
"legend": "write",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "704d1639-8545-42a9-8288-07e3d19bbac9",
"layout": {
"h": 4,
"i": "32b5451e-a1e9-4473-9f65-81f383390013",
"isResizable": true,
"w": 12,
"x": 0,
"y": 43
},
"maxPerRow": 4,
"name": "Goroutines ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(go_goroutines{job=~\"$job\", instance=~\"$instance\"})",
"legend": "gc duration",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "3e93d7d9-255d-425b-bdb6-9ae37b426954",
"layout": {
"h": 4,
"i": "b528dcd3-9c32-47f4-9355-bd97a7835ab3",
"isResizable": true,
"w": 12,
"x": 12,
"y": 43
},
"maxPerRow": 4,
"name": "TCP connections ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(vm_tcplistener_conns{job=~\"$job\", instance=~\"$instance\"})",
"legend": "connections",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "aeaabca1-a652-4df9-9160-f2f3b463e38b",
"layout": {
"h": 4,
"i": "9a071517-3444-405c-9c13-ad24d2b7e530",
"isResizable": true,
"w": 12,
"x": 0,
"y": 47
},
"maxPerRow": 4,
"name": "Threads ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(process_num_threads{job=~\"$job\", instance=~\"$instance\"})",
"legend": "threads",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "",
"id": "c8e7c73e-46b8-4b73-b4f7-80251a34e484",
"layout": {
"h": 4,
"i": "e251884d-4a78-4de7-abcb-8ee830daa9db",
"isResizable": true,
"w": 12,
"x": 12,
"y": 47
},
"maxPerRow": 4,
"name": "TCP connections rate ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(vm_tcplistener_accepts_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))",
"legend": "connections",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "5935fe5e-2b40-4b7c-81f5-ad2d3cc5b40c",
"layout": {
"h": 1,
"i": "5935fe5e-2b40-4b7c-81f5-ad2d3cc5b40c",
"isResizable": false,
"w": 24,
"x": 0,
"y": 31
},
"name": "Storage",
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "How many datapoints are inserted into storage per second",
"id": "cf5673b0-819b-4632-a669-a74882a0466c",
"layout": {
"h": 4,
"i": "cf5673b0-819b-4632-a669-a74882a0466c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 4
},
"links": [],
"maxPerRow": 4,
"name": "Datapoints ingestion rate ($instance)",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(vm_rows_inserted_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (type) \u003e 0",
"legend": "{{type}}",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows the time needed to reach the 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.",
"id": "6801e320-7b34-4dd5-bdf7-a2d3e2195275",
"layout": {
"h": 4,
"i": "6801e320-7b34-4dd5-bdf7-a2d3e2195275",
"isResizable": true,
"w": 12,
"x": 12,
"y": 4
},
"maxPerRow": 4,
"name": "Storage full ETA ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"} \n/ ignoring(path) (\n (\n rate(vm_rows_added_to_storage_total{job=~\"$job\", instance=~\"$instance\"}[1d]) \n - ignoring(type) rate(vm_deduplicated_samples_total{job=~\"$job\", instance=~\"$instance\", type=\"merge\"}[1d])\n ) * scalar(\n sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"}) \n / sum(vm_rows{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"})\n )\n )",
"legend": "{{instance}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows amount of on-disk space occupied by data points and the remaining disk space at `-storageDataPath`",
"id": "977baf19-2816-455a-b3be-c14a18d458dc",
"layout": {
"h": 4,
"i": "977baf19-2816-455a-b3be-c14a18d458dc",
"isResizable": true,
"w": 12,
"x": 0,
"y": 8
},
"maxPerRow": 4,
"name": "Disk space usage - datapoints ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"})",
"legend": "Used (datapoints)",
"refId": "A"
},
{
"expr": "vm_free_disk_space_bytes{job=~\"$job\", instance=~\"$instance\"}",
"legend": "Free",
"refId": "B"
},
{
"expr": "sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\", type=~\"indexdb.*\"})",
"legend": "Used (index)",
"refId": "C"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "How many datapoints are in RAM queue waiting to be written into storage. The number of pending data points should be in the range from 0 to `2*\u003cingestion_rate\u003e`, since VictoriaMetrics pushes pending data to persistent storage every second.",
"id": "d1830c5e-8876-46cc-a794-461e5fe2bb32",
"layout": {
"h": 4,
"i": "d1830c5e-8876-46cc-a794-461e5fe2bb32",
"isResizable": true,
"w": 12,
"x": 12,
"y": 8
},
"maxPerRow": 4,
"name": "Pending datapoints ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "vm_pending_rows{job=~\"$job\", instance=~\"$instance\", type=\"storage\"}",
"legend": "pending datapoints",
"refId": "A"
},
{
"expr": "vm_pending_rows{job=~\"$job\", instance=~\"$instance\", type=\"indexdb\"}",
"legend": "pending index entries",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows how many datapoints are in the storage and what is average disk usage per datapoint.",
"id": "2524834e-0c2c-4a1e-93dd-cd07d687982c",
"layout": {
"h": 4,
"i": "2524834e-0c2c-4a1e-93dd-cd07d687982c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 12
},
"maxPerRow": 4,
"name": "Datapoints ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(vm_rows{job=~\"$job\", instance=~\"$instance\", type!~\"indexdb.*\"})",
"legend": "total datapoints",
"refId": "A",
"step": 15
},
{
"expr": "sum(vm_data_size_bytes{job=~\"$job\", instance=~\"$instance\"}) \n/ sum(vm_rows{job=~\"$job\", instance=~\"$instance\"})",
"legend": "bytes-per-datapoint",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Data parts of LSM tree.\nHigh number of parts could be an evidence of slow merge performance - check the resource utilization.\n* `indexdb` - inverted index\n* `storage/small` - recently added parts of data ingested into storage(hot data)\n* `storage/big` - small parts gradually merged into big parts (cold data)",
"id": "0bba80df-7685-4b0b-865a-fe5e21d2c411",
"layout": {
"h": 4,
"i": "0bba80df-7685-4b0b-865a-fe5e21d2c411",
"isResizable": true,
"w": 12,
"x": 12,
"y": 12
},
"maxPerRow": 4,
"name": "LSM parts ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(vm_parts{job=~\"$job\", instance=~\"$instance\"}) by (type)",
"legend": "{{type}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows how many rows were ignored on insertion due to corrupted or out of retention timestamps.",
"id": "ab7d0f4a-c8c3-43b1-83b4-7f97dae99c5c",
"layout": {
"h": 4,
"i": "ab7d0f4a-c8c3-43b1-83b4-7f97dae99c5c",
"isResizable": true,
"w": 12,
"x": 0,
"y": 16
},
"maxPerRow": 4,
"name": "Rows ignored for last 1h ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(increase(vm_rows_ignored_total{job=~\"$job\", instance=~\"$instance\"}[1h])) by (reason)",
"legend": "{{reason}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The number of on-going merges in storage nodes. It is expected to have high numbers for `storage/small` metric.",
"id": "8567004e-55eb-4979-9cbb-8e9f0840aa95",
"layout": {
"h": 4,
"i": "8567004e-55eb-4979-9cbb-8e9f0840aa95",
"isResizable": true,
"w": 12,
"x": 12,
"y": 16
},
"maxPerRow": 4,
"name": "Active merges ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(vm_active_merges{job=~\"$job\", instance=~\"$instance\"}) by(type)",
"legend": "{{type}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows how many ongoing insertions (not API /write calls) on disk are taking place, where:\n* `max` - equal to number of CPUs;\n* `current` - current number of goroutines busy with inserting rows into underlying storage.\n\nEvery successful API /write call results into flush on disk. However, these two actions are separated and controlled via different concurrency limiters. The `max` on this panel can't be changed and always equal to number of CPUs. \n\nWhen `current` hits `max` constantly, it means storage is overloaded and requires more CPU.\n\n",
"id": "ff13cd62-6f6e-483a-ab33-68f3ebe261b3",
"layout": {
"h": 4,
"i": "ff13cd62-6f6e-483a-ab33-68f3ebe261b3",
"isResizable": true,
"w": 12,
"x": 0,
"y": 20
},
"maxPerRow": 4,
"name": "Concurrent flushes on disk ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max_over_time(vm_concurrent_addrows_capacity{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])",
"legend": "max",
"refId": "A"
},
{
"expr": "sum(vm_concurrent_addrows_current{job=~\"$job\", instance=~\"$instance\"})",
"legend": "current",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The number of rows merged per second by storage nodes.",
"id": "ce84f2ec-6e15-4a5e-bd13-c1c35c6b850c",
"layout": {
"h": 4,
"i": "ce84f2ec-6e15-4a5e-bd13-c1c35c6b850c",
"isResizable": true,
"w": 12,
"x": 12,
"y": 20
},
"maxPerRow": 4,
"name": "Merge speed ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(vm_rows_merged_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(type)",
"legend": "{{type}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "99th percentile of number of series read per query.",
"id": "8ba3dad0-1882-4e95-acd5-e52bf3bd2bbb",
"layout": {
"h": 4,
"i": "8ba3dad0-1882-4e95-acd5-e52bf3bd2bbb",
"isResizable": true,
"w": 12,
"x": 0,
"y": 24
},
"maxPerRow": 4,
"name": "Series read per query ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(vm_series_read_per_query_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (vmrange))",
"legend": "{{instance}}",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "99th percentile of number of raw samples read per queried series.",
"id": "0ae6c17d-eb65-43c6-9cfc-79939f343b4a",
"layout": {
"h": 4,
"i": "0ae6c17d-eb65-43c6-9cfc-79939f343b4a",
"isResizable": true,
"w": 12,
"x": 12,
"y": 24
},
"maxPerRow": 4,
"name": "Datapoints read per series ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(vm_rows_read_per_series_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (vmrange))",
"legend": "{{label_name}}",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "99th percentile of number of raw datapoints read per query.",
"id": "61624d75-55f9-4e32-afa0-867eadcd1207",
"layout": {
"h": 4,
"i": "61624d75-55f9-4e32-afa0-867eadcd1207",
"isResizable": true,
"w": 12,
"x": 0,
"y": 28
},
"maxPerRow": 4,
"name": "Datapoints read per query ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(histogram_quantile(0.99, sum(rate(vm_rows_read_per_query_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (instance, vmrange)))",
"legend": "datapoints",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.5,
"gradientMode": "none",
"lineInterpolation": "smooth",
"lineWidth": 1,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "99th percentile of number of raw samples scanner per query.\n\nThis number can exceed number of RowsReadPerQuery if `step` query arg passed to [/api/v1/query_range](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries) is smaller than the lookbehind window set in square brackets of [rollup function](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions). For example, if `increase(some_metric[1h])` is executed with the `step=5m`, then the same raw samples on a hour time range are scanned `1h/5m=12` times. See [this article](https://valyala.medium.com/how-to-optimize-promql-and-metricsql-queries-85a1b75bf986) for details.",
"id": "a38d0ec9-1be3-4800-aa77-62d421635590",
"layout": {
"h": 4,
"i": "a38d0ec9-1be3-4800-aa77-62d421635590",
"isResizable": true,
"w": 12,
"x": 12,
"y": 28
},
"maxPerRow": 4,
"name": "Datapoints scanned per series ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(vm_rows_scanned_per_query_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (vmrange))",
"legend": "{{instance}}",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"type": "row"
},
{
"collapsed": false,
"id": "54be3612-b43a-4a5a-a5c6-58349c6fd671",
"layout": {
"h": 1,
"i": "54be3612-b43a-4a5a-a5c6-58349c6fd671",
"isResizable": false,
"w": 24,
"x": 0,
"y": 32
},
"name": "Troubleshooting",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows the rate and total number of new series created over last 24h.\n\nHigh churn rate tightly connected with database performance and may result in unexpected OOM's or slow queries. It is recommended to always keep an eye on this metric to avoid unexpected cardinality \"explosions\".\n\nThe higher churn rate is, the more resources required to handle it. Consider to keep the churn rate as low as possible.\n\nGood references to read:\n* https://www.robustperception.io/cardinality-is-key\n* https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality",
"id": "1074205e-87cd-4988-b1e8-c76ba2922150",
"layout": {
"h": 4,
"i": "1074205e-87cd-4988-b1e8-c76ba2922150",
"isResizable": true,
"w": 12,
"x": 0,
"y": 33
},
"maxPerRow": 4,
"name": "Churn rate ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(vm_new_timeseries_created_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))",
"legend": "churn rate",
"refId": "A",
"step": 15
},
{
"expr": "sum(increase(vm_new_timeseries_created_total{job=~\"$job\", instance=~\"$instance\"}[24h]))",
"legend": "new series over 24h",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "The percentage of slow inserts comparing to total insertion rate during the last 5 minutes. \n\nThe less value is better. If percentage remains high (\u003e10%) during extended periods of time, then it is likely more RAM is needed for optimal handling of the current number of active time series. \n\nIn general, VictoriaMetrics requires ~1KB or RAM per active time series, so it should be easy calculating the required amounts of RAM for the current workload according to capacity planning docs. But the resulting number may be far from the real number because the required amounts of memory depends on may other factors such as the number of labels per time series and the length of label values.",
"id": "774e409e-d199-4415-bb92-db5252d409da",
"layout": {
"h": 4,
"i": "ddfbc9f0-db96-4526-bfc2-4c30e8b59e5d",
"isResizable": true,
"w": 12,
"x": 12,
"y": 33
},
"maxPerRow": 4,
"name": "Slow inserts ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "max(\n rate(vm_slow_row_inserts_total{job=~\"$job\"}[$__rate_interval]) \n / rate(vm_rows_added_to_storage_total{job=~\"$job\"}[$__rate_interval])\n)",
"legend": "slow inserts percentage",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Merge assist happens when storage can't keep up with merging parts. This is usually a sign of overload for storage.",
"id": "c473e925-b948-47a6-9958-01130c07b155",
"layout": {
"h": 4,
"i": "7b9c3650-28e0-4c63-bec8-834231d460c1",
"isResizable": true,
"w": 12,
"x": 0,
"y": 37
},
"maxPerRow": 4,
"name": "Assisted merges ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(increase(vm_assisted_merges_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(type) \u003e 0",
"legend": "",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Slow queries rate according to `search.logSlowQueryDuration` flag, which is `5s` by default.",
"id": "acec0952-245a-4de6-b82d-0a2c7cd3165a",
"layout": {
"h": 4,
"i": "c10d17ce-1f57-401d-a72b-936c428d0bc1",
"isResizable": true,
"w": 12,
"x": 12,
"y": 37
},
"maxPerRow": 4,
"name": "Slow queries rate ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(vm_slow_queries_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))",
"legend": "slow queries rate",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows the percentage of used cache size from the allowed size by type. \nValues close to 100% show the maximum potential utilization.\nValues close to 0% show that cache is underutilized.",
"id": "ae42a1cf-6390-427a-a189-5279b542cc42",
"layout": {
"h": 4,
"i": "648ab5b0-69ce-43c6-9e3d-3b9640d54dc3",
"isResizable": true,
"w": 12,
"x": 0,
"y": 41
},
"maxPerRow": 4,
"name": "Cache usage % by type ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "vm_cache_size_bytes{job=~\"$job\", instance=~\"$instance\"} / vm_cache_size_max_bytes{job=~\"$job\", instance=~\"$instance\"}",
"legend": "{{type}}",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "Shows cache miss ratio. Lower is better.",
"id": "5406caf4-941e-4a3b-8e8c-5072a05b9029",
"layout": {
"h": 4,
"i": "14ee2238-0d4e-43f4-87a0-fa31f3ab9ae0",
"isResizable": true,
"w": 12,
"x": 12,
"y": 41
},
"maxPerRow": 4,
"name": "Cache miss ratio ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(\n rate(vm_cache_misses_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n /\n rate(vm_cache_requests_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n) \u003e 0",
"legend": "{{type}}",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": true
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ef0b1897-c141-40f7-b68d-0f3492714b04",
"layout": {
"h": 4,
"i": "ef0b1897-c141-40f7-b68d-0f3492714b04",
"isResizable": true,
"w": 12,
"x": 0,
"y": 45
},
"maxPerRow": 4,
"name": "Non-default flags",
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"targets": [
{
"expr": "sum(flag{is_set=\"true\", job=~\"$job\", instance=~\"$instance\"}) by(job, instance, name, value)",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "table",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"description": "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n\nThis prevents from ingesting metrics with too many labels. The value of `maxLabelsPerTimeseries` must be adjusted for your workload.\n\nWhen limit is exceeded (graph is \u003e 0) - extra labels are dropped, which could result in unexpected identical time series.",
"id": "f0f122b3-180d-4673-b105-1a66a645cbaa",
"layout": {
"h": 4,
"i": "222ddf3b-d602-4646-a6e0-1ff430026823",
"isResizable": true,
"w": 12,
"x": 12,
"y": 45
},
"maxPerRow": 4,
"name": "Labels limit exceeded ($instance)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(increase(vm_metrics_with_dropped_labels_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))",
"legend": "limit exceeded",
"refId": "A",
"step": 15
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)",
"label": "job",
"multi": false,
"name": "job",
"reg": "",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(vm_app_version{job=~\"$job\"}, instance)",
"multi": false,
"name": "instance",
"reg": "",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values(vm_app_version{job=~\"$job\", instance=~\"$instance\"}, short_version)",
"hide": false,
"label": "version",
"multi": false,
"name": "version",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328456195000
}
================================================
FILE: integrations/VictoriaMetrics/markdown/README.md
================================================
# VictoriaMetrics
VictoriaMetrics 既可以单机部署,也可以集群方式部署。不管哪种部署方式,VictoriaMetrics 的进程都会暴露 `/metrics` 接口,通过这个接口暴露 Prometheus 协议的监控数据。
## 采集配置
categraf 的 `conf/input.prometheus/prometheus.toml`。因为 VictoriaMetrics 是暴露的 Prometheus 协议的监控数据,所以使用 categraf 的 prometheus 插件即可采集。
```toml
# vmstorage
[[instances]]
urls = [
"http://127.0.0.1:8482/metrics"
]
labels = {service="vmstorage"}
# vmselect
[[instances]]
urls = [
"http://127.0.0.1:8481/metrics"
]
labels = {service="vmselect"}
# vminsert
[[instances]]
urls = [
"http://127.0.0.1:8480/metrics"
]
labels = {service="vminsert"}
```
================================================
FILE: integrations/Whois/collect/whois/whois.toml
================================================
# # collect interval
#interval = 3600
#[[instances]]
## Used to collect domain name information.
#domain = "baidu.com"
## append some labels for series
#labels = { region="n9e", product="test1" }
## interval = global.interval * interval_times
#interval_times = 1
#[[instances]]
## Used to collect domain name information.
#domain = "google.com"
## append some labels for series
#labels = { region="n9e", product="test2" }
## interval = global.interval * interval_times
#interval_times = 1
================================================
FILE: integrations/Whois/markdown/README.md
================================================
# whois
域名探测插件,用于探测域名的注册时间和到期时间,值为UTC0时间戳
## Configuration
最核心的配置就是 domain 配置,配置目标地址,比如想要监控一个地址:
默认保持注释状态,注释状态下,插件默认不启用
```toml
# [[instances]]
## Used to collect domain name information.
# domain = "baidu.com"
```
请注意这里配置的是域名不是URL
## 指标解释
whois_domain_createddate 域名创建时间戳
whois_domain_updateddate 域名更新时间戳
whois_domain_expirationdate 域名到期时间戳
## 注意事项
请不要将interval设置过短,会导致频繁请求timeout,没太大必要性,请尽量放长请求周期
================================================
FILE: integrations/Windows/alerts/windows_by_categraf.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Hard disk - expected to be written full in 4 hours - categraf [Windows]",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"prom_ql": "predict_linear(disk_free{platform=~\"windows\"}[1h], 4*3600) \u003c 0",
"severity": 1
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328469800000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Hard disk - IO is a bit busy - categraf [Windows]",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"prom_ql": "rate(diskio_io_time{platform=~\"windows\"}[1m])/10 \u003e 99",
"severity": 2
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328470735000
},
{
"id": 0,
"group_id": 0,
"cate": "host",
"datasource_ids": null,
"cluster": "",
"name": "Lost connection with monitoring target - categraf [Windows]",
"note": "",
"prod": "host",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 0,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"key": "all_hosts",
"op": "==",
"values": []
},
{
"key": "tags",
"op": "==",
"values": [
"platform=windows"
]
}
],
"triggers": [
{
"duration": 60,
"severity": 2,
"type": "target_miss"
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328471354000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Machine load - high memory, please pay attention - categraf[Windows]",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3,
2,
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"prom_ql": "mem_available_percent{platform=~\"windows\"} \u003c 25",
"severity": 3
},
{
"prom_ql": "mem_available_percent{platform=~\"windows\"} \u003c 15",
"severity": 2
},
{
"prom_ql": "mem_available_percent{platform=~\"windows\"} \u003c 5",
"severity": 1
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328471811000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "packet loss in the inbound direction - categraf[Windows]",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"prom_ql": "increase(net_drop_in{platform=~\"windows\"[1m]) \u003e 0",
"severity": 3
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328472306000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "packet loss in the outbound direction - categraf [Windows]",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": false,
"queries": [
{
"prom_ql": "increase(net_drop_out{platform=~\"windows\"}[1m]) \u003e 0",
"severity": 3
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"email",
"dingtalk",
"wecom"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328472750000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "大于200G的盘,空间不足了[Windows]",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3,
2,
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"prom_ql": "disk_free{platform=~\"windows\"}/1024/1024/1024 \u003c 20 and disk_total{platform=~\"windows\"}/1024/1024/1024 \u003e= 200",
"severity": 3
},
{
"prom_ql": "disk_free{platform=~\"windows\"}/1024/1024/1024 \u003c 10 and disk_total{platform=~\"windows\"}/1024/1024/1024 \u003e= 200",
"severity": 2
},
{
"prom_ql": "disk_free{platform=~\"windows\"}/1024/1024/1024 \u003c 2 and disk_total{platform=~\"windows\"}/1024/1024/1024 \u003e= 200",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328473502000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "小于200G的盘,空间不足了[Windows]",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
3,
2,
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"inhibit": true,
"queries": [
{
"prom_ql": "disk_used_percent{platform=~\"windows\"} \u003e 90 and disk_total{platform=~\"windows\"}/1024/1024/1024 \u003c 200",
"severity": 3
},
{
"prom_ql": "disk_used_percent{platform=~\"windows\"} \u003e 95 and disk_total{platform=~\"windows\"}/1024/1024/1024 \u003c 200",
"severity": 2
},
{
"prom_ql": "disk_used_percent{platform=~\"windows\"} \u003e 99 and disk_total{platform=~\"windows\"}/1024/1024/1024 \u003c 200",
"severity": 1
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328474123000
}
]
================================================
FILE: integrations/Windows/alerts/windows_by_exporter.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "High CPU utilization - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "100 * sum by (instance) (rate(windows_cpu_time_total{mode != 'idle'}[5m])) / count by (instance) (windows_cpu_core_frequency_mhz) \u003e 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=HighCPUUsage"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328475638000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "High memory usage rate - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "100 * (windows_cs_physical_memory_bytes - windows_os_physical_memory_free_bytes) / windows_cs_physical_memory_bytes \u003e 80",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=HighPhysicalMemoryUsage"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328476276000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "machine reboot in the last 15 minutes - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 3,
"severities": [
3
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "time() - windows_system_system_up_time \u003c 900",
"severity": 3
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=UpTimeLessThan15Min"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328476801000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "packet loss issue in the inbound direction - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "100 * rate(windows_net_packets_received_errors[5m]) / (rate(windows_net_packets_received_errors[5m]) + rate(windows_net_packets_received_total[5m])\u003e0) \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=HighInboundErrorRate"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328477685000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "packet loss issue in the outbound direction - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "100 * rate(windows_net_packets_outbound_errors[5m]) / (rate(windows_net_packets_outbound_errors[5m]) + rate(windows_net_packets_sent_total[5m])\u003e0) \u003e 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=HighOutboundErrorRate"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328478160000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "The hard disk is almost full - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"severities": [
1
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "100 * (windows_logical_disk_size_bytes - windows_logical_disk_free_bytes) / windows_logical_disk_size_bytes \u003e 90",
"severity": 1
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"alertname=LogicalDiskFull"
],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328478599000
}
]
================================================
FILE: integrations/Windows/dashboards/windows_by_categraf.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Windows Host by Categraf",
"ident": "",
"tags": "Categraf",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [
{
"targetBlank": true,
"title": "n9e",
"url": "https://n9e.github.io/"
},
{
"targetBlank": true,
"title": "author",
"url": "http://flashcat.cloud/"
}
],
"panels": [
{
"collapsed": true,
"id": "2b2de3d1-65c8-4c39-9bea-02b754e0d751",
"layout": {
"h": 1,
"i": "2b2de3d1-65c8-4c39-9bea-02b754e0d751",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "单机概况",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {
"value": 30
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "deec579b-3090-4344-a9a6-c1455c4a8e50",
"layout": {
"h": 5,
"i": "deec579b-3090-4344-a9a6-c1455c4a8e50",
"isResizable": true,
"w": 4,
"x": 0,
"y": 1
},
"name": "启动时长(单位:天)",
"options": {
"standardOptions": {
"decimals": 1,
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": []
},
"targets": [
{
"expr": "system_uptime{ident=~\"$ident\"}/3600/24",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {
"value": 30
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "7a7bd5db-d12e-49f0-92a8-15958e99ee54",
"layout": {
"h": 5,
"i": "7a7bd5db-d12e-49f0-92a8-15958e99ee54",
"isResizable": true,
"w": 4,
"x": 4,
"y": 1
},
"name": "CPU使用率",
"options": {
"standardOptions": {
"decimals": 1,
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"valueMappings": [
{
"match": {
"from": 0,
"to": 50
},
"result": {
"color": "#129b22"
},
"type": "range"
},
{
"match": {
"from": 50,
"to": 100
},
"result": {
"color": "#f51919"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "100-cpu_usage_idle{ident=~\"$ident\",cpu=\"cpu-total\"}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "stat",
"version": "3.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {
"value": 30
}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "8a814265-54ad-419c-8cb7-e1f84a242de0",
"layout": {
"h": 5,
"i": "8a814265-54ad-419c-8cb7-e1f84a242de0",
"isResizable": true,
"w": 4,
"x": 8,
"y": 1
},
"name": "内存使用率",
"options": {
"standardOptions": {
"decimals": 1,
"util": "percent"
},
"valueMappings": [
{
"match": {
"from": 0,
"to": 50
},
"result": {
"color": "#129b22"
},
"type": "range"
},
{
"match": {
"from": 50,
"to": 100
},
"result": {
"color": "#f51919"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "mem_used_percent{ident=~\"$ident\"}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorDomainAuto": true,
"colorRange": [
"#83c898",
"#c2c2c2",
"#fc653f"
],
"reverseColorOrder": false,
"textMode": "valueAndName"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "b3c5dd9d-e82a-4b15-8b23-c510e2bee152",
"layout": {
"h": 5,
"i": "b3c5dd9d-e82a-4b15-8b23-c510e2bee152",
"isResizable": true,
"w": 6,
"x": 12,
"y": 1
},
"maxPerRow": 4,
"name": "磁盘使用率",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "disk_used_percent{ident=~\"$ident\"}",
"instant": false,
"legend": "主机名:{{ident}} - 盘符:{{path}}",
"refId": "A",
"step": 60
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "hexbin",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "59afa167-434d-496c-a3ef-ceff6db7c1f6",
"layout": {
"h": 5,
"i": "59afa167-434d-496c-a3ef-ceff6db7c1f6",
"isResizable": true,
"w": 6,
"x": 18,
"y": 1
},
"maxPerRow": 4,
"name": "io_util",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(diskio_io_time{ident=~\"$ident\"}[1m])/10",
"legend": "{{ident}}-{{name}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "10f34f8f-f94d-4a28-9551-16e6667e3833",
"layout": {
"h": 1,
"i": "10f34f8f-f94d-4a28-9551-16e6667e3833",
"isResizable": false,
"w": 24,
"x": 0,
"y": 6
},
"name": "CPU",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1559d880-7e26-4e42-9427-4e55fb6f67be",
"layout": {
"h": 7,
"i": "1559d880-7e26-4e42-9427-4e55fb6f67be",
"isResizable": true,
"w": 8,
"x": 0,
"y": 7
},
"maxPerRow": 4,
"name": "CPU空闲率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
},
{
"color": "#f20202",
"value": 10
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "cpu_usage_idle{ident=~\"$ident\",cpu=\"cpu-total\"}",
"legend": "{{ident}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "043c26de-d19f-4fe8-a615-2b7c10ceb828",
"layout": {
"h": 7,
"i": "043c26de-d19f-4fe8-a615-2b7c10ceb828",
"isResizable": true,
"w": 8,
"x": 8,
"y": 7
},
"maxPerRow": 4,
"name": "CPU使用率详情",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "cpu_usage_guest{ident=~\"$ident\",cpu=\"cpu-total\"}",
"legend": "{{ident}}-cpu_usage_guest",
"refId": "A"
},
{
"expr": "cpu_usage_iowait{ident=~\"$ident\",cpu=\"cpu-total\"}",
"legend": "{{ident}}-cpu_usage_iowait",
"refId": "B"
},
{
"expr": "cpu_usage_user{ident=~\"$ident\",cpu=\"cpu-total\"}",
"legend": "{{ident}}-cpu_usage_user",
"refId": "C"
},
{
"expr": "cpu_usage_system{ident=~\"$ident\",cpu=\"cpu-total\"}",
"legend": "{{ident}}-cpu_usage_system",
"refId": "D"
},
{
"expr": "cpu_usage_irq{ident=~\"$ident\",cpu=\"cpu-total\"}",
"legend": "{{ident}}-cpu_usage_irq",
"refId": "E"
},
{
"expr": "cpu_usage_softirq{ident=~\"$ident\",cpu=\"cpu-total\"}",
"legend": "{{ident}}-cpu_usage_softirq",
"refId": "F"
},
{
"expr": "cpu_usage_nice{ident=~\"$ident\",cpu=\"cpu-total\"}",
"legend": "{{ident}}-cpu_usage_nice",
"refId": "G"
},
{
"expr": "cpu_usage_steal{ident=~\"$ident\",cpu=\"cpu-total\"}",
"legend": "{{ident}}-cpu_usage_steal",
"refId": "H"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "a420ce25-6968-47f8-8335-60cde70fd062",
"layout": {
"h": 7,
"i": "a420ce25-6968-47f8-8335-60cde70fd062",
"isResizable": true,
"w": 8,
"x": 16,
"y": 7
},
"name": "CPU负载",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "system_load15{ident=~\"$ident\"}",
"refId": "A"
},
{
"expr": "system_load1{ident=~\"$ident\"}",
"refId": "B"
},
{
"expr": "system_load5{ident=~\"$ident\"}",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "b7a3c99f-a796-4b76-89b5-cbddd566f91c",
"layout": {
"h": 1,
"i": "b7a3c99f-a796-4b76-89b5-cbddd566f91c",
"isResizable": false,
"w": 24,
"x": 0,
"y": 14
},
"name": "内存详情",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "44a07a85-0f3e-4c73-abfe-dcc3c5c7f30e",
"layout": {
"h": 6,
"i": "44a07a85-0f3e-4c73-abfe-dcc3c5c7f30e",
"isResizable": true,
"w": 12,
"x": 0,
"y": 15
},
"maxPerRow": 4,
"name": "内存使用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "mem_used_percent{ident=~\"$ident\"}",
"legend": "platform: {{platform}} {{ident}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "28a1dac8-2ddc-408d-8e17-27fc7d7bc9ff",
"layout": {
"h": 6,
"i": "bab1e1f4-87cc-48b9-a655-bcac9839ec9b",
"isResizable": true,
"w": 12,
"x": 12,
"y": 15
},
"maxPerRow": 4,
"name": "内用当前可用率",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "mem_available_percent{ident=~\"$ident\"}",
"legend": "{{ident}} ",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "842a8c48-0e93-40bf-8f28-1b2f837e5c19",
"layout": {
"h": 1,
"i": "842a8c48-0e93-40bf-8f28-1b2f837e5c19",
"isResizable": false,
"w": 24,
"x": 0,
"y": 21
},
"name": "磁盘详情",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "bc894871-1c03-4d12-91be-6867f394a8a6",
"layout": {
"h": 7,
"i": "bc894871-1c03-4d12-91be-6867f394a8a6",
"isResizable": true,
"w": 6,
"x": 0,
"y": 22
},
"name": "磁盘空间",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": null,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "disk_free{ident=~\"$ident\"}",
"refId": "A"
},
{
"expr": "disk_total{ident=~\"$ident\"}",
"refId": "B"
},
{
"expr": "disk_used{ident=~\"$ident\"}",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "bbd1ebda-99f6-419c-90a5-5f84973976dd",
"layout": {
"h": 7,
"i": "bbd1ebda-99f6-419c-90a5-5f84973976dd",
"isResizable": true,
"w": 6,
"x": 6,
"y": 22
},
"name": "IO吞吐量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(diskio_read_bytes{ident=~\"$ident\"}[1m])",
"legend": "{{ident}}-{{name}}-read",
"refId": "A"
},
{
"expr": "rate(diskio_write_bytes{ident=~\"$ident\"}[1m])",
"legend": "{{ident}}-{{name}}-writes",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "f645741e-c632-4685-b267-c7ad26b5c10e",
"layout": {
"h": 7,
"i": "f645741e-c632-4685-b267-c7ad26b5c10e",
"isResizable": true,
"w": 6,
"x": 12,
"y": 22
},
"name": "IOPS",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(diskio_reads{ident=~\"$ident\"}[1m])",
"legend": "{{ident}}-{{name}}-read",
"refId": "A"
},
{
"expr": "rate(diskio_writes{ident=~\"$ident\"}[1m])",
"legend": "{{ident}}-{{name}}-writes",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "d6b45598-54c6-4b36-a896-0a7529ac21f8",
"layout": {
"h": 7,
"i": "d6b45598-54c6-4b36-a896-0a7529ac21f8",
"isResizable": true,
"w": 6,
"x": 18,
"y": 22
},
"name": "iowait",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(diskio_write_time{ident=~\"$ident\"}[1m])/rate(diskio_writes{ident=~\"$ident\"}[1m])+rate(diskio_read_time{ident=~\"$ident\"}[1m])/rate(diskio_reads{ident=~\"$ident\"}[1m])",
"legend": "{{ident}}-{{name}}",
"refId": "A"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"collapsed": true,
"id": "307152d2-708c-4736-98cf-08b886cbf7f2",
"layout": {
"h": 1,
"i": "307152d2-708c-4736-98cf-08b886cbf7f2",
"isResizable": false,
"w": 24,
"x": 0,
"y": 29
},
"name": "网络详情",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "f2ee5d32-737c-4095-b6b7-b15b778ffdb9",
"layout": {
"h": 7,
"i": "f2ee5d32-737c-4095-b6b7-b15b778ffdb9",
"isResizable": true,
"w": 6,
"x": 0,
"y": 30
},
"name": "网络流量",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bitsIEC"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(net_bytes_recv{ident=~\"$ident\"}[1m])*8",
"legend": "{{ident}}-{{interface}}-recv",
"refId": "A"
},
{
"expr": "rate(net_bytes_sent{ident=~\"$ident\"}[1m])*8",
"legend": "{{ident}}-{{interface}}-sent",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "9113323a-98f5-4bff-a8ce-3b459e7e2190",
"layout": {
"h": 7,
"i": "9113323a-98f5-4bff-a8ce-3b459e7e2190",
"isResizable": true,
"w": 6,
"x": 6,
"y": 30
},
"name": "packets",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(net_packets_recv{ident=~\"$ident\"}[1m])",
"legend": "{{ident}}-{{interface}}-recv",
"refId": "A"
},
{
"expr": "rate(net_packets_sent{ident=~\"$ident\"}[1m])",
"legend": "{{ident}}-{{interface}}-sent",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "9634c41c-e124-4d7f-9406-0f86753e8d70",
"layout": {
"h": 7,
"i": "9634c41c-e124-4d7f-9406-0f86753e8d70",
"isResizable": true,
"w": 6,
"x": 12,
"y": 30
},
"name": "error",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(net_err_in{ident=~\"$ident\"}[1m])",
"legend": "{{ident}}-{{interface}}-in",
"refId": "A"
},
{
"expr": "rate(net_err_out{ident=~\"$ident\"}[1m])",
"legend": "{{ident}}-{{interface}}-out",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "4123f4c1-bf8e-400e-b267-8d7f6a92691a",
"layout": {
"h": 7,
"i": "4123f4c1-bf8e-400e-b267-8d7f6a92691a",
"isResizable": true,
"w": 6,
"x": 18,
"y": 30
},
"name": "drop",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "rate(net_drop_in{ident=~\"$ident\"}[1m])",
"legend": "{{ident}}-{{interface}}-in",
"refId": "A"
},
{
"expr": "rate(net_drop_out{ident=~\"$ident\"}[1m])",
"legend": "{{ident}}-{{interface}}-out",
"refId": "B"
}
],
"transformations": [
{
"id": "organize",
"options": {}
}
],
"type": "timeseries",
"version": "3.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "cfb80689-de7b-47fb-9155-052b796dd7f5",
"layout": {
"h": 7,
"i": "cfb80689-de7b-47fb-9155-052b796dd7f5",
"isResizable": true,
"w": 24,
"x": 0,
"y": 37
},
"name": "tcp",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "netstat_tcp_established{ident=~\"$ident\"}",
"refId": "A"
},
{
"expr": "netstat_tcp_listen{ident=~\"$ident\"}",
"refId": "B"
},
{
"expr": "netstat_tcp_time_wait{ident=~\"$ident\"}",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(disk_used_percent{device=~\".+:\"},ident)",
"multi": true,
"name": "ident",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328480192000
}
================================================
FILE: integrations/Windows/dashboards/windows_by_exporter.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Windows - exporter",
"ident": "",
"tags": "Windows Prometheus",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "13fc4558-3a83-4165-bf93-bc4eaea0f097",
"layout": {
"h": 1,
"i": "13fc4558-3a83-4165-bf93-bc4eaea0f097",
"w": 24,
"x": 0,
"y": 0
},
"name": "Basic Info",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "666cda14-4732-4f80-a024-675e2d244051",
"layout": {
"h": 3,
"i": "666cda14-4732-4f80-a024-675e2d244051",
"w": 6,
"x": 0,
"y": 1
},
"name": "Uptime",
"options": {
"standardOptions": {
"util": "humantimeSeconds"
}
},
"targets": [
{
"expr": "time() - windows_system_system_up_time{instance=~\"$instance\"}",
"legend": ""
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "164edb5c-d8f3-4b77-8af3-7907b24a1073",
"layout": {
"h": 3,
"i": "164edb5c-d8f3-4b77-8af3-7907b24a1073",
"w": 6,
"x": 6,
"y": 1
},
"name": "CPU Core Total",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "windows_cs_logical_processors{instance=~\"$instance\"}",
"legend": ""
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "e9ddf0d3-53e5-43a8-83c7-10aa9d0028ad",
"layout": {
"h": 3,
"i": "e9ddf0d3-53e5-43a8-83c7-10aa9d0028ad",
"w": 6,
"x": 12,
"y": 1
},
"name": "Memory Total",
"options": {
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
}
},
"targets": [
{
"expr": "windows_cs_physical_memory_bytes{instance=~\"$instance\"}"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "288ae77f-aa29-427c-932a-d0445e7d749e",
"layout": {
"h": 3,
"i": "288ae77f-aa29-427c-932a-d0445e7d749e",
"w": 6,
"x": 18,
"y": 1
},
"name": "Process Total",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"to": 100
},
"result": {
"color": "#109d06"
},
"type": "range"
},
{
"match": {
"from": 100
},
"result": {
"color": "#d11010"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "windows_os_processes{instance=~\"$instance\"}"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "a0248950-a7c4-47f2-9e75-27666ef428cd",
"layout": {
"h": 1,
"i": "a0248950-a7c4-47f2-9e75-27666ef428cd",
"w": 24,
"x": 0,
"y": 4
},
"name": "CPU Memory Disk",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "b1c60c40-94ba-4b76-a688-1532e26d3a52",
"layout": {
"h": 7,
"i": "b1c60c40-94ba-4b76-a688-1532e26d3a52",
"w": 6,
"x": 0,
"y": 5
},
"name": "Cpu Util",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "100 * sum by (instance) (rate(windows_cpu_time_total{mode != 'idle'}[5m])) / count by (instance) (windows_cpu_core_frequency_mhz) ",
"legend": "CPU Util"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "799d9f5c-450a-4db3-80d0-71a64c6d8d73",
"layout": {
"h": 7,
"i": "799d9f5c-450a-4db3-80d0-71a64c6d8d73",
"w": 6,
"x": 6,
"y": 5
},
"name": "Memory Util",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {
"steps": [
{
"color": "#e71313",
"value": 70
}
]
},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "100 - (windows_os_physical_memory_free_bytes{instance=~\"$instance\"} / windows_cs_physical_memory_bytes{instance=~\"$instance\"})*100"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1dae2abc-d7eb-47b9-8280-fcc1810803cb",
"layout": {
"h": 7,
"i": "1dae2abc-d7eb-47b9-8280-fcc1810803cb",
"w": 6,
"x": 12,
"y": 5
},
"name": "Disk Util",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "100 - (windows_logical_disk_free_bytes{instance=~\"$instance\"} / windows_logical_disk_size_bytes{instance=~\"$instance\"})*100",
"legend": "{{volume}}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "fd93766b-1099-4791-ace1-2648a38a23fb",
"layout": {
"h": 7,
"i": "fd93766b-1099-4791-ace1-2648a38a23fb",
"w": 6,
"x": 18,
"y": 5
},
"name": "Disk Free",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 0,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "windows_logical_disk_free_bytes{instance=~\"$instance\"}",
"legend": "{{volume}} Free"
},
{
"expr": "windows_logical_disk_size_bytes{instance=~\"$instance\"}",
"legend": "{{volume}} Total"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "47cfd14a-7c12-4d42-aa98-c768633bb1b9",
"layout": {
"h": 1,
"i": "47cfd14a-7c12-4d42-aa98-c768633bb1b9",
"w": 24,
"x": 0,
"y": 12
},
"name": "Disk IO",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1105b0bf-476d-428c-b01b-c7fef29ee5c2",
"layout": {
"h": 7,
"i": "1105b0bf-476d-428c-b01b-c7fef29ee5c2",
"w": 12,
"x": 0,
"y": 13
},
"name": "Read/Write Bytes / Second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(windows_logical_disk_read_bytes_total{instance=~\"$instance\"}[5m])",
"legend": "{{volume}} Read"
},
{
"expr": "irate(windows_logical_disk_write_bytes_total{instance=~\"$instance\"}[5m])",
"legend": "{{volume}} Write"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "470503f0-e414-48ec-88bf-1d5c885960d0",
"layout": {
"h": 7,
"i": "470503f0-e414-48ec-88bf-1d5c885960d0",
"w": 12,
"x": 12,
"y": 13
},
"name": "Read/Write / Second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 2
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(windows_logical_disk_reads_total{instance=~\"$instance\"}[5m])",
"legend": "{{volume}} Read"
},
{
"expr": "irate(windows_logical_disk_writes_total{instance=~\"$instance\"}[5m])",
"legend": "{{volume}} Write"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "22ffcddd-74d1-4db3-bfa6-b5fecbf99c6e",
"layout": {
"h": 1,
"i": "22ffcddd-74d1-4db3-bfa6-b5fecbf99c6e",
"w": 24,
"x": 0,
"y": 20
},
"name": "Network",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "6a2d168f-c316-4e6f-b9b4-d91a0de6ea10",
"layout": {
"h": 7,
"i": "6a2d168f-c316-4e6f-b9b4-d91a0de6ea10",
"w": 8,
"x": 0,
"y": 21
},
"name": "Sent/Received bits / Second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1,
"util": "bitsIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(windows_net_bytes_sent_total{instance=~\"$instance\",nic!~'isatap.*|VPN.*'}[5m])*8",
"legend": "{{nic}} Sent"
},
{
"expr": "irate(windows_net_bytes_received_total{instance=~\"$instance\",nic!~'isatap.*|VPN.*'}[5m])*8",
"legend": "{{nic}} Received"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "befa8f81-2ae5-4b93-8883-057a9bff79a8",
"layout": {
"h": 7,
"i": "befa8f81-2ae5-4b93-8883-057a9bff79a8",
"w": 8,
"x": 8,
"y": 21
},
"name": "Network Util",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"decimals": 1
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "(irate(windows_net_bytes_total{instance=~\"$instance\",nic!~'isatap.*|VPN.*'}[5m]) * 8 / windows_net_current_bandwidth{instance=~\"$instance\",nic!~'isatap.*|VPN.*'}) * 100"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "f710ea1b-c8b3-4ca1-a8fc-4d2a8b21895d",
"layout": {
"h": 7,
"i": "f710ea1b-c8b3-4ca1-a8fc-4d2a8b21895d",
"w": 8,
"x": 16,
"y": 21
},
"name": "Packets / Second",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(windows_net_packets_outbound_discarded{instance=~\"$instance\", nic!~'isatap.*|VPN.*'}[5m]) + irate(windows_net_packets_outbound_errors{instance=~\"$instance\"}[5m])",
"legend": "outbound"
},
{
"expr": "irate(windows_net_packets_received_discarded{job=~\"$job\",instance=~\"$instance\", nic!~'isatap.*|VPN.*'}[5m]) + irate(windows_net_packets_received_errors{job=~\"$job\",instance=~\"$instance\"}[5m])",
"legend": "received"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "0a9d73c2-caff-4ae9-8159-2bc96dd847fb",
"layout": {
"h": 1,
"i": "0a9d73c2-caff-4ae9-8159-2bc96dd847fb",
"w": 24,
"x": 0,
"y": 28
},
"name": "System",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "7d995748-cf74-4ae7-9ad4-dab4eefd84f9",
"layout": {
"h": 7,
"i": "7d995748-cf74-4ae7-9ad4-dab4eefd84f9",
"w": 12,
"x": 0,
"y": 29
},
"name": "System Threads",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "windows_system_threads{instance=~\"$instance\"}"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "4cd91349-03ea-47b0-8c13-80191fc80e02",
"layout": {
"h": 7,
"i": "4cd91349-03ea-47b0-8c13-80191fc80e02",
"w": 12,
"x": 12,
"y": 29
},
"name": "System exception dispatches",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "desc"
}
},
"targets": [
{
"expr": "irate(windows_system_exception_dispatches_total{instance=~\"$instance\"}[5m])"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(windows_system_system_up_time, instance)",
"name": "instance",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328482559000
}
================================================
FILE: integrations/Windows/markdown/README.md
================================================
# Windows
categraf 不但支持 linux 监控数据采集,也支持 windows 监控数据采集,而且指标命名也是一样的,这样告警规则、仪表盘其实都可以复用。不需要对 windows 做额外处理。
## 安装
categraf 在 windows 下安装请参考这个 [文档](https://flashcat.cloud/docs/content/flashcat-monitor/categraf/2-installation/)。
## 仪表盘
linux、windows 仪表盘其实是可以复用的,只是两种操作系统个别指标不同。比如有些指标是 linux 特有的,有些指标是 windows 特有的。
================================================
FILE: integrations/XSKYApi/collect/xskyapi/xskyapi.toml
================================================
# # collect interval
# interval = 15
#
[[instances]]
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
## must be one of oss/gfs/eus
dss_type = "oss"
## URL of each server in the service's cluster
servers = [
#"http://x.x.x.x:xx"
]
## Set response_timeout (default 5 seconds)
response_timeout = "5s"
xms_auth_tokens = [
#"xxxxxxxxxxxxxxx"
]
================================================
FILE: integrations/XSKYApi/markdown/README.md
================================================
# XSKY Api
XSKY api
## Configurations
```toml
# # collect interval
# interval = 15
#
[[instances]]
# # append some labels for series
# labels = { region="cloud", product="n9e" }
# # interval = global.interval * interval_times
# interval_times = 1
## must be one of oss/gfs/eus
dss_type = "oss"
## URL of each server in the service's cluster
servers = [
#"http://x.x.x.x:xx"
]
## Set response_timeout (default 5 seconds)
response_timeout = "5s"
xms_auth_tokens = [
#"xxxxxxxxxxxxxxx"
]
```
================================================
FILE: integrations/ZooKeeper/alerts/zookeeper_by_exporter.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "More than 1 Zookeeper leader - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(zk_server_leader) \u003e 1",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328490414000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "There is no Zookeeper leader available - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "sum(zk_server_leader) == 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328491085000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Zookeeper has crashed - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "zk_up == 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328491571000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "Zookeeper instance is running abnormally - exporter",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"severities": [
2
],
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"algo_params": null,
"inhibit": false,
"prom_ql": "",
"queries": [
{
"prom_ql": "zk_ruok == 0",
"severity": 2
}
],
"severity": 0
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_days_of_weeks": [
[
"1",
"2",
"3",
"4",
"5",
"6",
"0"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": null,
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328492140000
}
]
================================================
FILE: integrations/ZooKeeper/collect/zookeeper/zookeeper.toml
================================================
# # collect interval
# interval = 15
[[instances]]
# cluster_name = "dev-zk-cluster"
# addresses = "127.0.0.1:2181"
# timeout = 10
# important! use global unique string to specify instance
# labels = { instance="n9e-10.2.3.4:2181" }
## Optional TLS Config
# use_tls = false
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true
================================================
FILE: integrations/ZooKeeper/dashboards/zookeeper_by_exporter.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Zookeeper - exporter",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "2718a256-a74a-4661-ae74-fe21d765c8b4",
"layout": {
"h": 1,
"i": "2718a256-a74a-4661-ae74-fe21d765c8b4",
"w": 24,
"x": 0,
"y": 0
},
"name": "overview",
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {
"value": 40
}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "4474ec3c-360b-4b3c-ab16-978305ecc438",
"layout": {
"h": 3,
"i": "4474ec3c-360b-4b3c-ab16-978305ecc438",
"w": 6,
"x": 0,
"y": 1
},
"name": "up",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"special": 1
},
"result": {
"color": "#3d950e",
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"color": "#f01414",
"text": "DOWN"
},
"type": "special"
}
]
},
"targets": [
{
"expr": "zk_up{job=\"$job\", instance=\"$instance\"}",
"legend": "up",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {
"value": 50
}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "5ef1653d-ca20-47b9-9604-f3a0dfffbdd6",
"layout": {
"h": 3,
"i": "5ef1653d-ca20-47b9-9604-f3a0dfffbdd6",
"w": 6,
"x": 6,
"y": 1
},
"name": "zk_znode_count",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "zk_znode_count{job=~\"$job\", instance=~\"$instance\"}",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {
"value": 50
}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "fef22c82-540a-4662-9913-26b6b38e8aa3",
"layout": {
"h": 3,
"i": "fef22c82-540a-4662-9913-26b6b38e8aa3",
"w": 6,
"x": 12,
"y": 1
},
"name": "zk_watch_count",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "zk_watch_count{job=~\"$job\", instance=~\"$instance\"}",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "value",
"textSize": {
"value": 50
}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "12510246-7469-4868-9dad-8d761f574ad3",
"layout": {
"h": 3,
"i": "12510246-7469-4868-9dad-8d761f574ad3",
"w": 6,
"x": 18,
"y": 1
},
"name": "zk_ephemerals_count",
"options": {
"standardOptions": {}
},
"targets": [
{
"expr": "zk_ephemerals_count{job=~\"$job\", instance=~\"$instance\"}",
"legend": "zk_ephemerals_count",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "0f7b7057-8446-49b8-ab45-beb9fb2a6af3",
"layout": {
"h": 7,
"i": "0f7b7057-8446-49b8-ab45-beb9fb2a6af3",
"w": 12,
"x": 0,
"y": 2
},
"name": "Packets",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "rate(zk_packets_sent{job=~\"$job\", instance=~\"$instance\"}[5m])",
"legend": "{{instance}}-sent",
"refId": "A"
},
{
"expr": "rate(zk_packets_received{job=~\"$job\", instance=~\"$instance\"}[5m])",
"legend": "{{instance}}-received",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "3e2d6853-4e2b-4b71-8601-fd2ececceb30",
"layout": {
"h": 7,
"i": "3e2d6853-4e2b-4b71-8601-fd2ececceb30",
"w": 6,
"x": 6,
"y": 4
},
"name": "alive_connections",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "zk_num_alive_connections{job=~\"$job\", instance=~\"$instance\"}",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "83205acd-35b8-404b-9883-cf3f656b022b",
"layout": {
"h": 7,
"i": "83205acd-35b8-404b-9883-cf3f656b022b",
"w": 6,
"x": 12,
"y": 4
},
"name": "file_descriptor",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "zk_open_file_descriptor_count{job=~\"$job\", instance=~\"$instance\"}",
"legend": "{{instance}}-open",
"refId": "A"
},
{
"expr": "zk_max_file_descriptor_count{job=~\"$job\", instance=~\"$instance\"}",
"legend": "{{instance}}-max",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "ea71f66f-690f-4e4e-95bc-b835f0d6027e",
"layout": {
"h": 7,
"i": "ea71f66f-690f-4e4e-95bc-b835f0d6027e",
"w": 6,
"x": 18,
"y": 4
},
"name": "latency(ms)",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "zk_avg_latency{job=~\"$job\", instance=~\"$instance\"}",
"legend": "{{instance}}-avg",
"refId": "A"
},
{
"expr": "zk_min_latency{job=~\"$job\", instance=~\"$instance\"}",
"legend": "{{instance}}-min",
"refId": "B"
},
{
"expr": "zk_max_latency{job=~\"$job\", instance=~\"$instance\"}",
"legend": "{{instance}}-max",
"refId": "C"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "906d651b-234b-4e38-b90f-7ac31b267eb8",
"layout": {
"h": 7,
"i": "906d651b-234b-4e38-b90f-7ac31b267eb8",
"w": 6,
"x": 0,
"y": 4
},
"name": "outstanding_requests",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "zk_outstanding_requests{job=~\"$job\", instance=~\"$instance\"}",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "83bb38e0-0074-4a80-ae2a-ea242db0da7b",
"layout": {
"h": 7,
"i": "83bb38e0-0074-4a80-ae2a-ea242db0da7b",
"w": 12,
"x": 12,
"y": 2
},
"name": "approximate_data_size",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "zk_approximate_data_size{job=~\"$job\", instance=~\"$instance\"}",
"legend": "{{instance}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(zk_up,job)",
"name": "job",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(zk_up,instance)",
"name": "instance",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328493590000
}
================================================
FILE: integrations/ZooKeeper/markdown/README.md
================================================
# zookeeper
注意: `>=3.6.0` zookeeper 版本内置 [prometheus 的支持](https://zookeeper.apache.org/doc/current/zookeeperMonitor.html),即,如果 zookeeper 启用了 prometheus,Categraf 可使用 prometheus 插件从这个 metrics 接口拉取数据即可。就无需使用 zookeeper 这个插件来采集了。
## 说明
categraf zookeeper 采集插件移植于 [dabealu/zookeeper-exporter](https://github.com/dabealu/zookeeper-exporter),适用于 `<3.6.0` 版本的 zookeeper, 原理就是利用 Zookeper 提供的四字命令(The Four Letter Words)获取监控信息。
需要注意的是,在 zookeeper v3.4.10 以后添加了四字命令白名单,需要在 zookeeper 的配置文件 `zoo.cfg` 中新增白名单配置:
```
4lw.commands.whitelist=mntr,ruok
```
## 配置
zookeeper 插件的配置在 `conf/input.zookeeper/zookeeper.toml` 集群中的多个实例地址请用空格分隔:
```toml
[[instances]]
cluster_name = "dev-zk-cluster"
addresses = "127.0.0.1:2181"
timeout = 10
```
如果要监控多个 zookeeper 集群,就增加 instances 即可:
```toml
[[instances]]
cluster_name = "dev-zk-cluster"
addresses = "127.0.0.1:2181"
timeout = 10
[[instances]]
cluster_name = "test-zk-cluster"
addresses = "127.0.0.1:2181 127.0.0.1:2182 127.0.0.1:2183"
timeout = 10
```
================================================
FILE: integrations/cAdvisor/collect/cadvisor/cadvisor.toml
================================================
# # collect interval
# interval = 15
[[instances]]
# url = "https://1.2.3.4:10250"
# type = "kubelet"
## url = "http://1.2.3.4:8080/metrics"
## type = "cadvisor"
# url_label_key = "instance"
# url_label_value = "{{.Host}}"
# bearer_token_string = "eyJlonglongxxxx.eyJlonglongyyyy.oQsXlonglongZZZ"
## bearer_token_file = "/path/to/token/file"
# ignore_label_keys = ["id","name", "container_label*"]
## choose_label_keys = ["id"]
# timeout = "3s"
# use_tls = true
## tls_min_version = "1.2"
## tls_ca = "/etc/categraf/ca.pem"
## tls_cert = "/etc/categraf/cert.pem"
## tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
## insecure_skip_verify = true
================================================
FILE: integrations/cAdvisor/dashboards/dashboard.json
================================================
{
"id": 0,
"group_id": 0,
"name": "cAdvisor",
"ident": "",
"tags": "",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"links": [],
"panels": [
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "eeb56afe-8a3e-46d6-8923-aeb3d0f124ea",
"layout": {
"h": 7,
"i": "eeb56afe-8a3e-46d6-8923-aeb3d0f124ea",
"isResizable": true,
"w": 24,
"x": 0,
"y": 0
},
"links": [],
"maxPerRow": 4,
"name": "CPU Usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name) *100",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6690fff4-c159-40e5-b340-65a3ba85e37e",
"layout": {
"h": 8,
"i": "6690fff4-c159-40e5-b340-65a3ba85e37e",
"isResizable": true,
"w": 12,
"x": 0,
"y": 7
},
"links": [],
"maxPerRow": 4,
"name": "Memory Usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_memory_rss{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "3c798af5-cfae-4962-9b70-85736df44bb1",
"layout": {
"h": 8,
"i": "3c798af5-cfae-4962-9b70-85736df44bb1",
"isResizable": true,
"w": 12,
"x": 12,
"y": 7
},
"links": [],
"maxPerRow": 4,
"name": "Memory Cached",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(container_memory_cache{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "111835e1-cfb5-40db-bb52-1aca74cf1a00",
"layout": {
"h": 8,
"i": "111835e1-cfb5-40db-bb52-1aca74cf1a00",
"isResizable": true,
"w": 12,
"x": 0,
"y": 15
},
"links": [],
"maxPerRow": 4,
"name": "Received Network Traffic",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "linear",
"lineWidth": 2,
"stack": "off",
"version": "2.0.0"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "b8050f8f-aee7-4fa5-888d-b6025df14aa1",
"layout": {
"h": 8,
"i": "b8050f8f-aee7-4fa5-888d-b6025df14aa1",
"isResizable": true,
"w": 12,
"x": 12,
"y": 15
},
"links": [],
"maxPerRow": 4,
"name": "Sent Network Traffic",
"options": {
"legend": {
"displayMode": "hidden"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)",
"legend": "{{name}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"allOption": true,
"allValue": ".*",
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values({__name__=~\"container.*\"},instance)",
"multi": false,
"name": "host",
"reg": "",
"type": "query"
},
{
"allOption": true,
"allValue": ".*",
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "label_values({__name__=~\"container.*\", instance=~\"$host\"},name)",
"multi": false,
"name": "container",
"reg": "",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328497218000
}
================================================
FILE: integrations/cAdvisor/markdown/README.md
================================================
## cadvisor
cadvisor 采集插件, 采集cadvisor 数据,如果是通过kubelet采集,可以附加pod的label和annotation
## Configuration
```toml
# # collect interval
# interval = 15
[[instances]]
# 填写kubelet的ip和port
url = "https://1.2.3.4:10250/metrics/cadvisor"
# 如果path为空, 会自动补齐为/metrics/cadvisor
# url = "https://1.2.3.4:10250"
# 如果是通过kubelet采集,可以附加pod的label和annotation
type = "kubelet"
# 直接采集cadvisor , type 设置为cadvisor
#url = "http://1.2.3.4:8080/metrics"
#type = "cadvisor"
# url_label_key 和 url_label_value 用法参加下面说明
url_label_key = "instance"
url_label_value = "{{.Host}}"
# # 认证的token 或者token file
#bearer_token_string = "eyJhblonglongXXX.eyJplonglongYYY.oQsXlonglongZ-Z-Z"
bearer_token_file = "/path/to/token/file"
# 需要忽略的label key
ignore_label_keys = ["id","name", "container_label*"]
# 只采集那些label key, 建议保持为空,采集所有的label。 优先级高于ignore_label_keys。
#choose_label_keys = ["*"]
timeout = "3s"
# # Optional TLS Config
# # 想跳过自签证书,use_tls 记得要配置为true
use_tls = true
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
insecure_skip_verify = true
```
## url_label_key 和 url_label_value 用法
```toml
# 从URL中提取Host部分,放到instance label中
# 假设 url =https://1.2.3.4:10250/metrics/cadvisor
# 最终附加的label为 instance=1.2.3.4:10250
url_label_key = "instance"
url_label_value = "{{.Host}}"
```
如果 scheme 部分和 path 部分都想取,可以这么写:
```toml
url_label_value = "{{.Scheme}}://{{.Host}}{{.Path}}"
```
相关变量是用这个方法生成的,供大家参考:
```go
func (ul *UrlLabel) GenerateLabel(u *url.URL) (string, string, error) {
if ul.LabelValue == "" {
return ul.LabelKey, u.String(), nil
}
dict := map[string]string{
"Scheme": u.Scheme,
"Host": u.Host,
"Hostname": u.Hostname(),
"Port": u.Port(),
"Path": u.Path,
"Query": u.RawQuery,
"Fragment": u.Fragment,
}
var buffer bytes.Buffer
err := ul.LabelValueTpl.Execute(&buffer, dict)
if err != nil {
return "", "", err
}
return ul.LabelKey, buffer.String(), nil
}
```
以 `http://1.2.3.4:8080/search?q=keyword#results` 为例, 变量及其值如下:
|variable|value|
|---|---|
|{{.Scheme}}|http|
|{{.Host}} |1.2.3.4:8080|
|{{.Hostname}}|1.2.3.4|
|{{.Port}}|8080|
|{{.Path}}|search|
|{{.Query}}|q=keyword|
|{{.Fragment}}| results|
================================================
FILE: integrations/cAdvisor/metrics/exporter-base.json
================================================
[
{
"id": 0,
"uuid": 1717556328499238000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 CPU 利用率(system)",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "irate(container_cpu_system_seconds_total{image!=\"\", image!~\".*pause.*\"}[3m]) * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 CPU 利用率(system)",
"note": ""
},
{
"lang": "en_US",
"name": "Container CPU utilization (system)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328501571000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 CPU 利用率(user)",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "irate(container_cpu_user_seconds_total{image!=\"\", image!~\".*pause.*\"}[3m]) * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 CPU 利用率(user)",
"note": ""
},
{
"lang": "en_US",
"name": "Container CPU utilization (user)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328503579000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 CPU 利用率(整体,值不会大于 100)",
"unit": "percent",
"note": "只有设置了 limit 的容器才能计算此利用率",
"lang": "zh_CN",
"expression": "sum(\n irate(container_cpu_usage_seconds_total{image!=\"\", image!~\".*pause.*\"}[3m])\n) by (pod,namespace,container,image)\n/\nsum(\n container_spec_cpu_quota/container_spec_cpu_period\n) by (pod,namespace,container,image)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 CPU 利用率(整体,值不会大于 100)",
"note": "只有设置了 limit 的容器才能计算此利用率"
},
{
"lang": "en_US",
"name": "Container CPU utilization (overall, the value will not be greater than 100)",
"note": "Only containers with limit set can calculate this utilization"
}
]
},
{
"id": 0,
"uuid": 1717556328505581000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 CPU 利用率(整体,值可能大于 100)",
"unit": "percent",
"note": "如果是 200% 表示占用了 2 个核",
"lang": "zh_CN",
"expression": "irate(container_cpu_usage_seconds_total{image!=\"\", image!~\".*pause.*\"}[3m]) * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 CPU 利用率(整体,值可能大于 100)",
"note": "如果是 200% 表示占用了 2 个核"
},
{
"lang": "en_US",
"name": "Container CPU utilization (overall, value may be greater than 100)",
"note": "If 200%, it means that 2 cores are occupied"
}
]
},
{
"id": 0,
"uuid": 1717556328507566000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 CPU 每秒有多少 period",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "irate(container_cpu_cfs_periods_total{}[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 CPU 每秒有多少 period",
"note": ""
},
{
"lang": "en_US",
"name": "How many periods does the container CPU have per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328509502000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 CPU 每秒被 throttle 的 period 量",
"unit": "none",
"note": "如果容器限制了 CPU,而 app 所需算法过多, 会被抑制使用,container_cpu_cfs_throttled_periods_total 统计总共有多少个 period 被抑制了,如果近期发生抑制是需要关注的,一些延迟敏感的 app 受影响尤为明显。出现被抑制的情况,大概率是需要升配了。",
"lang": "zh_CN",
"expression": "irate(container_cpu_cfs_throttled_periods_total{}[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 CPU 每秒被 throttle 的 period 量",
"note": "如果容器限制了 CPU,而 app 所需算法过多, 会被抑制使用,container_cpu_cfs_throttled_periods_total 统计总共有多少个 period 被抑制了,如果近期发生抑制是需要关注的,一些延迟敏感的 app 受影响尤为明显。出现被抑制的情况,大概率是需要升配了。"
},
{
"lang": "en_US",
"name": "The amount of periods that the container CPU is throttle per second",
"note": "If the container limits the CPU and the app requires too many algorithms, it will be suppressed. container _ CPU _ cfs _ throttled _ periods _ total counts how many periods have been suppressed in total. If suppression occurs recently, it needs attention. Some delay-sensitive apps are particularly affected. If it is suppressed, there is a high probability that it needs to be upgraded."
}
]
},
{
"id": 0,
"uuid": 1717556328511466000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 CPU 被 throttle 的比例",
"unit": "percent",
"note": "这个值大于 0 就要注意",
"lang": "zh_CN",
"expression": "irate(container_cpu_cfs_throttled_periods_total{}[3m]) / irate(container_cpu_cfs_periods_total{}[3m]) * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 CPU 被 throttle 的比例",
"note": "这个值大于 0 就要注意"
},
{
"lang": "en_US",
"name": "The proportion of container CPU being throttle",
"note": "If this value is greater than 0, pay attention"
}
]
},
{
"id": 0,
"uuid": 1717556328513368000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 filesystem 使用率",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "container_fs_usage_bytes / container_fs_limit_bytes * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 filesystem 使用率",
"note": ""
},
{
"lang": "en_US",
"name": "Container filesystem usage",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328515663000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 filesystem 使用量",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "container_fs_usage_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 filesystem 使用量",
"note": ""
},
{
"lang": "en_US",
"name": "Container filesystem usage",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328517712000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 filesystem 当前 IO 次数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "container_fs_io_current",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 filesystem 当前 IO 次数",
"note": ""
},
{
"lang": "en_US",
"name": "Container filesystem Current IO times",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328519514000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 filesystem 总量",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "container_fs_limit_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 filesystem 总量",
"note": ""
},
{
"lang": "en_US",
"name": "Container filesystem Total",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328521362000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 inode free 量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "container_fs_inodes_free",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 inode free 量",
"note": ""
},
{
"lang": "en_US",
"name": "Container inode free amount",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328523734000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 inode total 量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "container_fs_inodes_total",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 inode total 量",
"note": ""
},
{
"lang": "en_US",
"name": "Container inode total",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328525590000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 inode 使用率",
"unit": "percent",
"note": "",
"lang": "zh_CN",
"expression": "100 - container_fs_inodes_free / container_fs_inodes_total * 100",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 inode 使用率",
"note": ""
},
{
"lang": "en_US",
"name": "Container inode usage",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328527508000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 IO 每秒写入 byte 量",
"unit": "bytesSecIEC",
"note": "",
"lang": "zh_CN",
"expression": "sum(irate(container_fs_writes_bytes_total[3m])) by (namespace, pod)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 IO 每秒写入 byte 量",
"note": ""
},
{
"lang": "en_US",
"name": "Container IO writes bytes per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328529476000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 IO 每秒读取 byte 量",
"unit": "bytesSecIEC",
"note": "",
"lang": "zh_CN",
"expression": "sum(irate(container_fs_reads_bytes_total[3m])) by (namespace, pod)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 IO 每秒读取 byte 量",
"note": ""
},
{
"lang": "en_US",
"name": "Container IO reads bytes per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328531514000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 memory cache 量",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "container_memory_cache{image!=\"\", image!~\".*pause.*\"}",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 memory cache 量",
"note": ""
},
{
"lang": "en_US",
"name": "Container memory cache amount",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328533806000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 memory 使用率(Usage)",
"unit": "percent",
"note": "如果有大量文件 IO,有大量 container_memory_cache,container_memory_usage_bytes 和 container_memory_working_set_bytes 的大小会有差异",
"lang": "zh_CN",
"expression": "100 * container_memory_usage_bytes/container_spec_memory_limit_bytes\nand\ncontainer_spec_memory_limit_bytes != 0",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 memory 使用率(Usage)",
"note": "如果有大量文件 IO,有大量 container_memory_cache,container_memory_usage_bytes 和 container_memory_working_set_bytes 的大小会有差异"
},
{
"lang": "en_US",
"name": "Container memory Usage (Usage)",
"note": "If there is a large number of file IO and a large number of container _ memory _ cache, the size of container _ memory _ usage _ bytes and container _ memory _ working _ set _ bytes will be different"
}
]
},
{
"id": 0,
"uuid": 1717556328536123000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 memory 使用率(Working Set)",
"unit": "percent",
"note": "如果有大量文件 IO,有大量 container_memory_cache,container_memory_usage_bytes 和 container_memory_working_set_bytes 的大小会有差异",
"lang": "zh_CN",
"expression": "100 * container_memory_working_set_bytes/container_spec_memory_limit_bytes\nand\ncontainer_spec_memory_limit_bytes != 0",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 memory 使用率(Working Set)",
"note": "如果有大量文件 IO,有大量 container_memory_cache,container_memory_usage_bytes 和 container_memory_working_set_bytes 的大小会有差异"
},
{
"lang": "en_US",
"name": "Container memory usage rate (Working Set)",
"note": "If there is a large number of file IO and a large number of container _ memory _ cache, the size of container _ memory _ usage _ bytes and container _ memory _ working _ set _ bytes will be different"
}
]
},
{
"id": 0,
"uuid": 1717556328538777000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 memory 使用量(mapped_file)",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "container_memory_mapped_file{image!=\"\", image!~\".*pause.*\"}",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 memory 使用量(mapped_file)",
"note": ""
},
{
"lang": "en_US",
"name": "Container memory usage (mapped _ file)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328540899000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 memory 使用量(RSS)",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "container_memory_rss{image!=\"\", image!~\".*pause.*\"}",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 memory 使用量(RSS)",
"note": ""
},
{
"lang": "en_US",
"name": "Container memory usage (RSS)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328543073000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 memory 使用量(Swap)",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "container_memory_swap{image!=\"\", image!~\".*pause.*\"}",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 memory 使用量(Swap)",
"note": ""
},
{
"lang": "en_US",
"name": "Container memory usage (Swap)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328545308000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 memory 使用量(Usage)",
"unit": "bytesIEC",
"note": "如果有大量文件 IO,有大量 container_memory_cache,container_memory_usage_bytes 和 container_memory_working_set_bytes 的大小会有差异",
"lang": "zh_CN",
"expression": "container_memory_usage_bytes{image!=\"\", image!~\".*pause.*\"}",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 memory 使用量(Usage)",
"note": "如果有大量文件 IO,有大量 container_memory_cache,container_memory_usage_bytes 和 container_memory_working_set_bytes 的大小会有差异"
},
{
"lang": "en_US",
"name": "Container memory Usage",
"note": "If there is a large number of file IO and a large number of container _ memory _ cache, the size of container _ memory _ usage _ bytes and container _ memory _ working _ set _ bytes will be different"
}
]
},
{
"id": 0,
"uuid": 1717556328547364000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 memory 使用量(Working Set)",
"unit": "bytesIEC",
"note": "如果有大量文件 IO,有大量 container_memory_cache,container_memory_usage_bytes 和 container_memory_working_set_bytes 的大小会有差异",
"lang": "zh_CN",
"expression": "container_memory_working_set_bytes{image!=\"\", image!~\".*pause.*\"}",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 memory 使用量(Working Set)",
"note": "如果有大量文件 IO,有大量 container_memory_cache,container_memory_usage_bytes 和 container_memory_working_set_bytes 的大小会有差异"
},
{
"lang": "en_US",
"name": "Container memory usage (Working Set)",
"note": "If there is a large number of file IO and a large number of container _ memory _ cache, the size of container _ memory _ usage _ bytes and container _ memory _ working _ set _ bytes will be different"
}
]
},
{
"id": 0,
"uuid": 1717556328549264000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 memory 分配失败次数(每秒)",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "rate(container_memory_failures_total{}[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 memory 分配失败次数(每秒)",
"note": ""
},
{
"lang": "en_US",
"name": "Container memory allocation failures (per second)",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328551296000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 memory 限制量",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "container_spec_memory_limit_bytes{image!=\"\", image!~\".*pause.*\"}",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 memory 限制量",
"note": ""
},
{
"lang": "en_US",
"name": "Container memory limit",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328553284000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 net 每秒发送 bit 量",
"unit": "bitsSecIEC",
"note": "",
"lang": "zh_CN",
"expression": "sum(irate(container_network_transmit_bytes_total[3m])) by (namespace, pod) * 8",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 net 每秒发送 bit 量",
"note": ""
},
{
"lang": "en_US",
"name": "Container net sends bits per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328555450000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 net 每秒发送 byte 量",
"unit": "bytesSecIEC",
"note": "",
"lang": "zh_CN",
"expression": "sum(irate(container_network_transmit_bytes_total[3m])) by (namespace, pod)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 net 每秒发送 byte 量",
"note": ""
},
{
"lang": "en_US",
"name": "Container net sends bytes per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328557652000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 net 每秒发送数据包数量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "irate(container_network_transmit_packets_total[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 net 每秒发送数据包数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of packets sent per second by container net",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328559896000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 net 每秒发送时 drop 包数量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "irate(container_network_transmit_packets_dropped_total[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 net 每秒发送时 drop 包数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of drop packets sent by container net per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328563019000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 net 每秒发送错包数",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "irate(container_network_transmit_errors_total[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 net 每秒发送错包数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of wrong packets sent by container net per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328565014000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 net 每秒接收 bit 量",
"unit": "bitsSecIEC",
"note": "",
"lang": "zh_CN",
"expression": "sum(irate(container_network_receive_bytes_total[3m])) by (namespace, pod) * 8",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 net 每秒接收 bit 量",
"note": ""
},
{
"lang": "en_US",
"name": "The amount of bits received by the container net per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328566851000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 net 每秒接收 byte 量",
"unit": "bytesSecIEC",
"note": "",
"lang": "zh_CN",
"expression": "sum(irate(container_network_receive_bytes_total[3m])) by (namespace, pod)",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 net 每秒接收 byte 量",
"note": ""
},
{
"lang": "en_US",
"name": "Container net receives bytes per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328568786000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 net 每秒接收数据包数量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "irate(container_network_receive_packets_total[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 net 每秒接收数据包数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of packets received per second by container net",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328570812000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 net 每秒接收时 drop 包数量",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "irate(container_network_receive_packets_dropped_total[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 net 每秒接收时 drop 包数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of drop packets received by container net per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328572814000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器 net 每秒接收错包数",
"unit": "sishort",
"note": "",
"lang": "zh_CN",
"expression": "irate(container_network_receive_errors_total[3m])",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器 net 每秒接收错包数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of wrong packets received by container net per second",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328574925000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器允许运行的最大线程数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "container_threads_max{image!=\"\", image!~\".*pause.*\"}",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器允许运行的最大线程数",
"note": ""
},
{
"lang": "en_US",
"name": "The maximum number of threads the container is allowed to run",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328576878000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器内 1 号进程 soft ulimit 值",
"unit": "none",
"note": "容器内1号进程的软 ulimit 值。如果为-1,则无限制。",
"lang": "zh_CN",
"expression": "container_ulimits_soft{image!=\"\", image!~\".*pause.*\"}",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器内 1 号进程 soft ulimit 值",
"note": "容器内1号进程的软 ulimit 值。如果为-1,则无限制。"
},
{
"lang": "en_US",
"name": "Process No. 1 soft ulimit value in container",
"note": "Soft ulimit value for process # 1 inside the container. If-1, there is no limit."
}
]
},
{
"id": 0,
"uuid": 1717556328578812000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器已经运行的时间",
"unit": "seconds",
"note": "",
"lang": "zh_CN",
"expression": "container_start_time_seconds{image!=\"\", image!~\".*pause.*\"}",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器已经运行的时间",
"note": ""
},
{
"lang": "en_US",
"name": "How long the container has been running",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328580653000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器当前打开套接字数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "container_sockets{image!=\"\", image!~\".*pause.*\"}",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器当前打开套接字数量",
"note": ""
},
{
"lang": "en_US",
"name": "Number of currently open sockets in the container",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328582431000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器当前打开文件句柄数量",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "container_file_descriptors{image!=\"\", image!~\".*pause.*\"}",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器当前打开文件句柄数量",
"note": ""
},
{
"lang": "en_US",
"name": "Container Number of currently open file handles",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328584349000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器当前运行的线程数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "container_threads{image!=\"\", image!~\".*pause.*\"}",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器当前运行的线程数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of threads currently running in the container",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328586195000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器当前运行的进程数",
"unit": "none",
"note": "",
"lang": "zh_CN",
"expression": "container_processes{image!=\"\", image!~\".*pause.*\"}",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器当前运行的进程数",
"note": ""
},
{
"lang": "en_US",
"name": "Number of processes currently running in the container",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328588204000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器总 GPU 加速卡可用内存量",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "container_accelerator_memory_total_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器总 GPU 加速卡可用内存量",
"note": ""
},
{
"lang": "en_US",
"name": "Container Total GPU Accelerator Available Memory",
"note": ""
}
]
},
{
"id": 0,
"uuid": 1717556328590070000,
"collector": "Exporter",
"typ": "cAdvisor",
"name": "容器正在使用的 GPU 加速卡内存量",
"unit": "bytesIEC",
"note": "",
"lang": "zh_CN",
"expression": "container_accelerator_memory_used_bytes",
"created_at": 0,
"created_by": "",
"updated_at": 0,
"updated_by": "",
"translation": [
{
"lang": "zh_CN",
"name": "容器正在使用的 GPU 加速卡内存量",
"note": ""
},
{
"lang": "en_US",
"name": "The amount of GPU accelerator card memory the container is using",
"note": ""
}
]
}
]
================================================
FILE: integrations/vSphere/alerts/alerts.json
================================================
[
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ESXi实例CPU使用率大于70%",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "(sum(vsphere_host_cpu_usage_average{}) by(esxhostname)/count(vsphere_host_cpu_usage_average{}) by(esxhostname)) \u003e 70",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [
"dingtalk"
],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328595844000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "ESXi实例内存使用率大于70%",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "(sum(vsphere_host_mem_usage_average{}) by(esxhostname)/count(vsphere_host_mem_usage_average{}) by(esxhostname)) \u003e 70",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328596304000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "VM虚拟机CPU使用率大于70%",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "(sum(vsphere_vm_cpu_usage_average{}) by(vmname)/count(vsphere_vm_cpu_usage_average{}) by(vmname))\u003e70",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328596707000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "VM虚拟机内存使用率大于70%",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "(sum(vsphere_vm_mem_usage_average{}) by(vmname)/count(vsphere_vm_mem_usage_average{}) by(vmname))\u003e70",
"severity": 2
}
]
},
"prom_eval_interval": 30,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328597163000
},
{
"id": 0,
"group_id": 0,
"cate": "prometheus",
"datasource_ids": [
0
],
"cluster": "",
"name": "VSphere磁盘使用率大于70%",
"note": "",
"prod": "metric",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 0,
"severities": [
2
],
"disabled": 0,
"prom_for_duration": 15,
"prom_ql": "",
"rule_config": {
"queries": [
{
"prom_ql": "sum(vsphere_datastore_disk_used_latest{vcenter=~\".+\"}/vsphere_datastore_disk_capacity_latest{vcenter=~\".+\"}) by(source)*100 \u003e 70",
"severity": 2
}
]
},
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_stimes": [
"00:00"
],
"enable_etime": "23:59",
"enable_etimes": [
"23:59"
],
"enable_days_of_week": [
"0",
"1",
"2",
"3",
"4",
"5",
"6"
],
"enable_days_of_weeks": [
[
"0",
"1",
"2",
"3",
"4",
"5",
"6"
]
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_groups_obj": null,
"notify_groups": null,
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [],
"annotations": {},
"extra_config": null,
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"uuid": 1717556328597565000
}
]
================================================
FILE: integrations/vSphere/collect/vsphere/vsphere.toml
================================================
# # collect interval
# interval = 15
# Read metrics from one or many vCenters
[[instances]]
## vCenter URLs to be monitored. These three lines must be uncommented
## and edited for the plugin to work.
#vcenter = "https://vcenter.local/sdk"
#username = "user@corp.local"
#password = "secret"
## VMs
## Typical VM metrics (if omitted or empty, all metrics are collected)
# vm_include = [ "/*/vm/**"] # Inventory path to VMs to collect (by default all are collected)
# vm_exclude = [] # Inventory paths to exclude
vm_metric_include = [
"cpu.demand.average",
"cpu.idle.summation",
"cpu.latency.average",
"cpu.readiness.average",
"cpu.ready.summation",
"cpu.run.summation",
"cpu.usagemhz.average",
"cpu.used.summation",
"cpu.wait.summation",
"mem.active.average",
"mem.granted.average",
"mem.latency.average",
"mem.swapin.average",
"mem.swapinRate.average",
"mem.swapout.average",
"mem.swapoutRate.average",
"mem.usage.average",
"mem.vmmemctl.average",
"net.bytesRx.average",
"net.bytesTx.average",
"net.droppedRx.summation",
"net.droppedTx.summation",
"net.usage.average",
"power.power.average",
"virtualDisk.numberReadAveraged.average",
"virtualDisk.numberWriteAveraged.average",
"virtualDisk.read.average",
"virtualDisk.readOIO.latest",
"virtualDisk.throughput.usage.average",
"virtualDisk.totalReadLatency.average",
"virtualDisk.totalWriteLatency.average",
"virtualDisk.write.average",
"virtualDisk.writeOIO.latest",
"sys.uptime.latest",
]
# vm_metric_exclude = [] ## Nothing is excluded by default
# vm_instances = true ## true by default
## Hosts
## Typical host metrics (if omitted or empty, all metrics are collected)
# host_include = [ "/*/host/**"] # Inventory path to hosts to collect (by default all are collected)
# host_exclude [] # Inventory paths to exclude
host_metric_include = [
"cpu.coreUtilization.average",
"cpu.costop.summation",
"cpu.demand.average",
"cpu.idle.summation",
"cpu.latency.average",
"cpu.readiness.average",
"cpu.ready.summation",
"cpu.swapwait.summation",
"cpu.usage.average",
"cpu.usagemhz.average",
"cpu.used.summation",
"cpu.utilization.average",
"cpu.wait.summation",
"disk.deviceReadLatency.average",
"disk.deviceWriteLatency.average",
"disk.kernelReadLatency.average",
"disk.kernelWriteLatency.average",
"disk.numberReadAveraged.average",
"disk.numberWriteAveraged.average",
"disk.read.average",
"disk.totalReadLatency.average",
"disk.totalWriteLatency.average",
"disk.write.average",
"mem.active.average",
"mem.latency.average",
"mem.state.latest",
"mem.swapin.average",
"mem.swapinRate.average",
"mem.swapout.average",
"mem.swapoutRate.average",
"mem.totalCapacity.average",
"mem.usage.average",
"mem.vmmemctl.average",
"net.bytesRx.average",
"net.bytesTx.average",
"net.droppedRx.summation",
"net.droppedTx.summation",
"net.errorsRx.summation",
"net.errorsTx.summation",
"net.usage.average",
"power.power.average",
"storageAdapter.numberReadAveraged.average",
"storageAdapter.numberWriteAveraged.average",
"storageAdapter.read.average",
"storageAdapter.write.average",
"sys.uptime.latest",
]
# host_instances = true ## true by default
# host_include = [] ## Nothing included by default
# host_exclude = [] ## Nothing excluded by default
# host_metric_include = [] ## Nothing included by default
# host_metric_exclude = [] ## Nothing excluded by default
## Clusters
# cluster_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected)
# cluster_exclude = [] # Inventory paths to exclude
# cluster_metric_include = [] ## if omitted or empty, all metrics are collected
# cluster_metric_exclude = [] ## Nothing excluded by default
# cluster_instances = false ## false by default
## Resource Pools
# resoucepool_include = [ "/*/host/**"] # Inventory path to datastores to collect (by default all are collected)
# resoucepool_exclude = [] # Inventory paths to exclude
# resoucepool_metric_include = [] ## if omitted or empty, all metrics are collected
# resoucepool_metric_exclude = [] ## Nothing excluded by default
# resoucepool_instances = false ## false by default
## Datastores
# datastore_include = [ "/*/datastore/**"] # Inventory path to datastores to collect (by default all are collected)
# datastore_exclude = [] # Inventory paths to exclude
# datastore_metric_include = [] ## if omitted or empty, all metrics are collected
# datastore_metric_exclude = [] ## Nothing excluded by default
# datastore_instances = false ## false by default
## Datacenters
# datacenter_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected)
# datacenter_exclude = [] # Inventory paths to exclude
# datacenter_metric_include = [] ## if omitted or empty, all metrics are collected
# datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
# datacenter_instances = false ## false by default
## Plugin Settings
## separator character to use for measurement and field names (default: "_")
# separator = "_"
## Collect IP addresses? Valid values are "ipv4" and "ipv6"
# ip_addresses = ["ipv6", "ipv4" ]
## When set to true, all samples are sent as integers. This makes the output
## data types backwards compatible with Telegraf 1.9 or lower. Normally all
## samples from vCenter, with the exception of percentages, are integer
## values, but under some conditions, some averaging takes place internally in
## the plugin. Setting this flag to "false" will send values as floats to
## preserve the full precision when averaging takes place.
# use_int_samples = true
## Custom attributes from vCenter can be very useful for queries in order to slice the
## metrics along different dimension and for forming ad-hoc relationships. They are disabled
## by default, since they can add a considerable amount of tags to the resulting metrics. To
## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include
## to select the attributes you want to include.
## By default, since they can add a considerable amount of tags to the resulting metrics. To
## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include
## to select the attributes you want to include.
# custom_attribute_include = []
# custom_attribute_exclude = ["*"]
## The number of vSphere 5 minute metric collection cycles to look back for non-realtime metrics. In
## some versions (6.7, 7.0 and possible more), certain metrics, such as cluster metrics, may be reported
## with a significant delay (>30min). If this happens, try increasing this number. Please note that increasing
## it too much may cause performance issues.
# metric_lookback = 3
## number of objects to retrieve per query for realtime resources (vms and hosts)
## set to 64 for vCenter 5.5 and 6.0 (default: 256)
# max_query_objects = 256
## number of metrics to retrieve per query for non-realtime resources (clusters and datastores)
## set to 64 for vCenter 5.5 and 6.0 (default: 256)
# max_query_metrics = 256
## number of go routines to use for collection and discovery of objects and metrics
# collect_concurrency = 1
# discover_concurrency = 1
## the interval before (re)discovering objects subject to metrics collection (default: 300s)
# object_discovery_interval = "300s"
## timeout applies to any of the api request made to vcenter
# timeout = "60s"
## Optional SSL Config
# use_tls = false
# tls_ca = "/path/to/cafile"
# tls_cert = "/path/to/certfile"
# tls_key = "/path/to/keyfile"
## Use SSL but skip chain & host verification
# insecure_skip_verify = false
## The Historical Interval value must match EXACTLY the interval in the daily
# "Interval Duration" found on the VCenter server under Configure > General > Statistics > Statistic intervals
# historical_interval = "5m"
================================================
FILE: integrations/vSphere/dashboards/vmware_by_vsphere-monitor.json
================================================
{
"id": 0,
"group_id": 0,
"name": "VMware by vSphere-monitor",
"ident": "",
"tags": "VMware vSphere-monitor",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"panels": [
{
"collapsed": true,
"id": "8d8b9e77-73dd-4ccf-bb4b-82c725f6be60",
"layout": {
"h": 1,
"i": "8d8b9e77-73dd-4ccf-bb4b-82c725f6be60",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "Vcenter Status",
"panels": [],
"type": "row"
},
{
"custom": {
"alignItems": "center",
"bgColor": "#1d78a1",
"content": "```VMware Vcenter```",
"justifyContent": "center",
"textColor": "#ff9919",
"textSize": 22
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "",
"id": "597e7302-ea67-4690-a8d0-0ef4db233825",
"layout": {
"h": 3,
"i": "597e7302-ea67-4690-a8d0-0ef4db233825",
"isResizable": true,
"w": 4,
"x": 0,
"y": 1
},
"links": [
{
"targetBlank": true,
"title": "Vsphere-monitor",
"url": "https://github.com/freedomkk-qfeng/vsphere-monitor"
}
],
"name": "",
"type": "text",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有Datacenter数量",
"id": "65429acb-0817-4e45-a396-157ad73a77a2",
"layout": {
"h": 3,
"i": "62db6d95-be53-4123-bef1-6e5bd0adbfea",
"isResizable": true,
"w": 4,
"x": 4,
"y": 1
},
"name": "Datacenter summary",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#4ca3d9"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "count(max(esxi_alive{ident=\"$vcenter\"}) by (datacenter))",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有Datastore存储数量",
"id": "20001d28-d0f6-400d-b0ac-340160dd8bdc",
"layout": {
"h": 3,
"i": "ecae770c-5e50-40e7-99dd-88dbd64174f1",
"isResizable": true,
"w": 4,
"x": 8,
"y": 1
},
"name": "Datastore summary",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#ff9919"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "count(datastore_capacity{ident=\"$vcenter\"})",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有ESXI主机的数量",
"id": "40fb3098-f205-432b-b38e-5defb98b1da0",
"layout": {
"h": 3,
"i": "ce2ce101-ee12-429a-8968-e1a6fe0e0f13",
"isResizable": true,
"w": 4,
"x": 12,
"y": 1
},
"name": "ESXI summary",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#e49d4e"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "count(esxi_alive{ident=\"$vcenter\"})",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "开启的虚拟机数量",
"id": "906403b1-4bc3-41e3-a5a4-3563ad335724",
"layout": {
"h": 3,
"i": "66911f96-94d4-41b4-9604-ef3c4268f3bf",
"isResizable": true,
"w": 4,
"x": 16,
"y": 1
},
"name": "VM Power ON",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#3fc453"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "count(vm_power{ident=\"$vcenter\"}==1)",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "first",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "关闭的虚拟机数量",
"id": "8b452644-5cf2-45fb-8bb9-eb9bf1dcfd43",
"layout": {
"h": 3,
"i": "4ca2f3fd-02ff-4e17-9d16-91f5897726bb",
"isResizable": true,
"w": 4,
"x": 20,
"y": 1
},
"links": [],
"name": "VM Power OFF",
"options": {
"standardOptions": {
"min": null
},
"valueMappings": [
{
"match": {
"from": 1,
"specialValue": "null",
"to": null
},
"result": {
"color": "#ce4f52",
"text": ""
},
"type": "range"
},
{
"match": {
"special": 0
},
"result": {
"color": "#3fc453"
},
"type": "special"
}
]
},
"targets": [
{
"expr": "count(vm_power{ident=\"$vcenter\"} != 1) or vector(0)",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Datastore总空间",
"id": "aeacca5e-063d-4abf-9b05-80b8ca33ea02",
"layout": {
"h": 3,
"i": "3287ae88-1552-44c6-b5c2-168064d1e1e2",
"isResizable": true,
"w": 4,
"x": 0,
"y": 4
},
"name": "Datastore Total",
"options": {
"standardOptions": {
"util": "bytesIEC"
},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#9470ff"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(datastore_capacity{ident=\"$vcenter\"})",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Datastore总使用率",
"id": "4c51137d-9403-473a-b0d7-bda0410546e3",
"layout": {
"h": 3,
"i": "8a6b9c64-f05f-4128-9c4e-9227d955192c",
"isResizable": true,
"w": 4,
"x": 4,
"y": 4
},
"name": "Datastore Percent",
"options": {
"standardOptions": {
"util": "percentUnit"
},
"valueMappings": [
{
"match": {
"from": null,
"to": 0.6
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
},
{
"match": {
"from": 0.6,
"to": 0.8
},
"result": {
"color": "#ffae39"
},
"type": "range"
},
{
"match": {
"from": 0.8
},
"result": {
"color": "#ce4f52"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "(sum(datastore_capacity{ident=\"$vcenter\"})-sum(datastore_free{ident=\"$vcenter\"})) / sum(datastore_capacity{ident=\"$vcenter\"})",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "background",
"displayMode": "seriesToRows",
"showHeader": false,
"sortColumn": "value",
"sortOrder": "ascend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "ESXI列表",
"id": "2fc78fbf-9238-42d8-8934-b5c5c0ea72c4",
"layout": {
"h": 9,
"i": "8dace014-0c83-4be0-8234-4a779e0d656a",
"isResizable": true,
"w": 8,
"x": 8,
"y": 4
},
"name": "ESXI List",
"options": {
"standardOptions": {},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"value": "A"
},
"properties": {
"valueMappings": [
{
"match": {
"special": 1
},
"result": {
"color": "#2c9d3d",
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"color": "#ce4f52",
"text": "Down"
},
"type": "special"
}
]
}
}
],
"targets": [
{
"expr": "esxi_alive{ident=\"$vcenter\"} ",
"legend": "{{ident}}(esxi-{{host}})",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"baseColor": "#2c9d3d",
"calc": "lastNotNull",
"serieWidth": 40,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "2027eb16-0b3f-493d-b5fc-7627d9427377",
"layout": {
"h": 9,
"i": "2027eb16-0b3f-493d-b5fc-7627d9427377",
"isResizable": true,
"w": 8,
"x": 16,
"y": 4
},
"name": "虚拟机内存使用率 top10",
"options": {
"standardOptions": {
"util": "percent"
},
"valueMappings": [
{
"match": {
"from": null,
"to": 50
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
},
{
"match": {
"from": 50,
"to": 75
},
"result": {
"color": "#ffae39"
},
"type": "range"
},
{
"match": {
"from": 75,
"to": null
},
"result": {
"color": "#ce4f52"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "topk(10,(100 - vm_memory_freePercent{ident=\"$vcenter\"}))",
"legend": "{{vm}}(Vcenter={{ident}})",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "ESXI内存总量",
"id": "c06e345f-f174-48a1-9cf6-cca48ef61289",
"layout": {
"h": 3,
"i": "b8014e1d-a5c9-4a06-ac04-195a32ccedf1",
"isResizable": true,
"w": 4,
"x": 0,
"y": 7
},
"name": "Vcenter MEM Total",
"options": {
"standardOptions": {
"util": "bytesIEC"
},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#9470ff"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(esxi_memory_capacity{ident=\"$vcenter\"})",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "ESXI内存总使用率",
"id": "ba9c54ef-d538-4063-b917-398c231db726",
"layout": {
"h": 3,
"i": "7a104eb0-fddb-4f0f-a826-8796dfac3eda",
"isResizable": true,
"w": 4,
"x": 4,
"y": 7
},
"name": "Vcenter MEM Percent",
"options": {
"standardOptions": {
"util": "percentUnit"
},
"valueMappings": [
{
"match": {
"from": null,
"to": 0.6
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
},
{
"match": {
"from": 0.6,
"to": 0.8
},
"result": {
"color": "#ffae39"
},
"type": "range"
},
{
"match": {
"from": 0.8
},
"result": {
"color": "#ce4f52"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(esxi_memory_usage{ident=\"$vcenter\"}) / sum(esxi_memory_capacity{ident=\"$vcenter\"})",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "ESXI存活数量",
"id": "9ea6f568-c803-4890-aff1-4074a4d6309f",
"layout": {
"h": 3,
"i": "3a3545c2-3df6-4ec2-baa1-39ccd99beda8",
"isResizable": true,
"w": 4,
"x": 0,
"y": 10
},
"name": "ESXI UP",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "count(esxi_alive{ident=\"$vcenter\"} ==1)",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "ESXI宕机数量",
"id": "6bc0cb3a-71fb-4306-a150-8462a0e4c053",
"layout": {
"h": 3,
"i": "21d15ac6-9a68-46f2-87f6-d298f9b0d596",
"isResizable": true,
"w": 4,
"x": 4,
"y": 10
},
"name": "ESXI Down",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#ce4f52"
},
"type": "range"
},
{
"match": {
"special": 0
},
"result": {
"color": "#2c9d3d"
},
"type": "special"
}
]
},
"targets": [
{
"expr": "count(esxi_alive{ident=\"$vcenter\"} !=1) or vector(0)",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "35b7dc01-0e58-4b01-9f84-e466cb817879",
"layout": {
"h": 1,
"i": "35b7dc01-0e58-4b01-9f84-e466cb817879",
"isResizable": false,
"w": 24,
"x": 0,
"y": 13
},
"name": "Datacenter Status",
"panels": [],
"type": "row"
},
{
"custom": {
"baseColor": "#2c9d3d",
"calc": "lastNotNull",
"serieWidth": 40,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "ae61d444-e137-4467-ae4a-541976a0cb75",
"layout": {
"h": 9,
"i": "53b66c57-64da-431e-833b-b128654ddee6",
"isResizable": true,
"w": 8,
"x": 0,
"y": 14
},
"name": "Datacenter-Esxi CPU Usage top10",
"options": {
"standardOptions": {
"util": "percent"
},
"valueMappings": [
{
"match": {
"from": null,
"to": 50
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
},
{
"match": {
"from": 50,
"to": 75
},
"result": {
"color": "#ffae39"
},
"type": "range"
},
{
"match": {
"from": 75,
"to": null
},
"result": {
"color": "#ce4f52"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "topk(10,(esxi_cpu_usage{ident=\"$vcenter\",datacenter=~\"$datacenter\"}))",
"legend": "{{host}}(Vcenter={{ident}})",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"baseColor": "#2c9d3d",
"calc": "lastNotNull",
"serieWidth": 40,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "bffb1e95-6ee8-47d8-9f65-ef584c335d8b",
"layout": {
"h": 9,
"i": "5c71bbfe-7b70-4dbd-84ae-5c4b2318d0fb",
"isResizable": true,
"w": 8,
"x": 8,
"y": 14
},
"name": "Datacenter-Esxi Mem Usage top10",
"options": {
"standardOptions": {
"util": "percent"
},
"valueMappings": [
{
"match": {
"from": null,
"to": 50
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
},
{
"match": {
"from": 50,
"to": 75
},
"result": {
"color": "#ffae39"
},
"type": "range"
},
{
"match": {
"from": 75,
"to": null
},
"result": {
"color": "#ce4f52"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "topk(10,(100 - esxi_memory_freePercent{ident=\"$vcenter\"}))",
"legend": "{{host}}(Vcenter={{ident}})",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"baseColor": "#2c9d3d",
"calc": "lastNotNull",
"serieWidth": 40,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "b36c69c2-6cb0-4b89-a4a0-7225140c271f",
"layout": {
"h": 9,
"i": "9f192b9b-dca8-4d0b-acd4-38e89277ed4e",
"isResizable": true,
"w": 8,
"x": 16,
"y": 14
},
"name": "Datacenter-Esxi Datastore Usage top10 top10",
"options": {
"standardOptions": {
"util": "percent"
},
"valueMappings": [
{
"match": {
"from": null,
"to": 50
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
},
{
"match": {
"from": 50,
"to": 75
},
"result": {
"color": "#ffae39"
},
"type": "range"
},
{
"match": {
"from": 75,
"to": null
},
"result": {
"color": "#ce4f52"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "topk(10,(100 - datastore_freePercent{ident=\"$vcenter\",datacenter=~\"$datacenter\"}))",
"legend": "{{datastore}}(Vcenter={{ident}})",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有ESXI主机的数量",
"id": "acc6adf8-ad83-4f59-b8c9-f9937e10c20e",
"layout": {
"h": 3,
"i": "0ee44882-4d66-426e-8bc0-003ae2b4bdfa",
"isResizable": true,
"w": 4,
"x": 0,
"y": 23
},
"name": "Datacenter-ESXI summary",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#e49d4e"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "count(esxi_alive{ident=\"$vcenter\",datacenter=~\"$datacenter\"})",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "所有Datastore存储数量",
"id": "813109ee-42bd-4150-88e4-4e3e16d0bae7",
"layout": {
"h": 3,
"i": "bcc780a0-bfb9-4e90-87ed-8c2057255582",
"isResizable": true,
"w": 4,
"x": 4,
"y": 23
},
"name": "Datacenter-Datastore summary",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#ff9919"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "count(datastore_capacity{ident=\"$vcenter\",datacenter=~\"$datacenter\"})",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Datastore总空间",
"id": "d672145f-1b8e-40b5-9ae9-a72bd76d98fe",
"layout": {
"h": 3,
"i": "9ea70640-54a9-4065-a272-d7adfb434867",
"isResizable": true,
"w": 4,
"x": 8,
"y": 23
},
"name": "Datacenter-Datastore Total",
"options": {
"standardOptions": {
"util": "bytesIEC"
},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#9470ff"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(datastore_capacity{ident=\"$vcenter\",datacenter=~\"$datacenter\"})",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "Datastore总使用率",
"id": "40c1c5ef-69e4-4054-93cf-3f3c29a14b95",
"layout": {
"h": 3,
"i": "c84978c3-b1f3-4a21-bd85-edb9abc95780",
"isResizable": true,
"w": 4,
"x": 12,
"y": 23
},
"name": "Datacenter-Datastore Percent",
"options": {
"standardOptions": {
"util": "percentUnit"
},
"valueMappings": [
{
"match": {
"from": null,
"to": 0.6
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
},
{
"match": {
"from": 0.6,
"to": 0.8
},
"result": {
"color": "#ffae39"
},
"type": "range"
},
{
"match": {
"from": 0.8
},
"result": {
"color": "#ce4f52"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "(sum(datastore_capacity{ident=\"$vcenter\",datacenter=~\"$datacenter\"})-sum(datastore_free{ident=\"$vcenter\",datacenter=~\"$datacenter\"})) / sum(datastore_capacity{ident=\"$vcenter\",datacenter=~\"$datacenter\"})",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "ESXI内存总量",
"id": "e07d16cc-a7b1-4f26-b900-aa7a4c515954",
"layout": {
"h": 3,
"i": "3f3ab532-dfcf-4261-8f77-e068ed7d9cc7",
"isResizable": true,
"w": 4,
"x": 16,
"y": 23
},
"name": "Datacenter-MEM Total",
"options": {
"standardOptions": {
"util": "bytesIEC"
},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#9470ff"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(esxi_memory_capacity{ident=\"$vcenter\",datacenter=~\"$datacenter\"})",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {}
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"description": "ESXI内存总使用率",
"id": "22a23007-9c6f-4c30-b684-381e5ff88c5b",
"layout": {
"h": 3,
"i": "9ebe01f2-f97d-4021-bc01-035afcbc66dc",
"isResizable": true,
"w": 4,
"x": 20,
"y": 23
},
"name": "Datacenter-MEM Percent",
"options": {
"standardOptions": {
"util": "percentUnit"
},
"valueMappings": [
{
"match": {
"from": null,
"to": 0.6
},
"result": {
"color": "#2c9d3d"
},
"type": "range"
},
{
"match": {
"from": 0.6,
"to": 0.8
},
"result": {
"color": "#ffae39"
},
"type": "range"
},
{
"match": {
"from": 0.8
},
"result": {
"color": "#ce4f52"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "sum(esxi_memory_usage{ident=\"$vcenter\",datacenter=~\"$datacenter\"}) / sum(esxi_memory_capacity{ident=\"$vcenter\",datacenter=~\"$datacenter\"})",
"legend": "",
"refId": "A",
"time": {
"end": "now",
"start": "now-30m"
}
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "bars",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "ecb15e88-9f9b-4e58-b56e-280a77c30c12",
"layout": {
"h": 6,
"i": "1e978dc6-6a89-45c1-8c1a-4be48842830c",
"isResizable": true,
"w": 8,
"x": 0,
"y": 26
},
"name": "Datacenter-CPU Usage Trend",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "percent"
},
"thresholds": {
"steps": [
{
"color": "#ce4f52",
"value": 80
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(esxi_cpu_usage{ident=\"$vcenter\"}) by (datacenter)",
"legend": "{{datacenter}}",
"refId": "A",
"time": {
"end": "now",
"start": "now-7d"
}
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "0b70e6aa-bdff-488c-b60e-586baa8b5626",
"layout": {
"h": 6,
"i": "4de6e60a-8b25-4e79-9989-6d9e5877d39a",
"isResizable": true,
"w": 8,
"x": 8,
"y": 26
},
"name": "Datacenter-MEM Usage Trend",
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "sum(esxi_memory_usage{ident=\"$vcenter\"}) by (datacenter) / sum(esxi_memory_capacity{ident=\"$vcenter\"}) by(datacenter)",
"legend": "{{datacenter}}",
"refId": "A",
"time": {
"end": "now",
"start": "now-7d"
}
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1b4adc0a-19d8-463d-a47c-0051cf7113f6",
"layout": {
"h": 6,
"i": "7074b7a0-88c5-45e1-9d50-c3f48a4b6d54",
"isResizable": true,
"w": 8,
"x": 16,
"y": 26
},
"name": "Datacenter-Datastore Usage Trend",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"util": "percentUnit"
},
"thresholds": {
"steps": []
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(datastore_capacity{ident=\"$vcenter\"})-sum(datastore_free{ident=\"$vcenter\"})) / sum(datastore_capacity{ident=\"$vcenter\"}) by (datacenter)",
"legend": "{{datacenter}}",
"refId": "A",
"time": {
"end": "now",
"start": "now-7d"
}
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "3acc2ed5-a94a-4f6e-ba20-cf74af9ce773",
"layout": {
"h": 1,
"i": "3acc2ed5-a94a-4f6e-ba20-cf74af9ce773",
"isResizable": false,
"w": 24,
"x": 0,
"y": 32
},
"name": "ESXI Status",
"panels": [],
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": false
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "e46c5596-5f99-4cd5-8761-dfa861df318c",
"layout": {
"h": 7,
"i": "b648687d-bc38-40c7-af77-e910de8d7090",
"isResizable": true,
"w": 4,
"x": 0,
"y": 33
},
"name": "ESXI-UP/Down",
"options": {
"standardOptions": {
"decimals": 0,
"util": "none"
},
"valueMappings": [
{
"match": {
"from": null,
"special": 1,
"to": 1
},
"result": {
"color": "#2c9d3d",
"text": "UP"
},
"type": "special"
},
{
"match": {
"special": 0
},
"result": {
"color": "#ce4f52",
"text": "Down"
},
"type": "special"
}
]
},
"overrides": [
{}
],
"targets": [
{
"expr": "esxi_alive{ident=\"$vcenter\",datacenter=~\"$datacenter\",host=~\"$esxi\"}",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": false,
"sortColumn": "value",
"sortOrder": "ascend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "64683543-70ea-447b-8667-c66642f66d39",
"layout": {
"h": 7,
"i": "64683543-70ea-447b-8667-c66642f66d39",
"isResizable": true,
"w": 4,
"x": 4,
"y": 33
},
"name": "ESXI-Uptime(天)",
"options": {
"standardOptions": {
"decimals": 0,
"util": "none"
},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#ff9919"
},
"type": "range"
}
]
},
"overrides": [
{}
],
"targets": [
{
"expr": "esxi_uptime{ident=\"$vcenter\",datacenter=~\"$datacenter\",host=~\"$esxi\"} /60 /60 /24",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": false
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "6ec3bd61-cf30-45c3-9118-28c6dfd3c2bd",
"layout": {
"h": 7,
"i": "50431a0f-65f2-4eaf-87a6-7ffe3a77fec7",
"isResizable": true,
"w": 4,
"x": 8,
"y": 33
},
"name": "ESXI-MEM Total",
"options": {
"standardOptions": {
"decimals": 2,
"util": "bytesIEC"
},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "#70a0ff"
},
"type": "range"
}
]
},
"overrides": [
{}
],
"targets": [
{
"expr": "esxi_memory_capacity{ident=\"$vcenter\",datacenter=~\"$datacenter\",host=~\"$esxi\"}",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": false,
"sortOrder": "ascend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "46a4eea7-6577-430d-8fb2-2a371c1c8c64",
"layout": {
"h": 7,
"i": "e6344a4e-cd2a-4db5-ad16-4fb69d321e9d",
"isResizable": true,
"w": 4,
"x": 12,
"y": 33
},
"name": "ESXI-CPU Usage",
"options": {
"standardOptions": {
"decimals": 2,
"util": "percent"
},
"valueMappings": [
{
"match": {
"from": 0.1
},
"result": {
"color": "#ff8286"
},
"type": "range"
}
]
},
"overrides": [
{}
],
"targets": [
{
"expr": "esxi_cpu_usage{ident=\"$vcenter\",datacenter=~\"$datacenter\",host=~\"$esxi\"}",
"legend": "{{host}}",
"refId": "A",
"step": 15,
"time": {
"end": "now",
"start": "now-12h"
}
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": false,
"sortOrder": "ascend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "e631e9e9-7d25-4bab-916b-e11da4bcb6b4",
"layout": {
"h": 7,
"i": "e6673ca5-5f7e-4ec2-bc68-8745b4bac2a8",
"isResizable": true,
"w": 4,
"x": 16,
"y": 33
},
"name": "ESXI-NET_IN",
"options": {
"standardOptions": {
"decimals": 2,
"util": "bytesIEC"
},
"valueMappings": [
{
"match": {
"from": 0.1
},
"result": {
"color": "#9470ff"
},
"type": "range"
}
]
},
"overrides": [
{}
],
"targets": [
{
"expr": "esxi_net_if_in{ident=\"$vcenter\",datacenter=~\"$datacenter\",host=~\"$esxi\"}",
"legend": "{{host}}",
"refId": "A",
"step": 15,
"time": {
"end": "now",
"start": "now-12h"
}
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colorMode": "value",
"displayMode": "seriesToRows",
"showHeader": false,
"sortOrder": "ascend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "cbe91b45-d4de-4218-9f07-28848fb05202",
"layout": {
"h": 7,
"i": "62a2fcb0-584d-4e8b-a62e-de41352c43b8",
"isResizable": true,
"w": 4,
"x": 20,
"y": 33
},
"name": "ESXI-NET_OUT",
"options": {
"standardOptions": {
"decimals": 2,
"util": "bytesIEC"
},
"valueMappings": [
{
"match": {
"from": 0.1
},
"result": {
"color": "#ecd245"
},
"type": "range"
}
]
},
"overrides": [
{}
],
"targets": [
{
"expr": "esxi_net_if_out{ident=\"$vcenter\",datacenter=~\"$datacenter\",host=~\"$esxi\"}",
"legend": "{{host}}",
"refId": "A",
"step": 15,
"time": {
"end": "now",
"start": "now-12h"
}
}
],
"type": "table",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "bars",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "b3265d25-c15d-41cb-8c8b-25417e655caa",
"layout": {
"h": 6,
"i": "b3265d25-c15d-41cb-8c8b-25417e655caa",
"isResizable": true,
"w": 12,
"x": 0,
"y": 40
},
"name": "ESXI-CPU Usage Trend",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"decimals": 2,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "esxi_cpu_usage{ident=\"$vcenter\",datacenter=~\"$datacenter\",host=~\"$esxi\"}",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "25fd6cd6-be54-420f-b3ee-002a9579eb1d",
"layout": {
"h": 6,
"i": "e7042001-d660-4781-af81-861720fca815",
"isResizable": true,
"w": 12,
"x": 12,
"y": 40
},
"name": "ESXI-MEM Usage Trend",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"decimals": 2,
"util": "percentUnit"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "esxi_memory_usage{ident=\"$vcenter\",datacenter=~\"$datacenter\",host=~\"$esxi\"} / esxi_memory_capacity{ident=\"$vcenter\",datacenter=~\"$datacenter\",host=~\"$esxi\"}",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "868fca2d-df6e-487c-8c8d-e307d82c28ce",
"layout": {
"h": 7,
"i": "593f9476-adfe-48ad-8506-3b39fcaabd47",
"isResizable": true,
"w": 12,
"x": 0,
"y": 46
},
"name": "ESXI-NET_IN Trend",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"decimals": 2,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "esxi_net_if_in{ident=\"$vcenter\",datacenter=~\"$datacenter\",host=~\"$esxi\"}",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "aee98a26-c7dd-43d6-9195-72bdf1fc0b47",
"layout": {
"h": 7,
"i": "1f9fc9b6-5064-42b9-9aa6-8d741a36d5d0",
"isResizable": true,
"w": 12,
"x": 12,
"y": 46
},
"name": "ESXI-NET_OUT Trend",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"decimals": 2,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "esxi_net_if_out{ident=\"$vcenter\",datacenter=~\"$datacenter\",host=~\"$esxi\"}",
"legend": "{{host}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "e5ed126a-2d17-4195-94c4-214864a9755d",
"layout": {
"h": 1,
"i": "e5ed126a-2d17-4195-94c4-214864a9755d",
"isResizable": false,
"w": 24,
"x": 0,
"y": 53
},
"name": "VM Status",
"type": "row"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "182d718e-83af-449b-9492-7af701a16569",
"layout": {
"h": 5,
"i": "182d718e-83af-449b-9492-7af701a16569",
"isResizable": true,
"w": 4,
"x": 0,
"y": 54
},
"name": "VM-IO_Read",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "vm_datastore_io_read_bytes{ident=\"$vcenter\",vm=~\"$vm\"}",
"legend": "{{vm}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "43fe023c-8c3a-47f5-9c69-225189622d36",
"layout": {
"h": 5,
"i": "394d59f4-939c-4459-b875-381aac49d566",
"isResizable": true,
"w": 4,
"x": 4,
"y": 54
},
"name": "VM-IO_Write",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "vm_datastore_io_write_bytes{ident=\"$vcenter\",vm=~\"$vm\"}",
"legend": "{{vm}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "d8a40ecc-d125-4272-b60f-543e6018ca05",
"layout": {
"h": 5,
"i": "7ba230b1-9d71-4321-8729-3b046fdda035",
"isResizable": true,
"w": 4,
"x": 8,
"y": 54
},
"name": "VM-IO_Read_Numbers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "vm_datastore_io_read_numbers{ident=\"$vcenter\",vm=~\"$vm\"}",
"legend": "{{vm}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1a48a82c-19b4-4b73-8093-b2c9c87ded20",
"layout": {
"h": 5,
"i": "563c4bed-0ac7-491a-835c-70c638edd5c8",
"isResizable": true,
"w": 4,
"x": 12,
"y": 54
},
"name": "VM-IO_Write_Numbers",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "vm_datastore_io_write_numbers{ident=\"$vcenter\",vm=~\"$vm\"}",
"legend": "{{vm}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "15147aa2-3c7d-490b-b53b-4c3e3f9731d5",
"layout": {
"h": 5,
"i": "9f0098bf-7f82-41ad-9749-272e6126e020",
"isResizable": true,
"w": 4,
"x": 16,
"y": 54
},
"name": "VM-IO_Read_Latency",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "vm_datastore_io_read_latency{ident=\"$vcenter\",vm=~\"$vm\"}",
"legend": "{{vm}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1870bf34-d019-484f-a462-411629ca78fe",
"layout": {
"h": 5,
"i": "c89db128-f5e0-4141-a89b-c0dfc567f7e2",
"isResizable": true,
"w": 4,
"x": 20,
"y": 54
},
"name": "VM-IO_Write_Latency",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "vm_datastore_io_write_latency{ident=\"$vcenter\",vm=~\"$vm\"}",
"legend": "{{vm}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "bars",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "e2751b2b-3b1f-469e-abd6-f6e843efd16a",
"layout": {
"h": 6,
"i": "0516c59f-70f9-4d65-9cdb-06c0932c1956",
"isResizable": true,
"w": 12,
"x": 0,
"y": 59
},
"name": "VM-CPU Usage Trend",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"decimals": 2,
"util": "percent"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "vm_cpu_usage{ident=\"$vcenter\",vm=~\"$vm\"}",
"legend": "{{vm}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "1af92e8e-4163-475b-a3e4-ccdca9a74cdf",
"layout": {
"h": 6,
"i": "b1bc6cda-5d1e-4cd5-8e98-d3cca0a8d6aa",
"isResizable": true,
"w": 12,
"x": 12,
"y": 59
},
"name": "VM-MEM Usage Trend",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"decimals": 2,
"util": "percentUnit"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "vm_memory_usage{ident=\"$vcenter\",vm=~\"$vm\"} / vm_memory_capacity{ident=\"$vcenter\",vm=~\"$vm\"}",
"legend": "{{vm}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "380ea87f-9b4f-41dc-9117-579295a1a5de",
"layout": {
"h": 7,
"i": "3c04d89f-d880-495b-954a-708eb3c8cf02",
"isResizable": true,
"w": 12,
"x": 0,
"y": 65
},
"name": "ESXI-NET_IN Trend",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"decimals": 2,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "vm_net_if_in{ident=\"$vcenter\",vm=~\"$vm\"}",
"legend": "{{vm}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${prom}",
"id": "8d2bf7e4-eb86-4d24-96da-e45b1eef68f5",
"layout": {
"h": 7,
"i": "66f5afc9-ea2f-444c-9170-098c684e52ee",
"isResizable": true,
"w": 12,
"x": 12,
"y": 65
},
"name": "ESXI-NET_OUT Trend",
"options": {
"legend": {
"displayMode": "list"
},
"standardOptions": {
"decimals": 2,
"util": "bytesIEC"
},
"thresholds": {},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "vm_net_if_out{ident=\"$vcenter\",vm=~\"$vm\"}",
"legend": "{{vm}}",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "prom",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(esxi_alive,ident)",
"multi": false,
"name": "vcenter",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(esxi_alive{ident=\"$vcenter\"},datacenter)",
"multi": true,
"name": "datacenter",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(esxi_alive{ident=\"$vcenter\",datacenter=~\"$datacenter\"},host)",
"multi": true,
"name": "esxi",
"type": "query"
},
{
"allOption": true,
"datasource": {
"cate": "prometheus",
"value": "${prom}"
},
"definition": "label_values(vm_power{ident=\"$vcenter\"},vm)",
"multi": true,
"name": "vm",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328599672000
}
================================================
FILE: integrations/vSphere/dashboards/vsphere.json
================================================
{
"id": 0,
"group_id": 0,
"name": "Vsphere",
"ident": "",
"tags": "Categraf Nightingale VMware-Sphere",
"create_at": 0,
"create_by": "",
"update_at": 0,
"update_by": "",
"configs": {
"datasourceValue": "Default",
"panels": [
{
"collapsed": true,
"id": "0a149fdd-5c4c-4d09-857c-b16ca3e60f1f",
"layout": {
"h": 1,
"i": "0a149fdd-5c4c-4d09-857c-b16ca3e60f1f",
"isResizable": false,
"w": 24,
"x": 0,
"y": 0
},
"name": "vSphere Overview",
"panels": [],
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 5,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {
"title": 12,
"value": 36
},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "89ae4416-32f6-4cbd-97dc-12aa11c1363e",
"layout": {
"h": 2,
"i": "89ae4416-32f6-4cbd-97dc-12aa11c1363e",
"isResizable": true,
"w": 24,
"x": 0,
"y": 1
},
"name": "",
"options": {
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#9470ff",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "count(count(vsphere_host_cpu_usage_average) by(vcenter))",
"legend": "vCenter Summary",
"refId": "A"
},
{
"expr": "count(count(vsphere_host_cpu_usage_average) by(clustername))",
"legend": "cluster Summary",
"refId": "B"
},
{
"expr": "count(count(vsphere_host_cpu_usage_average) by(esxhostname))",
"legend": "ESXi Summary",
"refId": "C"
},
{
"expr": "count(count(vsphere_vm_cpu_used_summation) by(vmname))",
"legend": "VM Summary",
"refId": "D"
},
{
"expr": "count(count(vsphere_datastore_disk_used_latest) by(dsname))",
"legend": "Datastore Summary",
"refId": "E"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"baseColor": "#9470ff",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6e2b4ee6-1f73-413a-a174-840490177541",
"layout": {
"h": 5,
"i": "6e2b4ee6-1f73-413a-a174-840490177541",
"isResizable": true,
"w": 12,
"x": 0,
"y": 3
},
"name": "EXSI cpu使用率排名",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 60,
"to": 100
},
"result": {
"color": "#ff656b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "(sum(vsphere_host_cpu_usage_average{}) by(esxhostname)/count(vsphere_host_cpu_usage_average{}) by(esxhostname))",
"legend": "{{esxhostname}}",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"baseColor": "#9470ff",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "ff88cb4a-c396-415c-ad8d-bbb3af289427",
"layout": {
"h": 5,
"i": "905a1562-f7dc-4f3e-bbf6-b1697db9d489",
"isResizable": true,
"w": 12,
"x": 12,
"y": 3
},
"name": "EXSI mem使用率排名",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 60,
"to": 100
},
"result": {
"color": "#ff656b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "(sum(vsphere_host_mem_usage_average{}) by(esxhostname)/count(vsphere_host_mem_usage_average{}) by(esxhostname))",
"legend": "{{esxhostname}}",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"baseColor": "#9470ff",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "d651d6eb-5e30-408c-ba7f-1823d960251a",
"layout": {
"h": 5,
"i": "70cc38e7-2ded-48c2-bb75-fe1d71d4a770",
"isResizable": true,
"w": 12,
"x": 0,
"y": 8
},
"name": "VM cpu使用率排名",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 60,
"to": 100
},
"result": {
"color": "#ff656b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "(sum(vsphere_vm_cpu_usage_average{}) by(vmname)/count(vsphere_vm_cpu_usage_average{}) by(vmname))",
"legend": "{{vmname}}",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"custom": {
"baseColor": "#9470ff",
"calc": "lastNotNull",
"serieWidth": 20,
"sortOrder": "desc"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "cdd8f673-750e-408e-80ab-3990c16b1da5",
"layout": {
"h": 5,
"i": "f6e55bcf-6d39-425e-8c6d-efc6b8350501",
"isResizable": true,
"w": 12,
"x": 12,
"y": 8
},
"name": "VM mem使用率排名",
"options": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 60,
"to": 100
},
"result": {
"color": "#ff656b"
},
"type": "range"
}
]
},
"targets": [
{
"expr": "(sum(vsphere_vm_mem_usage_average{}) by(vmname)/count(vsphere_vm_mem_usage_average{}) by(vmname))",
"legend": "{{vmname}}",
"refId": "A"
}
],
"type": "barGauge",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "5329376c-2084-4f4d-b5f4-372cd702b643",
"layout": {
"h": 1,
"i": "5329376c-2084-4f4d-b5f4-372cd702b643",
"isResizable": false,
"w": 24,
"x": 0,
"y": 13
},
"name": "cluster Status",
"panels": [],
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 3,
"colorMode": "value",
"graphMode": "none",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "046163e4-031e-44e8-b592-7dc606496922",
"layout": {
"h": 5,
"i": "046163e4-031e-44e8-b592-7dc606496922",
"isResizable": true,
"w": 2,
"x": 0,
"y": 14
},
"maxPerRow": 4,
"name": "uptime",
"options": {
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "(sum(vsphere_host_sys_uptime_latest{clustername=\"$cluster\",vcenter=\"$vcenter\"}) by(clustername)/count(vsphere_host_sys_uptime_latest{clustername=\"$cluster\",vcenter=\"$vcenter\"}) by(clustername))/86500",
"legend": "",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8aca78d4-2869-44c4-a702-ad46f5c89443",
"layout": {
"h": 5,
"i": "af0d1102-cf29-45b9-b647-1bc0b605ac04",
"isResizable": true,
"w": 4,
"x": 2,
"y": 14
},
"maxPerRow": 4,
"name": "cluster CPU Usage %",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"expr": "(sum(vsphere_host_cpu_usage_average{clustername=\"$cluster\",vcenter=\"$vcenter\"}) by(clustername)/count(vsphere_host_cpu_usage_average{clustername=\"$cluster\",vcenter=\"$vcenter\"}) by(clustername))",
"legend": "{{clustername}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "8309ec72-51cb-4b9d-ba7e-e22acfd6e461",
"layout": {
"h": 5,
"i": "69332b5d-8841-4572-92f5-6ad237fb6ad5",
"isResizable": true,
"w": 4,
"x": 6,
"y": 14
},
"maxPerRow": 4,
"name": "cluster RAM Usage in %",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"expr": "(sum(vsphere_host_mem_usage_average{clustername=\"$cluster\",vcenter=\"$vcenter\"}) by(clustername)/count(vsphere_host_mem_usage_average{clustername=\"$cluster\",vcenter=\"$vcenter\"}) by(clustername))",
"legend": "{{clustername}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "7b43af1c-0814-42eb-ba31-14ed6eb07dc9",
"layout": {
"h": 5,
"i": "7dad6d09-2d17-41aa-845f-30f916ee344d",
"isResizable": true,
"w": 5,
"x": 10,
"y": 14
},
"maxPerRow": 4,
"name": "cluster Network Usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(vsphere_host_net_bytesRx_average{clustername=\"$cluster\",vcenter=\"$vcenter\"}) by(clustername))/1000 ",
"legend": "{{clustername}}-net_bytesRx",
"refId": "B"
},
{
"expr": "(sum(vsphere_host_net_bytesTx_average{clustername=\"$cluster\",vcenter=\"$vcenter\"}) by(clustername))/1000",
"legend": "{{clustername}}-net_bytesTx",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"aggrDimension": "source",
"calc": "lastNotNull",
"colorMode": "background",
"displayMode": "labelValuesToRows",
"showHeader": true,
"sortColumn": "source",
"sortOrder": "descend"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6dffc761-7805-47c5-b82d-cf34dd7b8b11",
"layout": {
"h": 5,
"i": "6dffc761-7805-47c5-b82d-cf34dd7b8b11",
"isResizable": true,
"w": 9,
"x": 15,
"y": 14
},
"maxPerRow": 4,
"options": {
"standardOptions": {
"util": "none"
},
"valueMappings": []
},
"overrides": [
{
"matcher": {
"value": "C"
},
"properties": {
"standardOptions": {
"util": "none"
},
"valueMappings": [
{
"match": {
"from": 1,
"to": 30
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
},
{
"match": {
"from": 30,
"special": 30,
"to": 50
},
"result": {
"color": "rgba(185, 159, 0, 1)"
},
"type": "range"
},
{
"match": {
"from": 50,
"special": 90
},
"result": {
"color": "rgba(255, 101, 107, 1)"
},
"type": "range"
}
]
}
},
{
"matcher": {
"value": "A"
},
"properties": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
}
]
},
"type": "special"
},
{
"matcher": {
"value": "B"
},
"properties": {
"standardOptions": {},
"valueMappings": [
{
"match": {
"from": 1,
"special": 1,
"to": 350
},
"result": {
"color": "rgba(63, 196, 83, 1)"
},
"type": "range"
},
{
"match": {
"from": 350,
"special": 300
},
"result": {
"color": "rgba(185, 159, 0, 1)"
},
"type": "range"
}
]
},
"type": "special"
}
],
"targets": [
{
"expr": "sum(vsphere_datastore_disk_capacity_latest{vcenter=\"$vcenter\"}) by(source) / 1024 / 1024",
"legend": "总量(GB)",
"refId": "A"
},
{
"expr": "sum(vsphere_datastore_disk_used_latest{vcenter=\"$vcenter\"}) by(source) / 1024 / 1024",
"legend": "使用量(GB)",
"refId": "B"
},
{
"expr": "sum(vsphere_datastore_disk_used_latest{vcenter=\"$vcenter\"}/vsphere_datastore_disk_capacity_latest{vcenter=\"$vcenter\"}) by(source)*100",
"legend": "使用率(%)",
"refId": "C"
}
],
"type": "table",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "fe54e096-8e11-406b-98f6-d2c5d76d9d8d",
"layout": {
"h": 1,
"i": "fe54e096-8e11-406b-98f6-d2c5d76d9d8d",
"isResizable": false,
"w": 24,
"x": 0,
"y": 19
},
"name": "Exsi status",
"panels": [],
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "4cca929b-8a04-4c0c-924f-240ad5cf08d9",
"layout": {
"h": 5,
"i": "4cca929b-8a04-4c0c-924f-240ad5cf08d9",
"isResizable": true,
"w": 2,
"x": 0,
"y": 20
},
"name": "uptime\n",
"options": {
"standardOptions": {
"util": "humantimeSeconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "vsphere_host_sys_uptime_latest{esxhostname=\"$esxi\"}",
"legend": "{{esxhostname}}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c097de32-5d94-4b32-8f93-0ac8cfe32657",
"layout": {
"h": 5,
"i": "7f2edbdb-d890-4799-89b0-fad87ebf1c22",
"isResizable": true,
"w": 3,
"x": 2,
"y": 20
},
"name": "Host CPU Ready Time",
"options": {
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "(sum(vsphere_host_cpu_ready_summation{vcenter=\"$vcenter\",esxhostname=\"$esxi\"}) by(esxhostname)/count(vsphere_host_cpu_ready_summation{vcenter=\"$vcenter\",esxhostname=\"$esxi\"}) by(esxhostname))",
"legend": "cpu usage",
"refId": "B"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "25ed7e6c-1d20-4d3f-a2d3-de1e9bb2fb17",
"layout": {
"h": 5,
"i": "9e349e57-b55e-462b-b63f-faed76213544",
"isResizable": true,
"w": 5,
"x": 5,
"y": 20
},
"name": "Host CPU Usage %",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"expr": "(sum(vsphere_host_cpu_usage_average{vcenter=\"$vcenter\",esxhostname=\"$esxi\"}) by(esxhostname)/count(vsphere_host_cpu_usage_average{vcenter=\"$vcenter\",esxhostname=\"$esxi\"}) by(esxhostname))",
"legend": "{{esxhostname}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2f77a4cc-0950-4fe5-86d7-0d6ebfed3d0c",
"layout": {
"h": 5,
"i": "01d79496-7b09-44c1-8e5e-8430509295f3",
"isResizable": true,
"w": 5,
"x": 10,
"y": 20
},
"name": "Host RAM Usage in %",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"expr": "(sum(vsphere_host_mem_usage_average{vcenter=\"$vcenter\",esxhostname=\"$esxi\"}) by(esxhostname)/count(vsphere_host_mem_usage_average{vcenter=\"$vcenter\",esxhostname=\"$esxi\"}) by(esxhostname))",
"legend": "{{esxhostname}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "af8a3cca-ad6e-4650-838f-c38c1cf2c7fa",
"layout": {
"h": 5,
"i": "446ea897-c338-40f7-a146-2cda7bbca311",
"isResizable": true,
"w": 9,
"x": 15,
"y": 20
},
"maxPerRow": 4,
"name": "cluster Network Usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"expr": "(sum(vsphere_host_net_bytesRx_average{clustername=\"$cluster\",vcenter=\"$vcenter\"}) by(esxhostname))/1000",
"legend": "{{esxhostname}}-net_bytesRx",
"refId": "B"
},
{
"expr": "(sum(vsphere_host_net_bytesTx_average{clustername=\"$cluster\",vcenter=\"$vcenter\"}) by(esxhostname))/1000",
"legend": "{{esxhostname}}-net_bytesTx",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"collapsed": true,
"id": "d2f2839c-11d2-470f-85a8-da9e81e72ad3",
"layout": {
"h": 1,
"i": "d2f2839c-11d2-470f-85a8-da9e81e72ad3",
"isResizable": false,
"w": 24,
"x": 0,
"y": 25
},
"name": "VMs status",
"panels": [],
"type": "row"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "772d5173-cde1-4e3f-a72d-864f737e07b4",
"layout": {
"h": 5,
"i": "7a9fe621-aca3-4a32-aae0-c3f3cf951ba3",
"isResizable": true,
"w": 2,
"x": 0,
"y": 26
},
"name": "uptime",
"options": {
"standardOptions": {
"util": "humantimeSeconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "vsphere_vm_sys_uptime_latest{vmname=\"$vmname\"}",
"legend": "{{vmname}}",
"refId": "A"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"calc": "lastNotNull",
"colSpan": 1,
"colorMode": "value",
"textMode": "valueAndName",
"textSize": {},
"valueField": "Value"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "6a28ae90-8a63-45a1-9ceb-ab660f0a0d75",
"layout": {
"h": 5,
"i": "f1ba46e5-558d-483e-bc9b-5cffff7343a8",
"isResizable": true,
"w": 3,
"x": 2,
"y": 26
},
"name": "VM CPU Ready Time",
"options": {
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
}
},
"targets": [
{
"expr": "(sum(vsphere_vm_cpu_ready_summation{vcenter=\"$vcenter\",vmname=\"$vmname\"}) by(vmname)/count(vsphere_vm_cpu_ready_summation{vcenter=\"$vcenter\",vmname=\"$vmname\"}) by(vmname))",
"legend": "cpu usage",
"refId": "B"
}
],
"type": "stat",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "0cd4f9df-81c6-4a31-a15e-060a3b0c7e65",
"layout": {
"h": 5,
"i": "96826cc9-cbe0-4e4e-9144-90d9052904cd",
"isResizable": true,
"w": 5,
"x": 5,
"y": 26
},
"name": "VM CPU Usage %",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"expr": "(sum(vsphere_vm_cpu_usage_average{vcenter=\"$vcenter\",vmname=\"$vmname\"}) by(vmname)/count(vsphere_vm_cpu_usage_average{vcenter=\"$vcenter\",vmname=\"$vmname\"}) by(vmname))",
"legend": "{{vmname}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "2beeb800-bacd-49ec-b44a-4b19d6497808",
"layout": {
"h": 5,
"i": "7651144f-7b05-4a21-a595-2816f108b23d",
"isResizable": true,
"w": 5,
"x": 10,
"y": 26
},
"name": "VM RAM Usage in %",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"expr": "(sum(vsphere_vm_mem_usage_average{vcenter=\"$vcenter\",vmname=\"$vmname\"}) by(vmname)/count(vsphere_vm_mem_usage_average{vcenter=\"$vcenter\",vmname=\"$vmname\"}) by(vmname))",
"legend": "{{vmname}}",
"refId": "B"
}
],
"type": "timeseries",
"version": "2.0.0"
},
{
"custom": {
"drawStyle": "lines",
"fillOpacity": 0.3,
"gradientMode": "opacity",
"lineInterpolation": "smooth",
"lineWidth": 2,
"scaleDistribution": {
"type": "linear"
},
"spanNulls": false,
"stack": "off"
},
"datasourceCate": "prometheus",
"datasourceValue": "${datasource}",
"id": "c2804d43-f9b0-4a74-a3fe-fb1f8b5f4a01",
"layout": {
"h": 5,
"i": "06122a9b-53b1-4ef9-93be-2419197600c8",
"isResizable": true,
"w": 9,
"x": 15,
"y": 26
},
"name": "VMNetwork Usage",
"options": {
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "none"
},
"thresholds": {
"steps": [
{
"color": "#634CD9",
"type": "base",
"value": null
}
]
},
"tooltip": {
"mode": "all",
"sort": "none"
}
},
"targets": [
{
"expr": "(sum(vsphere_vm_net_bytesRx_average{vmname=\"$vmname\",vcenter=\"$vcenter\"}) by(vmname))/1000",
"legend": "{{vmname}}-net_bytesRx",
"refId": "B"
},
{
"expr": "(sum(vsphere_vm_net_bytesTx_average{vmname=\"$vmname\",vcenter=\"$vcenter\"}) by(vmname))/1000",
"legend": "{{vmname}}-net_bytesTx",
"refId": "A"
}
],
"type": "timeseries",
"version": "2.0.0"
}
],
"var": [
{
"definition": "prometheus",
"name": "datasource",
"type": "datasource"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "vsphere_host_cpu_usage_average",
"multi": false,
"name": "vcenter",
"reg": "/.*vcenter=\"(.*?)\".*/",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "vsphere_host_cpu_usage_average{vcenter=\"$vcenter\"}",
"name": "cluster",
"reg": "/.*clustername=\"(.*?)\".*/",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "vsphere_host_cpu_usage_average{vcenter=\"$vcenter\"}",
"name": "esxi",
"reg": "/.*esxhostname=\"(.*?)\".*/",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "vsphere_vm_sys_uptime_latest{vcenter=\"$vcenter\"}",
"name": "vmname",
"reg": "/.*vmname=\"(.*?)\".*/",
"type": "query"
},
{
"datasource": {
"cate": "prometheus",
"value": "${datasource}"
},
"definition": "vsphere_datastore_disk_provisioned_latest{vcenter=\"$vcenter\"}",
"name": "datastore",
"reg": "/.*dsname=\"(.*?)\".*/",
"type": "query"
}
],
"version": "3.0.0"
},
"public": 0,
"public_cate": 0,
"bgids": null,
"built_in": 0,
"hide": 0,
"uuid": 1717556328609733000
}
================================================
FILE: integrations/vSphere/markdown/README.md
================================================
# VMware vSphere
使用 [categraf](https://github.com/flashcatcloud/categraf) 中的 [inputs.vsphere](https://github.com/flashcatcloud/categraf/tree/main/inputs/vsphere) 插件采集 VMware 指标数据。
VMware vSphere的两个核心组件: ESXi Server & vCenter Server。要监控 vSphere,需要部署 vCenter。
- ESXi Server 是 Hypervsior,在其中创建和运行虚拟机和虚拟设备。
- vCenter Server 是用于管理网络中连接的多个 ESXi 主机和主机资源池的服务。
博客参考:[夜莺监控之 Categraf 监控 VMware vSphere](https://unixsre.com/posts/n9e-monitor-vsphere/)
## 采集配置
Categraf 中的 `conf/input.vsphere/vsphere.toml`。
监控数据的获取,其实就是通过 vCenter 的 API 获取,所以需要配置 vCenter 的地址、用户名和密码。配置文件里默认示例是 administrator 账号,权限较大,仅做测试使用,建议做权限做控制,可以在 vCenter 中自己建用户跟角色。
```toml
[[instances]]
labels = { instance="192.168.11.111", clustername="Datacenter" }
## vCenter URLs to be monitored. These three lines must be uncommented
## and edited for the plugin to work.
## FQDN URLs to be monitored. These three lines must be uncommented
vcenter = "https://vcenter.unixsre.com/sdk"
username = "administrator@vcenter.unixsre.com"
password = "111111119@abcdef"
## VMs
## Typical VM metrics (if omitted or empty, all metrics are collected)
# vm_include = [ "/*/vm/**"] # Inventory path to VMs to collect (by default all are collected)
# vm_exclude = [] # Inventory paths to exclude
vm_metric_include = [
"cpu.demand.average",
"cpu.idle.summation",
"cpu.latency.average",
"cpu.readiness.average",
"cpu.ready.summation",
"cpu.run.summation",
"cpu.usage.average",
"cpu.used.summation",
"cpu.wait.summation",
"mem.active.average",
"mem.granted.average",
"mem.latency.average",
"mem.swapin.average",
"mem.swapinRate.average",
"mem.swapout.average",
"mem.swapoutRate.average",
"mem.usage.average",
"mem.vmmemctl.average",
"net.bytesRx.average",
"net.bytesTx.average",
"net.droppedRx.summation",
"net.droppedTx.summation",
"net.usage.average",
"power.power.average",
"virtualDisk.numberReadAveraged.average",
"virtualDisk.numberWriteAveraged.average",
"virtualDisk.read.average",
"virtualDisk.readOIO.latest",
"virtualDisk.throughput.usage.average",
"virtualDisk.totalReadLatency.average",
"virtualDisk.totalWriteLatency.average",
"virtualDisk.write.average",
"virtualDisk.writeOIO.latest",
"sys.uptime.latest",
]
# vm_metric_exclude = [] ## Nothing is excluded by default
# vm_instances = true ## true by default
## Hosts
## Typical host metrics (if omitted or empty, all metrics are collected)
# host_include = [ "/*/host/**"] # Inventory path to hosts to collect (by default all are collected)
# host_exclude [] # Inventory paths to exclude
host_metric_include = [
"cpu.coreUtilization.average",
"cpu.costop.summation",
"cpu.demand.average",
"cpu.idle.summation",
"cpu.latency.average",
"cpu.readiness.average",
"cpu.ready.summation",
"cpu.swapwait.summation",
"cpu.usage.average",
"cpu.used.summation",
"cpu.utilization.average",
"cpu.wait.summation",
"disk.deviceReadLatency.average",
"disk.deviceWriteLatency.average",
"disk.kernelReadLatency.average",
"disk.kernelWriteLatency.average",
"disk.numberReadAveraged.average",
"disk.numberWriteAveraged.average",
"disk.read.average",
"disk.totalReadLatency.average",
"disk.totalWriteLatency.average",
"disk.write.average",
"mem.active.average",
"mem.latency.average",
"mem.state.latest",
"mem.swapin.average",
"mem.swapinRate.average",
"mem.swapout.average",
"mem.swapoutRate.average",
"mem.totalCapacity.average",
"mem.usage.average",
"mem.vmmemctl.average",
"net.bytesRx.average",
"net.bytesTx.average",
"net.droppedRx.summation",
"net.droppedTx.summation",
"net.errorsRx.summation",
"net.errorsTx.summation",
"net.usage.average",
"power.power.average",
"storageAdapter.numberReadAveraged.average",
"storageAdapter.numberWriteAveraged.average",
"storageAdapter.read.average",
"storageAdapter.write.average",
"sys.uptime.latest",
]
# host_instances = true ## true by default
# host_include = [] ## Nothing included by default
# host_exclude = [] ## Nothing excluded by default
# host_metric_include = [] ## Nothing included by default
# host_metric_exclude = [] ## Nothing excluded by default
## Clusters
# cluster_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected)
# cluster_exclude = [] # Inventory paths to exclude
# cluster_metric_include = [] ## if omitted or empty, all metrics are collected
# cluster_metric_exclude = [] ## Nothing excluded by default
# cluster_instances = false ## false by default
## Resource Pools
# resoucepool_include = [ "/*/host/**"] # Inventory path to datastores to collect (by default all are collected)
# resoucepool_exclude = [] # Inventory paths to exclude
# resoucepool_metric_include = [] ## if omitted or empty, all metrics are collected
# resoucepool_metric_exclude = [] ## Nothing excluded by default
# resoucepool_instances = false ## false by default
## Datastores
# datastore_include = [ "/*/datastore/**"] # Inventory path to datastores to collect (by default all are collected)
# datastore_exclude = [] # Inventory paths to exclude
# datastore_metric_include = [] ## if omitted or empty, all metrics are collected
# datastore_metric_exclude = [] ## Nothing excluded by default
# datastore_instances = false ## false by default
## Datacenters
# datacenter_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected)
# datacenter_exclude = [] # Inventory paths to exclude
# datacenter_metric_include = [] ## if omitted or empty, all metrics are collected
# datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
# datacenter_instances = false ## false by default
## Plugin Settings
## separator character to use for measurement and field names (default: "_")
# separator = "_"
## Collect IP addresses? Valid values are "ipv4" and "ipv6"
# ip_addresses = ["ipv6", "ipv4" ]
## When set to true, all samples are sent as integers. This makes the output
## data types backwards compatible with Telegraf 1.9 or lower. Normally all
## samples from vCenter, with the exception of percentages, are integer
## values, but under some conditions, some averaging takes place internally in
## the plugin. Setting this flag to "false" will send values as floats to
## preserve the full precision when averaging takes place.
# use_int_samples = true
## Custom attributes from vCenter can be very useful for queries in order to slice the
## metrics along different dimension and for forming ad-hoc relationships. They are disabled
## by default, since they can add a considerable amount of tags to the resulting metrics. To
## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include
## to select the attributes you want to include.
## By default, since they can add a considerable amount of tags to the resulting metrics. To
## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include
## to select the attributes you want to include.
# custom_attribute_include = []
# custom_attribute_exclude = ["*"]
## The number of vSphere 5 minute metric collection cycles to look back for non-realtime metrics. In
## some versions (6.7, 7.0 and possible more), certain metrics, such as cluster metrics, may be reported
## with a significant delay (>30min). If this happens, try increasing this number. Please note that increasing
## it too much may cause performance issues.
# metric_lookback = 3
## number of objects to retrieve per query for realtime resources (vms and hosts)
## set to 64 for vCenter 5.5 and 6.0 (default: 256)
# max_query_objects = 256
## number of metrics to retrieve per query for non-realtime resources (clusters and datastores)
## set to 64 for vCenter 5.5 and 6.0 (default: 256)
# max_query_metrics = 256
## number of go routines to use for collection and discovery of objects and metrics
# collect_concurrency = 1
# discover_concurrency = 1
## the interval before (re)discovering objects subject to metrics collection (default: 300s)
# object_discovery_interval = "300s"
## timeout applies to any of the api request made to vcenter
# timeout = "60s"
## Optional SSL Config
use_tls = true
# tls_ca = "/path/to/cafile"
# tls_cert = "/path/to/certfile"
# tls_key = "/path/to/keyfile"
## Use SSL but skip chain & host verification
insecure_skip_verify = true
## The Historical Interval value must match EXACTLY the interval in the daily
# "Interval Duration" found on the VCenter server under Configure > General > Statistics > Statistic intervals
# historical_interval = "5m"
```
================================================
FILE: memsto/alert_mute_cache.go
================================================
package memsto
import (
"fmt"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type AlertMuteCacheType struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
stats *Stats
sync.RWMutex
mutes map[int64][]*models.AlertMute // key: busi_group_id
}
func NewAlertMuteCache(ctx *ctx.Context, stats *Stats) *AlertMuteCacheType {
amc := &AlertMuteCacheType{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
stats: stats,
mutes: make(map[int64][]*models.AlertMute),
}
amc.SyncAlertMutes()
return amc
}
func (amc *AlertMuteCacheType) Reset() {
amc.Lock()
defer amc.Unlock()
amc.statTotal = -1
amc.statLastUpdated = -1
amc.mutes = make(map[int64][]*models.AlertMute)
}
func (amc *AlertMuteCacheType) StatChanged(total, lastUpdated int64) bool {
if amc.statTotal == total && amc.statLastUpdated == lastUpdated {
return false
}
return true
}
func (amc *AlertMuteCacheType) Set(ms map[int64][]*models.AlertMute, total, lastUpdated int64) {
amc.Lock()
amc.mutes = ms
amc.Unlock()
// only one goroutine used, so no need lock
amc.statTotal = total
amc.statLastUpdated = lastUpdated
}
func (amc *AlertMuteCacheType) Gets(bgid int64) ([]*models.AlertMute, bool) {
amc.RLock()
defer amc.RUnlock()
lst, has := amc.mutes[bgid]
return lst, has
}
func (amc *AlertMuteCacheType) GetAllStructs() map[int64][]models.AlertMute {
amc.RLock()
defer amc.RUnlock()
ret := make(map[int64][]models.AlertMute)
for bgid := range amc.mutes {
lst := amc.mutes[bgid]
for i := 0; i < len(lst); i++ {
ret[bgid] = append(ret[bgid], *lst[i])
}
}
return ret
}
func (amc *AlertMuteCacheType) SyncAlertMutes() {
err := amc.syncAlertMutes()
if err != nil {
fmt.Println("failed to sync alert mutes:", err)
exit(1)
}
go amc.loopSyncAlertMutes()
}
func (amc *AlertMuteCacheType) loopSyncAlertMutes() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := amc.syncAlertMutes(); err != nil {
logger.Warning("failed to sync alert mutes:", err)
}
}
}
func (amc *AlertMuteCacheType) syncAlertMutes() error {
start := time.Now()
stat, err := models.AlertMuteStatistics(amc.ctx)
if err != nil {
dumper.PutSyncRecord("alert_mutes", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to exec AlertMuteStatistics")
}
if !amc.StatChanged(stat.Total, stat.LastUpdated) {
amc.stats.GaugeCronDuration.WithLabelValues("sync_alert_mutes").Set(0)
amc.stats.GaugeSyncNumber.WithLabelValues("sync_alert_mutes").Set(0)
dumper.PutSyncRecord("alert_mutes", start.Unix(), -1, -1, "not changed")
return nil
}
lst, err := models.AlertMuteGetsAll(amc.ctx)
if err != nil {
dumper.PutSyncRecord("alert_mutes", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to exec AlertMuteGetsByCluster")
}
oks := make(map[int64][]*models.AlertMute)
for i := 0; i < len(lst); i++ {
err = lst[i].Parse()
if err != nil {
logger.Warningf("failed to parse alert_mute, id: %d", lst[i].Id)
continue
}
oks[lst[i].GroupId] = append(oks[lst[i].GroupId], lst[i])
}
amc.Set(oks, stat.Total, stat.LastUpdated)
ms := time.Since(start).Milliseconds()
amc.stats.GaugeCronDuration.WithLabelValues("sync_alert_mutes").Set(float64(ms))
amc.stats.GaugeSyncNumber.WithLabelValues("sync_alert_mutes").Set(float64(len(lst)))
dumper.PutSyncRecord("alert_mutes", start.Unix(), ms, len(lst), "success")
return nil
}
================================================
FILE: memsto/alert_rule_cache.go
================================================
package memsto
import (
"fmt"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type AlertRuleCacheType struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
stats *Stats
sync.RWMutex
rules map[int64]*models.AlertRule // key: rule id
}
func NewAlertRuleCache(ctx *ctx.Context, stats *Stats) *AlertRuleCacheType {
arc := &AlertRuleCacheType{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
stats: stats,
rules: make(map[int64]*models.AlertRule),
}
arc.SyncAlertRules()
return arc
}
func (arc *AlertRuleCacheType) Reset() {
arc.Lock()
defer arc.Unlock()
arc.statTotal = -1
arc.statLastUpdated = -1
arc.rules = make(map[int64]*models.AlertRule)
}
func (arc *AlertRuleCacheType) StatChanged(total, lastUpdated int64) bool {
if arc.statTotal == total && arc.statLastUpdated == lastUpdated {
return false
}
return true
}
func (arc *AlertRuleCacheType) Set(m map[int64]*models.AlertRule, total, lastUpdated int64) {
arc.Lock()
arc.rules = m
arc.Unlock()
// only one goroutine used, so no need lock
arc.statTotal = total
arc.statLastUpdated = lastUpdated
}
func (arc *AlertRuleCacheType) Get(ruleId int64) *models.AlertRule {
arc.RLock()
defer arc.RUnlock()
return arc.rules[ruleId]
}
func (arc *AlertRuleCacheType) GetRuleIds() []int64 {
arc.RLock()
defer arc.RUnlock()
count := len(arc.rules)
list := make([]int64, 0, count)
for ruleId := range arc.rules {
list = append(list, ruleId)
}
return list
}
func (arc *AlertRuleCacheType) SyncAlertRules() {
err := arc.syncAlertRules()
if err != nil {
fmt.Println("failed to sync alert rules:", err)
exit(1)
}
go arc.loopSyncAlertRules()
}
func (arc *AlertRuleCacheType) loopSyncAlertRules() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := arc.syncAlertRules(); err != nil {
logger.Warning("failed to sync alert rules:", err)
}
}
}
func (arc *AlertRuleCacheType) syncAlertRules() error {
start := time.Now()
stat, err := models.AlertRuleStatistics(arc.ctx)
if err != nil {
dumper.PutSyncRecord("alert_rules", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to exec AlertRuleStatistics")
}
if !arc.StatChanged(stat.Total, stat.LastUpdated) {
arc.stats.GaugeCronDuration.WithLabelValues("sync_alert_rules").Set(0)
arc.stats.GaugeSyncNumber.WithLabelValues("sync_alert_rules").Set(0)
dumper.PutSyncRecord("alert_rules", start.Unix(), -1, -1, "not changed")
return nil
}
lst, err := models.AlertRuleGetsAll(arc.ctx)
if err != nil {
dumper.PutSyncRecord("alert_rules", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to exec AlertRuleGetsByCluster")
}
m := make(map[int64]*models.AlertRule)
for i := 0; i < len(lst); i++ {
m[lst[i].Id] = lst[i]
}
arc.Set(m, stat.Total, stat.LastUpdated)
ms := time.Since(start).Milliseconds()
arc.stats.GaugeCronDuration.WithLabelValues("sync_alert_rules").Set(float64(ms))
arc.stats.GaugeSyncNumber.WithLabelValues("sync_alert_rules").Set(float64(len(m)))
dumper.PutSyncRecord("alert_rules", start.Unix(), ms, len(m), "success")
return nil
}
================================================
FILE: memsto/alert_subscribe_cache.go
================================================
package memsto
import (
"fmt"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type AlertSubscribeCacheType struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
stats *Stats
sync.RWMutex
subs map[int64][]*models.AlertSubscribe
}
func NewAlertSubscribeCache(ctx *ctx.Context, stats *Stats) *AlertSubscribeCacheType {
asc := &AlertSubscribeCacheType{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
stats: stats,
subs: make(map[int64][]*models.AlertSubscribe),
}
asc.SyncAlertSubscribes()
return asc
}
func (c *AlertSubscribeCacheType) Reset() {
c.Lock()
defer c.Unlock()
c.statTotal = -1
c.statLastUpdated = -1
c.subs = make(map[int64][]*models.AlertSubscribe)
}
func (c *AlertSubscribeCacheType) StatChanged(total, lastUpdated int64) bool {
if c.statTotal == total && c.statLastUpdated == lastUpdated {
return false
}
return true
}
func (c *AlertSubscribeCacheType) Set(m map[int64][]*models.AlertSubscribe, total, lastUpdated int64) {
c.Lock()
c.subs = m
c.Unlock()
// only one goroutine used, so no need lock
c.statTotal = total
c.statLastUpdated = lastUpdated
}
func (c *AlertSubscribeCacheType) Get(ruleId int64) ([]*models.AlertSubscribe, bool) {
c.RLock()
defer c.RUnlock()
lst, has := c.subs[ruleId]
return lst, has
}
func (c *AlertSubscribeCacheType) GetAll() []*models.AlertSubscribe {
c.RLock()
defer c.RUnlock()
var ret []*models.AlertSubscribe
for _, v := range c.subs {
ret = append(ret, v...)
}
return ret
}
func (c *AlertSubscribeCacheType) GetStructs(ruleId int64) []models.AlertSubscribe {
c.RLock()
defer c.RUnlock()
lst, has := c.subs[ruleId]
if !has {
return []models.AlertSubscribe{}
}
ret := make([]models.AlertSubscribe, len(lst))
for i := 0; i < len(lst); i++ {
ret[i] = *lst[i]
}
return ret
}
func (c *AlertSubscribeCacheType) SyncAlertSubscribes() {
err := c.syncAlertSubscribes()
if err != nil {
fmt.Println("failed to sync alert subscribes:", err)
exit(1)
}
go c.loopSyncAlertSubscribes()
}
func (c *AlertSubscribeCacheType) loopSyncAlertSubscribes() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := c.syncAlertSubscribes(); err != nil {
logger.Warning("failed to sync alert subscribes:", err)
}
}
}
func (c *AlertSubscribeCacheType) syncAlertSubscribes() error {
start := time.Now()
stat, err := models.AlertSubscribeStatistics(c.ctx)
if err != nil {
dumper.PutSyncRecord("alert_subscribes", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to exec AlertSubscribeStatistics")
}
if !c.StatChanged(stat.Total, stat.LastUpdated) {
c.stats.GaugeCronDuration.WithLabelValues("sync_alert_subscribes").Set(0)
c.stats.GaugeSyncNumber.WithLabelValues("sync_alert_subscribes").Set(0)
dumper.PutSyncRecord("alert_subscribes", start.Unix(), -1, -1, "not changed")
return nil
}
lst, err := models.AlertSubscribeGetsAll(c.ctx)
if err != nil {
dumper.PutSyncRecord("alert_subscribes", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to exec AlertSubscribeGetsAll")
}
subs := make(map[int64][]*models.AlertSubscribe)
for i := 0; i < len(lst); i++ {
if lst[i].Disabled == 1 {
continue
}
err = lst[i].Parse()
if err != nil {
logger.Warningf("failed to parse alert subscribe, id: %d", lst[i].Id)
continue
}
err = lst[i].DB2FE()
if err != nil {
logger.Warningf("failed to db2fe alert subscribe, id: %d", lst[i].Id)
continue
}
err = lst[i].FillDatasourceIds(c.ctx)
if err != nil {
logger.Warningf("failed to fill datasource ids, id: %d", lst[i].Id)
continue
}
lst[i].CompatibleWithOldRuleId()
// To cache the subscription rule without id, the default id is 0
if len(lst[i].RuleIds) == 0 && lst[i].RuleId == 0 {
lst[i].RuleIds = append(lst[i].RuleIds, 0)
}
for _, rid := range lst[i].RuleIds {
subs[rid] = append(subs[rid], lst[i])
}
}
c.Set(subs, stat.Total, stat.LastUpdated)
ms := time.Since(start).Milliseconds()
c.stats.GaugeCronDuration.WithLabelValues("sync_alert_subscribes").Set(float64(ms))
c.stats.GaugeSyncNumber.WithLabelValues("sync_alert_subscribes").Set(float64(len(lst)))
dumper.PutSyncRecord("alert_subscribes", start.Unix(), ms, len(lst), "success")
return nil
}
================================================
FILE: memsto/busi_group_cache.go
================================================
package memsto
import (
"log"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type BusiGroupCacheType struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
stats *Stats
sync.RWMutex
ugs map[int64]*models.BusiGroup // key: id
}
func NewBusiGroupCache(ctx *ctx.Context, stats *Stats) *BusiGroupCacheType {
bg := &BusiGroupCacheType{
statTotal: -1,
statLastUpdated: -1,
ugs: make(map[int64]*models.BusiGroup),
ctx: ctx,
stats: stats,
}
bg.SyncBusiGroups()
return bg
}
func (c *BusiGroupCacheType) StatChanged(total, lastUpdated int64) bool {
if c.statTotal == total && c.statLastUpdated == lastUpdated {
return false
}
return true
}
func (c *BusiGroupCacheType) Set(ugs map[int64]*models.BusiGroup, total, lastUpdated int64) {
c.Lock()
c.ugs = ugs
c.Unlock()
// only one goroutine used, so no need lock
c.statTotal = total
c.statLastUpdated = lastUpdated
}
func (c *BusiGroupCacheType) GetByBusiGroupId(id int64) *models.BusiGroup {
c.RLock()
defer c.RUnlock()
return c.ugs[id]
}
func (c *BusiGroupCacheType) GetNamesByBusiGroupIds(ids []int64) []string {
c.RLock()
defer c.RUnlock()
names := make([]string, 0, len(ids))
for _, id := range ids {
if ug, exists := c.ugs[id]; exists {
names = append(names, ug.Name)
}
}
return names
}
func (c *BusiGroupCacheType) SyncBusiGroups() {
err := c.syncBusiGroups()
if err != nil {
log.Fatalln("failed to sync busi groups:", err)
}
go c.loopSyncBusiGroups()
}
func (c *BusiGroupCacheType) loopSyncBusiGroups() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := c.syncBusiGroups(); err != nil {
logger.Warning("failed to sync busi groups:", err)
}
}
}
func (c *BusiGroupCacheType) syncBusiGroups() error {
start := time.Now()
stat, err := models.BusiGroupStatistics(c.ctx)
if err != nil {
dumper.PutSyncRecord("busi_groups", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to call BusiGroupStatistics")
}
if !c.StatChanged(stat.Total, stat.LastUpdated) {
c.stats.GaugeCronDuration.WithLabelValues("sync_busi_groups").Set(0)
c.stats.GaugeSyncNumber.WithLabelValues("sync_busi_groups").Set(0)
dumper.PutSyncRecord("busi_groups", start.Unix(), -1, -1, "not changed")
return nil
}
m, err := models.BusiGroupGetMap(c.ctx)
if err != nil {
dumper.PutSyncRecord("busi_groups", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to call BusiGroupGetMap")
}
c.Set(m, stat.Total, stat.LastUpdated)
ms := time.Since(start).Milliseconds()
c.stats.GaugeCronDuration.WithLabelValues("sync_busi_groups").Set(float64(ms))
c.stats.GaugeSyncNumber.WithLabelValues("sync_busi_groups").Set(float64(len(m)))
dumper.PutSyncRecord("busi_groups", start.Unix(), ms, len(m), "success")
return nil
}
func (c *BusiGroupCacheType) GetNameByBusiGroupId(id int64) string {
c.RLock()
defer c.RUnlock()
busiGroup := c.ugs[id]
if busiGroup == nil {
return ""
}
return busiGroup.Name
}
================================================
FILE: memsto/config_cache.go
================================================
package memsto
import (
"log"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type ConfigCache struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
stats *Stats
privateKey []byte
passWord string
mu sync.RWMutex
userVariableMap map[string]string
}
func NewConfigCache(ctx *ctx.Context, status *Stats, privateKey []byte, passWord string) *ConfigCache {
configCache := &ConfigCache{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
stats: status,
privateKey: privateKey,
passWord: passWord,
userVariableMap: make(map[string]string),
}
configCache.initSyncConfigs()
return configCache
}
func (c *ConfigCache) initSyncConfigs() {
err := c.syncConfigs()
if err != nil {
log.Fatalln("failed to sync configs:", err)
}
go c.loopSyncConfigs()
}
func (c *ConfigCache) loopSyncConfigs() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := c.syncConfigs(); err != nil {
logger.Warning("failed to sync configs:", err)
}
}
}
func (c *ConfigCache) syncConfigs() error {
start := time.Now()
stat, err := models.ConfigsUserVariableStatistics(c.ctx)
if err != nil {
dumper.PutSyncRecord("user_variables", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to call userVariables")
}
if !c.statChanged(stat.Total, stat.LastUpdated) {
c.stats.GaugeCronDuration.WithLabelValues("sync_user_variables").Set(0)
c.stats.GaugeSyncNumber.WithLabelValues("sync_user_variables").Set(0)
dumper.PutSyncRecord("user_variables", start.Unix(), -1, -1, "not changed")
return nil
}
decryptMap, decryptErr := models.ConfigUserVariableGetDecryptMap(c.ctx, c.privateKey, c.passWord)
if decryptErr != nil {
dumper.PutSyncRecord("user_variables", start.Unix(), -1, -1, "failed to query records: "+decryptErr.Error())
return errors.WithMessage(decryptErr, "failed to call ConfigUserVariableGetDecryptMap")
}
c.Set(decryptMap, stat.Total, stat.LastUpdated)
ms := time.Since(start).Milliseconds()
c.stats.GaugeCronDuration.WithLabelValues("sync_user_variables").Set(float64(ms))
c.stats.GaugeSyncNumber.WithLabelValues("sync_user_variables").Set(float64(len(decryptMap)))
dumper.PutSyncRecord("user_variables", start.Unix(), ms, len(decryptMap), "success")
return nil
}
func (c *ConfigCache) statChanged(total int64, updated int64) bool {
if c.statTotal == total && c.statLastUpdated == updated {
return false
}
return true
}
func (c *ConfigCache) Set(decryptMap map[string]string, total int64, updated int64) {
c.mu.Lock()
defer c.mu.Unlock()
c.userVariableMap = decryptMap
c.statTotal = total
c.statLastUpdated = updated
}
func (c *ConfigCache) Get() map[string]string {
c.mu.RLock()
defer c.mu.RUnlock()
resMap := make(map[string]string, len(c.userVariableMap))
for k, v := range c.userVariableMap {
resMap[k] = v
}
return resMap
}
func (c *ConfigCache) GetLastUpdateTime() int64 {
c.mu.RLock()
defer c.mu.RUnlock()
return c.statLastUpdated
}
================================================
FILE: memsto/config_cval_cache.go
================================================
package memsto
import (
"encoding/json"
"log"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type CvalCache struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
stats *Stats
mu sync.RWMutex
cvals map[string]string
}
func NewCvalCache(ctx *ctx.Context, stats *Stats) *CvalCache {
cvalCache := &CvalCache{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
stats: stats,
cvals: make(map[string]string),
}
cvalCache.initSyncConfigs()
return cvalCache
}
func (c *CvalCache) initSyncConfigs() {
err := c.syncConfigs()
if err != nil {
log.Fatalln("failed to sync configs:", err)
}
err = models.RefreshPhoneEncryptionCache(c.ctx)
if err != nil {
logger.Errorf("failed to refresh phone encryption cache: %v", err)
}
go c.loopSyncConfigs()
}
func (c *CvalCache) loopSyncConfigs() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := c.syncConfigs(); err != nil {
logger.Warning("failed to sync configs:", err)
}
}
}
func (c *CvalCache) syncConfigs() error {
start := time.Now()
stat, err := models.ConfigCvalStatistics(c.ctx)
if err != nil {
dumper.PutSyncRecord("cvals", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to call ConfigCvalStatistics")
}
if !c.statChanged(stat.Total, stat.LastUpdated) {
c.stats.GaugeCronDuration.WithLabelValues("sync_cvals").Set(0)
c.stats.GaugeSyncNumber.WithLabelValues("sync_cvals").Set(0)
dumper.PutSyncRecord("cvals", start.Unix(), -1, -1, "not changed")
return nil
}
cvals, err := models.ConfigsGetAll(c.ctx)
if err != nil {
dumper.PutSyncRecord("cvals", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to call ConfigsGet")
}
c.Set(cvals, stat.Total, stat.LastUpdated)
ms := time.Since(start).Milliseconds()
c.stats.GaugeCronDuration.WithLabelValues("sync_cvals").Set(float64(ms))
c.stats.GaugeSyncNumber.WithLabelValues("sync_cvals").Set(float64(len(c.cvals)))
dumper.PutSyncRecord("cvals", start.Unix(), ms, len(c.cvals), "success")
return nil
}
func (c *CvalCache) statChanged(total int64, updated int64) bool {
if c.statTotal == total && c.statLastUpdated == updated {
return false
}
return true
}
func (c *CvalCache) Set(cvals []*models.Configs, total int64, updated int64) {
c.mu.Lock()
defer c.mu.Unlock()
c.statTotal = total
c.statLastUpdated = updated
for _, cfg := range cvals {
c.cvals[cfg.Ckey] = cfg.Cval
}
}
func (c *CvalCache) Get(ckey string) string {
c.mu.RLock()
defer c.mu.RUnlock()
return c.cvals[ckey]
}
func (c *CvalCache) GetLastUpdateTime() int64 {
c.mu.RLock()
defer c.mu.RUnlock()
return c.statLastUpdated
}
type SiteInfo struct {
PrintBodyPaths []string `json:"print_body_paths"`
PrintAccessLog bool `json:"print_access_log"`
SiteUrl string `json:"site_url"`
ReportHostNIC bool `json:"report_host_nic"`
}
func (c *CvalCache) GetSiteInfo() *SiteInfo {
c.mu.RLock()
defer c.mu.RUnlock()
si := SiteInfo{}
if siteInfoStr := c.Get("site_info"); siteInfoStr != "" {
if err := json.Unmarshal([]byte(siteInfoStr), &si); err != nil {
logger.Errorf("Failed to unmarshal site info: %v", err)
}
}
return &si
}
func (c *CvalCache) PrintBodyPaths() map[string]struct{} {
printBodyPaths := c.GetSiteInfo().PrintBodyPaths
pbp := make(map[string]struct{}, len(printBodyPaths))
for _, p := range printBodyPaths {
pbp[p] = struct{}{}
}
return pbp
}
func (c *CvalCache) PrintAccessLog() bool {
return c.GetSiteInfo().PrintAccessLog
}
================================================
FILE: memsto/datasource_cache.go
================================================
package memsto
import (
"log"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/gin-gonic/gin"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type DatasourceCacheType struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
stats *Stats
DatasourceCheckHook func(*gin.Context) bool
DatasourceFilter func([]*models.Datasource, *models.User) []*models.Datasource
sync.RWMutex
ds map[int64]*models.Datasource // key: id value: datasource
CateToIDs map[string]map[int64]*models.Datasource // key1: cate key2: id value: datasource
CateToNames map[string]map[string]int64 // key1: cate key2: name value: id
}
func NewDatasourceCache(ctx *ctx.Context, stats *Stats) *DatasourceCacheType {
ds := &DatasourceCacheType{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
stats: stats,
ds: make(map[int64]*models.Datasource),
CateToIDs: make(map[string]map[int64]*models.Datasource),
CateToNames: make(map[string]map[string]int64),
DatasourceCheckHook: func(ctx *gin.Context) bool { return false },
DatasourceFilter: func(ds []*models.Datasource, user *models.User) []*models.Datasource { return ds },
}
ds.SyncDatasources()
return ds
}
func (d *DatasourceCacheType) GetIDsByDsCateAndQueries(cate string, datasourceQueries []models.DatasourceQuery) []int64 {
d.Lock()
defer d.Unlock()
return models.GetDatasourceIDsByDatasourceQueries(datasourceQueries, d.CateToIDs[cate], d.CateToNames[cate])
}
func (d *DatasourceCacheType) StatChanged(total, lastUpdated int64) bool {
if d.statTotal == total && d.statLastUpdated == lastUpdated {
return false
}
return true
}
func (d *DatasourceCacheType) Set(ds map[int64]*models.Datasource, total, lastUpdated int64) {
cateToDs := make(map[string]map[int64]*models.Datasource)
cateToNames := make(map[string]map[string]int64)
for _, datasource := range ds {
if _, exists := cateToDs[datasource.PluginType]; !exists {
cateToDs[datasource.PluginType] = make(map[int64]*models.Datasource)
}
cateToDs[datasource.PluginType][datasource.Id] = datasource
if _, exists := cateToNames[datasource.PluginType]; !exists {
cateToNames[datasource.PluginType] = make(map[string]int64)
}
cateToNames[datasource.PluginType][datasource.Name] = datasource.Id
}
d.Lock()
d.CateToIDs = cateToDs
d.ds = ds
d.CateToNames = cateToNames
d.Unlock()
// only one goroutine used, so no need lock
d.statTotal = total
d.statLastUpdated = lastUpdated
}
func (d *DatasourceCacheType) GetById(id int64) *models.Datasource {
d.RLock()
defer d.RUnlock()
return d.ds[id]
}
func (d *DatasourceCacheType) SyncDatasources() {
err := d.syncDatasources()
if err != nil {
log.Fatalln("failed to sync datasources:", err)
}
go d.loopSyncDatasources()
}
func (d *DatasourceCacheType) loopSyncDatasources() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := d.syncDatasources(); err != nil {
logger.Warning("failed to sync datasources:", err)
}
}
}
func (d *DatasourceCacheType) syncDatasources() error {
start := time.Now()
stat, err := models.DatasourceStatistics(d.ctx)
if err != nil {
dumper.PutSyncRecord("datasources", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to call DatasourceStatistics")
}
if !d.StatChanged(stat.Total, stat.LastUpdated) {
d.stats.GaugeCronDuration.WithLabelValues("sync_datasources").Set(0)
d.stats.GaugeSyncNumber.WithLabelValues("sync_datasources").Set(0)
dumper.PutSyncRecord("datasources", start.Unix(), -1, -1, "not changed")
return nil
}
ds, err := models.DatasourceGetMap(d.ctx)
if err != nil {
dumper.PutSyncRecord("datasources", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to call DatasourceGetMap")
}
d.Set(ds, stat.Total, stat.LastUpdated)
ms := time.Since(start).Milliseconds()
d.stats.GaugeCronDuration.WithLabelValues("sync_datasources").Set(float64(ms))
d.stats.GaugeSyncNumber.WithLabelValues("sync_datasources").Set(float64(len(ds)))
dumper.PutSyncRecord("datasources", start.Unix(), ms, len(ds), "success")
return nil
}
================================================
FILE: memsto/drop_ident.go
================================================
package memsto
import (
"sync"
"time"
)
type Item struct {
Count int
Ts int64
}
type IdentCountCacheType struct {
sync.RWMutex
idents map[string]Item
}
func NewIdentCountCache() *IdentCountCacheType {
d := &IdentCountCacheType{
idents: make(map[string]Item),
}
go d.CronDeleteExpired()
return d
}
// Set ident
func (c *IdentCountCacheType) Set(ident string, count int, ts int64) {
c.Lock()
item := Item{
Count: count,
Ts: ts,
}
c.idents[ident] = item
c.Unlock()
}
func (c *IdentCountCacheType) Increment(ident string, num int) {
now := time.Now().Unix()
c.Lock()
if item, exists := c.idents[ident]; exists {
item.Count += num
item.Ts = now
c.idents[ident] = item
} else {
item := Item{
Count: num,
Ts: now,
}
c.idents[ident] = item
}
c.Unlock()
}
// check exists ident
func (c *IdentCountCacheType) Exists(ident string) bool {
c.RLock()
_, exists := c.idents[ident]
c.RUnlock()
return exists
}
func (c *IdentCountCacheType) Get(ident string) int {
c.RLock()
defer c.RUnlock()
item, exists := c.idents[ident]
if !exists {
return 0
}
return item.Count
}
func (c *IdentCountCacheType) GetsAndFlush() map[string]Item {
c.Lock()
data := make(map[string]Item)
for k, v := range c.idents {
data[k] = v
}
c.idents = make(map[string]Item)
c.Unlock()
return data
}
func (c *IdentCountCacheType) CronDeleteExpired() {
for {
time.Sleep(60 * time.Second)
c.deleteExpired()
}
}
// cron delete expired ident
func (c *IdentCountCacheType) deleteExpired() {
c.Lock()
now := time.Now().Unix()
for ident, item := range c.idents {
if item.Ts < now-120 {
delete(c.idents, ident)
}
}
c.Unlock()
}
================================================
FILE: memsto/es_index_pattern.go
================================================
package memsto
import (
"log"
"sync"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/logger"
)
type EsIndexPatternCacheType struct {
ctx *ctx.Context
sync.RWMutex
indexPattern map[int64]*models.EsIndexPattern // key: name
}
func NewEsIndexPatternCacheType(ctx *ctx.Context) *EsIndexPatternCacheType {
ipc := &EsIndexPatternCacheType{
ctx: ctx,
indexPattern: make(map[int64]*models.EsIndexPattern),
}
ipc.SyncEsIndexPattern()
return ipc
}
func (p *EsIndexPatternCacheType) Reset() {
p.Lock()
defer p.Unlock()
p.indexPattern = make(map[int64]*models.EsIndexPattern)
}
func (p *EsIndexPatternCacheType) Set(m map[int64]*models.EsIndexPattern) {
p.Lock()
p.indexPattern = m
p.Unlock()
}
func (p *EsIndexPatternCacheType) Get(id int64) (*models.EsIndexPattern, bool) {
p.RLock()
defer p.RUnlock()
ip, has := p.indexPattern[id]
return ip, has
}
func (p *EsIndexPatternCacheType) SyncEsIndexPattern() {
err := p.syncEsIndexPattern()
if err != nil {
log.Fatalln("failed to sync targets:", err)
}
go p.loopSyncEsIndexPattern()
}
func (p *EsIndexPatternCacheType) loopSyncEsIndexPattern() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := p.syncEsIndexPattern(); err != nil {
logger.Warning("failed to sync host alert rule targets:", err)
}
}
}
func (p *EsIndexPatternCacheType) syncEsIndexPattern() error {
lst, err := models.EsIndexPatternGets(p.ctx, "")
if err != nil {
return err
}
m := make(map[int64]*models.EsIndexPattern, len(lst))
for _, p := range lst {
m[p.Id] = p
}
p.Set(m)
return nil
}
================================================
FILE: memsto/event_processor_cache.go
================================================
package memsto
import (
"fmt"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type EventProcessorCacheType struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
stats *Stats
sync.RWMutex
eventPipelines map[int64]*models.EventPipeline // key: pipeline id
}
func NewEventProcessorCache(ctx *ctx.Context, stats *Stats) *EventProcessorCacheType {
epc := &EventProcessorCacheType{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
stats: stats,
eventPipelines: make(map[int64]*models.EventPipeline),
}
epc.SyncEventProcessors()
return epc
}
func (epc *EventProcessorCacheType) Reset() {
epc.Lock()
defer epc.Unlock()
epc.statTotal = -1
epc.statLastUpdated = -1
epc.eventPipelines = make(map[int64]*models.EventPipeline)
}
func (epc *EventProcessorCacheType) StatChanged(total, lastUpdated int64) bool {
if epc.statTotal == total && epc.statLastUpdated == lastUpdated {
return false
}
return true
}
func (epc *EventProcessorCacheType) Set(m map[int64]*models.EventPipeline, total, lastUpdated int64) {
epc.Lock()
epc.eventPipelines = m
epc.Unlock()
// only one goroutine used, so no need lock
epc.statTotal = total
epc.statLastUpdated = lastUpdated
}
func (epc *EventProcessorCacheType) Get(processorId int64) *models.EventPipeline {
epc.RLock()
defer epc.RUnlock()
return epc.eventPipelines[processorId]
}
func (epc *EventProcessorCacheType) GetProcessorIds() []int64 {
epc.RLock()
defer epc.RUnlock()
count := len(epc.eventPipelines)
list := make([]int64, 0, count)
for eid := range epc.eventPipelines {
list = append(list, eid)
}
return list
}
func (epc *EventProcessorCacheType) SyncEventProcessors() {
err := epc.syncEventProcessors()
if err != nil {
fmt.Println("failed to sync event processors:", err)
exit(1)
}
go epc.loopSyncEventProcessors()
}
func (epc *EventProcessorCacheType) loopSyncEventProcessors() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := epc.syncEventProcessors(); err != nil {
logger.Warning("failed to sync event processors:", err)
}
}
}
func (epc *EventProcessorCacheType) syncEventProcessors() error {
start := time.Now()
stat, err := models.EventPipelineStatistics(epc.ctx)
if err != nil {
dumper.PutSyncRecord("event_processors", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to exec StatisticsGet for EventPipeline")
}
if !epc.StatChanged(stat.Total, stat.LastUpdated) {
epc.stats.GaugeCronDuration.WithLabelValues("sync_event_processors").Set(0)
epc.stats.GaugeSyncNumber.WithLabelValues("sync_event_processors").Set(0)
dumper.PutSyncRecord("event_processors", start.Unix(), -1, -1, "not changed")
return nil
}
lst, err := models.ListEventPipelines(epc.ctx)
if err != nil {
dumper.PutSyncRecord("event_processors", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to exec ListEventPipelines")
}
m := make(map[int64]*models.EventPipeline)
for i := 0; i < len(lst); i++ {
m[lst[i].ID] = lst[i]
}
epc.Set(m, stat.Total, stat.LastUpdated)
ms := time.Since(start).Milliseconds()
epc.stats.GaugeCronDuration.WithLabelValues("sync_event_processors").Set(float64(ms))
epc.stats.GaugeSyncNumber.WithLabelValues("sync_event_processors").Set(float64(len(m)))
dumper.PutSyncRecord("event_processors", start.Unix(), ms, len(m), "success")
return nil
}
================================================
FILE: memsto/host_alert_rule_targets.go
================================================
package memsto
import (
"log"
"sync"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/logger"
)
type TargetsOfAlertRuleCacheType struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
stats *Stats
engineName string
sync.RWMutex
targets map[string]map[int64][]string // key: ident
}
func NewTargetOfAlertRuleCache(ctx *ctx.Context, engineName string, stats *Stats) *TargetsOfAlertRuleCacheType {
tc := &TargetsOfAlertRuleCacheType{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
engineName: engineName,
stats: stats,
targets: make(map[string]map[int64][]string),
}
tc.SyncTargets()
return tc
}
func (tc *TargetsOfAlertRuleCacheType) Reset() {
tc.Lock()
defer tc.Unlock()
tc.statTotal = -1
tc.statLastUpdated = -1
tc.targets = make(map[string]map[int64][]string)
}
func (tc *TargetsOfAlertRuleCacheType) Set(m map[string]map[int64][]string, total, lastUpdated int64) {
tc.Lock()
tc.targets = m
tc.Unlock()
// only one goroutine used, so no need lock
tc.statTotal = total
tc.statLastUpdated = lastUpdated
}
func (tc *TargetsOfAlertRuleCacheType) Get(engineName string, rid int64) ([]string, bool) {
tc.RLock()
defer tc.RUnlock()
m, has := tc.targets[engineName]
if !has {
return nil, false
}
lst, has := m[rid]
return lst, has
}
func (tc *TargetsOfAlertRuleCacheType) SyncTargets() {
err := tc.syncTargets()
if err != nil {
log.Fatalln("failed to sync targets:", err)
}
go tc.loopSyncTargets()
}
func (tc *TargetsOfAlertRuleCacheType) loopSyncTargets() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := tc.syncTargets(); err != nil {
logger.Warning("failed to sync host alert rule targets:", err)
}
}
}
func (tc *TargetsOfAlertRuleCacheType) syncTargets() error {
m, err := models.GetTargetsOfHostAlertRule(tc.ctx, tc.engineName)
if err != nil {
return err
}
logger.Debugf("get_targets_of_alert_rule total: %d engine_name:%s", len(m), tc.engineName)
for k, v := range m {
logger.Debugf("get_targets_of_alert_rule key:%s value:%v", k, v)
}
tc.Set(m, 0, 0)
return nil
}
================================================
FILE: memsto/memsto.go
================================================
package memsto
import (
"os"
"github.com/toolkits/pkg/logger"
)
// TODO 优化 exit 处理方式
func exit(code int) {
logger.Close()
os.Exit(code)
}
================================================
FILE: memsto/message_template_cache.go
================================================
package memsto
import (
"fmt"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type MessageTemplateCacheType struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
stats *Stats
sync.RWMutex
templates map[int64]*models.MessageTemplate // key: template id
}
func NewMessageTemplateCache(ctx *ctx.Context, stats *Stats) *MessageTemplateCacheType {
mtc := &MessageTemplateCacheType{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
stats: stats,
templates: make(map[int64]*models.MessageTemplate),
}
mtc.SyncMessageTemplates()
return mtc
}
func (mtc *MessageTemplateCacheType) Reset() {
mtc.Lock()
defer mtc.Unlock()
mtc.statTotal = -1
mtc.statLastUpdated = -1
mtc.templates = make(map[int64]*models.MessageTemplate)
}
func (mtc *MessageTemplateCacheType) StatChanged(total, lastUpdated int64) bool {
if mtc.statTotal == total && mtc.statLastUpdated == lastUpdated {
return false
}
return true
}
func (mtc *MessageTemplateCacheType) Set(m map[int64]*models.MessageTemplate, total, lastUpdated int64) {
mtc.Lock()
mtc.templates = m
mtc.Unlock()
// only one goroutine used, so no need lock
mtc.statTotal = total
mtc.statLastUpdated = lastUpdated
}
func (mtc *MessageTemplateCacheType) Get(templateId int64) *models.MessageTemplate {
mtc.RLock()
defer mtc.RUnlock()
return mtc.templates[templateId]
}
func (mtc *MessageTemplateCacheType) GetTemplateIds() []int64 {
mtc.RLock()
defer mtc.RUnlock()
count := len(mtc.templates)
list := make([]int64, 0, count)
for templateId := range mtc.templates {
list = append(list, templateId)
}
return list
}
func (mtc *MessageTemplateCacheType) SyncMessageTemplates() {
err := mtc.syncMessageTemplates()
if err != nil {
fmt.Println("failed to sync message templates:", err)
exit(1)
}
go mtc.loopSyncMessageTemplates()
}
func (mtc *MessageTemplateCacheType) loopSyncMessageTemplates() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := mtc.syncMessageTemplates(); err != nil {
logger.Warning("failed to sync message templates:", err)
}
}
}
func (mtc *MessageTemplateCacheType) syncMessageTemplates() error {
start := time.Now()
stat, err := models.MessageTemplateStatistics(mtc.ctx)
if err != nil {
dumper.PutSyncRecord("message_templates", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to exec MessageTemplateStatistics")
}
if !mtc.StatChanged(stat.Total, stat.LastUpdated) {
mtc.stats.GaugeCronDuration.WithLabelValues("sync_message_templates").Set(0)
mtc.stats.GaugeSyncNumber.WithLabelValues("sync_message_templates").Set(0)
dumper.PutSyncRecord("message_templates", start.Unix(), -1, -1, "not changed")
return nil
}
lst, err := models.MessageTemplateGetsAll(mtc.ctx)
if err != nil {
dumper.PutSyncRecord("message_templates", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to exec MessageTemplateGetsAll")
}
m := make(map[int64]*models.MessageTemplate)
for i := 0; i < len(lst); i++ {
m[lst[i].ID] = lst[i]
}
mtc.Set(m, stat.Total, stat.LastUpdated)
ms := time.Since(start).Milliseconds()
mtc.stats.GaugeCronDuration.WithLabelValues("sync_message_templates").Set(float64(ms))
mtc.stats.GaugeSyncNumber.WithLabelValues("sync_message_templates").Set(float64(len(m)))
dumper.PutSyncRecord("message_templates", start.Unix(), ms, len(m), "success")
return nil
}
================================================
FILE: memsto/notify_channel_cache.go
================================================
package memsto
import (
"crypto/tls"
"encoding/json"
"fmt"
"net/http"
"strings"
"sync"
"time"
"gopkg.in/gomail.v2"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/container/list"
"github.com/toolkits/pkg/logger"
)
// NotifyTask 表示一个通知发送任务
type NotifyTask struct {
Events []*models.AlertCurEvent
NotifyRuleId int64
NotifyChannel *models.NotifyChannelConfig
TplContent map[string]interface{}
CustomParams map[string]string
Sendtos []string
}
// NotifyRecordFunc 通知记录函数类型
type NotifyRecordFunc func(ctx *ctx.Context, events []*models.AlertCurEvent, notifyRuleId int64, channelName, target, resp string, err error)
type NotifyChannelCacheType struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
stats *Stats
sync.RWMutex
channels map[int64]*models.NotifyChannelConfig // key: channel id
channelsQueue map[int64]*list.SafeListLimited
httpClient map[int64]*http.Client
smtpCh map[int64]chan *models.EmailContext
smtpQuitCh map[int64]chan struct{}
// 队列消费者控制
queueQuitCh map[int64]chan struct{}
// 通知记录回调函数
notifyRecordFunc NotifyRecordFunc
}
func NewNotifyChannelCache(ctx *ctx.Context, stats *Stats) *NotifyChannelCacheType {
ncc := &NotifyChannelCacheType{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
stats: stats,
channels: make(map[int64]*models.NotifyChannelConfig),
channelsQueue: make(map[int64]*list.SafeListLimited),
queueQuitCh: make(map[int64]chan struct{}),
httpClient: make(map[int64]*http.Client),
smtpCh: make(map[int64]chan *models.EmailContext),
smtpQuitCh: make(map[int64]chan struct{}),
}
ncc.SyncNotifyChannels()
return ncc
}
// SetNotifyRecordFunc 设置通知记录回调函数
func (ncc *NotifyChannelCacheType) SetNotifyRecordFunc(fn NotifyRecordFunc) {
ncc.notifyRecordFunc = fn
}
func (ncc *NotifyChannelCacheType) StatChanged(total, lastUpdated int64) bool {
if ncc.statTotal == total && ncc.statLastUpdated == lastUpdated {
return false
}
return true
}
func (ncc *NotifyChannelCacheType) Set(m map[int64]*models.NotifyChannelConfig, total, lastUpdated int64) {
ncc.Lock()
defer ncc.Unlock()
// 1. 处理需要删除的通道
ncc.removeDeletedChannels(m)
// 2. 处理新增和更新的通道
ncc.addOrUpdateChannels(m)
// only one goroutine used, so no need lock
ncc.statTotal = total
ncc.statLastUpdated = lastUpdated
}
// removeDeletedChannels 移除已删除的通道
func (ncc *NotifyChannelCacheType) removeDeletedChannels(newChannels map[int64]*models.NotifyChannelConfig) {
for chID := range ncc.channels {
if _, exists := newChannels[chID]; !exists {
logger.Infof("removing deleted channel %d", chID)
// 停止消费者协程
if quitCh, exists := ncc.queueQuitCh[chID]; exists {
close(quitCh)
delete(ncc.queueQuitCh, chID)
}
// 删除队列
delete(ncc.channelsQueue, chID)
// 删除HTTP客户端
delete(ncc.httpClient, chID)
// 停止SMTP发送器
if quitCh, exists := ncc.smtpQuitCh[chID]; exists {
close(quitCh)
delete(ncc.smtpQuitCh, chID)
delete(ncc.smtpCh, chID)
}
// 删除通道配置
delete(ncc.channels, chID)
}
}
}
// addOrUpdateChannels 添加或更新通道
func (ncc *NotifyChannelCacheType) addOrUpdateChannels(newChannels map[int64]*models.NotifyChannelConfig) {
for chID, newChannel := range newChannels {
oldChannel, exists := ncc.channels[chID]
if exists {
if ncc.channelConfigChanged(oldChannel, newChannel) {
logger.Infof("updating channel %d (new: %t)", chID, !exists)
ncc.stopChannelResources(chID)
} else {
logger.Debugf("channel %d config not changed", chID)
continue
}
}
// 更新通道配置
ncc.channels[chID] = newChannel
// 根据类型创建相应的资源
switch newChannel.RequestType {
case "http", "flashduty", "pagerduty":
// 创建HTTP客户端
if newChannel.RequestConfig != nil && newChannel.RequestConfig.HTTPRequestConfig != nil {
cli, err := models.GetHTTPClient(newChannel)
if err != nil {
logger.Warningf("failed to create HTTP client for channel %d: %v", chID, err)
} else {
if ncc.httpClient == nil {
ncc.httpClient = make(map[int64]*http.Client)
}
ncc.httpClient[chID] = cli
}
}
// 对于 http 类型,启动队列和消费者
if newChannel.RequestType == "http" {
ncc.startHttpChannel(chID, newChannel)
}
case "smtp":
// 创建SMTP发送器
if newChannel.RequestConfig != nil && newChannel.RequestConfig.SMTPRequestConfig != nil {
ch := make(chan *models.EmailContext)
quit := make(chan struct{})
go ncc.startEmailSender(chID, newChannel.RequestConfig.SMTPRequestConfig, ch, quit)
if ncc.smtpCh == nil {
ncc.smtpCh = make(map[int64]chan *models.EmailContext)
}
if ncc.smtpQuitCh == nil {
ncc.smtpQuitCh = make(map[int64]chan struct{})
}
ncc.smtpCh[chID] = ch
ncc.smtpQuitCh[chID] = quit
}
}
}
}
// channelConfigChanged 检查通道配置是否发生变化
func (ncc *NotifyChannelCacheType) channelConfigChanged(oldChannel, newChannel *models.NotifyChannelConfig) bool {
if oldChannel == nil || newChannel == nil {
return true
}
// check updateat
if oldChannel.UpdateAt != newChannel.UpdateAt {
return true
}
return false
}
// stopChannelResources 停止通道的相关资源
func (ncc *NotifyChannelCacheType) stopChannelResources(chID int64) {
// 停止HTTP消费者协程
if quitCh, exists := ncc.queueQuitCh[chID]; exists {
close(quitCh)
delete(ncc.queueQuitCh, chID)
delete(ncc.channelsQueue, chID)
}
// 停止SMTP发送器
if quitCh, exists := ncc.smtpQuitCh[chID]; exists {
close(quitCh)
delete(ncc.smtpQuitCh, chID)
delete(ncc.smtpCh, chID)
}
}
// startHttpChannel 启动HTTP通道的队列和消费者
func (ncc *NotifyChannelCacheType) startHttpChannel(chID int64, channel *models.NotifyChannelConfig) {
if channel.RequestConfig == nil || channel.RequestConfig.HTTPRequestConfig == nil {
logger.Warningf("notify channel %+v http request config not found", channel)
return
}
// 创建队列
queue := list.NewSafeListLimited(100000)
ncc.channelsQueue[chID] = queue
// 启动消费者协程
quitCh := make(chan struct{})
ncc.queueQuitCh[chID] = quitCh
// 启动指定数量的消费者协程
concurrency := channel.RequestConfig.HTTPRequestConfig.Concurrency
for i := 0; i < concurrency; i++ {
go ncc.startNotifyConsumer(chID, queue, quitCh)
}
logger.Debugf("started %d notify consumers for channel %d", concurrency, chID)
}
// 启动通知消费者协程
func (ncc *NotifyChannelCacheType) startNotifyConsumer(channelID int64, queue *list.SafeListLimited, quitCh chan struct{}) {
logger.Debugf("starting notify consumer for channel %d", channelID)
for {
select {
case <-quitCh:
logger.Debugf("notify consumer for channel %d stopped", channelID)
return
default:
// 从队列中取出任务
task := queue.PopBack()
if task == nil {
// 队列为空,等待一段时间
time.Sleep(100 * time.Millisecond)
continue
}
notifyTask, ok := task.(*NotifyTask)
if !ok {
logger.Errorf("invalid task type in queue for channel %d", channelID)
continue
}
// 处理通知任务
ncc.processNotifyTask(notifyTask)
}
}
}
// processNotifyTask 处理通知任务(仅处理 http 类型)
func (ncc *NotifyChannelCacheType) processNotifyTask(task *NotifyTask) {
httpClient := ncc.GetHttpClient(task.NotifyChannel.ID)
logger.Debugf("processNotifyTask: task: %+v", task)
// 现在只处理 http 类型,flashduty 保持直接发送
if task.NotifyChannel.RequestType == "http" {
if len(task.Sendtos) == 0 || ncc.needBatchContacts(task.NotifyChannel.RequestConfig.HTTPRequestConfig) {
start := time.Now()
resp, err := task.NotifyChannel.SendHTTP(task.Events, task.TplContent, task.CustomParams, task.Sendtos, httpClient)
resp = fmt.Sprintf("send_time: %s duration: %d ms %s", time.Now().Format("2006-01-02 15:04:05"), time.Since(start).Milliseconds(), resp)
logger.Infof("http_sendernotify_id: %d, channel_name: %v, event:%s, tplContent:%v, customParams:%v, userInfo:%+v, respBody: %v, err: %v",
task.NotifyRuleId, task.NotifyChannel.Name, task.Events[0].Hash, task.TplContent, task.CustomParams, task.Sendtos, resp, err)
// 调用通知记录回调函数
if ncc.notifyRecordFunc != nil {
ncc.notifyRecordFunc(ncc.ctx, task.Events, task.NotifyRuleId, task.NotifyChannel.Name, ncc.getSendTarget(task.CustomParams, task.Sendtos), resp, err)
}
} else {
for i := range task.Sendtos {
start := time.Now()
resp, err := task.NotifyChannel.SendHTTP(task.Events, task.TplContent, task.CustomParams, []string{task.Sendtos[i]}, httpClient)
resp = fmt.Sprintf("send_time: %s duration: %d ms %s", time.Now().Format("2006-01-02 15:04:05"), time.Since(start).Milliseconds(), resp)
logger.Infof("http_sender notify_id: %d, channel_name: %v, event:%s, tplContent:%v, customParams:%v, userInfo:%+v, respBody: %v, err: %v",
task.NotifyRuleId, task.NotifyChannel.Name, task.Events[0].Hash, task.TplContent, task.CustomParams, task.Sendtos[i], resp, err)
// 调用通知记录回调函数
if ncc.notifyRecordFunc != nil {
ncc.notifyRecordFunc(ncc.ctx, task.Events, task.NotifyRuleId, task.NotifyChannel.Name, ncc.getSendTarget(task.CustomParams, []string{task.Sendtos[i]}), resp, err)
}
}
}
}
}
// 判断是否需要批量发送联系人
func (ncc *NotifyChannelCacheType) needBatchContacts(requestConfig *models.HTTPRequestConfig) bool {
if requestConfig == nil {
return false
}
b, _ := json.Marshal(requestConfig)
return strings.Contains(string(b), "$sendtos")
}
// 获取发送目标
func (ncc *NotifyChannelCacheType) getSendTarget(customParams map[string]string, sendtos []string) string {
if len(customParams) == 0 {
return strings.Join(sendtos, ",")
}
values := make([]string, 0)
for _, value := range customParams {
runes := []rune(value)
if len(runes) <= 4 {
values = append(values, value)
} else {
maskedValue := string(runes[:len(runes)-4]) + "****"
values = append(values, maskedValue)
}
}
return strings.Join(values, ",")
}
func (ncc *NotifyChannelCacheType) Get(channelId int64) *models.NotifyChannelConfig {
ncc.RLock()
defer ncc.RUnlock()
return ncc.channels[channelId]
}
func (ncc *NotifyChannelCacheType) GetHttpClient(channelId int64) *http.Client {
ncc.RLock()
defer ncc.RUnlock()
return ncc.httpClient[channelId]
}
func (ncc *NotifyChannelCacheType) GetSmtpClient(channelId int64) chan *models.EmailContext {
ncc.RLock()
defer ncc.RUnlock()
return ncc.smtpCh[channelId]
}
func (ncc *NotifyChannelCacheType) GetChannelIds() []int64 {
ncc.RLock()
defer ncc.RUnlock()
count := len(ncc.channels)
list := make([]int64, 0, count)
for channelId := range ncc.channels {
list = append(list, channelId)
}
return list
}
// 新增:将通知任务加入队列
func (ncc *NotifyChannelCacheType) EnqueueNotifyTask(task *NotifyTask) bool {
ncc.RLock()
queue := ncc.channelsQueue[task.NotifyChannel.ID]
ncc.RUnlock()
if queue == nil {
logger.Errorf("no queue found for channel %d", task.NotifyChannel.ID)
return false
}
success := queue.PushFront(task)
if !success {
logger.Warningf("failed to enqueue notify task for channel %d, queue is full", task.NotifyChannel.ID)
}
return success
}
func (ncc *NotifyChannelCacheType) SyncNotifyChannels() {
err := ncc.syncNotifyChannels()
if err != nil {
fmt.Println("failed to sync notify channels:", err)
}
go ncc.loopSyncNotifyChannels()
}
func (ncc *NotifyChannelCacheType) loopSyncNotifyChannels() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := ncc.syncNotifyChannels(); err != nil {
logger.Warning("failed to sync notify channels:", err)
}
}
}
func (ncc *NotifyChannelCacheType) syncNotifyChannels() error {
start := time.Now()
stat, err := models.NotifyChannelStatistics(ncc.ctx)
if err != nil {
dumper.PutSyncRecord("notify_channels", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to exec NotifyChannelStatistics")
}
if !ncc.StatChanged(stat.Total, stat.LastUpdated) {
ncc.stats.GaugeCronDuration.WithLabelValues("sync_notify_channels").Set(0)
ncc.stats.GaugeSyncNumber.WithLabelValues("sync_notify_channels").Set(0)
dumper.PutSyncRecord("notify_channels", start.Unix(), -1, -1, "not changed")
return nil
}
lst, err := models.NotifyChannelGetsAll(ncc.ctx)
if err != nil {
dumper.PutSyncRecord("notify_channels", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to exec NotifyChannelGetsAll")
}
m := make(map[int64]*models.NotifyChannelConfig)
for i := 0; i < len(lst); i++ {
m[lst[i].ID] = lst[i]
}
// 增量更新:只传递通道配置,让增量更新逻辑按需创建资源
ncc.Set(m, stat.Total, stat.LastUpdated)
ms := time.Since(start).Milliseconds()
ncc.stats.GaugeCronDuration.WithLabelValues("sync_notify_channels").Set(float64(ms))
ncc.stats.GaugeSyncNumber.WithLabelValues("sync_notify_channels").Set(float64(len(m)))
dumper.PutSyncRecord("notify_channels", start.Unix(), ms, len(m), "success")
return nil
}
func (ncc *NotifyChannelCacheType) startEmailSender(chID int64, smtp *models.SMTPRequestConfig, ch chan *models.EmailContext, quitCh chan struct{}) {
conf := smtp
if conf.Host == "" || conf.Port == 0 {
logger.Warning("SMTP configurations invalid")
return
}
logger.Debugf("start email sender... conf.Host:%+v,conf.Port:%+v", conf.Host, conf.Port)
d := gomail.NewDialer(conf.Host, conf.Port, conf.Username, conf.Password)
if conf.InsecureSkipVerify {
d.TLSConfig = &tls.Config{InsecureSkipVerify: true}
}
var s gomail.SendCloser
var open bool
var size int
for {
select {
case <-quitCh:
return
case m, ok := <-ch:
if !ok {
return
}
if !open {
s = ncc.dialSmtp(quitCh, d)
if s == nil {
// Indicates that the dialing failed and exited the current goroutine directly,
// but put the Message back in the mailch
ch <- m
return
}
open = true
}
var err error
if err = gomail.Send(s, m.Mail); err != nil {
logger.Errorf("email_sender: failed to send: %s", err)
// close and retry
if err := s.Close(); err != nil {
logger.Warningf("email_sender: failed to close smtp connection: %s", err)
}
s = ncc.dialSmtp(quitCh, d)
if s == nil {
// Indicates that the dialing failed and exited the current goroutine directly,
// but put the Message back in the mailch
ch <- m
return
}
open = true
if err = gomail.Send(s, m.Mail); err != nil {
logger.Errorf("email_sender: failed to retry send: %s", err)
}
} else {
logger.Infof("email_sender: result=succ subject=%v to=%v",
m.Mail.GetHeader("Subject"), m.Mail.GetHeader("To"))
}
// 记录通知详情
if ncc.notifyRecordFunc != nil {
target := strings.Join(m.Mail.GetHeader("To"), ",")
ncc.notifyRecordFunc(ncc.ctx, m.Events, m.NotifyRuleId, "Email", target, "success", err)
}
size++
if size >= conf.Batch {
if err := s.Close(); err != nil {
logger.Warningf("email_sender: failed to close smtp connection: %s", err)
}
open = false
size = 0
}
// Close the connection to the SMTP server if no email was sent in
// the last 30 seconds.
case <-time.After(30 * time.Second):
if open {
if err := s.Close(); err != nil {
logger.Warningf("email_sender: failed to close smtp connection: %s", err)
}
open = false
}
}
}
}
func (ncc *NotifyChannelCacheType) dialSmtp(quitCh chan struct{}, d *gomail.Dialer) gomail.SendCloser {
for {
select {
case <-quitCh:
// Note that Sendcloser is not obtained below,
// and the outgoing signal (with configuration changes) exits the current dial
return nil
default:
if s, err := d.Dial(); err != nil {
logger.Errorf("email_sender: failed to dial smtp: %s", err)
} else {
return s
}
time.Sleep(time.Second)
}
}
}
================================================
FILE: memsto/notify_config.go
================================================
package memsto
import (
"crypto/tls"
"encoding/json"
"net/http"
"strings"
"sync"
"time"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/pkg/tplx"
"github.com/BurntSushi/toml"
"github.com/toolkits/pkg/logger"
)
type NotifyConfigCacheType struct {
ctx *ctx.Context
ConfigCache *ConfigCache
webhooks map[string]*models.Webhook
smtp aconf.SMTPConfig
script models.NotifyScript
sync.RWMutex
}
const DefaultSMTP = `
Host = ""
Port = 994
User = "username"
Pass = "password"
From = "username@163.com"
InsecureSkipVerify = true
Batch = 5
`
const DefaultIbex = `
Address = "http://127.0.0.1:10090"
BasicAuthUser = "ibex"
BasicAuthPass = "ibex"
Timeout = 3000
`
func NewNotifyConfigCache(ctx *ctx.Context, configCache *ConfigCache) *NotifyConfigCacheType {
w := &NotifyConfigCacheType{
ctx: ctx,
ConfigCache: configCache,
webhooks: make(map[string]*models.Webhook),
}
w.SyncNotifyConfigs()
return w
}
func (w *NotifyConfigCacheType) SyncNotifyConfigs() {
err := w.syncNotifyConfigs()
if err != nil {
logger.Error("failed to sync webhooks:", err)
}
go w.loopSyncNotifyConfigs()
}
func (w *NotifyConfigCacheType) loopSyncNotifyConfigs() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := w.syncNotifyConfigs(); err != nil {
logger.Warning("failed to sync webhooks:", err)
}
}
}
func (w *NotifyConfigCacheType) syncNotifyConfigs() error {
start := time.Now()
userVariableMap := w.ConfigCache.Get()
w.RWMutex.Lock()
defer w.RWMutex.Unlock()
cval, err := models.ConfigsGet(w.ctx, models.WEBHOOKKEY)
if err != nil {
dumper.PutSyncRecord("webhooks", start.Unix(), -1, -1, "failed to query configs.webhook: "+err.Error())
return err
}
if strings.TrimSpace(cval) != "" {
var webhooks []*models.Webhook
err = json.Unmarshal([]byte(cval), &webhooks)
if err != nil {
dumper.PutSyncRecord("webhooks", start.Unix(), -1, -1, "failed to unmarshal configs.webhook: "+err.Error())
logger.Errorf("failed to unmarshal webhooks:%s error:%v", cval, err)
}
newWebhooks := make(map[string]*models.Webhook, len(webhooks))
for i := 0; i < len(webhooks); i++ {
if webhooks[i].Batch == 0 {
webhooks[i].Batch = 1000
}
if webhooks[i].Timeout == 0 {
webhooks[i].Timeout = 10
}
if webhooks[i].RetryCount == 0 {
webhooks[i].RetryCount = 10
}
if webhooks[i].RetryInterval == 0 {
webhooks[i].RetryInterval = 10
}
if webhooks[i].Client == nil {
transport := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: webhooks[i].SkipVerify},
}
if poster.UseProxy(webhooks[i].Url) {
transport.Proxy = http.ProxyFromEnvironment
}
webhooks[i].Client = &http.Client{
Timeout: time.Second * time.Duration(webhooks[i].Timeout),
Transport: transport,
}
}
newWebhooks[webhooks[i].Url] = webhooks[i]
}
for url, wh := range newWebhooks {
if oldWh, has := w.webhooks[url]; has && oldWh.Hash() != wh.Hash() {
w.webhooks[url] = wh
} else {
w.webhooks[url] = wh
}
}
for url := range w.webhooks {
if _, has := newWebhooks[url]; !has {
delete(w.webhooks, url)
}
}
}
dumper.PutSyncRecord("webhooks", start.Unix(), time.Since(start).Milliseconds(), len(w.webhooks), "success, webhooks:\n"+cval)
start = time.Now()
cval, err = models.ConfigsGet(w.ctx, models.SMTP)
if err != nil {
dumper.PutSyncRecord("smtp", start.Unix(), -1, -1, "failed to query configs.smtp_config: "+err.Error())
return err
}
cval = tplx.ReplaceTemplateUseText(models.SMTP, cval, userVariableMap)
if strings.TrimSpace(cval) != "" {
err = toml.Unmarshal([]byte(cval), &w.smtp)
if err != nil {
dumper.PutSyncRecord("smtp", start.Unix(), -1, -1, "failed to unmarshal configs.smtp_config: "+err.Error())
logger.Errorf("failed to unmarshal smtp:%s error:%v", cval, err)
}
}
dumper.PutSyncRecord("smtp", start.Unix(), time.Since(start).Milliseconds(), 1, "success, smtp_config:\n"+cval)
start = time.Now()
cval, err = models.ConfigsGet(w.ctx, models.NOTIFYSCRIPT)
if err != nil {
dumper.PutSyncRecord("notify_script", start.Unix(), -1, -1, "failed to query configs.notify_script: "+err.Error())
return err
}
if strings.TrimSpace(cval) != "" {
err = json.Unmarshal([]byte(cval), &w.script)
if err != nil {
dumper.PutSyncRecord("notify_script", start.Unix(), -1, -1, "failed to unmarshal configs.notify_script: "+err.Error())
logger.Errorf("failed to unmarshal notify script:%s error:%v", cval, err)
}
}
dumper.PutSyncRecord("notify_script", start.Unix(), time.Since(start).Milliseconds(), 1, "success, notify_script:\n"+cval)
return nil
}
func (w *NotifyConfigCacheType) GetWebhooks() map[string]*models.Webhook {
w.RWMutex.RLock()
defer w.RWMutex.RUnlock()
return w.webhooks
}
func (w *NotifyConfigCacheType) GetSMTP() aconf.SMTPConfig {
w.RWMutex.RLock()
defer w.RWMutex.RUnlock()
return w.smtp
}
func (w *NotifyConfigCacheType) GetNotifyScript() models.NotifyScript {
w.RWMutex.RLock()
defer w.RWMutex.RUnlock()
if w.script.Timeout == 0 {
w.script.Timeout = 10
}
return w.script
}
================================================
FILE: memsto/notify_rule_cache.go
================================================
package memsto
import (
"fmt"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type NotifyRuleCacheType struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
stats *Stats
sync.RWMutex
rules map[int64]*models.NotifyRule // key: rule id
}
func NewNotifyRuleCache(ctx *ctx.Context, stats *Stats) *NotifyRuleCacheType {
nrc := &NotifyRuleCacheType{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
stats: stats,
rules: make(map[int64]*models.NotifyRule),
}
nrc.SyncNotifyRules()
return nrc
}
func (nrc *NotifyRuleCacheType) Reset() {
nrc.Lock()
defer nrc.Unlock()
nrc.statTotal = -1
nrc.statLastUpdated = -1
nrc.rules = make(map[int64]*models.NotifyRule)
}
func (nrc *NotifyRuleCacheType) StatChanged(total, lastUpdated int64) bool {
if nrc.statTotal == total && nrc.statLastUpdated == lastUpdated {
return false
}
return true
}
func (nrc *NotifyRuleCacheType) Set(m map[int64]*models.NotifyRule, total, lastUpdated int64) {
nrc.Lock()
nrc.rules = m
nrc.Unlock()
// only one goroutine used, so no need lock
nrc.statTotal = total
nrc.statLastUpdated = lastUpdated
}
func (nrc *NotifyRuleCacheType) Get(ruleId int64) *models.NotifyRule {
nrc.RLock()
defer nrc.RUnlock()
return nrc.rules[ruleId]
}
func (nrc *NotifyRuleCacheType) GetRuleIds() []int64 {
nrc.RLock()
defer nrc.RUnlock()
count := len(nrc.rules)
list := make([]int64, 0, count)
for ruleId := range nrc.rules {
list = append(list, ruleId)
}
return list
}
func (nrc *NotifyRuleCacheType) SyncNotifyRules() {
err := nrc.syncNotifyRules()
if err != nil {
fmt.Println("failed to sync notify rules:", err)
exit(1)
}
go nrc.loopSyncNotifyRules()
}
func (nrc *NotifyRuleCacheType) loopSyncNotifyRules() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := nrc.syncNotifyRules(); err != nil {
logger.Warning("failed to sync notify rules:", err)
}
}
}
func (nrc *NotifyRuleCacheType) syncNotifyRules() error {
start := time.Now()
stat, err := models.NotifyRuleStatistics(nrc.ctx)
if err != nil {
dumper.PutSyncRecord("notify_rules", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to exec NotifyRuleStatistics")
}
if !nrc.StatChanged(stat.Total, stat.LastUpdated) {
nrc.stats.GaugeCronDuration.WithLabelValues("sync_notify_rules").Set(0)
nrc.stats.GaugeSyncNumber.WithLabelValues("sync_notify_rules").Set(0)
dumper.PutSyncRecord("notify_rules", start.Unix(), -1, -1, "not changed")
return nil
}
lst, err := models.NotifyRuleGetsAll(nrc.ctx)
if err != nil {
dumper.PutSyncRecord("notify_rules", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to exec NotifyRuleGetsAll")
}
m := make(map[int64]*models.NotifyRule)
for i := 0; i < len(lst); i++ {
m[lst[i].ID] = lst[i]
}
nrc.Set(m, stat.Total, stat.LastUpdated)
ms := time.Since(start).Milliseconds()
nrc.stats.GaugeCronDuration.WithLabelValues("sync_notify_rules").Set(float64(ms))
nrc.stats.GaugeSyncNumber.WithLabelValues("sync_notify_rules").Set(float64(len(m)))
dumper.PutSyncRecord("notify_rules", start.Unix(), ms, len(m), "success")
return nil
}
================================================
FILE: memsto/recording_rule_cache.go
================================================
package memsto
import (
"fmt"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type RecordingRuleCacheType struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
stats *Stats
sync.RWMutex
rules map[int64]*models.RecordingRule // key: rule id
}
func NewRecordingRuleCache(ctx *ctx.Context, stats *Stats) *RecordingRuleCacheType {
rrc := &RecordingRuleCacheType{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
stats: stats,
rules: make(map[int64]*models.RecordingRule),
}
rrc.SyncRecordingRules()
return rrc
}
func (rrc *RecordingRuleCacheType) Reset() {
rrc.Lock()
defer rrc.Unlock()
rrc.statTotal = -1
rrc.statLastUpdated = -1
rrc.rules = make(map[int64]*models.RecordingRule)
}
func (rrc *RecordingRuleCacheType) StatChanged(total, lastUpdated int64) bool {
if rrc.statTotal == total && rrc.statLastUpdated == lastUpdated {
return false
}
return true
}
func (rrc *RecordingRuleCacheType) Set(m map[int64]*models.RecordingRule, total, lastUpdated int64) {
rrc.Lock()
rrc.rules = m
rrc.Unlock()
// only one goroutine used, so no need lock
rrc.statTotal = total
rrc.statLastUpdated = lastUpdated
}
func (rrc *RecordingRuleCacheType) Get(ruleId int64) *models.RecordingRule {
rrc.RLock()
defer rrc.RUnlock()
return rrc.rules[ruleId]
}
func (rrc *RecordingRuleCacheType) GetRuleIds() []int64 {
rrc.RLock()
defer rrc.RUnlock()
count := len(rrc.rules)
list := make([]int64, 0, count)
for ruleId := range rrc.rules {
list = append(list, ruleId)
}
return list
}
func (rrc *RecordingRuleCacheType) SyncRecordingRules() {
err := rrc.syncRecordingRules()
if err != nil {
fmt.Println("failed to sync recording rules:", err)
exit(1)
}
go rrc.loopSyncRecordingRules()
}
func (rrc *RecordingRuleCacheType) loopSyncRecordingRules() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := rrc.syncRecordingRules(); err != nil {
logger.Warning("failed to sync recording rules:", err)
}
}
}
func (rrc *RecordingRuleCacheType) syncRecordingRules() error {
start := time.Now()
stat, err := models.RecordingRuleStatistics(rrc.ctx)
if err != nil {
dumper.PutSyncRecord("recording_rules", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to exec RecordingRuleStatistics")
}
if !rrc.StatChanged(stat.Total, stat.LastUpdated) {
rrc.stats.GaugeCronDuration.WithLabelValues("sync_recording_rules").Set(0)
rrc.stats.GaugeSyncNumber.WithLabelValues("sync_recording_rules").Set(0)
dumper.PutSyncRecord("recording_rules", start.Unix(), -1, -1, "not changed")
return nil
}
lst, err := models.RecordingRuleGetsByCluster(rrc.ctx)
if err != nil {
dumper.PutSyncRecord("recording_rules", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to exec RecordingRuleGetsByCluster")
}
m := make(map[int64]*models.RecordingRule)
for i := 0; i < len(lst); i++ {
m[lst[i].Id] = lst[i]
}
rrc.Set(m, stat.Total, stat.LastUpdated)
ms := time.Since(start).Milliseconds()
rrc.stats.GaugeCronDuration.WithLabelValues("sync_recording_rules").Set(float64(ms))
rrc.stats.GaugeSyncNumber.WithLabelValues("sync_recording_rules").Set(float64(len(m)))
dumper.PutSyncRecord("recording_rules", start.Unix(), ms, len(m), "success")
return nil
}
================================================
FILE: memsto/stat.go
================================================
package memsto
import "github.com/prometheus/client_golang/prometheus"
type Stats struct {
GaugeCronDuration *prometheus.GaugeVec
GaugeSyncNumber *prometheus.GaugeVec
}
func NewSyncStats() *Stats {
GaugeCronDuration := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "n9e",
Subsystem: "cron",
Name: "duration",
Help: "Cron method use duration, unit: ms.",
}, []string{"name"})
GaugeSyncNumber := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "n9e",
Subsystem: "cron",
Name: "sync_number",
Help: "Cron sync number.",
}, []string{"name"})
prometheus.MustRegister(
GaugeCronDuration,
GaugeSyncNumber,
)
return &Stats{
GaugeCronDuration: GaugeCronDuration,
GaugeSyncNumber: GaugeSyncNumber,
}
}
================================================
FILE: memsto/target_cache.go
================================================
package memsto
import (
"context"
"encoding/json"
"log"
"math"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/storage"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
// 1. append note to alert_event
// 2. append tags to series
type TargetCacheType struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
stats *Stats
redis storage.Redis
metaSyncCycle int // 计数器,用于控制 HostMeta 刷新频率
sync.RWMutex
targets map[string]*models.Target // key: ident
targetsIndex map[string][]string // key: ip, value: ident list
}
func NewTargetCache(ctx *ctx.Context, stats *Stats, redis storage.Redis) *TargetCacheType {
tc := &TargetCacheType{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
stats: stats,
redis: redis,
targets: make(map[string]*models.Target),
targetsIndex: make(map[string][]string),
}
tc.SyncTargets()
return tc
}
func (tc *TargetCacheType) Reset() {
tc.Lock()
defer tc.Unlock()
tc.statTotal = -1
tc.statLastUpdated = -1
tc.targets = make(map[string]*models.Target)
tc.targetsIndex = make(map[string][]string)
}
func (tc *TargetCacheType) StatChanged(total, lastUpdated int64) bool {
if tc.statTotal == total && tc.statLastUpdated == lastUpdated {
return false
}
return true
}
func (tc *TargetCacheType) Set(m map[string]*models.Target, total, lastUpdated int64) {
idx := make(map[string][]string, len(m))
for ident, target := range m {
if _, ok := idx[target.HostIp]; !ok {
idx[target.HostIp] = []string{}
}
idx[target.HostIp] = append(idx[target.HostIp], ident)
}
tc.Lock()
tc.targets = m
tc.targetsIndex = idx
tc.Unlock()
// only one goroutine used, so no need lock
tc.statTotal = total
tc.statLastUpdated = lastUpdated
}
func (tc *TargetCacheType) Get(ident string) (*models.Target, bool) {
tc.RLock()
defer tc.RUnlock()
val, has := tc.targets[ident]
return val, has
}
func (tc *TargetCacheType) GetByIp(ip string) ([]*models.Target, bool) {
tc.RLock()
defer tc.RUnlock()
idents, has := tc.targetsIndex[ip]
if !has {
return nil, false
}
targs := make([]*models.Target, 0, len(idents))
for _, ident := range idents {
if val, has := tc.targets[ident]; has {
targs = append(targs, val)
}
}
return targs, len(targs) > 0
}
func (tc *TargetCacheType) GetAll() []*models.Target {
tc.RLock()
defer tc.RUnlock()
lst := make([]*models.Target, 0, len(tc.targets))
for _, target := range tc.targets {
lst = append(lst, target)
}
return lst
}
// GetAllBeatTime 返回所有 target 的心跳时间 map,key 为 ident,value 为 BeatTime
func (tc *TargetCacheType) GetAllBeatTime() map[string]int64 {
tc.RLock()
defer tc.RUnlock()
beatTimeMap := make(map[string]int64, len(tc.targets))
for ident, target := range tc.targets {
beatTimeMap[ident] = target.BeatTime
}
return beatTimeMap
}
// refreshBeatTime 从 Redis 刷新缓存中所有 target 的 BeatTime
func (tc *TargetCacheType) refreshBeatTime() {
if tc.redis == nil {
return
}
// 快照 ident 列表,避免持锁访问 Redis
tc.RLock()
idents := make([]string, 0, len(tc.targets))
for ident := range tc.targets {
idents = append(idents, ident)
}
tc.RUnlock()
if len(idents) == 0 {
return
}
beatTimes := models.FetchBeatTimesFromRedis(tc.redis, idents)
if len(beatTimes) == 0 {
return
}
tc.Lock()
for ident, ts := range beatTimes {
if target, ok := tc.targets[ident]; ok {
target.BeatTime = ts
}
}
tc.Unlock()
}
// refreshHostMetas 从 Redis 刷新缓存中所有 target 的 HostMeta(CpuUtil, MemUtil, CpuNum 等)
func (tc *TargetCacheType) refreshHostMetas() {
if tc.redis == nil {
return
}
// 快照 target 列表,避免持锁访问 Redis
tc.RLock()
targets := make([]*models.Target, 0, len(tc.targets))
for _, t := range tc.targets {
targets = append(targets, t)
}
tc.RUnlock()
if len(targets) == 0 {
return
}
metaMap := tc.GetHostMetas(targets)
if len(metaMap) == 0 {
return
}
tc.Lock()
for ident, target := range tc.targets {
if meta, ok := metaMap[ident]; ok {
target.FillMeta(meta)
}
}
tc.Unlock()
}
func (tc *TargetCacheType) Gets(idents []string) []*models.Target {
tc.RLock()
defer tc.RUnlock()
var targets []*models.Target
for _, ident := range idents {
if target, has := tc.targets[ident]; has {
targets = append(targets, target)
}
}
return targets
}
func (tc *TargetCacheType) GetOffsetHost(targets []*models.Target, now, offset int64) map[string]int64 {
tc.RLock()
defer tc.RUnlock()
hostOffset := make(map[string]int64)
for _, target := range targets {
target, exists := tc.targets[target.Ident]
if !exists {
continue
}
if target.CpuNum <= 0 {
// means this target is not collect by categraf, do not check offset
continue
}
if now-target.BeatTime > 120 {
// means this target is not a active host, do not check offset
continue
}
if int64(math.Abs(float64(target.Offset))) > offset {
hostOffset[target.Ident] = target.Offset
}
}
return hostOffset
}
func (tc *TargetCacheType) SyncTargets() {
err := tc.syncTargets()
if err != nil {
log.Fatalln("failed to sync targets:", err)
}
go tc.loopSyncTargets()
}
func (tc *TargetCacheType) loopSyncTargets() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := tc.syncTargets(); err != nil {
logger.Warning("failed to sync targets:", err)
}
}
}
func (tc *TargetCacheType) syncTargets() error {
start := time.Now()
stat, err := models.TargetStatistics(tc.ctx)
if err != nil {
dumper.PutSyncRecord("targets", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to call TargetStatistics")
}
if !tc.StatChanged(stat.Total, stat.LastUpdated) {
tc.refreshBeatTime()
// HostMeta(CPU/内存等)变化不如心跳频繁,每 6 个周期(约 54 秒)刷新一次,减轻 Redis 压力
tc.metaSyncCycle++
if tc.metaSyncCycle >= 6 {
tc.metaSyncCycle = 0
tc.refreshHostMetas()
}
tc.stats.GaugeCronDuration.WithLabelValues("sync_targets").Set(0)
tc.stats.GaugeSyncNumber.WithLabelValues("sync_targets").Set(0)
dumper.PutSyncRecord("targets", start.Unix(), -1, -1, "not changed")
return nil
}
lst, err := models.TargetGetsAll(tc.ctx)
if err != nil {
dumper.PutSyncRecord("targets", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to call TargetGetsAll")
}
m := make(map[string]*models.Target)
metaMap := tc.GetHostMetas(lst)
if len(metaMap) > 0 {
for i := 0; i < len(lst); i++ {
if meta, ok := metaMap[lst[i].Ident]; ok {
lst[i].FillMeta(meta)
}
}
}
// 从 Redis 批量获取心跳时间填充 BeatTime
models.FillTargetsBeatTime(tc.redis, lst)
for i := 0; i < len(lst); i++ {
m[lst[i].Ident] = lst[i]
}
tc.Set(m, stat.Total, stat.LastUpdated)
tc.metaSyncCycle = 0 // 全量同步已包含 meta,重置计数器
ms := time.Since(start).Milliseconds()
tc.stats.GaugeCronDuration.WithLabelValues("sync_targets").Set(float64(ms))
tc.stats.GaugeSyncNumber.WithLabelValues("sync_targets").Set(float64(len(lst)))
dumper.PutSyncRecord("targets", start.Unix(), ms, len(lst), "success")
return nil
}
// get host update time
func (tc *TargetCacheType) GetHostUpdateTime(targets []string) map[string]int64 {
if tc.redis == nil {
return make(map[string]int64)
}
metaMap := models.FetchBeatTimesFromRedis(tc.redis, targets)
for _, ident := range targets {
if _, ok := metaMap[ident]; !ok {
// if not exists, get from cache
target, exists := tc.Get(ident)
if exists {
metaMap[ident] = target.BeatTime
}
}
}
return metaMap
}
func (tc *TargetCacheType) GetHostMetas(targets []*models.Target) map[string]*models.HostMeta {
metaMap := make(map[string]*models.HostMeta)
if tc.redis == nil {
return metaMap
}
var metas []*models.HostMeta
num := 0
var keys []string
for i := 0; i < len(targets); i++ {
keys = append(keys, models.WrapIdent(targets[i].Ident))
num++
if num == 100 {
vals := storage.MGet(context.Background(), tc.redis, keys)
for _, value := range vals {
var meta models.HostMeta
if value == nil {
continue
}
err := json.Unmarshal(value, &meta)
if err != nil {
logger.Errorf("failed to unmarshal host meta: %s value:%v", err, value)
continue
}
metaMap[meta.Hostname] = &meta
}
keys = keys[:0]
metas = metas[:0]
num = 0
}
}
vals := storage.MGet(context.Background(), tc.redis, keys)
for _, value := range vals {
var meta models.HostMeta
if value == nil {
continue
}
err := json.Unmarshal(value, &meta)
if err != nil {
continue
}
metaMap[meta.Hostname] = &meta
}
return metaMap
}
================================================
FILE: memsto/task_tpl_cache.go
================================================
package memsto
import (
"fmt"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type TaskTplCache struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
tpls map[int64]*models.TaskTpl
sync.RWMutex
}
func NewTaskTplCache(ctx *ctx.Context) *TaskTplCache {
ttc := &TaskTplCache{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
tpls: make(map[int64]*models.TaskTpl),
}
ttc.SyncTaskTpl()
return ttc
}
func (ttc *TaskTplCache) Set(tpls map[int64]*models.TaskTpl, total, lastUpdated int64) {
ttc.Lock()
ttc.tpls = tpls
ttc.Unlock()
ttc.statTotal = total
ttc.statLastUpdated = lastUpdated
}
func (ttc *TaskTplCache) Get(id int64) *models.TaskTpl {
ttc.Lock()
defer ttc.Unlock()
return ttc.tpls[id]
}
func (ttc *TaskTplCache) SyncTaskTpl() {
if err := ttc.syncTaskTpl(); err != nil {
fmt.Println("failed to sync task tpls:", err)
exit(1)
}
go ttc.loopSyncTaskTpl()
}
func (ttc *TaskTplCache) syncTaskTpl() error {
start := time.Now()
stat, err := models.TaskTplStatistics(ttc.ctx)
if err != nil {
dumper.PutSyncRecord("task_tpls", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to exec TaskTplStatistics")
}
if !ttc.StatChange(stat.Total, stat.LastUpdated) {
dumper.PutSyncRecord("task_tpls", start.Unix(), -1, -1, "not changed")
return nil
}
lst, err := models.TaskTplGetAll(ttc.ctx)
if err != nil {
dumper.PutSyncRecord("task_tpls", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to exec TaskTplGetAll")
}
m := make(map[int64]*models.TaskTpl, len(lst))
for _, tpl := range lst {
m[tpl.Id] = tpl
}
ttc.Set(m, stat.Total, stat.LastUpdated)
ms := time.Since(start).Milliseconds()
dumper.PutSyncRecord("task_tpls", start.Unix(), ms, len(m), "success")
return nil
}
func (ttc *TaskTplCache) loopSyncTaskTpl() {
d := time.Duration(9) * time.Second
for {
time.Sleep(d)
if err := ttc.syncTaskTpl(); err != nil {
logger.Warning("failed to sync task tpl:", err)
}
}
}
func (ttc *TaskTplCache) StatChange(total int64, lastUpdated int64) bool {
if ttc.statTotal == total && ttc.statLastUpdated == lastUpdated {
return false
}
return true
}
================================================
FILE: memsto/user_cache.go
================================================
package memsto
import (
"fmt"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/flashduty"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type UserCacheType struct {
statTotal int64
statLastUpdated int64
configsTotal int64
configsLastUpdated int64
ctx *ctx.Context
stats *Stats
sync.RWMutex
users map[int64]*models.User // key: id
}
func NewUserCache(ctx *ctx.Context, stats *Stats) *UserCacheType {
uc := &UserCacheType{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
stats: stats,
users: make(map[int64]*models.User),
}
uc.SyncUsers()
return uc
}
func (uc *UserCacheType) StatChanged(total, lastUpdated, configsTotal, configsLastUpdated int64) bool {
if uc.statTotal == total && uc.statLastUpdated == lastUpdated && uc.configsTotal == configsTotal && uc.configsLastUpdated == configsLastUpdated {
return false
}
return true
}
func (uc *UserCacheType) Set(m map[int64]*models.User, total, lastUpdated, configsTotal, configsLastUpdated int64) {
uc.Lock()
uc.users = m
uc.Unlock()
// only one goroutine used, so no need lock
uc.statTotal = total
uc.statLastUpdated = lastUpdated
uc.configsTotal = configsTotal
uc.configsLastUpdated = configsLastUpdated
}
func (uc *UserCacheType) GetByUserId(id int64) *models.User {
uc.RLock()
defer uc.RUnlock()
return uc.users[id]
}
func (uc *UserCacheType) GetByUsername(name string) *models.User {
uc.RLock()
defer uc.RUnlock()
for _, v := range uc.users {
if v.Username == name {
return v
}
}
return nil
}
func (uc *UserCacheType) GetByUserIds(ids []int64) []*models.User {
set := make(map[int64]struct{})
uc.RLock()
defer uc.RUnlock()
var users []*models.User
for _, id := range ids {
if uc.users[id] == nil {
continue
}
if _, has := set[id]; has {
continue
}
users = append(users, uc.users[id])
set[id] = struct{}{}
}
if users == nil {
users = []*models.User{}
}
return users
}
func (uc *UserCacheType) GetMaintainerUsers() []*models.User {
uc.RLock()
defer uc.RUnlock()
var users []*models.User
for _, v := range uc.users {
if v.Maintainer == 1 {
users = append(users, v)
}
}
if users == nil {
users = []*models.User{}
}
return users
}
func (uc *UserCacheType) SyncUsers() {
err := uc.syncUsers()
if err != nil {
fmt.Println("failed to sync users:", err)
exit(1)
}
go uc.loopSyncUsers()
go uc.loopUpdateLastActiveTime()
}
func (uc *UserCacheType) loopSyncUsers() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := uc.syncUsers(); err != nil {
logger.Warning("failed to sync users:", err)
}
}
}
func (uc *UserCacheType) syncUsers() error {
start := time.Now()
stat, err := models.UserStatistics(uc.ctx)
if err != nil {
dumper.PutSyncRecord("users", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to exec UserStatistics")
}
configsStat, err := models.ConfigsUserVariableStatistics(uc.ctx)
if err != nil {
dumper.PutSyncRecord("user_variables", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to exec ConfigsUserVariableStatistics")
}
if !uc.StatChanged(stat.Total, stat.LastUpdated, configsStat.Total, configsStat.LastUpdated) {
uc.stats.GaugeCronDuration.WithLabelValues("sync_users").Set(0)
uc.stats.GaugeSyncNumber.WithLabelValues("sync_users").Set(0)
dumper.PutSyncRecord("users", start.Unix(), -1, -1, "not changed")
return nil
}
lst, err := models.UserGetAll(uc.ctx)
if err != nil {
dumper.PutSyncRecord("users", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to exec UserGetAll")
}
m := make(map[int64]*models.User)
for i := 0; i < len(lst); i++ {
m[lst[i].Id] = lst[i]
}
uc.Set(m, stat.Total, stat.LastUpdated, configsStat.Total, configsStat.LastUpdated)
if flashduty.NeedSyncUser(uc.ctx) {
go func() {
err := flashduty.SyncUsersChange(uc.ctx, lst)
if err != nil {
logger.Warning("failed to sync users to flashduty:", err)
dumper.PutSyncRecord("users", start.Unix(), -1, -1, "failed to sync to flashduty: "+err.Error())
}
}()
}
ms := time.Since(start).Milliseconds()
uc.stats.GaugeCronDuration.WithLabelValues("sync_users").Set(float64(ms))
uc.stats.GaugeSyncNumber.WithLabelValues("sync_users").Set(float64(len(m)))
dumper.PutSyncRecord("users", start.Unix(), ms, len(m), "success")
return nil
}
func (uc *UserCacheType) SetLastActiveTime(userId int64, lastActiveTime int64) {
uc.Lock()
defer uc.Unlock()
if user, exists := uc.users[userId]; exists {
user.LastActiveTime = lastActiveTime
}
}
func (uc *UserCacheType) loopUpdateLastActiveTime() {
defer func() {
if r := recover(); r != nil {
logger.Errorf("panic to loopUpdateLastActiveTime(), err: %v", r)
}
}()
// Sync every five minutes
duration := 5 * time.Minute
for {
time.Sleep(duration)
if err := uc.UpdateUsersLastActiveTime(); err != nil {
logger.Warningf("failed to update users' last active time: %v", err)
}
}
}
func (uc *UserCacheType) UpdateUsersLastActiveTime() error {
// read the full list of users from the database
users, err := models.UserGetAll(uc.ctx)
if err != nil {
return errors.WithMessage(err, "failed to get all users from database")
}
for _, dbUser := range users {
cacheUser := uc.GetByUserId(dbUser.Id)
if cacheUser == nil {
continue
}
if dbUser.LastActiveTime >= cacheUser.LastActiveTime {
continue
}
err = models.UpdateUserLastActiveTime(uc.ctx, cacheUser.Id, cacheUser.LastActiveTime)
if err != nil {
logger.Warningf("failed to update last active time for user %d: %v", cacheUser.Id, err)
return err
}
}
return nil
}
================================================
FILE: memsto/user_group_cache.go
================================================
package memsto
import (
"fmt"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type UserGroupCacheType struct {
statTotal int64
statLastUpdated int64
ctx *ctx.Context
stats *Stats
sync.RWMutex
ugs map[int64]*models.UserGroup // key: id
}
func NewUserGroupCache(ctx *ctx.Context, stats *Stats) *UserGroupCacheType {
ugc := &UserGroupCacheType{
statTotal: -1,
statLastUpdated: -1,
ctx: ctx,
stats: stats,
ugs: make(map[int64]*models.UserGroup),
}
ugc.SyncUserGroups()
return ugc
}
func (ugc *UserGroupCacheType) StatChanged(total, lastUpdated int64) bool {
if ugc.statTotal == total && ugc.statLastUpdated == lastUpdated {
return false
}
return true
}
func (ugc *UserGroupCacheType) Set(ugs map[int64]*models.UserGroup, total, lastUpdated int64) {
ugc.Lock()
ugc.ugs = ugs
ugc.Unlock()
// only one goroutine used, so no need lock
ugc.statTotal = total
ugc.statLastUpdated = lastUpdated
}
func (ugc *UserGroupCacheType) GetByUserGroupId(id int64) *models.UserGroup {
ugc.RLock()
defer ugc.RUnlock()
return ugc.ugs[id]
}
func (ugc *UserGroupCacheType) GetByUserGroupIds(ids []int64) []*models.UserGroup {
set := make(map[int64]struct{})
ugc.RLock()
defer ugc.RUnlock()
var ugs []*models.UserGroup
for _, id := range ids {
if ugc.ugs[id] == nil {
continue
}
if _, has := set[id]; has {
continue
}
ugs = append(ugs, ugc.ugs[id])
set[id] = struct{}{}
}
if ugs == nil {
return []*models.UserGroup{}
}
return ugs
}
func (ugc *UserGroupCacheType) SyncUserGroups() {
err := ugc.syncUserGroups()
if err != nil {
fmt.Println("failed to sync user groups:", err)
exit(1)
}
go ugc.loopSyncUserGroups()
}
func (ugc *UserGroupCacheType) loopSyncUserGroups() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := ugc.syncUserGroups(); err != nil {
logger.Warning("failed to sync user groups:", err)
}
}
}
func (ugc *UserGroupCacheType) syncUserGroups() error {
start := time.Now()
stat, err := models.UserGroupStatistics(ugc.ctx)
if err != nil {
dumper.PutSyncRecord("user_groups", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to exec UserGroupStatistics")
}
if !ugc.StatChanged(stat.Total, stat.LastUpdated) {
ugc.stats.GaugeCronDuration.WithLabelValues("sync_user_groups").Set(0)
ugc.stats.GaugeSyncNumber.WithLabelValues("sync_user_groups").Set(0)
dumper.PutSyncRecord("user_groups", start.Unix(), -1, -1, "not changed")
return nil
}
lst, err := models.UserGroupGetAll(ugc.ctx)
if err != nil {
dumper.PutSyncRecord("user_groups", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to exec UserGroupGetAll")
}
m := make(map[int64]*models.UserGroup)
for i := 0; i < len(lst); i++ {
m[lst[i].Id] = lst[i]
}
// fill user ids
members, err := models.UserGroupMemberGetAll(ugc.ctx)
if err != nil {
dumper.PutSyncRecord("user_groups", start.Unix(), -1, -1, "failed to query members: "+err.Error())
return errors.WithMessage(err, "failed to exec UserGroupMemberGetAll")
}
for i := 0; i < len(members); i++ {
ug, has := m[members[i].GroupId]
if !has {
continue
}
if ug == nil {
continue
}
ug.UserIds = append(ug.UserIds, members[i].UserId)
}
ugc.Set(m, stat.Total, stat.LastUpdated)
ms := time.Since(start).Milliseconds()
ugc.stats.GaugeCronDuration.WithLabelValues("sync_user_groups").Set(float64(ms))
ugc.stats.GaugeSyncNumber.WithLabelValues("sync_user_groups").Set(float64(len(m)))
dumper.PutSyncRecord("user_groups", start.Unix(), ms, len(m), "success")
return nil
}
================================================
FILE: memsto/user_token_cache.go
================================================
package memsto
import (
"fmt"
"sync"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type UserTokenCacheType struct {
statTotal int64
ctx *ctx.Context
stats *Stats
sync.RWMutex
tokens map[string]*models.User
tokensLastUsed map[string]int64
}
func NewUserTokenCache(ctx *ctx.Context, stats *Stats) *UserTokenCacheType {
utc := &UserTokenCacheType{
statTotal: -1,
ctx: ctx,
stats: stats,
tokens: make(map[string]*models.User),
tokensLastUsed: make(map[string]int64),
}
utc.SyncUserTokens()
return utc
}
func (utc *UserTokenCacheType) StatChanged(total int64) bool {
if utc.statTotal == total {
return false
}
return true
}
func (utc *UserTokenCacheType) Set(tokenUsers map[string]*models.User, total int64) {
utc.Lock()
utc.tokens = tokenUsers
utc.Unlock()
utc.statTotal = total
}
func (utc *UserTokenCacheType) GetByToken(token string) *models.User {
utc.Lock()
defer utc.Unlock()
utc.tokensLastUsed[token] = time.Now().Unix()
return utc.tokens[token]
}
func (utc *UserTokenCacheType) SyncUserTokens() {
err := utc.syncUserTokens()
if err != nil {
fmt.Println("failed to sync user tokens:", err)
exit(1)
}
go utc.loopSyncUserTokens()
go utc.loopUpdateUserTokenLastUsedTime()
}
func (utc *UserTokenCacheType) loopUpdateUserTokenLastUsedTime() {
duration := time.Duration(10) * time.Minute
for {
time.Sleep(duration)
if err := utc.updateUserTokenLastUsedTime(); err != nil {
logger.Warning("failed to update user token last used time:", err)
}
}
}
func (utc *UserTokenCacheType) loopSyncUserTokens() {
duration := time.Duration(9000) * time.Millisecond
for {
time.Sleep(duration)
if err := utc.syncUserTokens(); err != nil {
logger.Warning("failed to sync user tokens:", err)
}
}
}
func (utc *UserTokenCacheType) updateUserTokenLastUsedTime() error {
tokenLastUsedMap := make(map[string]int64)
now := time.Now().Unix()
utc.Lock()
for token, lastUsedTime := range utc.tokensLastUsed {
if lastUsedTime == 0 {
continue
}
if now-lastUsedTime > 1800 {
// 如果 token 已经 30 分钟没有使用,不再更新数据库
delete(utc.tokensLastUsed, token)
continue
}
tokenLastUsedMap[token] = lastUsedTime
}
utc.Unlock()
for token, lastUsedTime := range tokenLastUsedMap {
err := models.UserTokenUpdateLastUsedTime(utc.ctx, token, lastUsedTime)
if err != nil {
logger.Warning("failed to update user token last used time:", err)
continue
}
}
return nil
}
func (utc *UserTokenCacheType) syncUserTokens() error {
start := time.Now()
total, err := models.UserTokenTotal(utc.ctx)
if err != nil {
dumper.PutSyncRecord("user_tokens", start.Unix(), -1, -1, "failed to query statistics: "+err.Error())
return errors.WithMessage(err, "failed to exec UserTokenStatistics")
}
if !utc.StatChanged(total) {
utc.stats.GaugeCronDuration.WithLabelValues("sync_user_tokens").Set(0)
utc.stats.GaugeSyncNumber.WithLabelValues("sync_user_tokens").Set(0)
dumper.PutSyncRecord("user_tokens", start.Unix(), -1, -1, "not changed")
return nil
}
lst, err := models.UserTokenGetAll(utc.ctx)
if err != nil {
dumper.PutSyncRecord("user_tokens", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to exec UserTokenGetAll")
}
users, err := models.UserGetAll(utc.ctx)
if err != nil {
dumper.PutSyncRecord("user_tokens", start.Unix(), -1, -1, "failed to query records: "+err.Error())
return errors.WithMessage(err, "failed to exec UserGetAll")
}
userMap := make(map[string]*models.User)
for _, user := range users {
userMap[user.Username] = user
}
tokenUsers := make(map[string]*models.User)
for _, token := range lst {
user, ok := userMap[token.Username]
if !ok {
continue
}
tokenUsers[token.Token] = user
}
utc.Set(tokenUsers, total)
ms := time.Since(start).Milliseconds()
utc.stats.GaugeCronDuration.WithLabelValues("sync_user_tokens").Set(float64(ms))
utc.stats.GaugeSyncNumber.WithLabelValues("sync_user_tokens").Set(float64(len(tokenUsers)))
dumper.PutSyncRecord("user_tokens", start.Unix(), ms, len(tokenUsers), "success")
return nil
}
================================================
FILE: models/alert_aggr_view.go
================================================
package models
import (
"errors"
"fmt"
"sort"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/slice"
)
// AlertAggrView 在告警聚合视图查看的时候,要存储一些聚合规则
type AlertAggrView struct {
Id int64 `json:"id" gorm:"primaryKey"`
Name string `json:"name"`
Rule string `json:"rule"`
Cate int `json:"cate"`
CreateAt int64 `json:"create_at"`
CreateBy int64 `json:"create_by"`
UpdateAt int64 `json:"update_at"`
}
func (v *AlertAggrView) TableName() string {
return "alert_aggr_view"
}
func (v *AlertAggrView) Verify() error {
v.Name = strings.TrimSpace(v.Name)
if v.Name == "" {
return errors.New("name is blank")
}
v.Rule = strings.TrimSpace(v.Rule)
if v.Rule == "" {
return errors.New("rule is blank")
}
if !strings.Contains(v.Rule, "{{") {
var validFields = []string{
"cluster",
"group_id",
"group_name",
"rule_id",
"rule_name",
"severity",
"runbook_url",
"target_ident",
"target_note",
}
arr := strings.Split(v.Rule, "::")
for i := 0; i < len(arr); i++ {
pair := strings.Split(arr[i], ":")
if len(pair) != 2 {
return errors.New("rule invalid")
}
if !(pair[0] == "field" || pair[0] == "tagkey") {
return errors.New("rule invalid")
}
if pair[0] == "field" {
// 只支持有限的field
if !slice.ContainsString(validFields, pair[1]) {
return fmt.Errorf("unsupported field: %s", pair[1])
}
}
}
}
return nil
}
func (v *AlertAggrView) Add(ctx *ctx.Context) error {
if err := v.Verify(); err != nil {
return err
}
now := time.Now().Unix()
v.CreateAt = now
v.UpdateAt = now
v.Cate = 1
return Insert(ctx, v)
}
func (v *AlertAggrView) Update(ctx *ctx.Context) error {
if err := v.Verify(); err != nil {
return err
}
v.UpdateAt = time.Now().Unix()
return DB(ctx).Model(v).Select("name", "rule", "cate", "update_at", "create_by").Updates(v).Error
}
// AlertAggrViewDel: userid for safe delete
func AlertAggrViewDel(ctx *ctx.Context, ids []int64, createBy ...interface{}) error {
if len(ids) == 0 {
return nil
}
if len(createBy) > 0 {
return DB(ctx).Where("id in ? and create_by = ?", ids, createBy).Delete(new(AlertAggrView)).Error
}
return DB(ctx).Where("id in ?", ids).Delete(new(AlertAggrView)).Error
}
func AlertAggrViewGets(ctx *ctx.Context, createBy interface{}) ([]AlertAggrView, error) {
var lst []AlertAggrView
err := DB(ctx).Where("create_by = ? or cate = 0", createBy).Find(&lst).Error
if err == nil && len(lst) > 1 {
sort.Slice(lst, func(i, j int) bool {
if lst[i].Cate < lst[j].Cate {
return true
}
if lst[i].Cate > lst[j].Cate {
return false
}
return lst[i].Name < lst[j].Name
})
}
return lst, err
}
func AlertAggrViewGet(ctx *ctx.Context, where string, args ...interface{}) (*AlertAggrView, error) {
var lst []*AlertAggrView
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
return lst[0], nil
}
func GetAlertAggrViewByViewID(ctx *ctx.Context, viewID int64) (*AlertAggrView, error) {
view, err := AlertAggrViewGet(ctx, "id = ?", viewID)
if err != nil {
return nil, err
}
if view == nil {
return nil, errors.New("alert aggr view not found")
}
return view, nil
}
================================================
FILE: models/alert_cur_event.go
================================================
package models
import (
"bytes"
"encoding/json"
"fmt"
"net/http"
"reflect"
"strconv"
"strings"
"text/template"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/pkg/tplx"
"github.com/ccfos/nightingale/v6/pkg/unit"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/toolkits/pkg/logger"
)
type AlertCurEvent struct {
Id int64 `json:"id" gorm:"primaryKey"`
Cate string `json:"cate"`
Cluster string `json:"cluster"`
DatasourceId int64 `json:"datasource_id"`
GroupId int64 `json:"group_id"` // busi group id
GroupName string `json:"group_name"` // busi group name
Hash string `json:"hash"` // rule_id + vector_key
RuleId int64 `json:"rule_id"`
RuleName string `json:"rule_name"`
RuleNote string `json:"rule_note"`
RuleProd string `json:"rule_prod"`
RuleAlgo string `json:"rule_algo"`
Severity int `json:"severity"`
PromForDuration int `json:"prom_for_duration"`
PromQl string `json:"prom_ql"`
RuleConfig string `json:"-" gorm:"rule_config"` // rule config
RuleConfigJson interface{} `json:"rule_config" gorm:"-"` // rule config for fe
PromEvalInterval int `json:"prom_eval_interval"`
Callbacks string `json:"-"` // for db
CallbacksJSON []string `json:"callbacks" gorm:"-"` // for fe
RunbookUrl string `json:"runbook_url"`
NotifyRecovered int `json:"notify_recovered"`
NotifyChannels string `json:"-"` // for db
NotifyChannelsJSON []string `json:"notify_channels,omitempty" gorm:"-"` // for fe
NotifyGroups string `json:"-"` // for db
NotifyGroupsJSON []string `json:"notify_groups,omitempty" gorm:"-"` // for fe
NotifyGroupsObj []*UserGroup `json:"notify_groups_obj,omitempty" gorm:"-"` // for fe
TargetIdent string `json:"target_ident"`
TargetNote string `json:"target_note"`
TriggerTime int64 `json:"trigger_time"`
TriggerValue string `json:"trigger_value"`
TriggerValues string `json:"trigger_values" gorm:"-"`
TriggerValuesJson EventTriggerValues `json:"trigger_values_json" gorm:"-"`
Tags string `json:"-"` // for db
TagsJSON []string `json:"tags" gorm:"-"` // for fe
TagsMap map[string]string `json:"tags_map" gorm:"-"` // for internal usage
OriginalTags string `json:"-"` // for db
OriginalTagsJSON []string `json:"original_tags" gorm:"-"` // for fe
Annotations string `json:"-"` //
AnnotationsJSON map[string]string `json:"annotations" gorm:"-"` // for fe
IsRecovered bool `json:"is_recovered" gorm:"-"` // for notify.py
NotifyUsersObj []*User `json:"notify_users_obj,omitempty" gorm:"-"` // for notify.py
LastEvalTime int64 `json:"last_eval_time" gorm:"-"` // for notify.py 上次计算的时间
LastSentTime int64 `json:"last_sent_time" gorm:"-"` // 上次发送时间
FirstEvalTime int64 `json:"first_eval_time" gorm:"-"` // 首次异常检测时间
NotifyCurNumber int `json:"notify_cur_number"` // notify: current number
FirstTriggerTime int64 `json:"first_trigger_time"` // 连续告警的首次告警时间
ExtraConfig interface{} `json:"extra_config" gorm:"-"`
Status int `json:"status" gorm:"-"`
Claimant string `json:"claimant" gorm:"-"`
SubRuleId int64 `json:"sub_rule_id" gorm:"-"`
ExtraInfo []string `json:"extra_info" gorm:"-"`
Target *Target `json:"target" gorm:"-"`
RecoverConfig RecoverConfig `json:"recover_config" gorm:"-"`
RuleHash string `json:"rule_hash" gorm:"-"`
ExtraInfoMap []map[string]string `json:"extra_info_map" gorm:"-"`
NotifyRuleIds []int64 `json:"notify_rule_ids" gorm:"serializer:json"`
NotifyRuleId int64 `json:"notify_rule_id" gorm:"-"`
NotifyRuleName string `json:"notify_rule_name" gorm:"-"`
NotifyVersion int `json:"notify_version" gorm:"-"` // 0: old, 1: new
NotifyRules []*EventNotifyRule `json:"notify_rules" gorm:"-"`
RecoverTime int64 `json:"recover_time" gorm:"-"`
}
type EventNotifyRule struct {
Id int64 `json:"id"`
Name string `json:"name"`
}
func (e *AlertCurEvent) SetTagsMap() {
e.TagsMap = make(map[string]string)
for i := 0; i < len(e.TagsJSON); i++ {
pair := strings.TrimSpace(e.TagsJSON[i])
if pair == "" {
continue
}
arr := strings.SplitN(pair, "=", 2)
if len(arr) != 2 {
continue
}
e.TagsMap[arr[0]] = arr[1]
}
}
func (e *AlertCurEvent) JsonTagsAndValue() map[string]string {
v := reflect.ValueOf(e).Elem()
t := v.Type()
tags := make(map[string]string)
for i := 0; i < t.NumField(); i++ {
field := t.Field(i)
// 获取 json tag
tag := field.Tag.Get("json")
if tag == "" {
continue
}
// 处理类似 `json:",omitempty"` 或 `json:"-"` 的特殊情况
tagParts := strings.Split(tag, ",")
if tagParts[0] == "-" {
continue
}
// 获取字段值并转换为字符串
fieldValue := v.Field(i).Interface()
var strValue string
switch v := fieldValue.(type) {
case string:
strValue = v
case int, int8, int16, int32, int64:
strValue = fmt.Sprintf("%d", v)
case float32, float64:
strValue = fmt.Sprintf("%f", v)
case bool:
strValue = fmt.Sprintf("%v", v)
case []string:
b, _ := json.Marshal(v)
strValue = string(b)
case map[string]string:
b, _ := json.Marshal(v)
strValue = string(b)
default:
// 对于其他类型,尝试 JSON 序列化
if b, err := json.Marshal(v); err == nil {
strValue = string(b)
} else {
strValue = fmt.Sprintf("%v", v)
}
}
// 如果没有指定 tag 名称,使用字段名作为 key
if tagParts[0] == "" {
tags[field.Name] = strValue
} else {
tags[tagParts[0]] = strValue
}
}
return tags
}
type EventTriggerValues struct {
ValuesWithUnit map[string]unit.FormattedValue `json:"values_with_unit"`
}
func (e *AlertCurEvent) TableName() string {
return "alert_cur_event"
}
func (e *AlertCurEvent) Add(ctx *ctx.Context) error {
return Insert(ctx, e)
}
type AggrRule struct {
Type string
Value string
}
func (e *AlertCurEvent) ParseRule(field string) error {
f := e.GetField(field)
f = strings.TrimSpace(f)
if f == "" {
return nil
}
if field == "annotations" {
err := json.Unmarshal([]byte(e.Annotations), &e.AnnotationsJSON)
if err != nil {
logger.Warningf("ruleid:%d failed to parse annotations: %v", e.RuleId, err)
e.AnnotationsJSON = make(map[string]string)
e.AnnotationsJSON["error"] = e.Annotations
}
for k, v := range e.AnnotationsJSON {
f = v
var defs = []string{
"{{$labels := .TagsMap}}",
"{{$value := .TriggerValue}}",
}
templateFuncMapCopy := tplx.NewTemplateFuncMap()
templateFuncMapCopy["query"] = func(promql string, param ...int64) tplx.QueryResult {
datasourceId := e.DatasourceId
if len(param) > 0 {
datasourceId = param[0]
}
value := tplx.Query(datasourceId, promql)
return tplx.ConvertToQueryResult(value)
}
text := strings.Join(append(defs, f), "")
t, err := template.New(fmt.Sprint(e.RuleId)).Funcs(templateFuncMapCopy).Parse(text)
if err != nil {
e.AnnotationsJSON[k] = fmt.Sprintf("failed to parse annotations: %v", err)
continue
}
var body bytes.Buffer
err = t.Execute(&body, e)
if err != nil {
e.AnnotationsJSON[k] = fmt.Sprintf("failed to parse annotations: %v", err)
continue
}
e.AnnotationsJSON[k] = body.String()
}
b, err := json.Marshal(e.AnnotationsJSON)
if err != nil {
e.AnnotationsJSON = make(map[string]string)
e.AnnotationsJSON["error"] = fmt.Sprintf("failed to parse annotations: %v", err)
} else {
e.Annotations = string(b)
}
return nil
}
var defs = []string{
"{{$labels := .TagsMap}}",
"{{$value := .TriggerValue}}",
"{{$annotations := .AnnotationsJSON}}",
}
text := strings.Join(append(defs, f), "")
t, err := template.New(fmt.Sprint(e.RuleId)).Funcs(template.FuncMap(tplx.TemplateFuncMap)).Parse(text)
if err != nil {
return err
}
var body bytes.Buffer
err = t.Execute(&body, e)
if err != nil {
return err
}
if field == "rule_name" {
e.RuleName = body.String()
}
if field == "rule_note" {
e.RuleNote = body.String()
}
return nil
}
func (e *AlertCurEvent) ParseURL(url string) (string, error) {
f := strings.TrimSpace(url)
if f == "" {
return url, nil
}
var defs = []string{
"{{$labels := .TagsMap}}",
"{{$value := .TriggerValue}}",
"{{$annotations := .AnnotationsJSON}}",
}
text := strings.Join(append(defs, f), "")
t, err := template.New("callbackUrl" + fmt.Sprint(e.RuleId)).Funcs(template.FuncMap(tplx.TemplateFuncMap)).Parse(text)
if err != nil {
return url, nil
}
var body bytes.Buffer
err = t.Execute(&body, e)
if err != nil {
return url, nil
}
return body.String(), nil
}
func parseAggrRules(rule string) []*AggrRule {
aggrRules := strings.Split(rule, "::") // e.g. field:group_name::field:severity::tagkey:ident
if len(aggrRules) == 0 {
ginx.Bomb(http.StatusBadRequest, "rule empty")
}
rules := make([]*AggrRule, len(aggrRules))
for i := 0; i < len(aggrRules); i++ {
pair := strings.Split(aggrRules[i], ":")
if len(pair) != 2 {
ginx.Bomb(http.StatusBadRequest, "rule invalid")
}
if !(pair[0] == "field" || pair[0] == "tagkey") {
ginx.Bomb(http.StatusBadRequest, "rule invalid")
}
rules[i] = &AggrRule{
Type: pair[0],
Value: pair[1],
}
}
return rules
}
func (e *AlertCurEvent) GenCardTitle(rule string) (string, error) {
if strings.Contains(rule, "{{") {
// 有 {{ 表示使用的是新的配置方式,使用 go template 进行格式化
tmpl, err := template.New("card_title").Parse(rule)
if err != nil {
return fmt.Sprintf("failed to parse card title: %v", err), nil
}
var buf bytes.Buffer
if err := tmpl.Execute(&buf, e); err != nil {
return fmt.Sprintf("failed to execute card title: %v", err), nil
}
return buf.String(), nil
}
rules := parseAggrRules(rule)
arr := make([]string, len(rules))
for i := 0; i < len(rules); i++ {
rule := rules[i]
if rule.Type == "field" {
arr[i] = e.GetField(rule.Value)
}
if rule.Type == "tagkey" {
arr[i] = e.GetTagValue(rule.Value)
}
if len(arr[i]) == 0 {
arr[i] = "Others"
}
}
return strings.Join(arr, "::"), nil
}
func (e *AlertCurEvent) GetTagValue(tagkey string) string {
for _, tag := range e.TagsJSON {
i := strings.Index(tag, tagkey+"=")
if i >= 0 {
return tag[len(tagkey+"="):]
}
}
return ""
}
func (e *AlertCurEvent) GetField(field string) string {
switch field {
case "cluster":
return e.Cluster
case "group_id":
return fmt.Sprint(e.GroupId)
case "group_name":
return e.GroupName
case "rule_id":
return fmt.Sprint(e.RuleId)
case "rule_name":
return e.RuleName
case "rule_note":
return e.RuleNote
case "severity":
return fmt.Sprint(e.Severity)
case "runbook_url":
return e.RunbookUrl
case "target_ident":
return e.TargetIdent
case "target_note":
return e.TargetNote
case "callbacks":
return e.Callbacks
case "annotations":
return e.Annotations
default:
return ""
}
}
func (e *AlertCurEvent) ToHis(ctx *ctx.Context) *AlertHisEvent {
isRecovered := 0
var recoverTime int64 = 0
if e.IsRecovered {
isRecovered = 1
recoverTime = e.LastEvalTime
}
return &AlertHisEvent{
IsRecovered: isRecovered,
Cate: e.Cate,
Cluster: e.Cluster,
DatasourceId: e.DatasourceId,
GroupId: e.GroupId,
GroupName: e.GroupName,
Hash: e.Hash,
RuleId: e.RuleId,
RuleName: e.RuleName,
RuleProd: e.RuleProd,
RuleAlgo: e.RuleAlgo,
RuleNote: e.RuleNote,
Severity: e.Severity,
PromForDuration: e.PromForDuration,
PromQl: e.PromQl,
PromEvalInterval: e.PromEvalInterval,
RuleConfig: e.RuleConfig,
RuleConfigJson: e.RuleConfigJson,
Callbacks: e.Callbacks,
RunbookUrl: e.RunbookUrl,
NotifyRecovered: e.NotifyRecovered,
NotifyChannels: e.NotifyChannels,
NotifyGroups: e.NotifyGroups,
Annotations: e.Annotations,
AnnotationsJSON: e.AnnotationsJSON,
TargetIdent: e.TargetIdent,
TargetNote: e.TargetNote,
TriggerTime: e.TriggerTime,
TriggerValue: e.TriggerValue,
Tags: e.Tags,
OriginalTags: e.OriginalTags,
RecoverTime: recoverTime,
LastEvalTime: e.LastEvalTime,
NotifyCurNumber: e.NotifyCurNumber,
FirstTriggerTime: e.FirstTriggerTime,
NotifyRuleIds: e.NotifyRuleIds,
}
}
func (e *AlertCurEvent) DB2FE() error {
e.NotifyChannelsJSON = strings.Fields(e.NotifyChannels)
e.NotifyGroupsJSON = strings.Fields(e.NotifyGroups)
e.CallbacksJSON = strings.Fields(e.Callbacks)
e.TagsJSON = strings.Split(e.Tags, ",,")
e.OriginalTagsJSON = strings.Split(e.OriginalTags, ",,")
if err := json.Unmarshal([]byte(e.Annotations), &e.AnnotationsJSON); err != nil {
return err
}
if err := json.Unmarshal([]byte(e.RuleConfig), &e.RuleConfigJson); err != nil {
return err
}
e.TagsMap = make(map[string]string)
for i := 0; i < len(e.TagsJSON); i++ {
pair := strings.TrimSpace(e.TagsJSON[i])
if pair == "" {
continue
}
arr := strings.SplitN(pair, "=", 2)
if len(arr) != 2 {
continue
}
e.TagsMap[arr[0]] = arr[1]
}
return nil
}
func (e *AlertCurEvent) FE2DB() {
e.NotifyChannels = strings.Join(e.NotifyChannelsJSON, " ")
e.NotifyGroups = strings.Join(e.NotifyGroupsJSON, " ")
e.Callbacks = strings.Join(e.CallbacksJSON, " ")
e.Tags = strings.Join(e.TagsJSON, ",,")
e.OriginalTags = strings.Join(e.OriginalTagsJSON, ",,")
b, _ := json.Marshal(e.AnnotationsJSON)
e.Annotations = string(b)
b, _ = json.Marshal(e.RuleConfigJson)
e.RuleConfig = string(b)
}
func (e *AlertCurEvent) FillTagsMap() {
e.TagsMap = make(map[string]string)
for i := 0; i < len(e.TagsJSON); i++ {
pair := strings.TrimSpace(e.TagsJSON[i])
if pair == "" {
continue
}
arr := strings.SplitN(pair, "=", 2)
if len(arr) != 2 {
continue
}
e.TagsMap[arr[0]] = arr[1]
}
}
func (e *AlertCurEvent) DB2Mem() {
e.IsRecovered = false
e.NotifyGroupsJSON = strings.Fields(e.NotifyGroups)
e.CallbacksJSON = strings.Fields(e.Callbacks)
e.NotifyChannelsJSON = strings.Fields(e.NotifyChannels)
e.TagsJSON = strings.Split(e.Tags, ",,")
e.TagsMap = make(map[string]string)
for i := 0; i < len(e.TagsJSON); i++ {
pair := strings.TrimSpace(e.TagsJSON[i])
if pair == "" {
continue
}
arr := strings.SplitN(pair, "=", 2)
if len(arr) != 2 {
continue
}
e.TagsMap[arr[0]] = arr[1]
}
// 解决之前数据库中 FirstTriggerTime 为 0 的情况
if e.FirstTriggerTime == 0 {
e.FirstTriggerTime = e.TriggerTime
}
}
func (e *AlertCurEvent) OverrideGlobalWebhook() bool {
var rc RuleConfig
if err := json.Unmarshal([]byte(e.RuleConfig), &rc); err != nil {
logger.Warningf("failed to unmarshal rule config: %v", err)
return false
}
return rc.OverrideGlobalWebhook
}
func FillRuleConfigTplName(ctx *ctx.Context, ruleConfig string) (interface{}, bool) {
var config RuleConfig
err := json.Unmarshal([]byte(ruleConfig), &config)
if err != nil {
logger.Warningf("failed to unmarshal rule config: %v", err)
return nil, false
}
if len(config.TaskTpls) == 0 {
return nil, false
}
for i := 0; i < len(config.TaskTpls); i++ {
tpl, err := TaskTplGetById(ctx, config.TaskTpls[i].TplId)
if err != nil {
logger.Warningf("failed to get task tpl by id:%d, %v", config.TaskTpls[i].TplId, err)
return nil, false
}
if tpl == nil {
logger.Warningf("task tpl not found by id:%d", config.TaskTpls[i].TplId)
return nil, false
}
config.TaskTpls[i].TplName = tpl.Title
}
return config, true
}
// for webui
func (e *AlertCurEvent) FillNotifyGroups(ctx *ctx.Context, cache map[int64]*UserGroup) error {
// some user-group already deleted ?
count := len(e.NotifyGroupsJSON)
if count == 0 {
e.NotifyGroupsObj = []*UserGroup{}
return nil
}
for i := range e.NotifyGroupsJSON {
id, err := strconv.ParseInt(e.NotifyGroupsJSON[i], 10, 64)
if err != nil {
continue
}
ug, has := cache[id]
if has {
e.NotifyGroupsObj = append(e.NotifyGroupsObj, ug)
continue
}
ug, err = UserGroupGetById(ctx, id)
if err != nil {
return err
}
if ug != nil {
e.NotifyGroupsObj = append(e.NotifyGroupsObj, ug)
cache[id] = ug
}
}
return nil
}
func AlertCurEventTotal(ctx *ctx.Context, prods []string, bgids []int64, stime, etime int64,
severity []int64, dsIds []int64, cates []string, ruleId int64, query string, eventIds []int64) (int64, error) {
session := DB(ctx).Model(&AlertCurEvent{})
if stime != 0 && etime != 0 {
session = session.Where("trigger_time between ? and ?", stime, etime)
}
if len(prods) != 0 {
session = session.Where("rule_prod in ?", prods)
}
if len(bgids) > 0 {
session = session.Where("group_id in ?", bgids)
}
if len(severity) > 0 {
session = session.Where("severity in ?", severity)
}
if len(dsIds) > 0 {
session = session.Where("datasource_id in ?", dsIds)
}
if len(cates) > 0 {
session = session.Where("cate in ?", cates)
}
if ruleId > 0 {
session = session.Where("rule_id = ?", ruleId)
}
if len(eventIds) > 0 {
session = session.Where("id in ?", eventIds)
}
if query != "" {
arr := strings.Fields(query)
for i := 0; i < len(arr); i++ {
qarg := "%" + arr[i] + "%"
session = session.Where("rule_name like ? or tags like ?", qarg, qarg)
}
}
return Count(session)
}
func AlertCurEventsGet(ctx *ctx.Context, prods []string, bgids []int64, stime, etime int64,
severity []int64, dsIds []int64, cates []string, ruleId int64, query string, limit, offset int, eventIds []int64) (
[]AlertCurEvent, error) {
session := DB(ctx).Model(&AlertCurEvent{})
if stime != 0 && etime != 0 {
session = session.Where("trigger_time between ? and ?", stime, etime)
}
if len(prods) != 0 {
session = session.Where("rule_prod in ?", prods)
}
if len(bgids) > 0 {
session = session.Where("group_id in ?", bgids)
}
if len(severity) > 0 {
session = session.Where("severity in ?", severity)
}
if len(dsIds) > 0 {
session = session.Where("datasource_id in ?", dsIds)
}
if len(cates) > 0 {
session = session.Where("cate in ?", cates)
}
if ruleId > 0 {
session = session.Where("rule_id = ?", ruleId)
}
if len(eventIds) > 0 {
session = session.Where("id in ?", eventIds)
}
if query != "" {
arr := strings.Fields(query)
for i := 0; i < len(arr); i++ {
qarg := "%" + arr[i] + "%"
session = session.Where("rule_name like ? or tags like ?", qarg, qarg)
}
}
var lst []AlertCurEvent
err := session.Order("trigger_time desc").Limit(limit).Offset(offset).Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
}
return lst, err
}
func AlertCurEventCountByRuleId(ctx *ctx.Context, rids []int64, stime, etime int64) map[int64]int64 {
type Row struct {
RuleId int64
Cnt int64
}
var rows []Row
err := DB(ctx).Model(&AlertCurEvent{}).Select("rule_id, count(*) as cnt").
Where("trigger_time between ? and ?", stime, etime).Group("rule_id").Find(&rows).Error
if err != nil {
logger.Errorf("Failed to count group by rule_id: %v", err)
return nil
}
curEventTotalByRid := make(map[int64]int64, len(rids))
for _, r := range rows {
curEventTotalByRid[r.RuleId] = r.Cnt
}
return curEventTotalByRid
}
func AlertCurEventDel(ctx *ctx.Context, ids []int64) error {
if len(ids) == 0 {
return nil
}
return DB(ctx).Where("id in ?", ids).Delete(&AlertCurEvent{}).Error
}
func AlertCurEventDelByHash(ctx *ctx.Context, hash string) error {
if !ctx.IsCenter {
_, err := poster.GetByUrls[string](ctx, "/v1/n9e/alert-cur-events-del-by-hash?hash="+hash)
return err
}
return DB(ctx).Where("hash = ?", hash).Delete(&AlertCurEvent{}).Error
}
func AlertCurEventExists(ctx *ctx.Context, where string, args ...interface{}) (bool, error) {
return Exists(DB(ctx).Model(&AlertCurEvent{}).Where(where, args...))
}
func AlertCurEventGet(ctx *ctx.Context, where string, args ...interface{}) (*AlertCurEvent, error) {
var lst []*AlertCurEvent
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
lst[0].DB2FE()
lst[0].FillNotifyGroups(ctx, make(map[int64]*UserGroup))
return lst[0], nil
}
func AlertCurEventGetById(ctx *ctx.Context, id int64) (*AlertCurEvent, error) {
return AlertCurEventGet(ctx, "id=?", id)
}
type AlertNumber struct {
GroupId int64
GroupCount int64
}
// for busi_group list page
func AlertNumbers(ctx *ctx.Context, bgids []int64) (map[int64]int64, error) {
ret := make(map[int64]int64)
if len(bgids) == 0 {
return ret, nil
}
var arr []AlertNumber
err := DB(ctx).Model(&AlertCurEvent{}).Select("group_id", "count(*) as group_count").Where("group_id in ?", bgids).Group("group_id").Find(&arr).Error
if err != nil {
return nil, err
}
for i := 0; i < len(arr); i++ {
ret[arr[i].GroupId] = arr[i].GroupCount
}
return ret, nil
}
func AlertCurEventGetByIds(ctx *ctx.Context, ids []int64) ([]*AlertCurEvent, error) {
var lst []*AlertCurEvent
if len(ids) == 0 {
return lst, nil
}
err := DB(ctx).Model(&AlertCurEvent{}).Where("id in ?", ids).Order("trigger_time desc").Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
}
return lst, err
}
func AlertCurEventGetByRuleIdAndDsId(ctx *ctx.Context, ruleId int64, datasourceId int64) ([]*AlertCurEvent, error) {
if !ctx.IsCenter {
lst, err := poster.GetByUrls[[]*AlertCurEvent](ctx, "/v1/n9e/alert-cur-events-get-by-rid?rid="+strconv.FormatInt(ruleId, 10)+"&dsid="+strconv.FormatInt(datasourceId, 10))
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].FE2DB()
}
}
return lst, err
}
var lst []*AlertCurEvent
err := DB(ctx).Where("rule_id=? and datasource_id = ?", ruleId, datasourceId).Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
}
return lst, err
}
func AlertCurEventGetMap(ctx *ctx.Context, cluster string) (map[int64]map[string]struct{}, error) {
session := DB(ctx).Model(&AlertCurEvent{})
if cluster != "" {
session = session.Where("datasource_id = ?", cluster)
}
var lst []*AlertCurEvent
err := session.Select("rule_id", "hash").Find(&lst).Error
if err != nil {
return nil, err
}
ret := make(map[int64]map[string]struct{})
for i := 0; i < len(lst); i++ {
rid := lst[i].RuleId
hash := lst[i].Hash
if _, has := ret[rid]; has {
ret[rid][hash] = struct{}{}
} else {
ret[rid] = make(map[string]struct{})
ret[rid][hash] = struct{}{}
}
}
return ret, nil
}
func (e *AlertCurEvent) UpdateFieldsMap(ctx *ctx.Context, fields map[string]interface{}) error {
return DB(ctx).Model(e).Updates(fields).Error
}
func AlertCurEventUpgradeToV6(ctx *ctx.Context, dsm map[string]Datasource) error {
var lst []*AlertCurEvent
err := DB(ctx).Where("trigger_time > ?", time.Now().Unix()-3600*24*30).Find(&lst).Error
if err != nil {
return err
}
for i := 0; i < len(lst); i++ {
ds, exists := dsm[lst[i].Cluster]
if !exists {
continue
}
lst[i].DatasourceId = ds.Id
ruleConfig := PromRuleConfig{
Queries: []PromQuery{
{
PromQl: lst[i].PromQl,
Severity: lst[i].Severity,
},
},
}
b, _ := json.Marshal(ruleConfig)
lst[i].RuleConfig = string(b)
if lst[i].RuleProd == "" {
lst[i].RuleProd = METRIC
}
if lst[i].Cate == "" {
lst[i].Cate = PROMETHEUS
}
err = lst[i].UpdateFieldsMap(ctx, map[string]interface{}{
"datasource_id": lst[i].DatasourceId,
"rule_config": lst[i].RuleConfig,
"rule_prod": lst[i].RuleProd,
"cate": lst[i].Cate,
})
if err != nil {
logger.Errorf("update alert rule:%d datasource ids failed, %v", lst[i].Id, err)
}
}
return nil
}
// AlertCurEventGetsFromAlertMute find current events from db.
func AlertCurEventGetsFromAlertMute(ctx *ctx.Context, alertMute *AlertMute) ([]*AlertCurEvent, error) {
var lst []*AlertCurEvent
tx := DB(ctx).Where("group_id = ?", alertMute.GroupId)
if len(alertMute.SeveritiesJson) != 0 {
tx = tx.Where("severity IN (?)", alertMute.SeveritiesJson)
}
if len(alertMute.DatasourceIdsJson) != 0 && !IsAllDatasource(alertMute.DatasourceIdsJson) {
tx = tx.Where("datasource_id IN (?)", alertMute.DatasourceIdsJson)
}
err := tx.Order("id desc").Find(&lst).Error
return lst, err
}
func AlertCurEventStatistics(ctx *ctx.Context, stime time.Time) map[string]interface{} {
stime24HoursAgoUnix := stime.Add(-24 * time.Hour).Unix()
//Beginning of today
stimeMidnightUnix := time.Date(stime.Year(), stime.Month(), stime.Day(), 0, 0, 0, 0, stime.Location()).Unix()
///Monday of the current week, starting at 00:00
daysToMonday := (int(stime.Weekday()) - 1 + 7) % 7 // (DayOfTheWeek - Monday(1) + DaysAWeek(7))/DaysAWeek(7)
stimeOneWeekAgoUnix := time.Date(stime.Year(), stime.Month(), stime.Day()-daysToMonday, 0, 0, 0, 0, stime.Location()).Unix()
var err error
res := make(map[string]interface{})
res["total"], err = Count(DB(ctx).Model(&AlertCurEvent{}))
if err != nil {
logger.Debugf("count alert current rule failed(total), %v", err)
}
res["total_24_ago"], err = Count(DB(ctx).Model(&AlertCurEvent{}).Where("trigger_time < ?", stime24HoursAgoUnix))
if err != nil {
logger.Debugf("count alert current rule failed(total_24ago), %v", err)
}
res["total_today"], err = Count(DB(ctx).Model(&AlertHisEvent{}).Where("trigger_time >= ? and is_recovered = ? ", stimeMidnightUnix, 0))
if err != nil {
logger.Debugf("count alert his rule failed(total_today), %v", err)
}
res["total_week"], err = Count(DB(ctx).Model(&AlertHisEvent{}).Where("trigger_time >= ? and is_recovered = ? ", stimeOneWeekAgoUnix, 0))
if err != nil {
logger.Debugf("count alert his rule failed(total_today), %v", err)
}
return res
}
func (e *AlertCurEvent) DeepCopy() *AlertCurEvent {
eventCopy := *e
// 复制指针字段
if e.NotifyGroupsObj != nil {
eventCopy.NotifyGroupsObj = make([]*UserGroup, len(e.NotifyGroupsObj))
for i, group := range e.NotifyGroupsObj {
if group != nil {
groupCopy := *group
eventCopy.NotifyGroupsObj[i] = &groupCopy
}
}
}
if e.NotifyUsersObj != nil {
eventCopy.NotifyUsersObj = make([]*User, len(e.NotifyUsersObj))
for i, user := range e.NotifyUsersObj {
if user != nil {
userCopy := *user
eventCopy.NotifyUsersObj[i] = &userCopy
}
}
}
if e.Target != nil {
targetCopy := *e.Target
eventCopy.Target = &targetCopy
}
// 复制切片字段
if e.CallbacksJSON != nil {
eventCopy.CallbacksJSON = make([]string, len(e.CallbacksJSON))
copy(eventCopy.CallbacksJSON, e.CallbacksJSON)
}
if e.NotifyChannelsJSON != nil {
eventCopy.NotifyChannelsJSON = make([]string, len(e.NotifyChannelsJSON))
copy(eventCopy.NotifyChannelsJSON, e.NotifyChannelsJSON)
}
if e.NotifyGroupsJSON != nil {
eventCopy.NotifyGroupsJSON = make([]string, len(e.NotifyGroupsJSON))
copy(eventCopy.NotifyGroupsJSON, e.NotifyGroupsJSON)
}
if e.TagsJSON != nil {
eventCopy.TagsJSON = make([]string, len(e.TagsJSON))
copy(eventCopy.TagsJSON, e.TagsJSON)
}
if e.TagsMap != nil {
eventCopy.TagsMap = make(map[string]string, len(e.TagsMap))
for k, v := range e.TagsMap {
eventCopy.TagsMap[k] = v
}
}
if e.OriginalTagsJSON != nil {
eventCopy.OriginalTagsJSON = make([]string, len(e.OriginalTagsJSON))
copy(eventCopy.OriginalTagsJSON, e.OriginalTagsJSON)
}
if e.AnnotationsJSON != nil {
eventCopy.AnnotationsJSON = make(map[string]string, len(e.AnnotationsJSON))
for k, v := range e.AnnotationsJSON {
eventCopy.AnnotationsJSON[k] = v
}
}
if e.ExtraInfo != nil {
eventCopy.ExtraInfo = make([]string, len(e.ExtraInfo))
copy(eventCopy.ExtraInfo, e.ExtraInfo)
}
if e.ExtraInfoMap != nil {
eventCopy.ExtraInfoMap = make([]map[string]string, len(e.ExtraInfoMap))
for i, m := range e.ExtraInfoMap {
if m != nil {
eventCopy.ExtraInfoMap[i] = make(map[string]string, len(m))
for k, v := range m {
eventCopy.ExtraInfoMap[i][k] = v
}
}
}
}
if e.NotifyRuleIds != nil {
eventCopy.NotifyRuleIds = make([]int64, len(e.NotifyRuleIds))
copy(eventCopy.NotifyRuleIds, e.NotifyRuleIds)
}
eventCopy.RuleConfigJson = e.RuleConfigJson
eventCopy.ExtraConfig = e.ExtraConfig
return &eventCopy
}
================================================
FILE: models/alert_his_event.go
================================================
package models
import (
"encoding/json"
"fmt"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/logger"
)
type AlertHisEvent struct {
Id int64 `json:"id" gorm:"primaryKey"`
Cate string `json:"cate"`
IsRecovered int `json:"is_recovered"`
DatasourceId int64 `json:"datasource_id"`
Cluster string `json:"cluster"`
GroupId int64 `json:"group_id"`
GroupName string `json:"group_name"` // busi group name
Hash string `json:"hash"`
RuleId int64 `json:"rule_id"`
RuleName string `json:"rule_name"`
RuleNote string `json:"rule_note"`
RuleProd string `json:"rule_prod"`
RuleAlgo string `json:"rule_algo"`
Severity int `json:"severity"`
PromForDuration int `json:"prom_for_duration"`
PromQl string `json:"prom_ql"`
RuleConfig string `json:"-" gorm:"rule_config"` // rule config
RuleConfigJson interface{} `json:"rule_config" gorm:"-"` // rule config for fe
PromEvalInterval int `json:"prom_eval_interval"`
Callbacks string `json:"-"`
CallbacksJSON []string `json:"callbacks" gorm:"-"`
RunbookUrl string `json:"runbook_url"`
NotifyRecovered int `json:"notify_recovered"`
NotifyChannels string `json:"-"`
NotifyChannelsJSON []string `json:"notify_channels" gorm:"-"`
NotifyGroups string `json:"-"`
NotifyGroupsJSON []string `json:"notify_groups" gorm:"-"`
NotifyGroupsObj []UserGroup `json:"notify_groups_obj" gorm:"-"`
TargetIdent string `json:"target_ident"`
TargetNote string `json:"target_note"`
TriggerTime int64 `json:"trigger_time"`
TriggerValue string `json:"trigger_value"`
RecoverTime int64 `json:"recover_time"`
LastEvalTime int64 `json:"last_eval_time"`
Tags string `json:"-"`
TagsJSON []string `json:"tags" gorm:"-"`
OriginalTags string `json:"-"` // for db
OriginalTagsJSON []string `json:"original_tags" gorm:"-"` // for fe
Annotations string `json:"-"`
AnnotationsJSON map[string]string `json:"annotations" gorm:"-"` // for fe
NotifyCurNumber int `json:"notify_cur_number"` // notify: current number
FirstTriggerTime int64 `json:"first_trigger_time"` // 连续告警的首次告警时间
ExtraConfig interface{} `json:"extra_config" gorm:"-"`
NotifyRuleIds []int64 `json:"notify_rule_ids" gorm:"serializer:json"`
NotifyVersion int `json:"notify_version" gorm:"-"`
NotifyRules []*EventNotifyRule `json:"notify_rules" gorm:"-"`
}
func (e *AlertHisEvent) TableName() string {
return "alert_his_event"
}
func (e *AlertHisEvent) Add(ctx *ctx.Context) error {
return Insert(ctx, e)
}
func (e *AlertHisEvent) DB2FE() {
e.NotifyChannelsJSON = strings.Fields(e.NotifyChannels)
e.NotifyGroupsJSON = strings.Fields(e.NotifyGroups)
e.CallbacksJSON = strings.Fields(e.Callbacks)
e.TagsJSON = strings.Split(e.Tags, ",,")
e.OriginalTagsJSON = strings.Split(e.OriginalTags, ",,")
if len(e.Annotations) > 0 {
err := json.Unmarshal([]byte(e.Annotations), &e.AnnotationsJSON)
if err != nil {
e.AnnotationsJSON = make(map[string]string)
e.AnnotationsJSON["error"] = e.Annotations
}
}
json.Unmarshal([]byte(e.RuleConfig), &e.RuleConfigJson)
}
func (e *AlertHisEvent) FillNotifyGroups(ctx *ctx.Context, cache map[int64]*UserGroup) error {
// some user-group already deleted ?
count := len(e.NotifyGroupsJSON)
if count == 0 {
e.NotifyGroupsObj = []UserGroup{}
return nil
}
for i := range e.NotifyGroupsJSON {
id, err := strconv.ParseInt(e.NotifyGroupsJSON[i], 10, 64)
if err != nil {
continue
}
ug, has := cache[id]
if has {
e.NotifyGroupsObj = append(e.NotifyGroupsObj, *ug)
continue
}
ug, err = UserGroupGetById(ctx, id)
if err != nil {
return err
}
if ug != nil {
e.NotifyGroupsObj = append(e.NotifyGroupsObj, *ug)
cache[id] = ug
}
}
return nil
}
// func (e *AlertHisEvent) FillTaskTplName(ctx *ctx.Context, cache map[int64]*UserGroup) error {
// }
func AlertHisEventTotal(
ctx *ctx.Context, prods []string, bgids []int64, stime, etime int64, severity int,
recovered int, dsIds []int64, cates []string, ruleId int64, query string, eventIds []int64) (int64, error) {
session := DB(ctx).Model(&AlertHisEvent{}).Where("last_eval_time between ? and ?", stime, etime)
if len(prods) > 0 {
session = session.Where("rule_prod in ?", prods)
}
if len(bgids) > 0 {
session = session.Where("group_id in ?", bgids)
}
if severity >= 0 {
session = session.Where("severity = ?", severity)
}
if recovered >= 0 {
session = session.Where("is_recovered = ?", recovered)
}
if len(dsIds) > 0 {
session = session.Where("datasource_id in ?", dsIds)
}
if len(cates) > 0 {
session = session.Where("cate in ?", cates)
}
if ruleId > 0 {
session = session.Where("rule_id = ?", ruleId)
}
if len(eventIds) > 0 {
session = session.Where("id in ?", eventIds)
}
if query != "" {
arr := strings.Fields(query)
for i := 0; i < len(arr); i++ {
qarg := "%" + arr[i] + "%"
session = session.Where("rule_name like ? or tags like ?", qarg, qarg)
}
}
return Count(session)
}
func AlertHisEventGets(ctx *ctx.Context, prods []string, bgids []int64, stime, etime int64,
severity int, recovered int, dsIds []int64, cates []string, ruleId int64, query string,
limit, offset int, eventIds []int64) ([]AlertHisEvent, error) {
session := DB(ctx).Where("last_eval_time between ? and ?", stime, etime)
if len(prods) != 0 {
session = session.Where("rule_prod in ?", prods)
}
if len(bgids) > 0 {
session = session.Where("group_id in ?", bgids)
}
if severity >= 0 {
session = session.Where("severity = ?", severity)
}
if recovered >= 0 {
session = session.Where("is_recovered = ?", recovered)
}
if len(dsIds) > 0 {
session = session.Where("datasource_id in ?", dsIds)
}
if len(cates) > 0 {
session = session.Where("cate in ?", cates)
}
if ruleId > 0 {
session = session.Where("rule_id = ?", ruleId)
}
if len(eventIds) > 0 {
session = session.Where("id in ?", eventIds)
}
if query != "" {
arr := strings.Fields(query)
for i := 0; i < len(arr); i++ {
qarg := "%" + arr[i] + "%"
session = session.Where("rule_name like ? or tags like ?", qarg, qarg)
}
}
var lst []AlertHisEvent
err := session.Order("trigger_time desc, id desc").Limit(limit).Offset(offset).Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
}
return lst, err
}
func AlertHisEventGet(ctx *ctx.Context, where string, args ...interface{}) (*AlertHisEvent, error) {
var lst []*AlertHisEvent
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
lst[0].DB2FE()
lst[0].FillNotifyGroups(ctx, make(map[int64]*UserGroup))
return lst[0], nil
}
func AlertHisEventGetById(ctx *ctx.Context, id int64) (*AlertHisEvent, error) {
return AlertHisEventGet(ctx, "id=?", id)
}
func AlertHisEventGetByHash(ctx *ctx.Context, hash string) (*AlertHisEvent, error) {
var lst []*AlertHisEvent
err := DB(ctx).Where("hash = ?", hash).Order("trigger_time desc").Limit(1).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
return lst[0], nil
}
func AlertHisEventBatchDelete(ctx *ctx.Context, timestamp int64, severities []int, limit int) (int64, error) {
db := DB(ctx).Where("last_eval_time < ?", timestamp)
if len(severities) > 0 {
db = db.Where("severity IN (?)", severities)
}
res := db.Limit(limit).Delete(&AlertHisEvent{})
return res.RowsAffected, res.Error
}
func (m *AlertHisEvent) UpdateFieldsMap(ctx *ctx.Context, fields map[string]interface{}) error {
return DB(ctx).Model(m).Updates(fields).Error
}
func AlertHisEventUpgradeToV6(ctx *ctx.Context, dsm map[string]Datasource) error {
var lst []*AlertHisEvent
err := DB(ctx).Where("trigger_time > ?", time.Now().Unix()-3600*24*30).Limit(10000).Order("id desc").Find(&lst).Error
if err != nil {
return err
}
for i := 0; i < len(lst); i++ {
ds, exists := dsm[lst[i].Cluster]
if !exists {
continue
}
lst[i].DatasourceId = ds.Id
ruleConfig := PromRuleConfig{
Queries: []PromQuery{
{
PromQl: lst[i].PromQl,
Severity: lst[i].Severity,
},
},
}
b, _ := json.Marshal(ruleConfig)
lst[i].RuleConfig = string(b)
if lst[i].RuleProd == "" {
lst[i].RuleProd = METRIC
}
if lst[i].Cate == "" {
lst[i].Cate = PROMETHEUS
}
err = lst[i].UpdateFieldsMap(ctx, map[string]interface{}{
"datasource_id": lst[i].DatasourceId,
"rule_config": lst[i].RuleConfig,
"rule_prod": lst[i].RuleProd,
"cate": lst[i].Cate,
})
if err != nil {
logger.Errorf("update alert rule:%d datasource ids failed, %v", lst[i].Id, err)
}
}
return nil
}
func EventPersist(ctx *ctx.Context, event *AlertCurEvent) error {
has, err := AlertCurEventExists(ctx, "hash=?", event.Hash)
if err != nil {
return fmt.Errorf("event_persist_check_exists_fail: %v rule_id=%d hash=%s", err, event.RuleId, event.Hash)
}
his := event.ToHis(ctx)
// 不管是告警还是恢复,全量告警里都要记录
if err := his.Add(ctx); err != nil {
return fmt.Errorf("add his event error:%v", err)
}
if has {
// 活跃告警表中有记录,删之
err = AlertCurEventDelByHash(ctx, event.Hash)
if err != nil {
return fmt.Errorf("event_del_cur_fail: %v hash=%s", err, event.Hash)
}
if !event.IsRecovered {
// 恢复事件,从活跃告警列表彻底删掉,告警事件,要重新加进来新的event
// use his id as cur id
event.Id = his.Id
if event.Id > 0 {
if err := event.Add(ctx); err != nil {
return fmt.Errorf("add cur event err:%v", err)
}
}
}
// use his id as cur id
event.Id = his.Id
return nil
}
// use his id as cur id
event.Id = his.Id
if event.IsRecovered {
// alert_cur_event表里没有数据,表示之前没告警,结果现在报了恢复,神奇....理论上不应该出现的
return nil
}
if event.Id > 0 {
if err := event.Add(ctx); err != nil {
return fmt.Errorf("add cur event error:%v", err)
}
}
return nil
}
func AlertHisEventGetByIds(ctx *ctx.Context, ids []int64) ([]*AlertHisEvent, error) {
var lst []*AlertHisEvent
if len(ids) == 0 {
return lst, nil
}
err := DB(ctx).Where("id in ?", ids).Order("trigger_time desc").Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
}
return lst, err
}
func (e *AlertHisEvent) ToCur() *AlertCurEvent {
cur := AlertCurEvent{
Id: e.Id,
Cate: e.Cate,
Cluster: e.Cluster,
DatasourceId: e.DatasourceId,
GroupId: e.GroupId,
GroupName: e.GroupName,
Hash: e.Hash,
RuleId: e.RuleId,
RuleName: e.RuleName,
RuleProd: e.RuleProd,
RuleAlgo: e.RuleAlgo,
RuleNote: e.RuleNote,
Severity: e.Severity,
PromForDuration: e.PromForDuration,
PromQl: e.PromQl,
PromEvalInterval: e.PromEvalInterval,
RuleConfig: e.RuleConfig,
RuleConfigJson: e.RuleConfigJson,
Callbacks: e.Callbacks,
RunbookUrl: e.RunbookUrl,
NotifyRecovered: e.NotifyRecovered,
NotifyChannels: e.NotifyChannels,
NotifyGroups: e.NotifyGroups,
Annotations: e.Annotations,
AnnotationsJSON: e.AnnotationsJSON,
TargetIdent: e.TargetIdent,
TargetNote: e.TargetNote,
TriggerTime: e.TriggerTime,
TriggerValue: e.TriggerValue,
Tags: e.Tags,
TagsJSON: strings.Split(e.Tags, ",,"),
OriginalTags: e.OriginalTags,
LastEvalTime: e.LastEvalTime,
NotifyCurNumber: e.NotifyCurNumber,
FirstTriggerTime: e.FirstTriggerTime,
IsRecovered: e.IsRecovered == 1,
TriggerValues: e.TriggerValue,
CallbacksJSON: e.CallbacksJSON,
NotifyChannelsJSON: e.NotifyChannelsJSON,
NotifyGroupsJSON: e.NotifyGroupsJSON,
OriginalTagsJSON: e.OriginalTagsJSON,
NotifyRuleIds: e.NotifyRuleIds,
NotifyRules: e.NotifyRules,
NotifyVersion: e.NotifyVersion,
RecoverTime: e.RecoverTime,
}
cur.SetTagsMap()
return &cur
}
================================================
FILE: models/alert_mute.go
================================================
package models
import (
"encoding/json"
"fmt"
"regexp"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/ormx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/toolkits/pkg/logger"
"github.com/pkg/errors"
)
type TagFilter struct {
Key string `json:"key"` // tag key
Func string `json:"func"` // `==` | `=~` | `in` | `!=` | `!~` | `not in`
Op string `json:"op"` // `==` | `=~` | `in` | `!=` | `!~` | `not in`
Value interface{} `json:"value"` // tag value
Regexp *regexp.Regexp // parse value to regexp if func = '=~' or '!~'
Vset map[string]struct{} // parse value to regexp if func = 'in' or 'not in'
}
func (t *TagFilter) Verify() error {
if t.Key == "" {
return errors.New("tag key cannot be empty")
}
if t.Func == "" {
t.Func = t.Op
}
if t.Func != "==" && t.Func != "!=" && t.Func != "in" && t.Func != "not in" &&
t.Func != "=~" && t.Func != "!~" {
return errors.New("invalid operation")
}
return nil
}
func ParseTagFilter(bFilters []TagFilter) ([]TagFilter, error) {
var err error
for i := 0; i < len(bFilters); i++ {
if bFilters[i].Func == "=~" || bFilters[i].Func == "!~" {
// 这里存在两个情况,一个是 string 一个是 int
var pattern string
switch v := bFilters[i].Value.(type) {
case string:
pattern = v
case int:
pattern = strconv.Itoa(v)
default:
return nil, fmt.Errorf("unsupported value type for regex: %T", v)
}
bFilters[i].Regexp, err = regexp.Compile(pattern)
if err != nil {
return nil, err
}
} else if bFilters[i].Func == "in" || bFilters[i].Func == "not in" {
// 这里存在两个情况,一个是 string 一个是[]int
bFilters[i].Vset = make(map[string]struct{})
switch v := bFilters[i].Value.(type) {
case string:
// 处理字符串情况
arr := strings.Fields(v)
for j := 0; j < len(arr); j++ {
bFilters[i].Vset[arr[j]] = struct{}{}
}
case []int:
// 处理[]int情况
for j := 0; j < len(v); j++ {
bFilters[i].Vset[strconv.Itoa(v[j])] = struct{}{}
}
case []string:
for j := 0; j < len(v); j++ {
bFilters[i].Vset[v[j]] = struct{}{}
}
case []interface{}:
// 处理[]interface{}情况(JSON解析可能产生)
for j := 0; j < len(v); j++ {
switch item := v[j].(type) {
case string:
bFilters[i].Vset[item] = struct{}{}
case int:
bFilters[i].Vset[strconv.Itoa(item)] = struct{}{}
case float64:
bFilters[i].Vset[strconv.Itoa(int(item))] = struct{}{}
}
}
default:
// 兜底处理,转为字符串
str := fmt.Sprintf("%v", v)
arr := strings.Fields(str)
for j := 0; j < len(arr); j++ {
bFilters[i].Vset[arr[j]] = struct{}{}
}
}
}
}
return bFilters, nil
}
func GetTagFilters(jsonArr ormx.JSONArr) ([]TagFilter, error) {
if jsonArr == nil || len([]byte(jsonArr)) == 0 {
return []TagFilter{}, nil
}
bFilters := make([]TagFilter, 0)
err := json.Unmarshal(jsonArr, &bFilters)
if err != nil {
return nil, err
}
for i := 0; i < len(bFilters); i++ {
if bFilters[i].Func == "=~" || bFilters[i].Func == "!~" {
var pattern string
switch v := bFilters[i].Value.(type) {
case string:
pattern = v
case int:
pattern = strconv.Itoa(v)
default:
return nil, fmt.Errorf("unsupported value type for regex: %T", v)
}
bFilters[i].Regexp, err = regexp.Compile(pattern)
if err != nil {
return nil, err
}
} else if bFilters[i].Func == "in" || bFilters[i].Func == "not in" {
bFilters[i].Vset = make(map[string]struct{})
// 在GetTagFilters中,Value通常是string类型,但也要处理其他可能的类型
switch v := bFilters[i].Value.(type) {
case string:
// 处理字符串情况
arr := strings.Fields(v)
for j := 0; j < len(arr); j++ {
bFilters[i].Vset[arr[j]] = struct{}{}
}
case []int:
// 处理[]int情况
for j := 0; j < len(v); j++ {
bFilters[i].Vset[strconv.Itoa(v[j])] = struct{}{}
}
case []interface{}:
// 处理[]interface{}情况(JSON解析可能产生)
for j := 0; j < len(v); j++ {
switch item := v[j].(type) {
case string:
bFilters[i].Vset[item] = struct{}{}
case int:
bFilters[i].Vset[strconv.Itoa(item)] = struct{}{}
case float64:
bFilters[i].Vset[strconv.Itoa(int(item))] = struct{}{}
}
}
default:
// 兜底处理,转为字符串
str := fmt.Sprintf("%v", v)
arr := strings.Fields(str)
for j := 0; j < len(arr); j++ {
bFilters[i].Vset[arr[j]] = struct{}{}
}
}
}
}
return bFilters, nil
}
const TimeRange int = 0
const Periodic int = 1
type AlertMute struct {
Id int64 `json:"id" gorm:"primaryKey"`
GroupId int64 `json:"group_id"`
Note string `json:"note"`
Cate string `json:"cate"`
Prod string `json:"prod"`
DatasourceIds string `json:"-" gorm:"datasource_ids"` // datasource ids
DatasourceIdsJson []int64 `json:"datasource_ids" gorm:"-"` // for fe
Cluster string `json:"cluster"` // take effect by clusters, separated by space
Tags ormx.JSONArr `json:"tags"`
Cause string `json:"cause"`
Btime int64 `json:"btime"`
Etime int64 `json:"etime"`
Disabled int `json:"disabled"` // 0: enabled, 1: disabled
Activated int `json:"activated" gorm:"-"` // 0: not activated, 1: activated
CreateBy string `json:"create_by"`
UpdateBy string `json:"update_by"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
CreateAt int64 `json:"create_at"`
UpdateAt int64 `json:"update_at"`
ITags []TagFilter `json:"-" gorm:"-"` // inner tags
MuteTimeType int `json:"mute_time_type"` // 0: mute by time range, 1: mute by periodic time
PeriodicMutes string `json:"-" gorm:"periodic_mutes"`
PeriodicMutesJson []PeriodicMute `json:"periodic_mutes" gorm:"-"`
Severities string `json:"-" gorm:"severities"`
SeveritiesJson []int `json:"severities" gorm:"-"`
}
type PeriodicMute struct {
EnableStime string `json:"enable_stime"` // split by space: "00:00 10:00 12:00"
EnableEtime string `json:"enable_etime"` // split by space: "00:00 10:00 12:00"
EnableDaysOfWeek string `json:"enable_days_of_week"` // eg: "0 1 2 3 4 5 6"
}
func (m *AlertMute) TableName() string {
return "alert_mute"
}
func AlertMuteGetById(ctx *ctx.Context, id int64) (*AlertMute, error) {
return AlertMuteGet(ctx, "id=?", id)
}
func AlertMuteGet(ctx *ctx.Context, where string, args ...interface{}) (*AlertMute, error) {
var lst []*AlertMute
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
err = lst[0].DB2FE()
return lst[0], err
}
func AlertMuteGets(ctx *ctx.Context, prods []string, bgid int64, disabled int, expired int, query string) (lst []AlertMute, err error) {
session := DB(ctx)
if bgid != -1 {
session = session.Where("group_id = ?", bgid)
}
if len(prods) > 0 {
session = session.Where("prod in (?)", prods)
}
if disabled != -1 {
if disabled == 0 {
session = session.Where("disabled = 0")
} else {
session = session.Where("disabled = 1")
}
}
if expired != -1 {
now := time.Now().Unix()
if expired == 1 {
session = session.Where("mute_time_type = ? AND etime < ?", TimeRange, now)
} else {
session = session.Where("(mute_time_type = ? AND etime >= ?) OR mute_time_type = ?", TimeRange, now, Periodic)
}
}
if query != "" {
arr := strings.Fields(query)
for i := 0; i < len(arr); i++ {
qarg := "%" + arr[i] + "%"
session = session.Where("cause like ?", qarg)
}
}
err = session.Order("id desc").Find(&lst).Error
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
return
}
func AlertMuteGetsByBG(ctx *ctx.Context, groupId int64) (lst []AlertMute, err error) {
err = DB(ctx).Where("group_id=?", groupId).Order("id desc").Find(&lst).Error
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
return
}
func AlertMuteGetsByBGIds(ctx *ctx.Context, bgids []int64) (lst []AlertMute, err error) {
session := DB(ctx)
if len(bgids) > 0 {
session = session.Where("group_id in (?)", bgids)
}
err = session.Order("id desc").Find(&lst).Error
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
return
}
func (m *AlertMute) Verify() error {
if m.GroupId < 0 {
return errors.New("group_id invalid")
}
if IsAllDatasource(m.DatasourceIdsJson) {
m.DatasourceIdsJson = []int64{0}
}
if m.Etime <= m.Btime {
return fmt.Errorf("oops... etime(%d) <= btime(%d)", m.Etime, m.Btime)
}
if err := m.Parse(); err != nil {
return err
}
return nil
}
func (m *AlertMute) Parse() error {
var err error
m.ITags, err = GetTagFilters(m.Tags)
if err != nil {
return err
}
return nil
}
func (m *AlertMute) Add(ctx *ctx.Context) error {
if err := m.Verify(); err != nil {
return err
}
if err := m.FE2DB(); err != nil {
return err
}
now := time.Now().Unix()
m.CreateAt = now
m.UpdateAt = now
return Insert(ctx, m)
}
func (m *AlertMute) Update(ctx *ctx.Context, arm AlertMute) error {
arm.Id = m.Id
arm.GroupId = m.GroupId
arm.CreateAt = m.CreateAt
arm.CreateBy = m.CreateBy
arm.UpdateAt = time.Now().Unix()
err := arm.Verify()
if err != nil {
return err
}
if err := arm.FE2DB(); err != nil {
return err
}
return DB(ctx).Model(m).Select("*").Updates(arm).Error
}
func (m *AlertMute) FE2DB() error {
idsBytes, err := json.Marshal(m.DatasourceIdsJson)
if err != nil {
return err
}
m.DatasourceIds = string(idsBytes)
periodicMutesBytes, err := json.Marshal(m.PeriodicMutesJson)
if err != nil {
return err
}
m.PeriodicMutes = string(periodicMutesBytes)
if len(m.SeveritiesJson) > 0 {
severitiesBytes, err := json.Marshal(m.SeveritiesJson)
if err != nil {
return err
}
m.Severities = string(severitiesBytes)
}
return nil
}
func (m *AlertMute) DB2FE() error {
err := json.Unmarshal([]byte(m.DatasourceIds), &m.DatasourceIdsJson)
if err != nil {
return err
}
if m.DatasourceIdsJson == nil {
m.DatasourceIdsJson = []int64{}
}
err = json.Unmarshal([]byte(m.PeriodicMutes), &m.PeriodicMutesJson)
if err != nil {
return err
}
if m.Severities != "" {
err = json.Unmarshal([]byte(m.Severities), &m.SeveritiesJson)
if err != nil {
return err
}
}
// 检查时间范围
isWithinTime := false
if m.MuteTimeType == TimeRange {
isWithinTime = m.IsWithinTimeRange(time.Now().Unix())
} else if m.MuteTimeType == Periodic {
isWithinTime = m.IsWithinPeriodicMute(time.Now().Unix())
} else {
logger.Warningf("mute time type invalid, %d", m.MuteTimeType)
}
if isWithinTime {
m.Activated = 1
} else {
m.Activated = 0
}
return err
}
func (m *AlertMute) UpdateFieldsMap(ctx *ctx.Context, fields map[string]interface{}) error {
return DB(ctx).Model(m).Updates(fields).Error
}
func (m *AlertMute) IsWithinTimeRange(checkTime int64) bool {
if checkTime < m.Btime || checkTime > m.Etime {
return false
}
return true
}
func (m *AlertMute) IsWithinPeriodicMute(checkTime int64) bool {
tm := time.Unix(checkTime, 0)
triggerTime := tm.Format("15:04")
triggerWeek := strconv.Itoa(int(tm.Weekday()))
for i := 0; i < len(m.PeriodicMutesJson); i++ {
if strings.Contains(m.PeriodicMutesJson[i].EnableDaysOfWeek, triggerWeek) {
if m.PeriodicMutesJson[i].EnableStime == m.PeriodicMutesJson[i].EnableEtime || (m.PeriodicMutesJson[i].EnableStime == "00:00" && m.PeriodicMutesJson[i].EnableEtime == "23:59") {
return true
} else if m.PeriodicMutesJson[i].EnableStime < m.PeriodicMutesJson[i].EnableEtime {
if triggerTime >= m.PeriodicMutesJson[i].EnableStime && triggerTime < m.PeriodicMutesJson[i].EnableEtime {
return true
}
} else {
if triggerTime >= m.PeriodicMutesJson[i].EnableStime || triggerTime < m.PeriodicMutesJson[i].EnableEtime {
return true
}
}
}
}
return false
}
func AlertMuteDel(ctx *ctx.Context, ids []int64) error {
if len(ids) == 0 {
return nil
}
return DB(ctx).Where("id in ?", ids).Delete(new(AlertMute)).Error
}
func AlertMuteStatistics(ctx *ctx.Context) (*Statistics, error) {
var stats []*Statistics
if !ctx.IsCenter {
s, err := poster.GetByUrls[*Statistics](ctx, "/v1/n9e/statistic?name=alert_mute")
return s, err
}
session := DB(ctx).Model(&AlertMute{}).Select("count(*) as total", "max(update_at) as last_updated")
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
func AlertMuteGetsAll(ctx *ctx.Context) ([]*AlertMute, error) {
// get my cluster's mutes
var lst []*AlertMute
if !ctx.IsCenter {
lst, err := poster.GetByUrls[[]*AlertMute](ctx, "/v1/n9e/active-alert-mutes")
if err != nil {
return nil, err
}
for i := 0; i < len(lst); i++ {
lst[i].FE2DB()
}
return lst, err
}
session := DB(ctx).Model(&AlertMute{}).Where("disabled = 0")
// 只筛选在生效时间内的屏蔽规则, 这里 btime < now+10 是为了避免同步期间有规则满足了生效时间条件
now := time.Now().Unix()
session = session.Where("(mute_time_type = ? AND btime <= ? AND etime >= ?) OR mute_time_type = ?", TimeRange, now+10, now, Periodic)
err := session.Find(&lst).Error
if err != nil {
return nil, err
}
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
return lst, err
}
func AlertMuteUpgradeToV6(ctx *ctx.Context, dsm map[string]Datasource) error {
var lst []*AlertMute
err := DB(ctx).Find(&lst).Error
if err != nil {
return err
}
for i := 0; i < len(lst); i++ {
var ids []int64
if lst[i].Cluster == "$all" {
ids = append(ids, 0)
} else {
clusters := strings.Fields(lst[i].Cluster)
for j := 0; j < len(clusters); j++ {
if ds, exists := dsm[clusters[j]]; exists {
ids = append(ids, ds.Id)
}
}
}
b, err := json.Marshal(ids)
if err != nil {
continue
}
lst[i].DatasourceIds = string(b)
err = lst[i].UpdateFieldsMap(ctx, map[string]interface{}{
"datasource_ids": lst[i].DatasourceIds,
})
if err != nil {
logger.Errorf("update alert rule:%d datasource ids failed, %v", lst[i].Id, err)
}
}
return nil
}
================================================
FILE: models/alert_rule.go
================================================
package models
import (
"encoding/json"
"fmt"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/robfig/cron/v3"
"github.com/jinzhu/copier"
"github.com/pkg/errors"
"github.com/tidwall/match"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/str"
)
const (
METRIC = "metric"
LOG = "logging"
HOST = "host"
LOKI = "loki"
PROMETHEUS = "prometheus"
TDENGINE = "tdengine"
ELASTICSEARCH = "elasticsearch"
MYSQL = "mysql"
POSTGRESQL = "pgsql"
DORIS = "doris"
OPENSEARCH = "opensearch"
CLICKHOUSE = "ck"
VICTORIALOGS = "victorialogs"
)
const (
AlertRuleEnabled = 0
AlertRuleDisabled = 1
AlertRuleEnableInGlobalBG = 0
AlertRuleEnableInOneBG = 1
AlertRuleNotNotifyRecovered = 0
AlertRuleNotifyRecovered = 1
AlertRuleNotifyRepeatStep60Min = 60
AlertRuleRecoverDuration0Sec = 0
)
const (
SeverityEmergency = 1
SeverityWarning = 2
SeverityNotice = 3
SeverityLowest = 4
)
type AlertRule struct {
Id int64 `json:"id" gorm:"primaryKey"`
GroupId int64 `json:"group_id"` // busi group id
Cate string `json:"cate"` // alert rule cate (prometheus|elasticsearch)
DatasourceIds string `json:"-" gorm:"datasource_ids"` // Deprecated: use DatasourceQueries instead
DatasourceIdsJson []int64 `json:"datasource_ids,omitempty" gorm:"-"` // alert rule list page use this field
DatasourceQueries []DatasourceQuery `json:"datasource_queries" gorm:"datasource_queries;type:text;serializer:json"` // datasource queries
Cluster string `json:"cluster"` // Deprecated: use DatasourceQueries instead // take effect by clusters, separated by space
Name string `json:"name"` // rule name
Note string `json:"note"` // will sent in notify
Prod string `json:"prod"` // product empty means n9e
Algorithm string `json:"algorithm"` // algorithm (''|holtwinters), empty means threshold
AlgoParams string `json:"-" gorm:"algo_params"` // params algorithm need
AlgoParamsJson interface{} `json:"algo_params" gorm:"-"` // for fe
Delay int `json:"delay"` // Time (in seconds) to delay evaluation
Severity int `json:"severity"` // 1: Emergency 2: Warning 3: Notice
Severities []int `json:"severities" gorm:"-"` // 1: Emergency 2: Warning 3: Notice
Disabled int `json:"disabled"` // 0: enabled, 1: disabled
PromForDuration int `json:"prom_for_duration"` // Deprecated: use cron pattern instead // prometheus for, unit:s
PromQl string `json:"prom_ql"` // just one ql
RuleConfig string `json:"-" gorm:"rule_config"` // rule config
RuleConfigJson interface{} `json:"rule_config" gorm:"-"` // rule config for fe
EventRelabelConfig []*pconf.RelabelConfig `json:"event_relabel_config" gorm:"-"` // event relabel config
PromEvalInterval int `json:"prom_eval_interval"` // unit:s
EnableStime string `json:"-"` // Deprecated // split by space: "00:00 10:00 12:00"
EnableStimeJSON string `json:"enable_stime" gorm:"-"` // Deprecated // for fe
EnableStimesJSON []string `json:"enable_stimes" gorm:"-"` // for fe
EnableEtime string `json:"-"` // Deprecated // split by space: "00:00 10:00 12:00"
EnableEtimeJSON string `json:"enable_etime" gorm:"-"` // Deprecated // for fe
EnableEtimesJSON []string `json:"enable_etimes" gorm:"-"` // for fe
EnableDaysOfWeek string `json:"-"` // Deprecated // eg: "0 1 2 3 4 5 6 ; 0 1 2"
EnableDaysOfWeekJSON []string `json:"enable_days_of_week" gorm:"-"` // Deprecated // for fe
EnableDaysOfWeeksJSON [][]string `json:"enable_days_of_weeks" gorm:"-"` // for fe
EnableInBG int `json:"enable_in_bg"` // 0: global 1: enable one busi-group
NotifyRecovered int `json:"notify_recovered"` // whether notify when recovery
NotifyChannels string `json:"-"` // Deprecated // split by space: sms voice email dingtalk wecom
NotifyChannelsJSON []string `json:"notify_channels" gorm:"-"` // Deprecated // for fe
NotifyGroups string `json:"-"` // Deprecated // split by space: 233 43
NotifyGroupsObj []UserGroup `json:"notify_groups_obj" gorm:"-"` // Deprecated // for fe
NotifyGroupsJSON []string `json:"notify_groups" gorm:"-"` // Deprecated // for fe
NotifyRepeatStep int `json:"notify_repeat_step"` // notify repeat interval, unit: min
NotifyMaxNumber int `json:"notify_max_number"` // notify: max number
RecoverDuration int64 `json:"recover_duration"` // unit: s
Callbacks string `json:"-"` // Deprecated // split by space: http://a.com/api/x http://a.com/api/y'
CallbacksJSON []string `json:"callbacks" gorm:"-"` // Deprecated // for fe
RunbookUrl string `json:"runbook_url"` // sop url
AppendTags string `json:"-"` // split by space: service=n9e mod=api
AppendTagsJSON []string `json:"append_tags" gorm:"-"` // for fe
Annotations string `json:"-"` //
AnnotationsJSON map[string]string `json:"annotations" gorm:"-"` // for fe
ExtraConfig string `json:"-" gorm:"extra_config"` // extra config
ExtraConfigJSON interface{} `json:"extra_config" gorm:"-"` // for fe
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
UUID int64 `json:"uuid" gorm:"-"` // tpl identifier
CurEventCount int64 `json:"cur_event_count" gorm:"-"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"` // for fe
CronPattern string `json:"cron_pattern"`
TimeZone string `json:"time_zone" gorm:"default:''"` // timezone for alert rule, e.g. "Asia/Shanghai", "UTC", empty for default
NotifyRuleIds []int64 `json:"notify_rule_ids" gorm:"serializer:json"`
PipelineConfigs []PipelineConfig `json:"pipeline_configs" gorm:"serializer:json"`
NotifyVersion int `json:"notify_version"` // 0: old, 1: new
}
type ChildVarConfig struct {
ParamVal []map[string]ParamQuery `json:"param_val"`
ChildVarConfigs *ChildVarConfig `json:"child_var_configs"`
}
type ParamQuery struct {
ParamType string `json:"param_type"` // host、device、enum、threshold 三种类型
Query interface{} `json:"query"`
}
type VarConfig struct {
ParamVal []ParamQueryForFirst `json:"param_val"`
ChildVarConfigs *ChildVarConfig `json:"child_var_configs"`
}
// ParamQueryForFirst 同 ParamQuery,仅在第一层出现
type ParamQueryForFirst struct {
Name string `json:"name"`
ParamType string `json:"param_type"`
Query interface{} `json:"query"`
}
type Tpl struct {
TplId int64 `json:"tpl_id"`
TplName string `json:"tpl_name"`
Host []string `json:"host"`
}
type RuleConfig struct {
Version string `json:"version,omitempty"`
EventRelabelConfig []*pconf.RelabelConfig `json:"event_relabel_config,omitempty"`
TaskTpls []*Tpl `json:"task_tpls,omitempty"`
Queries interface{} `json:"queries,omitempty"`
Triggers []Trigger `json:"triggers,omitempty"`
Inhibit bool `json:"inhibit,omitempty"`
PromQl string `json:"prom_ql,omitempty"`
Severity int `json:"severity,omitempty"`
AlgoParams interface{} `json:"algo_params,omitempty"`
OverrideGlobalWebhook bool `json:"override_global_webhook,omitempty"`
}
type PromRuleConfig struct {
Queries []PromQuery `json:"queries"`
Inhibit bool `json:"inhibit"`
PromQl string `json:"prom_ql"`
Severity int `json:"severity"`
AlgoParams interface{} `json:"algo_params"`
}
type RecoverJudge int
const (
Origin RecoverJudge = 0
NotRecoverWhenNoData RecoverJudge = 1
RecoverOnCondition RecoverJudge = 2
)
type RecoverConfig struct {
JudgeType RecoverJudge `json:"judge_type"`
RecoverExp string `json:"recover_exp"`
}
type HostRuleConfig struct {
Queries []HostQuery `json:"queries"`
Triggers []HostTrigger `json:"triggers"`
Inhibit bool `json:"inhibit"`
}
type PromQuery struct {
PromQl string `json:"prom_ql"`
Severity int `json:"severity"`
VarEnabled bool `json:"var_enabled"`
VarConfig VarConfig `json:"var_config"`
RecoverConfig RecoverConfig `json:"recover_config"`
Unit string `json:"unit"`
}
type HostTrigger struct {
Type string `json:"type"`
Duration int `json:"duration"`
Percent int `json:"percent"`
Severity int `json:"severity"`
}
type RuleQuery struct {
Version string `json:"version"`
Inhibit bool `json:"inhibit"`
Queries []interface{} `json:"queries"`
ExpTriggerDisable bool `json:"exp_trigger_disable"`
Triggers []Trigger `json:"triggers"`
NodataTrigger NodataTrigger `json:"nodata_trigger"`
AnomalyTrigger interface{} `json:"anomaly_trigger"`
TriggerType TriggerType `json:"trigger_type,omitempty"` // 在告警事件中使用
}
type NodataTrigger struct {
Enable bool `json:"enable"`
Severity int `json:"severity"`
ResolveAfterEnable bool `json:"resolve_after_enable"`
ResolveAfter int `json:"resolve_after"` // 单位秒
}
type Trigger struct {
Expressions interface{} `json:"expressions"`
Mode int `json:"mode"`
Exp string `json:"exp"`
Severity int `json:"severity"`
Type string `json:"type,omitempty"`
Duration int `json:"duration,omitempty"`
Percent int `json:"percent,omitempty"`
Joins []Join `json:"joins"`
JoinRef string `json:"join_ref"`
RecoverConfig RecoverConfig `json:"recover_config"`
}
type Join struct {
JoinType string `json:"join_type"`
Ref string `json:"ref"`
On []string `json:"on"`
}
var DataSourceQueryAll = DatasourceQuery{
MatchType: 2,
Op: "in",
Values: []interface{}{DatasourceIdAll},
}
type DatasourceQuery struct {
MatchType int `json:"match_type"`
Op string `json:"op"`
Values []interface{} `json:"values"`
}
// GetDatasourceIDsByDatasourceQueries 从 datasourceQueries 中获取 datasourceIDs
// 查询分为精确\模糊匹配,逻辑有 in 与 not in
// idMap 为当前 datasourceQueries 对应的数据源全集
// nameMap 为所有 datasource 的 name 到 id 的映射,用于名称的模糊匹配
func GetDatasourceIDsByDatasourceQueries[T any](datasourceQueries []DatasourceQuery, idMap map[int64]T, nameMap map[string]int64) []int64 {
if len(datasourceQueries) == 0 {
return nil
}
// 所有 query 取交集,初始集合为全集
curIDs := make(map[int64]struct{})
for id, _ := range idMap {
curIDs[id] = struct{}{}
}
for i := range datasourceQueries {
// 每次 query 都在 curIDs 的基础上得到 dsIDs
dsIDs := make(map[int64]struct{})
q := datasourceQueries[i]
if q.MatchType == 0 {
// 精确匹配转为 id 匹配
idValues := make([]int64, 0, len(q.Values))
for v := range q.Values {
var val int64
switch v := q.Values[v].(type) {
case int64:
val = v
case int:
val = int64(v)
case float64:
val = int64(v)
case float32:
val = int64(v)
case int8:
val = int64(v)
case int16:
val = int64(v)
case int32:
val = int64(v)
default:
continue
}
idValues = append(idValues, int64(val))
}
if q.Op == "in" {
if len(idValues) == 1 && idValues[0] == DatasourceIdAll {
for id := range curIDs {
dsIDs[id] = struct{}{}
}
} else {
for idx := range idValues {
if _, exist := curIDs[idValues[idx]]; exist {
dsIDs[idValues[idx]] = struct{}{}
}
}
}
} else if q.Op == "not in" {
for idx := range idValues {
delete(curIDs, idValues[idx])
}
dsIDs = curIDs
}
} else if q.MatchType == 1 {
// 模糊匹配使用 datasource name
if q.Op == "in" {
for dsName, dsID := range nameMap {
if _, exist := curIDs[dsID]; exist {
for idx := range q.Values {
if _, ok := q.Values[idx].(string); !ok {
continue
}
if match.Match(dsName, q.Values[idx].(string)) {
dsIDs[nameMap[dsName]] = struct{}{}
}
}
}
}
} else if q.Op == "not in" {
for dsName, _ := range nameMap {
for idx := range q.Values {
if _, ok := q.Values[idx].(string); !ok {
continue
}
if match.Match(dsName, q.Values[idx].(string)) {
delete(curIDs, nameMap[dsName])
}
}
}
dsIDs = curIDs
}
} else if q.MatchType == 2 {
// 全部数据源
for id := range curIDs {
dsIDs[id] = struct{}{}
}
}
curIDs = dsIDs
if len(curIDs) == 0 {
break
}
}
dsIds := make([]int64, 0, len(curIDs))
for c := range curIDs {
dsIds = append(dsIds, c)
}
return dsIds
}
func GetHostsQuery(queries []HostQuery) []map[string]interface{} {
var query []map[string]interface{}
for _, q := range queries {
m := make(map[string]interface{})
switch q.Key {
case "group_ids":
ids := ParseInt64(q.Values)
if q.Op == "==" {
m["target_busi_group.group_id in (?)"] = ids
} else {
m["NOT EXISTS (SELECT 1 FROM target_busi_group tbg WHERE tbg.target_ident = target.ident AND tbg.group_id IN (?))"] = ids
}
case "tags":
lst := []string{}
for _, v := range q.Values {
if v == nil {
continue
}
lst = append(lst, v.(string))
}
if q.Op == "==" {
blank := " "
for _, tag := range lst {
m["tags like ?"+blank] = "%" + tag + "%"
m["host_tags like ?"+blank] = "%" + tag + "%"
blank += " "
}
} else {
var args []interface{}
var query []string
for _, tag := range lst {
query = append(query, "tags not like ?",
"(host_tags not like ? or host_tags is null)")
args = append(args, "%"+tag+"%", "%"+tag+"%")
}
m[strings.Join(query, " and ")] = args
}
case "hosts":
lst := []string{}
for _, v := range q.Values {
if v == nil {
continue
}
lst = append(lst, v.(string))
}
if q.Op == "==" {
m["ident in (?)"] = lst
} else if q.Op == "!=" {
m["ident not in (?)"] = lst
} else if q.Op == "=~" {
blank := " "
for _, host := range lst {
m["ident like ?"+blank] = strings.ReplaceAll(host, "*", "%")
blank += " "
}
} else if q.Op == "!~" {
var args []interface{}
var query []string
for _, host := range lst {
query = append(query, "ident not like ?")
args = append(args, strings.ReplaceAll(host, "*", "%"))
}
m[strings.Join(query, " and ")] = args
}
}
query = append(query, m)
}
return query
}
func ParseInt64(values []interface{}) []int64 {
b, _ := json.Marshal(values)
var ret []int64
json.Unmarshal(b, &ret)
return ret
}
type HostQuery struct {
Key string `json:"key"`
Op string `json:"op"`
Values []interface{} `json:"values"`
}
func Str2Int(arr []string) []int64 {
var ret []int64
for _, v := range arr {
i, _ := strconv.ParseInt(v, 10, 64)
ret = append(ret, i)
}
return ret
}
func (ar *AlertRule) TableName() string {
return "alert_rule"
}
func (ar *AlertRule) Verify() error {
if ar.GroupId < 0 {
return fmt.Errorf("GroupId(%d) invalid", ar.GroupId)
}
//if IsAllDatasource(ar.DatasourceIdsJson) {
// ar.DatasourceIdsJson = []int64{0}
//}
ar.Name = strings.TrimSpace(ar.Name)
if ar.Name == "" {
return errors.New("name is blank")
}
if ar.TimeZone != "" {
_, err := time.LoadLocation(ar.TimeZone)
if err != nil {
return fmt.Errorf("invalid timezone: %s", ar.TimeZone)
}
}
if str.Dangerous(ar.Name) {
return errors.New("Name has invalid characters")
}
if ar.Prod == "" {
ar.Prod = METRIC
}
if ar.Cate == "" {
ar.Cate = PROMETHEUS
}
if ar.RuleConfig == "" {
return errors.New("rule_config is blank")
}
if ar.PromEvalInterval <= 0 {
ar.PromEvalInterval = 15
}
// check in front-end
// if _, err := parser.ParseExpr(ar.PromQl); err != nil {
// return errors.New("prom_ql parse error: %")
// }
ar.AppendTags = strings.TrimSpace(ar.AppendTags)
arr := strings.Fields(ar.AppendTags)
appendTagKeys := make(map[string]struct{})
for i := 0; i < len(arr); i++ {
if !strings.Contains(arr[i], "=") {
return fmt.Errorf("AppendTags(%s) invalid", arr[i])
}
pair := strings.SplitN(arr[i], "=", 2)
if _, exists := appendTagKeys[pair[0]]; exists {
return fmt.Errorf("AppendTags has duplicate key: %s", pair[0])
}
appendTagKeys[pair[0]] = struct{}{}
}
gids := strings.Fields(ar.NotifyGroups)
for i := 0; i < len(gids); i++ {
if _, err := strconv.ParseInt(gids[i], 10, 64); err != nil {
return fmt.Errorf("NotifyGroups(%s) invalid", ar.NotifyGroups)
}
}
if err := ar.validateCronPattern(); err != nil {
return err
}
if ar.NotifyVersion == 0 {
// 如果是旧版本,则清空 NotifyRuleIds
ar.NotifyRuleIds = []int64{}
}
if ar.NotifyVersion > 0 {
// 如果是新版本,则清空旧的通知媒介和通知组
ar.NotifyChannelsJSON = []string{}
ar.NotifyGroupsJSON = []string{}
ar.NotifyChannels = ""
ar.NotifyGroups = ""
ar.Callbacks = ""
ar.CallbacksJSON = []string{}
}
return nil
}
func (ar *AlertRule) validateCronPattern() error {
if ar.CronPattern == "" {
return nil
}
// 创建一个临时的 cron scheduler 来验证表达式
scheduler := cron.New(cron.WithSeconds())
// 尝试添加一个空函数来验证 cron 表达式
_, err := scheduler.AddFunc(ar.CronPattern, func() {})
if err != nil {
return fmt.Errorf("invalid cron pattern: %s, error: %v", ar.CronPattern, err)
}
return nil
}
func (ar *AlertRule) Add(ctx *ctx.Context) error {
if err := ar.Verify(); err != nil {
return err
}
exists, err := AlertRuleExists(ctx, 0, ar.GroupId, ar.Name)
if err != nil {
return err
}
if exists {
return errors.New("AlertRule already exists")
}
now := time.Now().Unix()
ar.CreateAt = now
ar.UpdateAt = now
return Insert(ctx, ar)
}
func (ar *AlertRule) Update(ctx *ctx.Context, arf AlertRule) error {
if ar.Name != arf.Name {
exists, err := AlertRuleExists(ctx, ar.Id, ar.GroupId, arf.Name)
if err != nil {
return err
}
if exists {
return errors.New("AlertRule already exists")
}
}
err := arf.FE2DB()
if err != nil {
return err
}
arf.Id = ar.Id
arf.GroupId = ar.GroupId
arf.CreateAt = ar.CreateAt
arf.CreateBy = ar.CreateBy
arf.UpdateAt = time.Now().Unix()
err = arf.Verify()
if err != nil {
return err
}
return DB(ctx).Model(ar).Select("*").Updates(arf).Error
}
func (ar *AlertRule) UpdateColumn(ctx *ctx.Context, column string, value interface{}) error {
if value == nil {
return nil
}
if column == "datasource_ids" {
b, err := json.Marshal(value)
if err != nil {
return err
}
return DB(ctx).Model(ar).UpdateColumn(column, string(b)).Error
}
if column == "severity" {
severity := int(value.(float64))
if ar.Cate == PROMETHEUS {
var ruleConfig PromRuleConfig
err := json.Unmarshal([]byte(ar.RuleConfig), &ruleConfig)
if err != nil {
return err
}
if len(ruleConfig.Queries) < 1 {
ruleConfig.Severity = severity
b, err := json.Marshal(ruleConfig)
if err != nil {
return err
}
return DB(ctx).Model(ar).UpdateColumn("rule_config", string(b)).Error
}
if len(ruleConfig.Queries) != 1 {
return nil
}
ruleConfig.Queries[0].Severity = severity
b, err := json.Marshal(ruleConfig)
if err != nil {
return err
}
return DB(ctx).Model(ar).UpdateColumn("rule_config", string(b)).Error
} else if ar.Cate == HOST {
var ruleConfig HostRuleConfig
err := json.Unmarshal([]byte(ar.RuleConfig), &ruleConfig)
if err != nil {
return err
}
if len(ruleConfig.Triggers) != 1 {
return nil
}
ruleConfig.Triggers[0].Severity = severity
b, err := json.Marshal(ruleConfig)
if err != nil {
return err
}
return DB(ctx).Model(ar).UpdateColumn("rule_config", string(b)).Error
} else {
var ruleConfig RuleQuery
err := json.Unmarshal([]byte(ar.RuleConfig), &ruleConfig)
if err != nil {
return err
}
if len(ruleConfig.Triggers) != 1 {
return nil
}
ruleConfig.Triggers[0].Severity = severity
b, err := json.Marshal(ruleConfig)
if err != nil {
return err
}
return DB(ctx).Model(ar).UpdateColumn("rule_config", string(b)).Error
}
}
if column == "runbook_url" {
url := value.(string)
err := json.Unmarshal([]byte(ar.Annotations), &ar.AnnotationsJSON)
if err != nil {
return err
}
if ar.AnnotationsJSON == nil {
ar.AnnotationsJSON = make(map[string]string)
}
ar.AnnotationsJSON["runbook_url"] = url
b, err := json.Marshal(ar.AnnotationsJSON)
if err != nil {
return err
}
return DB(ctx).Model(ar).UpdateColumn("annotations", string(b)).Error
}
if column == "annotations" {
newAnnotations := value.(map[string]interface{})
ar.AnnotationsJSON = make(map[string]string)
for k, v := range newAnnotations {
ar.AnnotationsJSON[k] = v.(string)
}
b, err := json.Marshal(ar.AnnotationsJSON)
if err != nil {
return err
}
return DB(ctx).Model(ar).UpdateColumn("annotations", string(b)).Error
}
if column == "notify_rule_ids" {
updates := map[string]interface{}{
"notify_version": 1,
"notify_channels": "",
"notify_groups": "",
"notify_rule_ids": value,
}
return DB(ctx).Model(ar).Updates(updates).Error
}
if column == "notify_groups" || column == "notify_channels" {
updates := map[string]interface{}{
column: value,
"notify_version": 0,
"notify_rule_ids": []int64{},
}
return DB(ctx).Model(ar).Updates(updates).Error
}
return DB(ctx).Model(ar).UpdateColumn(column, value).Error
}
func (ar *AlertRule) UpdateFieldsMap(ctx *ctx.Context, fields map[string]interface{}) error {
return DB(ctx).Model(ar).Updates(fields).Error
}
func (ar *AlertRule) FillDatasourceQueries() error {
// 兼容旧逻辑,将 datasourceIds 转换为 datasourceQueries
if len(ar.DatasourceQueries) == 0 && len(ar.DatasourceIds) != 0 {
datasourceQueries := DatasourceQuery{
MatchType: 0,
Op: "in",
Values: make([]interface{}, 0),
}
var values []int
if ar.DatasourceIds != "" {
json.Unmarshal([]byte(ar.DatasourceIds), &values)
}
for i := range values {
if values[i] == 0 {
// 0 表示所有数据源
datasourceQueries.MatchType = 2
break
}
datasourceQueries.Values = append(datasourceQueries.Values, values[i])
}
ar.DatasourceQueries = []DatasourceQuery{datasourceQueries}
}
return nil
}
func (ar *AlertRule) FillSeverities() error {
if ar.RuleConfig != "" {
var rule RuleQuery
if err := json.Unmarshal([]byte(ar.RuleConfig), &rule); err != nil {
return err
}
m := make(map[int]struct{})
if (ar.Cate == PROMETHEUS || ar.Cate == LOKI) && rule.Version != "v2" {
var rule PromRuleConfig
if err := json.Unmarshal([]byte(ar.RuleConfig), &rule); err != nil {
return err
}
if len(rule.Queries) == 0 {
ar.Severities = append(ar.Severities, rule.Severity)
return nil
}
for i := range rule.Queries {
m[rule.Queries[i].Severity] = struct{}{}
}
} else {
for i := range rule.Triggers {
m[rule.Triggers[i].Severity] = struct{}{}
}
}
for k := range m {
ar.Severities = append(ar.Severities, k)
}
}
return nil
}
func (ar *AlertRule) FillNotifyGroups(ctx *ctx.Context, cache map[int64]*UserGroup) error {
// some user-group already deleted ?
count := len(ar.NotifyGroupsJSON)
if count == 0 {
ar.NotifyGroupsObj = []UserGroup{}
return nil
}
exists := make([]string, 0, count)
delete := false
for i := range ar.NotifyGroupsJSON {
id, _ := strconv.ParseInt(ar.NotifyGroupsJSON[i], 10, 64)
ug, has := cache[id]
if has {
exists = append(exists, ar.NotifyGroupsJSON[i])
ar.NotifyGroupsObj = append(ar.NotifyGroupsObj, *ug)
continue
}
ug, err := UserGroupGetById(ctx, id)
if err != nil {
return err
}
if ug == nil {
delete = true
} else {
exists = append(exists, ar.NotifyGroupsJSON[i])
ar.NotifyGroupsObj = append(ar.NotifyGroupsObj, *ug)
cache[id] = ug
}
}
if delete {
// some user-group already deleted
ar.NotifyGroupsJSON = exists
ar.NotifyGroups = strings.Join(exists, " ")
DB(ctx).Model(ar).Update("notify_groups", ar.NotifyGroups)
}
return nil
}
func (ar *AlertRule) FE2DB() error {
if len(ar.EnableStimesJSON) > 0 {
ar.EnableStime = strings.Join(ar.EnableStimesJSON, " ")
ar.EnableEtime = strings.Join(ar.EnableEtimesJSON, " ")
} else {
ar.EnableStime = ar.EnableStimeJSON
ar.EnableEtime = ar.EnableEtimeJSON
}
if len(ar.EnableDaysOfWeeksJSON) > 0 {
for i := 0; i < len(ar.EnableDaysOfWeeksJSON); i++ {
if len(ar.EnableDaysOfWeeksJSON) == 1 {
ar.EnableDaysOfWeek = strings.Join(ar.EnableDaysOfWeeksJSON[i], " ")
} else {
if i == len(ar.EnableDaysOfWeeksJSON)-1 {
ar.EnableDaysOfWeek += strings.Join(ar.EnableDaysOfWeeksJSON[i], " ")
} else {
ar.EnableDaysOfWeek += strings.Join(ar.EnableDaysOfWeeksJSON[i], " ") + ";"
}
}
}
} else {
ar.EnableDaysOfWeek = strings.Join(ar.EnableDaysOfWeekJSON, " ")
}
ar.NotifyChannels = strings.Join(ar.NotifyChannelsJSON, " ")
ar.NotifyGroups = strings.Join(ar.NotifyGroupsJSON, " ")
ar.Callbacks = strings.Join(ar.CallbacksJSON, " ")
for i := range ar.AppendTagsJSON {
// 后面要把多个标签拼接在一起,所以每个标签里不能有空格
ar.AppendTagsJSON[i] = strings.ReplaceAll(ar.AppendTagsJSON[i], " ", "")
}
if len(ar.AppendTagsJSON) > 0 {
ar.AppendTags = strings.Join(ar.AppendTagsJSON, " ")
}
algoParamsByte, err := json.Marshal(ar.AlgoParamsJson)
if err != nil {
return fmt.Errorf("marshal algo_params err:%v", err)
}
ar.AlgoParams = string(algoParamsByte)
// 老的规则,是 PromQl 和 Severity 字段,新版的规则,使用 RuleConfig 字段
if ar.RuleConfigJson == nil || len(ar.PromQl) > 0 {
query := PromQuery{
PromQl: ar.PromQl,
Severity: ar.Severity,
}
ar.RuleConfigJson = PromRuleConfig{
Queries: []PromQuery{query},
}
}
// json.Marshal RuleConfigJson
if ar.RuleConfigJson != nil {
b, err := json.Marshal(ar.RuleConfigJson)
if err != nil {
return fmt.Errorf("marshal rule_config err:%v", err)
}
ar.RuleConfig = string(b)
ar.PromQl = ""
}
if ar.AnnotationsJSON != nil {
b, err := json.Marshal(ar.AnnotationsJSON)
if err != nil {
return fmt.Errorf("marshal annotations err:%v", err)
}
ar.Annotations = string(b)
}
if ar.ExtraConfigJSON != nil {
b, err := json.Marshal(ar.ExtraConfigJSON)
if err != nil {
return fmt.Errorf("marshal extra_config err:%v", err)
}
ar.ExtraConfig = string(b)
}
return nil
}
func (ar *AlertRule) DB2FE() error {
ar.EnableStimesJSON = strings.Fields(ar.EnableStime)
ar.EnableEtimesJSON = strings.Fields(ar.EnableEtime)
if len(ar.EnableEtimesJSON) > 0 {
ar.EnableStimeJSON = ar.EnableStimesJSON[0]
ar.EnableEtimeJSON = ar.EnableEtimesJSON[0]
}
cache := strings.Split(ar.EnableDaysOfWeek, ";")
for i := 0; i < len(cache); i++ {
ar.EnableDaysOfWeeksJSON = append(ar.EnableDaysOfWeeksJSON, strings.Fields(cache[i]))
}
if len(ar.EnableDaysOfWeeksJSON) > 0 {
ar.EnableDaysOfWeekJSON = ar.EnableDaysOfWeeksJSON[0]
}
if ar.NotifyRuleIds == nil {
ar.NotifyRuleIds = make([]int64, 0)
}
ar.NotifyChannelsJSON = strings.Fields(ar.NotifyChannels)
ar.NotifyGroupsJSON = strings.Fields(ar.NotifyGroups)
ar.CallbacksJSON = strings.Fields(ar.Callbacks)
ar.AppendTagsJSON = strings.Fields(ar.AppendTags)
json.Unmarshal([]byte(ar.AlgoParams), &ar.AlgoParamsJson)
json.Unmarshal([]byte(ar.RuleConfig), &ar.RuleConfigJson)
json.Unmarshal([]byte(ar.Annotations), &ar.AnnotationsJSON)
json.Unmarshal([]byte(ar.ExtraConfig), &ar.ExtraConfigJSON)
// 解析 RuleConfig 字段
var ruleConfig struct {
EventRelabelConfig []*pconf.RelabelConfig `json:"event_relabel_config"`
}
json.Unmarshal([]byte(ar.RuleConfig), &ruleConfig)
ar.EventRelabelConfig = ruleConfig.EventRelabelConfig
// 兼容旧逻辑填充 cron_pattern
if ar.CronPattern == "" && ar.PromEvalInterval != 0 {
ar.CronPattern = fmt.Sprintf("@every %ds", ar.PromEvalInterval)
}
err := ar.FillDatasourceQueries()
if err != nil {
return err
}
ar.FillSeverities()
return nil
}
func AlertRuleDels(ctx *ctx.Context, ids []int64, bgid ...int64) error {
for i := 0; i < len(ids); i++ {
session := DB(ctx).Where("id = ?", ids[i])
if len(bgid) > 0 {
session = session.Where("group_id = ?", bgid[0])
}
ret := session.Delete(&AlertRule{})
if ret.Error != nil {
return ret.Error
}
// 说明确实删掉了,把相关的活跃告警也删了,这些告警永远都不会恢复了,而且策略都没了,说明没���关心了
if ret.RowsAffected > 0 {
DB(ctx).Where("rule_id = ?", ids[i]).Delete(new(AlertCurEvent))
}
}
return nil
}
func AlertRuleExists(ctx *ctx.Context, id, groupId int64, name string) (bool, error) {
session := DB(ctx).Where("id <> ? and group_id = ? and name = ?", id, groupId, name)
var lst []AlertRule
err := session.Find(&lst).Error
if err != nil {
return false, err
}
return len(lst) > 0, nil
}
func GetAlertRuleIdsByTaskId(ctx *ctx.Context, taskId int64) ([]int64, error) {
tpl := "%\"tpl_id\":" + fmt.Sprint(taskId) + "}%"
cb := "{ibex}/" + fmt.Sprint(taskId) + "%"
session := DB(ctx).Where("rule_config like ? or callbacks like ?", tpl, cb)
var lst []AlertRule
var ids []int64
err := session.Find(&lst).Error
if err != nil || len(lst) == 0 {
return ids, err
}
for i := 0; i < len(lst); i++ {
ids = append(ids, lst[i].Id)
}
return ids, nil
}
func AlertRuleGets(ctx *ctx.Context, groupId int64) ([]AlertRule, error) {
session := DB(ctx).Where("group_id=?", groupId).Order("name")
var lst []AlertRule
err := session.Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
}
return lst, err
}
func AlertRuleGetsByBGIds(ctx *ctx.Context, bgids []int64) ([]AlertRule, error) {
session := DB(ctx)
if len(bgids) > 0 {
session = session.Where("group_id in (?)", bgids).Order("name")
}
var lst []AlertRule
err := session.Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
}
return lst, err
}
func AlertRuleGetsAll(ctx *ctx.Context) ([]*AlertRule, error) {
if !ctx.IsCenter {
lst, err := poster.GetByUrls[[]*AlertRule](ctx, "/v1/n9e/alert-rules?disabled=0")
if err != nil {
return nil, err
}
for i := 0; i < len(lst); i++ {
lst[i].FE2DB()
}
return lst, err
}
session := DB(ctx).Where("disabled = ?", 0)
var lst []*AlertRule
err := session.Find(&lst).Error
if err != nil {
return lst, err
}
if len(lst) == 0 {
return lst, nil
}
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
return lst, nil
}
func AlertRulesGetsBy(ctx *ctx.Context, prods []string, query, algorithm, cluster string,
cates []string, disabled int) ([]*AlertRule, error) {
session := DB(ctx)
if len(prods) > 0 {
session = session.Where("prod in (?)", prods)
}
if query != "" {
arr := strings.Fields(query)
for i := 0; i < len(arr); i++ {
qarg := "%" + arr[i] + "%"
session = session.Where("append_tags like ?", qarg)
}
}
if algorithm != "" {
session = session.Where("algorithm = ?", algorithm)
}
if cluster != "" {
session = session.Where("cluster like ?", "%"+cluster+"%")
}
if len(cates) != 0 {
session = session.Where("cate in (?)", cates)
}
if disabled != -1 {
session = session.Where("disabled = ?", disabled)
}
var lst []*AlertRule
err := session.Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
}
return lst, err
}
func AlertRuleGet(ctx *ctx.Context, where string, args ...interface{}) (*AlertRule, error) {
var lst []*AlertRule
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
lst[0].DB2FE()
return lst[0], nil
}
func AlertRuleGetById(ctx *ctx.Context, id int64) (*AlertRule, error) {
return AlertRuleGet(ctx, "id=?", id)
}
func AlertRuleGetsByIds(ctx *ctx.Context, ids []int64) ([]AlertRule, error) {
lst := make([]AlertRule, 0, len(ids))
err := DB(ctx).Model(new(AlertRule)).Where("id in ?", ids).Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
}
return lst, err
}
func AlertRuleStatistics(ctx *ctx.Context) (*Statistics, error) {
if !ctx.IsCenter {
s, err := poster.GetByUrls[*Statistics](ctx, "/v1/n9e/statistic?name=alert_rule")
return s, err
}
session := DB(ctx).Model(&AlertRule{}).Select("count(*) as total", "max(update_at) as last_updated").Where("disabled = ?", 0)
var stats []*Statistics
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
func (ar *AlertRule) IsPrometheusRule() bool {
return ar.Prod == METRIC && ar.Cate == PROMETHEUS
}
func (ar *AlertRule) IsLokiRule() bool {
return ar.Prod == LOKI || ar.Cate == LOKI
}
func (ar *AlertRule) IsTdengineRule() bool {
return ar.Cate == TDENGINE
}
func (ar *AlertRule) IsHostRule() bool {
return ar.Prod == HOST
}
func (ar *AlertRule) IsInnerRule() bool {
return ar.Cate == TDENGINE ||
ar.Cate == CLICKHOUSE ||
ar.Cate == ELASTICSEARCH ||
ar.Prod == LOKI || ar.Cate == LOKI ||
ar.Cate == MYSQL ||
ar.Cate == POSTGRESQL ||
ar.Cate == DORIS ||
ar.Cate == OPENSEARCH ||
ar.Cate == VICTORIALOGS
}
func (ar *AlertRule) GetRuleType() string {
if ar.Prod == METRIC || ar.Prod == LOG {
return ar.Cate
}
return ar.Prod
}
func (ar *AlertRule) IsClickHouseRule() bool {
return ar.Cate == CLICKHOUSE
}
func (ar *AlertRule) IsElasticSearch() bool {
return ar.Cate == ELASTICSEARCH
}
func (ar *AlertRule) GenerateNewEvent(ctx *ctx.Context) *AlertCurEvent {
event := &AlertCurEvent{}
ar.UpdateEvent(event)
return event
}
func (ar *AlertRule) UpdateEvent(event *AlertCurEvent) {
if event == nil {
return
}
event.GroupId = ar.GroupId
event.Cate = ar.Cate
event.RuleId = ar.Id
event.RuleName = ar.Name
event.RuleNote = ar.Note
event.RuleProd = ar.Prod
event.RuleAlgo = ar.Algorithm
event.PromForDuration = ar.PromForDuration
event.Callbacks = ar.Callbacks
event.CallbacksJSON = ar.CallbacksJSON
event.RunbookUrl = ar.RunbookUrl
event.NotifyRecovered = ar.NotifyRecovered
event.NotifyChannels = ar.NotifyChannels
event.NotifyChannelsJSON = ar.NotifyChannelsJSON
event.NotifyGroups = ar.NotifyGroups
event.NotifyGroupsJSON = ar.NotifyGroupsJSON
}
func AlertRuleUpgradeToV6(ctx *ctx.Context, dsm map[string]Datasource) error {
var lst []*AlertRule
err := DB(ctx).Find(&lst).Error
if err != nil {
return err
}
for i := 0; i < len(lst); i++ {
var ids []int64
if lst[i].Cluster == "$all" {
ids = append(ids, 0)
} else {
clusters := strings.Fields(lst[i].Cluster)
for j := 0; j < len(clusters); j++ {
if ds, exists := dsm[clusters[j]]; exists {
ids = append(ids, ds.Id)
}
}
}
b, err := json.Marshal(ids)
if err != nil {
continue
}
lst[i].DatasourceIds = string(b)
if lst[i].PromQl == "" {
continue
}
ruleConfig := PromRuleConfig{
Queries: []PromQuery{
{
PromQl: lst[i].PromQl,
Severity: lst[i].Severity,
},
},
}
b, _ = json.Marshal(ruleConfig)
lst[i].RuleConfig = string(b)
m := make(map[string]string)
if lst[i].RunbookUrl != "" {
m["runbook_url"] = lst[i].RunbookUrl
b, err = json.Marshal(m)
if err != nil {
continue
}
lst[i].Annotations = string(b)
}
if lst[i].Prod == "" {
lst[i].Prod = METRIC
}
if lst[i].Cate == "" {
lst[i].Cate = PROMETHEUS
}
err = lst[i].UpdateFieldsMap(ctx, map[string]interface{}{
"datasource_ids": lst[i].DatasourceIds,
"annotations": lst[i].Annotations,
"rule_config": lst[i].RuleConfig,
"prod": lst[i].Prod,
"cate": lst[i].Cate,
})
if err != nil {
logger.Errorf("update alert rule:%d datasource ids failed, %v", lst[i].Id, err)
}
}
return nil
}
func GetTargetsOfHostAlertRule(ctx *ctx.Context, engineName string) (map[string]map[int64][]string, error) {
if !ctx.IsCenter {
m, err := poster.GetByUrls[map[string]map[int64][]string](ctx, "/v1/n9e/targets-of-alert-rule?engine_name="+engineName)
return m, err
}
m := make(map[string]map[int64][]string)
hostAlertRules, err := AlertRulesGetsBy(ctx, []string{"host"}, "", "", "", []string{}, 0)
if err != nil {
return m, err
}
for i := 0; i < len(hostAlertRules); i++ {
var rule *HostRuleConfig
if err := json.Unmarshal([]byte(hostAlertRules[i].RuleConfig), &rule); err != nil {
logger.Errorf("rule:%d rule_config:%s, error:%v", hostAlertRules[i].Id, hostAlertRules[i].RuleConfig, err)
continue
}
if rule == nil {
logger.Errorf("rule:%d rule_config:%s, error:rule is nil", hostAlertRules[i].Id, hostAlertRules[i].RuleConfig)
continue
}
query := GetHostsQuery(rule.Queries)
session := TargetFilterQueryBuild(ctx, query, 0, 0)
var lst []*Target
err := session.Find(&lst).Error
if err != nil {
logger.Errorf("failed to query targets: %v", err)
continue
}
for _, target := range lst {
if _, exists := m[target.EngineName]; !exists {
m[target.EngineName] = make(map[int64][]string)
}
if _, exists := m[target.EngineName][hostAlertRules[i].Id]; !exists {
m[target.EngineName][hostAlertRules[i].Id] = []string{}
}
m[target.EngineName][hostAlertRules[i].Id] = append(m[target.EngineName][hostAlertRules[i].Id], target.Ident)
logger.Debugf("get_targets_of_alert_rule engine:%s, rule:%d, target:%s", target.EngineName, hostAlertRules[i].Id, target.Ident)
}
}
return m, nil
}
func (ar *AlertRule) Copy(ctx *ctx.Context) (*AlertRule, error) {
newAr := &AlertRule{}
err := copier.Copy(newAr, ar)
if err != nil {
logger.Errorf("copy alert rule failed, %v", err)
}
return newAr, err
}
func InsertAlertRule(ctx *ctx.Context, ars []*AlertRule) error {
if len(ars) == 0 {
return nil
}
return DB(ctx).Create(ars).Error
}
func (ar *AlertRule) Hash() string {
return str.MD5(fmt.Sprintf("%d_%s_%s", ar.Id, ar.DatasourceIds, ar.RuleConfig))
}
// 复制告警策略,需要提供操作者名称和新的业务组ID
func (ar *AlertRule) Clone(operatorName string, newBgid int64) *AlertRule {
newAr := ar
newAr.Id = 0
newAr.GroupId = newBgid
newAr.Name = ar.Name
newAr.UpdateBy = operatorName
newAr.UpdateAt = time.Now().Unix()
newAr.CreateBy = operatorName
newAr.CreateAt = time.Now().Unix()
return newAr
}
================================================
FILE: models/alert_subscribe.go
================================================
package models
import (
"encoding/json"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/ormx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type AlertSubscribe struct {
Id int64 `json:"id" gorm:"primaryKey"`
Name string `json:"name"` // AlertSubscribe name
Disabled int `json:"disabled"` // 0: enabled, 1: disabled
GroupId int64 `json:"group_id"`
Prod string `json:"prod"`
Cate string `json:"cate"`
DatasourceIds string `json:"-" gorm:"datasource_ids"` // datasource ids
DatasourceIdsJson []int64 `json:"datasource_ids" gorm:"-"` // for fe
Cluster string `json:"cluster"` // take effect by clusters, separated by space
RuleId int64 `json:"rule_id"`
Severities string `json:"-" gorm:"severities"` // sub severity
SeveritiesJson []int `json:"severities" gorm:"-"` // for fe
ForDuration int64 `json:"for_duration"` // for duration, unit: second
RuleName string `json:"rule_name" gorm:"-"` // for fe
Tags ormx.JSONArr `json:"tags"`
RedefineSeverity int `json:"redefine_severity"`
NewSeverity int `json:"new_severity"`
RedefineChannels int `json:"redefine_channels"`
NewChannels string `json:"new_channels"`
UserGroupIds string `json:"user_group_ids"`
UserGroups []UserGroup `json:"user_groups" gorm:"-"` // for fe
RedefineWebhooks int `json:"redefine_webhooks"`
Webhooks string `json:"-" gorm:"webhooks"`
WebhooksJson []string `json:"webhooks" gorm:"-"`
ExtraConfig string `json:"-" grom:"extra_config"`
ExtraConfigJson interface{} `json:"extra_config" gorm:"-"` // for fe
Note string `json:"note"`
CreateBy string `json:"create_by"`
CreateAt int64 `json:"create_at"`
UpdateBy string `json:"update_by"`
UpdateAt int64 `json:"update_at"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
ITags []TagFilter `json:"-" gorm:"-"` // inner tags
BusiGroups ormx.JSONArr `json:"busi_groups"`
IBusiGroups []TagFilter `json:"-" gorm:"-"` // inner busiGroups
RuleIds []int64 `json:"rule_ids" gorm:"serializer:json"`
NotifyRuleIds []int64 `json:"notify_rule_ids" gorm:"serializer:json"`
NotifyVersion int `json:"notify_version"`
RuleNames []string `json:"rule_names" gorm:"-"`
}
func (s *AlertSubscribe) TableName() string {
return "alert_subscribe"
}
func AlertSubscribeGets(ctx *ctx.Context, groupId int64) (lst []AlertSubscribe, err error) {
err = DB(ctx).Where("group_id=?", groupId).Order("id desc").Find(&lst).Error
return
}
func AlertSubscribeGetsByBGIds(ctx *ctx.Context, bgids []int64) (lst []AlertSubscribe, err error) {
session := DB(ctx)
if len(bgids) > 0 {
session = session.Where("group_id in (?)", bgids)
}
err = session.Order("id desc").Find(&lst).Error
return
}
func AlertSubscribeGetsByService(ctx *ctx.Context) (lst []AlertSubscribe, err error) {
err = DB(ctx).Find(&lst).Error
if err != nil {
return
}
for i := range lst {
lst[i].DB2FE()
}
return
}
func AlertSubscribeGet(ctx *ctx.Context, where string, args ...interface{}) (*AlertSubscribe, error) {
var lst []*AlertSubscribe
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
return lst[0], nil
}
func (s *AlertSubscribe) IsDisabled() bool {
return s.Disabled == 1
}
func (s *AlertSubscribe) Verify() error {
if IsAllDatasource(s.DatasourceIdsJson) {
s.DatasourceIdsJson = []int64{0}
}
if err := s.Parse(); err != nil {
return err
}
if len(s.SeveritiesJson) == 0 {
return errors.New("severities is required")
}
if s.NotifyVersion == 1 {
if len(s.NotifyRuleIds) == 0 {
return errors.New("no notify rules selected")
}
s.UserGroupIds = ""
s.RedefineChannels = 0
s.NewChannels = ""
s.RedefineWebhooks = 0
s.Webhooks = ""
s.RedefineSeverity = 0
s.NewSeverity = 0
return nil
}
if s.UserGroupIds != "" && s.NewChannels == "" {
// 如果指定了用户组,那么新告警的通知渠道必须指定,否则容易出现告警规则中没有指定通知渠道,导致订阅通知时,没有通知渠道
return errors.New("new_channels is required")
}
ugids := strings.Fields(s.UserGroupIds)
for i := 0; i < len(ugids); i++ {
if _, err := strconv.ParseInt(ugids[i], 10, 64); err != nil {
return errors.New("user_group_ids invalid")
}
}
if s.NotifyVersion == 0 {
s.NotifyRuleIds = []int64{}
}
return nil
}
func (s *AlertSubscribe) FE2DB() error {
if len(s.DatasourceIdsJson) > 0 {
idsByte, _ := json.Marshal(s.DatasourceIdsJson)
s.DatasourceIds = string(idsByte)
}
if len(s.WebhooksJson) > 0 {
b, _ := json.Marshal(s.WebhooksJson)
s.Webhooks = string(b)
}
b, _ := json.Marshal(s.ExtraConfigJson)
s.ExtraConfig = string(b)
if len(s.SeveritiesJson) > 0 {
b, _ := json.Marshal(s.SeveritiesJson)
s.Severities = string(b)
}
return nil
}
func (s *AlertSubscribe) DB2FE() error {
if s.DatasourceIds != "" {
if err := json.Unmarshal([]byte(s.DatasourceIds), &s.DatasourceIdsJson); err != nil {
return err
}
}
if s.Webhooks != "" {
if err := json.Unmarshal([]byte(s.Webhooks), &s.WebhooksJson); err != nil {
return err
}
}
if s.ExtraConfig != "" {
if err := json.Unmarshal([]byte(s.ExtraConfig), &s.ExtraConfigJson); err != nil {
return err
}
}
if s.Severities != "" {
if err := json.Unmarshal([]byte(s.Severities), &s.SeveritiesJson); err != nil {
return err
}
}
return nil
}
func (s *AlertSubscribe) Parse() error {
var err error
s.ITags, err = GetTagFilters(s.Tags)
if err != nil {
return err
}
s.IBusiGroups, err = GetTagFilters(s.BusiGroups)
return err
}
func (s *AlertSubscribe) Add(ctx *ctx.Context) error {
if err := s.Verify(); err != nil {
return err
}
if err := s.FE2DB(); err != nil {
return err
}
now := time.Now().Unix()
s.CreateAt = now
s.UpdateAt = now
return Insert(ctx, s)
}
func (s *AlertSubscribe) CompatibleWithOldRuleId() {
if len(s.RuleIds) == 0 && s.RuleId != 0 {
s.RuleIds = append(s.RuleIds, s.RuleId)
}
}
func (s *AlertSubscribe) FillRuleNames(ctx *ctx.Context, cache map[int64]string) error {
s.CompatibleWithOldRuleId()
if len(s.RuleIds) == 0 {
return nil
}
idNameHas := make(map[int64]string, len(s.RuleIds))
idsNotInCache := make([]int64, 0)
for _, rid := range s.RuleIds {
rname, exist := cache[rid]
if exist {
idNameHas[rid] = rname
} else {
idsNotInCache = append(idsNotInCache, rid)
}
}
if len(idsNotInCache) > 0 {
lst, err := AlertRuleGetsByIds(ctx, idsNotInCache)
if err != nil {
return err
}
for _, alterRule := range lst {
idNameHas[alterRule.Id] = alterRule.Name
cache[alterRule.Id] = alterRule.Name
}
}
names := make([]string, len(s.RuleIds))
for i, rid := range s.RuleIds {
if name, exist := idNameHas[rid]; exist {
names[i] = name
} else if rid == 0 {
names[i] = ""
} else {
names[i] = "Error: AlertRule not found"
}
}
s.RuleNames = names
return nil
}
// for v5 rule
func (s *AlertSubscribe) FillDatasourceIds(ctx *ctx.Context) error {
if s.DatasourceIds != "" {
json.Unmarshal([]byte(s.DatasourceIds), &s.DatasourceIdsJson)
return nil
}
return nil
}
func (s *AlertSubscribe) FillUserGroups(ctx *ctx.Context, cache map[int64]*UserGroup) error {
// some user-group already deleted ?
ugids := strings.Fields(s.UserGroupIds)
count := len(ugids)
if count == 0 {
s.UserGroups = []UserGroup{}
return nil
}
exists := make([]string, 0, count)
isDelete := false
for i := range ugids {
id, _ := strconv.ParseInt(ugids[i], 10, 64)
ug, has := cache[id]
if has {
exists = append(exists, ugids[i])
s.UserGroups = append(s.UserGroups, *ug)
continue
}
ug, err := UserGroupGetById(ctx, id)
if err != nil {
return err
}
if ug == nil {
isDelete = true
} else {
exists = append(exists, ugids[i])
s.UserGroups = append(s.UserGroups, *ug)
cache[id] = ug
}
}
if isDelete {
// some user-group already deleted
DB(ctx).Model(s).Update("user_group_ids", strings.Join(exists, " "))
s.UserGroupIds = strings.Join(exists, " ")
}
return nil
}
func (s *AlertSubscribe) Update(ctx *ctx.Context, selectField interface{}, selectFields ...interface{}) error {
if err := s.Verify(); err != nil {
return err
}
if err := s.FE2DB(); err != nil {
return err
}
return DB(ctx).Model(s).Select(selectField, selectFields...).Updates(s).Error
}
func AlertSubscribeDel(ctx *ctx.Context, ids []int64) error {
if len(ids) == 0 {
return nil
}
return DB(ctx).Where("id in ?", ids).Delete(new(AlertSubscribe)).Error
}
func AlertSubscribeStatistics(ctx *ctx.Context) (*Statistics, error) {
if !ctx.IsCenter {
s, err := poster.GetByUrls[*Statistics](ctx, "/v1/n9e/statistic?name=alert_subscribe")
return s, err
}
session := DB(ctx).Model(&AlertSubscribe{}).Select("count(*) as total", "max(update_at) as last_updated")
var stats []*Statistics
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
func AlertSubscribeGetsAll(ctx *ctx.Context) ([]*AlertSubscribe, error) {
if !ctx.IsCenter {
lst, err := poster.GetByUrls[[]*AlertSubscribe](ctx, "/v1/n9e/alert-subscribes")
if err != nil {
return nil, err
}
for i := 0; i < len(lst); i++ {
lst[i].FE2DB()
}
return lst, err
}
// get my cluster's subscribes
session := DB(ctx).Model(&AlertSubscribe{})
var lst []*AlertSubscribe
err := session.Find(&lst).Error
return lst, err
}
func (s *AlertSubscribe) MatchProd(prod string) bool {
//Replace 'prod' with optional item
if s.Prod == "" {
return true
}
return s.Prod == prod
}
func (s *AlertSubscribe) MatchCate(cate string) bool {
if s.Cate == "" {
return true
}
if s.Cate == "host" {
return cate == "host"
}
return true
}
func (s *AlertSubscribe) MatchCluster(dsId int64) bool {
// 没有配置数据源, 或者事件不需要关联数据源
// do not match any datasource or event not related to datasource
if len(s.DatasourceIdsJson) == 0 || dsId == 0 {
return true
}
for _, id := range s.DatasourceIdsJson {
if id == dsId || id == 0 {
return true
}
}
return false
}
func (s *AlertSubscribe) ModifyEvent(event *AlertCurEvent) {
if s.RedefineSeverity == 1 {
event.Severity = s.NewSeverity
}
if s.RedefineChannels == 1 {
event.NotifyChannels = s.NewChannels
event.NotifyChannelsJSON = strings.Fields(s.NewChannels)
}
if s.RedefineWebhooks == 1 {
event.Callbacks = s.Webhooks
event.CallbacksJSON = s.WebhooksJson
} else {
// 将 callback 重置为空,防止事件被订阅之后,再次将事件发送给回调地址
event.Callbacks = ""
event.CallbacksJSON = []string{}
}
if len(s.NotifyRuleIds) > 0 {
event.NotifyRuleIds = s.NotifyRuleIds
} else {
event.NotifyRuleIds = []int64{}
}
event.NotifyGroups = s.UserGroupIds
event.NotifyGroupsJSON = strings.Fields(s.UserGroupIds)
}
func (s *AlertSubscribe) UpdateFieldsMap(ctx *ctx.Context, fields map[string]interface{}) error {
return DB(ctx).Model(s).Updates(fields).Error
}
func AlertSubscribeUpgradeToV6(ctx *ctx.Context, dsm map[string]Datasource) error {
var lst []*AlertSubscribe
err := DB(ctx).Find(&lst).Error
if err != nil {
return err
}
for i := 0; i < len(lst); i++ {
var ids []int64
if lst[i].Cluster == "$all" {
ids = append(ids, 0)
} else {
clusters := strings.Fields(lst[i].Cluster)
for j := 0; j < len(clusters); j++ {
if ds, exists := dsm[clusters[j]]; exists {
ids = append(ids, ds.Id)
}
}
}
b, err := json.Marshal(ids)
if err != nil {
continue
}
lst[i].DatasourceIds = string(b)
if lst[i].Prod == "" {
lst[i].Prod = METRIC
}
if lst[i].Cate == "" {
lst[i].Cate = PROMETHEUS
}
err = lst[i].UpdateFieldsMap(ctx, map[string]interface{}{
"datasource_ids": lst[i].DatasourceIds,
"prod": lst[i].Prod,
"cate": PROMETHEUS,
})
if err != nil {
logger.Errorf("update alert rule:%d datasource ids failed, %v", lst[i].Id, err)
}
}
return nil
}
================================================
FILE: models/alerting_engine.go
================================================
package models
import (
"fmt"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
)
type AlertingEngines struct {
Id int64 `json:"id" gorm:"primaryKey"`
Instance string `json:"instance"`
EngineCluster string `json:"cluster" gorm:"engine_cluster"`
DatasourceId int64 `json:"datasource_id"`
Clock int64 `json:"clock"`
}
func (e *AlertingEngines) TableName() string {
return "alerting_engines"
}
// UpdateCluster 页面上用户会给各个n9e-server分配要关联的目标集群是什么
func (e *AlertingEngines) UpdateDatasourceId(ctx *ctx.Context, id int64) error {
count, err := Count(DB(ctx).Model(&AlertingEngines{}).Where("id<>? and instance=? and datasource_id=?", e.Id, e.Instance, id))
if err != nil {
return err
}
if count > 0 {
return fmt.Errorf("instance %s and datasource_id %d already exists", e.Instance, id)
}
e.DatasourceId = id
return DB(ctx).Model(e).Select("datasource_id").Updates(e).Error
}
func AlertingEngineAdd(ctx *ctx.Context, instance string, datasourceId int64) error {
count, err := Count(DB(ctx).Model(&AlertingEngines{}).Where("instance=? and datasource_id=?", instance, datasourceId))
if err != nil {
return err
}
if count > 0 {
return fmt.Errorf("instance %s and datasource_id %d already exists", instance, datasourceId)
}
err = DB(ctx).Create(&AlertingEngines{
Instance: instance,
DatasourceId: datasourceId,
Clock: time.Now().Unix(),
}).Error
return err
}
func AlertingEngineDel(ctx *ctx.Context, ids []int64) error {
if len(ids) == 0 {
return nil
}
return DB(ctx).Where("id in ?", ids).Delete(new(AlertingEngines)).Error
}
func AlertingEngineGetDatasourceIds(ctx *ctx.Context, instance string) ([]int64, error) {
var objs []AlertingEngines
err := DB(ctx).Where("instance=?", instance).Find(&objs).Error
if err != nil {
return []int64{}, err
}
if len(objs) == 0 {
return []int64{}, nil
}
var ids []int64
for i := 0; i < len(objs); i++ {
ids = append(ids, objs[i].DatasourceId)
}
return ids, nil
}
// AlertingEngineGets 拉取列表数据,用户要在页面上看到所有 n9e-server 实例列表,然后为其分配 cluster
func AlertingEngineGets(ctx *ctx.Context, where string, args ...interface{}) ([]*AlertingEngines, error) {
var objs []*AlertingEngines
var err error
session := DB(ctx).Order("instance")
if where == "" {
err = session.Find(&objs).Error
} else {
err = session.Where(where, args...).Find(&objs).Error
}
return objs, err
}
func AlertingEngineGet(ctx *ctx.Context, where string, args ...interface{}) (*AlertingEngines, error) {
lst, err := AlertingEngineGets(ctx, where, args...)
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
return lst[0], nil
}
func AlertingEngineGetsClusters(ctx *ctx.Context, where string, args ...interface{}) ([]string, error) {
var arr []string
var err error
session := DB(ctx).Model(new(AlertingEngines)).Where("engine_cluster != ''").Order("engine_cluster").Distinct("engine_cluster")
if where == "" {
err = session.Pluck("engine_cluster", &arr).Error
} else {
err = session.Where(where, args...).Pluck("engine_cluster", &arr).Error
}
return arr, err
}
func AlertingEngineGetsInstances(ctx *ctx.Context, where string, args ...interface{}) ([]string, error) {
var arr []string
var err error
session := DB(ctx).Model(new(AlertingEngines)).Order("instance")
if where == "" {
err = session.Pluck("instance", &arr).Error
} else {
err = session.Where(where, args...).Pluck("instance", &arr).Error
}
return arr, err
}
type HeartbeatInfo struct {
Instance string `json:"instance"`
EngineCluster string `json:"engine_cluster"`
DatasourceId int64 `json:"datasource_id"`
}
func AlertingEngineHeartbeatWithCluster(ctx *ctx.Context, instance, cluster string, datasourceId int64) error {
if !ctx.IsCenter {
info := HeartbeatInfo{
Instance: instance,
EngineCluster: cluster,
DatasourceId: datasourceId,
}
err := poster.PostByUrls(ctx, "/v1/n9e/server-heartbeat", info)
return err
}
var total int64
err := DB(ctx).Model(new(AlertingEngines)).Where("instance=? and engine_cluster = ? and datasource_id=?", instance, cluster, datasourceId).Count(&total).Error
if err != nil {
return err
}
if total == 0 {
// insert
err = DB(ctx).Create(&AlertingEngines{
Instance: instance,
DatasourceId: datasourceId,
EngineCluster: cluster,
Clock: time.Now().Unix(),
}).Error
} else {
// updates
fields := map[string]interface{}{"clock": time.Now().Unix()}
err = DB(ctx).Model(new(AlertingEngines)).Where("instance=? and engine_cluster = ? and datasource_id=?", instance, cluster, datasourceId).Updates(fields).Error
}
return err
}
================================================
FILE: models/anomaly_point.go
================================================
package models
import (
"fmt"
"math"
"strings"
"github.com/ccfos/nightingale/v6/pkg/unit"
"github.com/prometheus/common/model"
)
type AnomalyPoint struct {
Key string `json:"key"`
Labels model.Metric `json:"labels"`
Timestamp int64 `json:"timestamp"`
Value float64 `json:"value"`
Severity int `json:"severity"`
Triggered bool `json:"triggered"`
Query string `json:"query"`
Values string `json:"values"`
ValuesUnit map[string]unit.FormattedValue `json:"values_unit"`
RecoverConfig RecoverConfig `json:"recover_config"`
TriggerType TriggerType `json:"trigger_type"`
}
type TriggerType string
const (
TriggerTypeNormal TriggerType = "normal"
TriggerTypeNodata TriggerType = "nodata"
)
func NewAnomalyPoint(key string, labels map[string]string, ts int64, value float64, severity int) AnomalyPoint {
anomalyPointLabels := make(model.Metric)
for k, v := range labels {
anomalyPointLabels[model.LabelName(k)] = model.LabelValue(v)
}
anomalyPointLabels[model.MetricNameLabel] = model.LabelValue(key)
return AnomalyPoint{
Key: key,
Labels: anomalyPointLabels,
Timestamp: ts,
Value: value,
Severity: severity,
}
}
func (v *AnomalyPoint) ReadableValue() string {
if len(v.ValuesUnit) > 0 {
for _, unit := range v.ValuesUnit { // 配置了单位,优先用配置了单位的值
return unit.Text
}
}
ret := fmt.Sprintf("%.5f", v.Value)
ret = strings.TrimRight(ret, "0")
return strings.TrimRight(ret, ".")
}
func ConvertAnomalyPoints(value model.Value) (lst []AnomalyPoint) {
if value == nil {
return
}
switch value.Type() {
case model.ValVector:
items, ok := value.(model.Vector)
if !ok {
return
}
for _, item := range items {
if math.IsNaN(float64(item.Value)) {
continue
}
lst = append(lst, AnomalyPoint{
Key: item.Metric.String(),
Timestamp: item.Timestamp.Unix(),
Value: float64(item.Value),
Labels: item.Metric,
})
}
case model.ValMatrix:
items, ok := value.(model.Matrix)
if !ok {
return
}
for _, item := range items {
if len(item.Values) == 0 {
return
}
last := item.Values[len(item.Values)-1]
if math.IsNaN(float64(last.Value)) {
continue
}
lst = append(lst, AnomalyPoint{
Key: item.Metric.String(),
Labels: item.Metric,
Timestamp: last.Timestamp.Unix(),
Value: float64(last.Value),
})
}
case model.ValScalar:
item, ok := value.(*model.Scalar)
if !ok {
return
}
if math.IsNaN(float64(item.Value)) {
return
}
lst = append(lst, AnomalyPoint{
Key: "{}",
Timestamp: item.Timestamp.Unix(),
Value: float64(item.Value),
Labels: model.Metric{},
})
default:
return
}
return
}
================================================
FILE: models/board.go
================================================
package models
import (
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/google/uuid"
"github.com/pkg/errors"
"github.com/toolkits/pkg/str"
"gorm.io/gorm"
)
const (
PublicAnonymous = 0
PublicLogin = 1
PublicBusi = 2
)
type Board struct {
Id int64 `json:"id" gorm:"primaryKey"`
GroupId int64 `json:"group_id"`
Name string `json:"name"`
Ident string `json:"ident"`
Tags string `json:"tags"`
Note string `json:"note"`
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
Configs string `json:"configs" gorm:"-"`
Public int `json:"public"` // 0: false, 1: true
PublicCate int `json:"public_cate"` // 0: anonymous, 1: login, 2: busi
Bgids []int64 `json:"bgids" gorm:"-"`
BuiltIn int `json:"built_in"` // 0: false, 1: true
Hide int `json:"hide"` // 0: false, 1: true
}
func (b *Board) TableName() string {
return "board"
}
func (b *Board) Verify() error {
if b.Name == "" {
return errors.New("Name is blank")
}
if str.Dangerous(b.Name) {
return errors.New("Name has invalid characters")
}
return nil
}
func (b *Board) Clone(operatorName string, newBgid int64, suffix string) *Board {
clone := &Board{
Name: b.Name,
Tags: b.Tags,
Note: b.Note,
GroupId: newBgid,
CreateBy: operatorName,
UpdateBy: operatorName,
}
if suffix != "" {
clone.Name = clone.Name + " " + suffix
}
if b.Ident != "" {
clone.Ident = uuid.NewString()
}
return clone
}
func (b *Board) CanRenameIdent(ctx *ctx.Context, ident string) (bool, error) {
if ident == "" {
return true, nil
}
cnt, err := Count(DB(ctx).Model(b).Where("ident=? and id <> ?", ident, b.Id))
if err != nil {
return false, err
}
return cnt == 0, nil
}
func (b *Board) Add(ctx *ctx.Context) error {
if err := b.Verify(); err != nil {
return err
}
if b.Ident != "" {
// ident duplicate check
cnt, err := Count(DB(ctx).Model(b).Where("ident=?", b.Ident))
if err != nil {
return err
}
if cnt > 0 {
return errors.New("Ident duplicate")
}
}
cnt, err := Count(DB(ctx).Model(b).Where("name = ? and group_id = ?", b.Name, b.GroupId))
if err != nil {
return err
}
if cnt > 0 {
return errors.New("Name duplicate")
}
now := time.Now().Unix()
b.CreateAt = now
b.UpdateAt = now
return Insert(ctx, b)
}
func (b *Board) AtomicAdd(c *ctx.Context, payload string) error {
return DB(c).Transaction(func(tx *gorm.DB) error {
tCtx := &ctx.Context{
DB: tx,
}
if err := b.Add(tCtx); err != nil {
return err
}
if payload != "" {
if err := BoardPayloadSave(tCtx, b.Id, payload); err != nil {
return err
}
}
return nil
})
}
func (b *Board) Update(ctx *ctx.Context, selectField interface{}, selectFields ...interface{}) error {
if err := b.Verify(); err != nil {
return err
}
return DB(ctx).Model(b).Select(selectField, selectFields...).Updates(b).Error
}
func (b *Board) Del(ctx *ctx.Context) error {
return DB(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Where("id=?", b.Id).Delete(&BoardPayload{}).Error; err != nil {
return err
}
if err := tx.Where("id=?", b.Id).Delete(&Board{}).Error; err != nil {
return err
}
return nil
})
}
func BoardGetByID(ctx *ctx.Context, id int64) (*Board, error) {
var lst []*Board
err := DB(ctx).Where("id = ?", id).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
return lst[0], nil
}
// BoardGet for detail page
func BoardGet(ctx *ctx.Context, where string, args ...interface{}) (*Board, error) {
var lst []*Board
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
payload, err := BoardPayloadGet(ctx, lst[0].Id)
if err != nil {
return nil, err
}
lst[0].Configs = payload
return lst[0], nil
}
func BoardCount(ctx *ctx.Context, where string, args ...interface{}) (num int64, err error) {
return Count(DB(ctx).Model(&Board{}).Where(where, args...))
}
func BoardExists(ctx *ctx.Context, where string, args ...interface{}) (bool, error) {
num, err := BoardCount(ctx, where, args...)
return num > 0, err
}
// BoardGets for list page
func BoardGetsByGroupId(ctx *ctx.Context, groupId int64, query string) ([]Board, error) {
session := DB(ctx).Where("group_id=?", groupId).Order("name")
arr := strings.Fields(query)
if len(arr) > 0 {
for i := 0; i < len(arr); i++ {
if strings.HasPrefix(arr[i], "-") {
q := "%" + arr[i][1:] + "%"
session = session.Where("name not like ? and tags not like ?", q, q)
} else {
q := "%" + arr[i] + "%"
session = session.Where("(name like ? or tags like ?)", q, q)
}
}
}
var objs []Board
err := session.Find(&objs).Error
return objs, err
}
func BoardGetsByBGIds(ctx *ctx.Context, gids []int64, query string) ([]Board, error) {
session := DB(ctx)
if len(gids) > 0 {
session = session.Where("group_id in (?)", gids).Order("name")
}
arr := strings.Fields(query)
if len(arr) > 0 {
for i := 0; i < len(arr); i++ {
if strings.HasPrefix(arr[i], "-") {
q := "%" + arr[i][1:] + "%"
session = session.Where("name not like ? and tags not like ?", q, q)
} else {
q := "%" + arr[i] + "%"
session = session.Where("(name like ? or tags like ?)", q, q)
}
}
}
var objs []Board
err := session.Find(&objs).Error
return objs, err
}
func BoardGets(ctx *ctx.Context, query, where string, args ...interface{}) ([]Board, error) {
session := DB(ctx).Order("name")
if where != "" {
session = session.Where(where, args...)
}
arr := strings.Fields(query)
if len(arr) > 0 {
for i := 0; i < len(arr); i++ {
if strings.HasPrefix(arr[i], "-") {
q := "%" + arr[i][1:] + "%"
session = session.Where("name not like ? and tags not like ?", q, q)
} else {
q := "%" + arr[i] + "%"
session = session.Where("(name like ? or tags like ?)", q, q)
}
}
}
var objs []Board
err := session.Find(&objs).Error
return objs, err
}
func BoardSetHide(ctx *ctx.Context, ids []int64) error {
return DB(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Model(&Board{}).Where("built_in = 1").Update("hide", 0).Error; err != nil {
return err
}
if err := tx.Model(&Board{}).Where("id in (?) and built_in = 1", ids).Update("hide", 1).Error; err != nil {
return err
}
return nil
})
}
func BoardGetsByBids(ctx *ctx.Context, bids []int64) ([]map[string]interface{}, error) {
var boards []Board
err := DB(ctx).Where("id IN ?", bids).Find(&boards).Error
if err != nil {
return nil, err
}
// 收集所有唯一的 group_id
groupIDs := make([]int64, 0)
groupIDSet := make(map[int64]struct{})
for _, board := range boards {
if _, exists := groupIDSet[board.GroupId]; !exists {
groupIDs = append(groupIDs, board.GroupId)
groupIDSet[board.GroupId] = struct{}{}
}
}
// 一次性查询所有需要的 BusiGroup
var busiGroups []BusiGroup
err = DB(ctx).Where("id IN ?", groupIDs).Find(&busiGroups).Error
if err != nil {
return nil, err
}
// 创建 group_id 到 BusiGroup 的映射
groupMap := make(map[int64]BusiGroup)
for _, bg := range busiGroups {
groupMap[bg.Id] = bg
}
result := make([]map[string]interface{}, 0, len(boards))
for _, board := range boards {
busiGroup, exists := groupMap[board.GroupId]
if !exists {
// 处理找不到对应 BusiGroup 的情况
continue
}
item := map[string]interface{}{
"busi_group_name": busiGroup.Name,
"busi_group_id": busiGroup.Id,
"board_id": board.Id,
"board_name": board.Name,
}
result = append(result, item)
}
return result, nil
}
================================================
FILE: models/board_busi.go
================================================
package models
import (
"github.com/ccfos/nightingale/v6/pkg/ctx"
"gorm.io/gorm"
)
type BoardBusigroup struct {
BusiGroupId int64 `json:"busi_group_id"`
BoardId int64 `json:"board_id"`
}
func (BoardBusigroup) TableName() string {
return "board_busigroup"
}
func BoardBusigroupAdd(tx *gorm.DB, boardId int64, busiGroupIds []int64) error {
if len(busiGroupIds) == 0 {
return nil
}
for _, busiGroupId := range busiGroupIds {
obj := BoardBusigroup{
BusiGroupId: busiGroupId,
BoardId: boardId,
}
if err := tx.Create(obj).Error; err != nil {
return err
}
}
return nil
}
func BoardBusigroupUpdate(ctx *ctx.Context, boardId int64, busiGroupIds []int64) error {
return DB(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Where("board_id=?", boardId).Delete(&BoardBusigroup{}).Error; err != nil {
return err
}
if err := BoardBusigroupAdd(tx, boardId, busiGroupIds); err != nil {
return err
}
return nil
})
}
func BoardBusigroupDelByBoardId(ctx *ctx.Context, boardId int64) error {
return DB(ctx).Where("board_id=?", boardId).Delete(&BoardBusigroup{}).Error
}
// BoardBusigroupCheck(rt.Ctx, board.Id, bgids)
func BoardBusigroupCheck(ctx *ctx.Context, boardId int64, busiGroupIds []int64) (bool, error) {
count, err := Count(DB(ctx).Where("board_id=? and busi_group_id in (?)", boardId, busiGroupIds).Model(&BoardBusigroup{}))
return count > 0, err
}
func BoardBusigroupGets(ctx *ctx.Context) ([]BoardBusigroup, error) {
var objs []BoardBusigroup
err := DB(ctx).Find(&objs).Error
return objs, err
}
// get board ids by busi group ids
func BoardIdsByBusiGroupIds(ctx *ctx.Context, busiGroupIds []int64) ([]int64, error) {
var ids []int64
err := DB(ctx).Model(&BoardBusigroup{}).Where("busi_group_id in (?)", busiGroupIds).Pluck("board_id", &ids).Error
return ids, err
}
================================================
FILE: models/board_payload.go
================================================
package models
import (
"errors"
"github.com/ccfos/nightingale/v6/pkg/ctx"
)
type BoardPayload struct {
Id int64 `json:"id" gorm:"primaryKey"`
Payload string `json:"payload"`
}
func (p *BoardPayload) TableName() string {
return "board_payload"
}
func (p *BoardPayload) Update(ctx *ctx.Context, selectField interface{}, selectFields ...interface{}) error {
return DB(ctx).Model(p).Select(selectField, selectFields...).Updates(p).Error
}
func BoardPayloadGets(ctx *ctx.Context, ids []int64) ([]*BoardPayload, error) {
if len(ids) == 0 {
return nil, errors.New("empty ids")
}
var arr []*BoardPayload
err := DB(ctx).Where("id in ?", ids).Find(&arr).Error
return arr, err
}
func BoardPayloadGet(ctx *ctx.Context, id int64) (string, error) {
payloads, err := BoardPayloadGets(ctx, []int64{id})
if err != nil {
return "", err
}
if len(payloads) == 0 {
return "", nil
}
return payloads[0].Payload, nil
}
func BoardPayloadSave(ctx *ctx.Context, id int64, payload string) error {
var bp BoardPayload
err := DB(ctx).Where("id = ?", id).Find(&bp).Error
if err != nil {
return err
}
if bp.Id > 0 {
// already exists
bp.Payload = payload
return bp.Update(ctx, "payload")
}
return Insert(ctx, &BoardPayload{
Id: id,
Payload: payload,
})
}
================================================
FILE: models/builtin_cate.go
================================================
package models
import (
"github.com/ccfos/nightingale/v6/pkg/ctx"
)
type BuiltinCate struct {
Id int64 `json:"id" gorm:"primaryKey"`
Name string `json:"name"`
UserId int64 `json:"user_id"`
}
func (b *BuiltinCate) TableName() string {
return "builtin_cate"
}
// 创建 builtin_cate
func (b *BuiltinCate) Create(c *ctx.Context) error {
return Insert(c, b)
}
// 删除 builtin_cate
func BuiltinCateDelete(c *ctx.Context, name string, userId int64) error {
return DB(c).Where("name=? and user_id=?", name, userId).Delete(&BuiltinCate{}).Error
}
// 根据 userId 获取 builtin_cate
func BuiltinCateGetByUserId(c *ctx.Context, userId int64) (map[string]BuiltinCate, error) {
var builtinCates []BuiltinCate
err := DB(c).Where("user_id=?", userId).Find(&builtinCates).Error
var builtinCatesMap = make(map[string]BuiltinCate)
for _, builtinCate := range builtinCates {
builtinCatesMap[builtinCate.Name] = builtinCate
}
return builtinCatesMap, err
}
================================================
FILE: models/builtin_component.go
================================================
package models
import (
"errors"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
)
const SYSTEM = "system"
// BuiltinComponent represents a builtin component along with its metadata.
type BuiltinComponent struct {
ID uint64 `json:"id" gorm:"primaryKey;type:bigint;autoIncrement;comment:'unique identifier'"`
Ident string `json:"ident" gorm:"type:varchar(191);not null;index:idx_ident"`
Logo string `json:"logo" gorm:"type:mediumtext;comment:'logo of component'"`
Readme string `json:"readme" gorm:"type:text;not null;comment:'readme of component'"`
Disabled int `json:"disabled" gorm:"type:int;not null;default:0;comment:'is disabled or not'"`
CreatedAt int64 `json:"created_at" gorm:"type:bigint;not null;default:0;comment:'create time'"`
CreatedBy string `json:"created_by" gorm:"type:varchar(191);not null;default:'';comment:'creator'"`
UpdatedAt int64 `json:"updated_at" gorm:"type:bigint;not null;default:0;comment:'update time'"`
UpdatedBy string `json:"updated_by" gorm:"type:varchar(191);not null;default:'';comment:'updater'"`
}
type PostgresBuiltinComponent struct {
ID uint64 `json:"id" gorm:"primaryKey;type:bigint;autoIncrement;comment:'unique identifier'"`
Ident string `json:"ident" gorm:"type:varchar(191);not null;index:idx_ident;comment:'identifier of component'"`
Logo string `json:"logo" gorm:"type:text;comment:'logo of component'"`
Readme string `json:"readme" gorm:"type:text;not null;comment:'readme of component'"`
Disabled int `json:"disabled" gorm:"type:int;not null;default:0;comment:'is disabled or not'"`
CreatedAt int64 `json:"created_at" gorm:"type:bigint;not null;default:0;comment:'create time'"`
CreatedBy string `json:"created_by" gorm:"type:varchar(191);not null;default:'';comment:'creator'"`
UpdatedAt int64 `json:"updated_at" gorm:"type:bigint;not null;default:0;comment:'update time'"`
UpdatedBy string `json:"updated_by" gorm:"type:varchar(191);not null;default:'';comment:'updater'"`
}
func (bc *PostgresBuiltinComponent) TableName() string {
return "builtin_components"
}
func (bc *BuiltinComponent) TableName() string {
return "builtin_components"
}
func (bc *BuiltinComponent) Verify() error {
bc.Ident = strings.TrimSpace(bc.Ident)
if bc.Ident == "" {
return errors.New("ident is blank")
}
return nil
}
func BuiltinComponentExists(ctx *ctx.Context, bc *BuiltinComponent) (bool, error) {
var count int64
err := DB(ctx).Model(bc).Where("ident = ?", bc.Ident).Count(&count).Error
if err != nil {
return false, err
}
return count > 0, nil
}
func (bc *BuiltinComponent) Add(ctx *ctx.Context, username string) error {
if err := bc.Verify(); err != nil {
return err
}
exists, err := BuiltinComponentExists(ctx, bc)
if err != nil {
return err
}
if exists {
return errors.New("builtin component already exists")
}
now := time.Now().Unix()
bc.CreatedAt = now
bc.UpdatedAt = now
bc.CreatedBy = username
bc.UpdatedBy = username
return Insert(ctx, bc)
}
func (bc *BuiltinComponent) Update(ctx *ctx.Context, req BuiltinComponent) error {
if err := req.Verify(); err != nil {
return err
}
if bc.Ident != req.Ident {
exists, err := BuiltinComponentExists(ctx, &req)
if err != nil {
return err
}
if exists {
return errors.New("builtin component already exists")
}
}
req.UpdatedAt = time.Now().Unix()
return DB(ctx).Model(bc).Select("*").Updates(req).Error
}
func BuiltinComponentDels(ctx *ctx.Context, ids []int64) error {
if len(ids) == 0 {
return nil
}
return DB(ctx).Where("id in ?", ids).Delete(new(BuiltinComponent)).Error
}
func BuiltinComponentGets(ctx *ctx.Context, query string, disabled int) ([]*BuiltinComponent, error) {
session := DB(ctx)
if query != "" {
queryPattern := "%" + query + "%"
session = session.Where("ident LIKE ?", queryPattern)
}
if disabled == 0 || disabled == 1 {
session = session.Where("disabled = ?", disabled)
}
var lst []*BuiltinComponent
err := session.Order("disabled ASC, updated_at DESC, ident ASC").Find(&lst).Error
return lst, err
}
func BuiltinComponentGet(ctx *ctx.Context, where string, args ...interface{}) (*BuiltinComponent, error) {
var lst []*BuiltinComponent
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
return lst[0], nil
}
================================================
FILE: models/builtin_metrics.go
================================================
package models
import (
"encoding/json"
"errors"
"fmt"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"gorm.io/gorm"
)
// BuiltinMetric represents a metric along with its metadata.
type BuiltinMetric struct {
ID int64 `json:"id" gorm:"primaryKey;type:bigint;autoIncrement;comment:'unique identifier'"`
UUID int64 `json:"uuid" gorm:"type:bigint;not null;default:0;comment:'uuid'"`
Collector string `json:"collector" gorm:"type:varchar(191);not null;index:idx_collector,sort:asc;comment:'type of collector'"`
Typ string `json:"typ" gorm:"type:varchar(191);not null;index:idx_typ,sort:asc;comment:'type of metric'"`
Name string `json:"name" gorm:"type:varchar(191);not null;index:idx_builtinmetric_name,sort:asc;comment:'name of metric'"`
Unit string `json:"unit" gorm:"type:varchar(191);not null;comment:'unit of metric'"`
Note string `json:"note" gorm:"type:varchar(4096);not null;comment:'description of metric'"`
Lang string `json:"lang" gorm:"type:varchar(191);not null;default:'zh';index:idx_lang,sort:asc;comment:'language'"`
Translation []Translation `json:"translation" gorm:"type:text;serializer:json;comment:'translation of metric'"`
Expression string `json:"expression" gorm:"type:varchar(4096);not null;comment:'expression of metric'"`
ExpressionType string `json:"expression_type" gorm:"type:varchar(32);not null;default:'promql';comment:'expression type: metric_name or promql'"`
MetricType string `json:"metric_type" gorm:"type:varchar(191);not null;default:'';comment:'metric type like counter/gauge'"`
ExtraFields json.RawMessage `json:"extra_fields" gorm:"type:text;serializer:json;comment:'custom extra fields'"`
CreatedAt int64 `json:"created_at" gorm:"type:bigint;not null;default:0;comment:'create time'"`
CreatedBy string `json:"created_by" gorm:"type:varchar(191);not null;default:'';comment:'creator'"`
UpdatedAt int64 `json:"updated_at" gorm:"type:bigint;not null;default:0;comment:'update time'"`
UpdatedBy string `json:"updated_by" gorm:"type:varchar(191);not null;default:'';comment:'updater'"`
}
type Translation struct {
Lang string `json:"lang"`
Name string `json:"name"`
Note string `json:"note"`
}
func (bm *BuiltinMetric) TableName() string {
return "builtin_metrics"
}
func (bm *BuiltinMetric) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
func (bm *BuiltinMetric) Verify() error {
if len(bm.Translation) == 0 {
return errors.New("translation is required")
}
bm.Collector = strings.TrimSpace(bm.Collector)
if bm.Collector == "" {
return errors.New("collector is blank")
}
bm.Typ = strings.TrimSpace(bm.Typ)
if bm.Typ == "" {
return errors.New("type is blank")
}
return nil
}
func BuiltinMetricExists(ctx *ctx.Context, bm *BuiltinMetric) (bool, error) {
var count int64
err := DB(ctx).Model(bm).Where("expression = ? and collector = ? and typ = ?", bm.Expression, bm.Collector, bm.Typ).Count(&count).Error
if err != nil {
return false, err
}
return count > 0, nil
}
func (bm *BuiltinMetric) Add(ctx *ctx.Context, username string) error {
if err := bm.Verify(); err != nil {
return err
}
// check if the builtin metric already exists
exists, err := BuiltinMetricExists(ctx, bm)
if err != nil {
return err
}
if exists {
return errors.New("builtin metric already exists")
}
now := time.Now().Unix()
bm.CreatedAt = now
bm.UpdatedAt = now
bm.UpdatedBy = username
bm.CreatedBy = username
return Insert(ctx, bm)
}
func (bm *BuiltinMetric) Update(ctx *ctx.Context, req BuiltinMetric) error {
if err := req.Verify(); err != nil {
return err
}
req.UpdatedAt = time.Now().Unix()
req.CreatedAt = bm.CreatedAt
req.CreatedBy = bm.CreatedBy
req.UUID = bm.UUID
return DB(ctx).Model(bm).Select("*").Updates(req).Error
}
func BuiltinMetricDels(ctx *ctx.Context, ids []int64) error {
if len(ids) == 0 {
return nil
}
return DB(ctx).Where("id in ?", ids).Delete(new(BuiltinMetric)).Error
}
func BuiltinMetricGets(ctx *ctx.Context, lang, collector, typ, query, unit string) ([]*BuiltinMetric, error) {
session := DB(ctx)
session = builtinMetricQueryBuild(lang, collector, session, typ, query, unit)
var lst []*BuiltinMetric
if err := session.Order("collector asc, typ asc, name asc").Find(&lst).Error; err != nil {
return nil, err
}
return lst, nil
}
func builtinMetricQueryBuild(lang, collector string, session *gorm.DB, typ string, query, unit string) *gorm.DB {
session = session.Where("updated_by != ?", SYSTEM)
if lang != "" {
session = session.Where("lang = ?", lang)
}
if collector != "" {
session = session.Where("collector = ?", collector)
}
if typ != "" {
session = session.Where("typ = ?", typ)
}
if unit != "" {
us := strings.Split(unit, ",")
session = session.Where("unit in (?)", us)
}
if query != "" {
qs := strings.Split(query, " ")
for _, q := range qs {
if strings.HasPrefix(q, "-") {
q = strings.TrimPrefix(q, "-")
queryPattern := "%" + q + "%"
session = session.Where("name NOT LIKE ? AND note NOT LIKE ? AND expression NOT LIKE ?", queryPattern, queryPattern, queryPattern)
} else {
queryPattern := "%" + q + "%"
session = session.Where("name LIKE ? OR note LIKE ? OR expression LIKE ?", queryPattern, queryPattern, queryPattern)
}
}
}
return session
}
func BuiltinMetricGet(ctx *ctx.Context, where string, args ...interface{}) (*BuiltinMetric, error) {
var lst []*BuiltinMetric
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
return lst[0], nil
}
func BuiltinMetricTypes(ctx *ctx.Context, lang, collector, query string) ([]string, error) {
var types []string
session := DB(ctx).Model(&BuiltinMetric{}).Where("updated_by != ?", SYSTEM)
if lang != "" {
session = session.Where("lang = ?", lang)
}
if collector != "" {
session = session.Where("collector = ?", collector)
}
if query != "" {
session = session.Where("typ like ?", "%"+query+"%")
}
err := session.Select("distinct(typ)").Pluck("typ", &types).Error
return types, err
}
func BuiltinMetricCollectors(ctx *ctx.Context, lang, typ, query string) ([]string, error) {
var collectors []string
session := DB(ctx).Model(&BuiltinMetric{}).Where("updated_by != ?", SYSTEM)
if lang != "" {
session = session.Where("lang = ?", lang)
}
if typ != "" {
session = session.Where("typ = ?", typ)
}
if query != "" {
session = session.Where("collector like ?", "%"+query+"%")
}
err := session.Select("distinct(collector)").Pluck("collector", &collectors).Error
return collectors, err
}
func BuiltinMetricBatchUpdateColumn(ctx *ctx.Context, col, old, new, updatedBy string) error {
if old == new {
return nil
}
return DB(ctx).Model(&BuiltinMetric{}).Where(fmt.Sprintf("%s = ?", col), old).Updates(map[string]interface{}{col: new, "updated_by": updatedBy}).Error
}
================================================
FILE: models/builtin_metrics_filter.go
================================================
package models
import (
"errors"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
)
type MetricFilter struct {
ID int64 `json:"id" gorm:"primaryKey;type:bigint;autoIncrement;comment:'unique identifier'"`
Name string `json:"name" gorm:"type:varchar(191);not null;index:idx_metricfilter_name,sort:asc;comment:'name of metric filter'"`
Configs string `json:"configs" gorm:"type:varchar(4096);not null;comment:'configuration of metric filter'"`
GroupsPerm []GroupPerm `json:"groups_perm" gorm:"type:text;serializer:json;"`
CreateAt int64 `json:"create_at" gorm:"type:bigint;not null;default:0;comment:'create time'"`
CreateBy string `json:"create_by" gorm:"type:varchar(191);not null;default:'';comment:'creator'"`
UpdateAt int64 `json:"update_at" gorm:"type:bigint;not null;default:0;comment:'update time'"`
UpdateBy string `json:"update_by" gorm:"type:varchar(191);not null;default:'';comment:'updater'"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
}
type GroupPerm struct {
Gid int64 `json:"gid"`
Write bool `json:"write"` // write permission
}
func (f *MetricFilter) TableName() string {
return "metric_filter"
}
func (f *MetricFilter) Verify() error {
f.Name = strings.TrimSpace(f.Name)
if f.Name == "" {
return errors.New("name is blank")
}
f.Configs = strings.TrimSpace(f.Configs)
if f.Configs == "" {
return errors.New("configs is blank")
}
return nil
}
func (f *MetricFilter) Add(ctx *ctx.Context) error {
if err := f.Verify(); err != nil {
return err
}
now := time.Now().Unix()
f.CreateAt = now
f.UpdateAt = now
return Insert(ctx, f)
}
func (f *MetricFilter) Update(ctx *ctx.Context) error {
if err := f.Verify(); err != nil {
return err
}
f.UpdateAt = time.Now().Unix()
return DB(ctx).Model(f).Select("name", "configs", "groups_perm", "update_at", "update_by").Updates(f).Error
}
func MetricFilterDel(ctx *ctx.Context, ids []int64) error {
if len(ids) == 0 {
return nil
}
return DB(ctx).Where("id in ?", ids).Delete(new(MetricFilter)).Error
}
func MetricFilterGets(ctx *ctx.Context, where string, args ...interface{}) ([]MetricFilter, error) {
var lst []MetricFilter
err := DB(ctx).Where(where, args...).Find(&lst).Error
return lst, err
}
// get by id
func MetricFilterGet(ctx *ctx.Context, id int64) (*MetricFilter, error) {
var f MetricFilter
err := DB(ctx).Where("id = ?", id).First(&f).Error
return &f, err
}
================================================
FILE: models/builtin_payload.go
================================================
package models
import (
"errors"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
)
type BuiltinPayload struct {
ID int64 `json:"id" gorm:"primaryKey;type:bigint;autoIncrement;comment:'unique identifier'"`
Type string `json:"type" gorm:"type:varchar(191);not null;index:idx_type,sort:asc;comment:'type of payload'"` // Alert Dashboard Collect
Component string `json:"component" gorm:"type:varchar(191);not null;index:idx_component,sort:asc;comment:'component of payload'"` //
ComponentID uint64 `json:"component_id" gorm:"type:bigint;index:idx_component,sort:asc;comment:'component_id of payload'"` // ComponentID which the payload belongs to
Cate string `json:"cate" gorm:"type:varchar(191);not null;comment:'category of payload'"` // categraf_v1 telegraf_v1
Name string `json:"name" gorm:"type:varchar(191);not null;index:idx_buildinpayload_name,sort:asc;comment:'name of payload'"` //
Tags string `json:"tags" gorm:"type:varchar(191);not null;default:'';comment:'tags of payload'"` // {"host":"
Content string `json:"content" gorm:"type:longtext;not null;comment:'content of payload'"`
UUID int64 `json:"uuid" gorm:"type:bigint;not null;index:idx_uuid;comment:'uuid of payload'"`
Note string `json:"note" gorm:"type:varchar(1024);not null;default:'';comment:'note of payload'"`
CreatedAt int64 `json:"created_at" gorm:"type:bigint;not null;default:0;comment:'create time'"`
CreatedBy string `json:"created_by" gorm:"type:varchar(191);not null;default:'';comment:'creator'"`
UpdatedAt int64 `json:"updated_at" gorm:"type:bigint;not null;default:0;comment:'update time'"`
UpdatedBy string `json:"updated_by" gorm:"type:varchar(191);not null;default:'';comment:'updater'"`
}
func (bp *BuiltinPayload) TableName() string {
return "builtin_payloads"
}
type PostgresBuiltinPayload struct {
ID int64 `json:"id" gorm:"primaryKey;type:bigint;autoIncrement;comment:'unique identifier'"`
Type string `json:"type" gorm:"type:varchar(191);not null;index:idx_type,sort:asc;comment:'type of payload'"`
Component string `json:"component" gorm:"type:varchar(191);not null;index:idx_component,sort:asc;comment:'component of payload'"`
ComponentID uint64 `json:"component_id" gorm:"type:bigint;index:idx_component,sort:asc;comment:'component_id of payload'"`
Cate string `json:"cate" gorm:"type:varchar(191);not null;comment:'category of payload'"`
Name string `json:"name" gorm:"type:varchar(191);not null;index:idx_buildinpayload_name,sort:asc;comment:'name of payload'"`
Tags string `json:"tags" gorm:"type:varchar(191);not null;default:'';comment:'tags of payload'"`
Content string `json:"content" gorm:"type:text;not null;comment:'content of payload'"`
UUID int64 `json:"uuid" gorm:"type:bigint;not null;index:idx_uuid;comment:'uuid of payload'"`
Note string `json:"note" gorm:"type:varchar(1024);not null;default:'';comment:'note of payload'"`
CreatedAt int64 `json:"created_at" gorm:"type:bigint;not null;default:0;comment:'create time'"`
CreatedBy string `json:"created_by" gorm:"type:varchar(191);not null;default:'';comment:'creator'"`
UpdatedAt int64 `json:"updated_at" gorm:"type:bigint;not null;default:0;comment:'update time'"`
UpdatedBy string `json:"updated_by" gorm:"type:varchar(191);not null;default:'';comment:'updater'"`
}
func (bp *PostgresBuiltinPayload) TableName() string {
return "builtin_payloads"
}
func (bp *BuiltinPayload) Verify() error {
bp.Type = strings.TrimSpace(bp.Type)
if bp.Type == "" {
return errors.New("type is blank")
}
if bp.ComponentID == 0 {
return errors.New("component_id is blank")
}
if bp.Name == "" {
return errors.New("name is blank")
}
return nil
}
func BuiltinPayloadExists(ctx *ctx.Context, bp *BuiltinPayload) (bool, error) {
var count int64
err := DB(ctx).Model(bp).Where("type = ? AND component_id = ? AND name = ? AND cate = ?", bp.Type, bp.ComponentID, bp.Name, bp.Cate).Count(&count).Error
if err != nil {
return false, err
}
return count > 0, nil
}
func (bp *BuiltinPayload) Add(ctx *ctx.Context, username string) error {
if err := bp.Verify(); err != nil {
return err
}
exists, err := BuiltinPayloadExists(ctx, bp)
if err != nil {
return err
}
if exists {
return errors.New("builtin payload already exists")
}
now := time.Now().Unix()
bp.CreatedAt = now
bp.CreatedBy = username
bp.UpdatedAt = now
bp.UpdatedBy = username
return Insert(ctx, bp)
}
func (bp *BuiltinPayload) Update(ctx *ctx.Context, req BuiltinPayload) error {
if err := req.Verify(); err != nil {
return err
}
if bp.Type != req.Type || bp.ComponentID != req.ComponentID || bp.Name != req.Name {
exists, err := BuiltinPayloadExists(ctx, &req)
if err != nil {
return err
}
if exists {
return errors.New("builtin payload already exists")
}
}
req.UpdatedAt = time.Now().Unix()
req.UUID = bp.UUID
req.CreatedBy = bp.CreatedBy
req.CreatedAt = bp.CreatedAt
return DB(ctx).Model(bp).Select("*").Updates(req).Error
}
func BuiltinPayloadDels(ctx *ctx.Context, ids []int64) error {
if len(ids) == 0 {
return nil
}
return DB(ctx).Where("id in ?", ids).Delete(new(BuiltinPayload)).Error
}
func BuiltinPayloadGet(ctx *ctx.Context, where string, args ...interface{}) (*BuiltinPayload, error) {
var bp BuiltinPayload
result := DB(ctx).Where(where, args...).Find(&bp)
if result.Error != nil {
return nil, result.Error
}
// 检查是否找到记录
if result.RowsAffected == 0 {
return nil, nil
}
return &bp, nil
}
func BuiltinPayloadGets(ctx *ctx.Context, componentId uint64, typ, cate, query string) ([]*BuiltinPayload, error) {
session := DB(ctx).Where("updated_by != ?", SYSTEM)
if typ != "" {
session = session.Where("type = ?", typ)
}
if componentId != 0 {
session = session.Where("component_id = ?", componentId)
}
if cate != "" {
session = session.Where("cate = ?", cate)
}
if query != "" {
arr := strings.Fields(query)
for i := 0; i < len(arr); i++ {
qarg := "%" + arr[i] + "%"
session = session.Where("name like ? or tags like ?", qarg, qarg)
}
}
var lst []*BuiltinPayload
err := session.Find(&lst).Error
return lst, err
}
// get cates of BuiltinPayload by type and component, return []string
func BuiltinPayloadCates(ctx *ctx.Context, typ string, componentID uint64) ([]string, error) {
var cates []string
err := DB(ctx).Model(new(BuiltinPayload)).Where("type = ? and component_id = ? and updated_by != ?", typ, componentID, SYSTEM).Distinct("cate").Pluck("cate", &cates).Error
return cates, err
}
// get components of BuiltinPayload by type and cate, return string
func BuiltinPayloadComponents(ctx *ctx.Context, typ, cate string) (string, error) {
var components []string
err := DB(ctx).Model(new(BuiltinPayload)).Where("type = ? and cate = ?", typ, cate).Distinct("component").Pluck("component", &components).Error
if err != nil {
return "", err
}
if len(components) == 0 {
return "", nil
}
return components[0], nil
}
// InitBuiltinPayloads 兼容新旧 BuiltinPayload 格式
func InitBuiltinPayloads(ctx *ctx.Context) error {
var lst []*BuiltinPayload
components, err := BuiltinComponentGets(ctx, "", -1)
if err != nil {
return err
}
identToId := make(map[string]uint64)
for _, component := range components {
identToId[component.Ident] = component.ID
}
err = DB(ctx).Where("component_id = 0 or component_id is NULL").Find(&lst).Error
if err != nil {
return err
}
for _, bp := range lst {
componentId, ok := identToId[bp.Component]
if !ok {
continue
}
bp.ComponentID = componentId
}
if len(lst) == 0 {
return nil
}
return DB(ctx).Save(&lst).Error
}
================================================
FILE: models/busi_group.go
================================================
package models
import (
"fmt"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/pkg/errors"
"gorm.io/gorm"
)
type BusiGroup struct {
Id int64 `json:"id" gorm:"primaryKey"`
Name string `json:"name"`
LabelEnable int `json:"label_enable"`
LabelValue string `json:"label_value"`
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
UserGroups []UserGroupWithPermFlag `json:"user_groups" gorm:"-"`
DB *gorm.DB `json:"-" gorm:"-"`
}
func New(db *gorm.DB) *BusiGroup {
return &BusiGroup{
DB: db,
}
}
type UserGroupWithPermFlag struct {
UserGroup *UserGroup `json:"user_group"`
PermFlag string `json:"perm_flag"`
}
func (bg *BusiGroup) TableName() string {
return "busi_group"
}
func (bg *BusiGroup) FillUserGroups(ctx *ctx.Context) error {
members, err := BusiGroupMemberGetsByBusiGroupId(ctx, bg.Id)
if err != nil {
return err
}
if len(members) == 0 {
return nil
}
for i := 0; i < len(members); i++ {
ug, err := UserGroupGetById(ctx, members[i].UserGroupId)
if err != nil {
return err
}
bg.UserGroups = append(bg.UserGroups, UserGroupWithPermFlag{
UserGroup: ug,
PermFlag: members[i].PermFlag,
})
}
return nil
}
func BusiGroupGetMap(ctx *ctx.Context) (map[int64]*BusiGroup, error) {
var lst []*BusiGroup
var err error
if !ctx.IsCenter {
lst, err = poster.GetByUrls[[]*BusiGroup](ctx, "/v1/n9e/busi-groups")
if err != nil {
return nil, err
}
} else {
err = DB(ctx).Find(&lst).Error
if err != nil {
return nil, err
}
}
ret := make(map[int64]*BusiGroup)
for i := 0; i < len(lst); i++ {
ret[lst[i].Id] = lst[i]
}
return ret, nil
}
func BusiGroupGetAll(ctx *ctx.Context) ([]*BusiGroup, error) {
var lst []*BusiGroup
err := DB(ctx).Find(&lst).Error
return lst, err
}
func BusiGroupGet(ctx *ctx.Context, where string, args ...interface{}) (*BusiGroup, error) {
var lst []*BusiGroup
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
return lst[0], nil
}
func BusiGroupGetById(ctx *ctx.Context, id int64) (*BusiGroup, error) {
return BusiGroupGet(ctx, "id=?", id)
}
func BusiGroupGetByIds(ctx *ctx.Context, ids []int64) ([]*BusiGroup, error) {
var lst []*BusiGroup
err := DB(ctx).Where("id in ?", ids).Find(&lst).Error
return lst, err
}
func BusiGroupExists(ctx *ctx.Context, where string, args ...interface{}) (bool, error) {
num, err := Count(DB(ctx).Model(&BusiGroup{}).Where(where, args...))
return num > 0, err
}
// RegisterGroupDelCheckEntries 提供给外部注册删除 group 时需要检查的表
func RegisterGroupDelCheckEntries(e []CheckEntry) {
entries = append(entries, e...)
}
type CheckEntry struct {
Entry interface{}
ErrorMessage string
FieldName string
}
var entries = []CheckEntry{
{
Entry: &AlertRule{},
ErrorMessage: "Some alert rules still in the BusiGroup",
FieldName: "group_id",
},
{
Entry: &AlertMute{},
ErrorMessage: "Some alert mutes still in the BusiGroup",
FieldName: "group_id",
},
{
Entry: &AlertSubscribe{},
ErrorMessage: "Some alert subscribes still in the BusiGroup",
FieldName: "group_id",
},
{
Entry: &Board{},
ErrorMessage: "Some Board still in the BusiGroup",
FieldName: "group_id",
},
{
Entry: &Target{},
ErrorMessage: "Some targets still in the BusiGroup",
FieldName: "group_id",
},
{
Entry: &RecordingRule{},
ErrorMessage: "Some recording rules still in the BusiGroup",
FieldName: "group_id",
},
{
Entry: &TaskTpl{},
ErrorMessage: "Some recovery scripts still in the BusiGroup",
FieldName: "group_id",
},
{
Entry: &TargetBusiGroup{},
ErrorMessage: "Some target busigroups still in the BusiGroup",
FieldName: "group_id",
},
}
func (bg *BusiGroup) Del(ctx *ctx.Context) error {
for _, e := range entries {
has, err := Exists(DB(ctx).Model(e.Entry).Where(fmt.Sprintf("%s=?", e.FieldName), bg.Id))
if err != nil {
return err
}
if has {
return errors.New(e.ErrorMessage)
}
}
return DB(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Where("busi_group_id=?", bg.Id).Delete(&BusiGroupMember{}).Error; err != nil {
return err
}
if err := tx.Where("id=?", bg.Id).Delete(&BusiGroup{}).Error; err != nil {
return err
}
// 这个需要好好斟酌一下,删掉BG,对应的活跃告警事件也一并删除
// BG都删了,说明下面已经没有告警规则了,说明这些活跃告警永远都不会恢复了
// 而且这些活跃告警已经没人关心了,既然是没人关心的,删了吧
if err := tx.Where("group_id=?", bg.Id).Delete(&AlertCurEvent{}).Error; err != nil {
return err
}
return nil
})
}
func (bg *BusiGroup) AddMembers(ctx *ctx.Context, members []BusiGroupMember, username string) error {
for i := 0; i < len(members); i++ {
err := BusiGroupMemberAdd(ctx, members[i])
if err != nil {
return err
}
}
return DB(ctx).Model(bg).Updates(map[string]interface{}{
"update_at": time.Now().Unix(),
"update_by": username,
}).Error
}
func (bg *BusiGroup) DelMembers(ctx *ctx.Context, members []BusiGroupMember, username string) error {
for i := 0; i < len(members); i++ {
num, err := BusiGroupMemberCount(ctx, "busi_group_id = ? and user_group_id <> ?", members[i].BusiGroupId, members[i].UserGroupId)
if err != nil {
return err
}
if num == 0 {
// 说明这是最后一个user-group,如果再删了,就没人可以管理这个busi-group了
return fmt.Errorf("the business group must retain at least one team")
}
err = BusiGroupMemberDel(ctx, "busi_group_id = ? and user_group_id = ?", members[i].BusiGroupId, members[i].UserGroupId)
if err != nil {
return err
}
}
return DB(ctx).Model(bg).Updates(map[string]interface{}{
"update_at": time.Now().Unix(),
"update_by": username,
}).Error
}
func (bg *BusiGroup) Update(ctx *ctx.Context, name string, labelEnable int, labelValue string, updateBy string) error {
if bg.Name == name && bg.LabelEnable == labelEnable && bg.LabelValue == labelValue {
return nil
}
exists, err := BusiGroupExists(ctx, "name = ? and id <> ?", name, bg.Id)
if err != nil {
return errors.WithMessage(err, "failed to count BusiGroup")
}
if exists {
return errors.New("BusiGroup already exists")
}
if labelEnable == 1 {
exists, err = BusiGroupExists(ctx, "label_enable = 1 and label_value = ? and id <> ?", labelValue, bg.Id)
if err != nil {
return errors.WithMessage(err, "failed to count BusiGroup")
}
if exists {
return errors.New("BusiGroup already exists")
}
} else {
labelValue = ""
}
return DB(ctx).Model(bg).Updates(map[string]interface{}{
"name": name,
"label_enable": labelEnable,
"label_value": labelValue,
"update_at": time.Now().Unix(),
"update_by": updateBy,
}).Error
}
func BusiGroupAdd(ctx *ctx.Context, name string, labelEnable int, labelValue string, members []BusiGroupMember, creator string) error {
exists, err := BusiGroupExists(ctx, "name=?", name)
if err != nil {
return errors.WithMessage(err, "failed to count BusiGroup")
}
if exists {
return errors.New("BusiGroup already exists")
}
if labelEnable == 1 {
exists, err = BusiGroupExists(ctx, "label_enable = 1 and label_value = ?", labelValue)
if err != nil {
return errors.WithMessage(err, "failed to count BusiGroup")
}
if exists {
return errors.New("BusiGroup already exists")
}
} else {
labelValue = ""
}
count := len(members)
for i := 0; i < count; i++ {
ug, err := UserGroupGet(ctx, "id=?", members[i].UserGroupId)
if err != nil {
return errors.WithMessage(err, "failed to get UserGroup")
}
if ug == nil {
return errors.New("Some UserGroup id not exists")
}
}
now := time.Now().Unix()
obj := &BusiGroup{
Name: name,
LabelEnable: labelEnable,
LabelValue: labelValue,
CreateAt: now,
CreateBy: creator,
UpdateAt: now,
UpdateBy: creator,
}
return DB(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Create(obj).Error; err != nil {
return err
}
for i := 0; i < len(members); i++ {
if err := tx.Create(&BusiGroupMember{
BusiGroupId: obj.Id,
UserGroupId: members[i].UserGroupId,
PermFlag: members[i].PermFlag,
}).Error; err != nil {
return err
}
}
return nil
})
}
func BusiGroupStatistics(ctx *ctx.Context) (*Statistics, error) {
if !ctx.IsCenter {
s, err := poster.GetByUrls[*Statistics](ctx, "/v1/n9e/statistic?name=busi_group")
return s, err
}
session := DB(ctx).Model(&BusiGroup{}).Select("count(*) as total", "max(update_at) as last_updated")
var stats []*Statistics
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
================================================
FILE: models/busi_group_member.go
================================================
package models
import "github.com/ccfos/nightingale/v6/pkg/ctx"
type BusiGroupMember struct {
BusiGroupId int64 `json:"busi_group_id"`
UserGroupId int64 `json:"user_group_id"`
PermFlag string `json:"perm_flag"`
}
func (BusiGroupMember) TableName() string {
return "busi_group_member"
}
func BusiGroupIds(ctx *ctx.Context, userGroupIds []int64, permFlag ...string) ([]int64, error) {
if len(userGroupIds) == 0 {
return []int64{}, nil
}
session := DB(ctx).Model(&BusiGroupMember{}).Where("user_group_id in ?", userGroupIds)
if len(permFlag) > 0 {
session = session.Where("perm_flag=?", permFlag[0])
}
var ids []int64
err := session.Pluck("busi_group_id", &ids).Error
return ids, err
}
func UserGroupIdsOfBusiGroup(ctx *ctx.Context, busiGroupId int64, permFlag ...string) ([]int64, error) {
session := DB(ctx).Model(&BusiGroupMember{}).Where("busi_group_id = ?", busiGroupId)
if len(permFlag) > 0 {
session = session.Where("perm_flag=?", permFlag[0])
}
var ids []int64
err := session.Pluck("user_group_id", &ids).Error
return ids, err
}
func BusiGroupMemberCount(ctx *ctx.Context, where string, args ...interface{}) (int64, error) {
return Count(DB(ctx).Model(&BusiGroupMember{}).Where(where, args...))
}
func BusiGroupMemberAdd(ctx *ctx.Context, member BusiGroupMember) error {
obj, err := BusiGroupMemberGet(ctx, "busi_group_id = ? and user_group_id = ?", member.BusiGroupId, member.UserGroupId)
if err != nil {
return err
}
if obj == nil {
// insert
return Insert(ctx, &BusiGroupMember{
BusiGroupId: member.BusiGroupId,
UserGroupId: member.UserGroupId,
PermFlag: member.PermFlag,
})
} else {
// update
if obj.PermFlag == member.PermFlag {
return nil
}
return DB(ctx).Model(&BusiGroupMember{}).Where("busi_group_id = ? and user_group_id = ?", member.BusiGroupId, member.UserGroupId).Update("perm_flag", member.PermFlag).Error
}
}
func BusiGroupMemberGet(ctx *ctx.Context, where string, args ...interface{}) (*BusiGroupMember, error) {
var lst []*BusiGroupMember
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
return lst[0], nil
}
func BusiGroupMemberDel(ctx *ctx.Context, where string, args ...interface{}) error {
return DB(ctx).Where(where, args...).Delete(&BusiGroupMember{}).Error
}
func BusiGroupMemberGets(ctx *ctx.Context, where string, args ...interface{}) ([]BusiGroupMember, error) {
var lst []BusiGroupMember
err := DB(ctx).Where(where, args...).Order("perm_flag").Find(&lst).Error
return lst, err
}
func BusiGroupMemberGetsByBusiGroupId(ctx *ctx.Context, busiGroupId int64) ([]BusiGroupMember, error) {
return BusiGroupMemberGets(ctx, "busi_group_id=?", busiGroupId)
}
================================================
FILE: models/chart.go
================================================
package models
import "github.com/ccfos/nightingale/v6/pkg/ctx"
type Chart struct {
Id int64 `json:"id" gorm:"primaryKey"`
GroupId int64 `json:"group_id"`
Configs string `json:"configs"`
Weight int `json:"weight"`
}
func (c *Chart) TableName() string {
return "chart"
}
func ChartsOf(ctx *ctx.Context, chartGroupId int64) ([]Chart, error) {
var objs []Chart
err := DB(ctx).Where("group_id = ?", chartGroupId).Order("weight").Find(&objs).Error
return objs, err
}
func (c *Chart) Add(ctx *ctx.Context) error {
return Insert(ctx, c)
}
func (c *Chart) Update(ctx *ctx.Context, selectField interface{}, selectFields ...interface{}) error {
return DB(ctx).Model(c).Select(selectField, selectFields...).Updates(c).Error
}
func (c *Chart) Del(ctx *ctx.Context) error {
return DB(ctx).Where("id=?", c.Id).Delete(&Chart{}).Error
}
================================================
FILE: models/chart_group.go
================================================
package models
import (
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/str"
"gorm.io/gorm"
)
type ChartGroup struct {
Id int64 `json:"id" gorm:"primaryKey"`
DashboardId int64 `json:"dashboard_id"`
Name string `json:"name"`
Weight int `json:"weight"`
}
func (cg *ChartGroup) TableName() string {
return "chart_group"
}
func (cg *ChartGroup) Verify() error {
if cg.DashboardId <= 0 {
return errors.New("Arg(dashboard_id) invalid")
}
if str.Dangerous(cg.Name) {
return errors.New("Name has invalid characters")
}
return nil
}
func (cg *ChartGroup) Add(ctx *ctx.Context) error {
if err := cg.Verify(); err != nil {
return err
}
return Insert(ctx, cg)
}
func (cg *ChartGroup) Update(ctx *ctx.Context, selectField interface{}, selectFields ...interface{}) error {
if err := cg.Verify(); err != nil {
return err
}
return DB(ctx).Model(cg).Select(selectField, selectFields...).Updates(cg).Error
}
func (cg *ChartGroup) Del(ctx *ctx.Context) error {
return DB(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Where("group_id=?", cg.Id).Delete(&Chart{}).Error; err != nil {
return err
}
if err := tx.Where("id=?", cg.Id).Delete(&ChartGroup{}).Error; err != nil {
return err
}
return nil
})
}
func NewDefaultChartGroup(ctx *ctx.Context, dashId int64) error {
return Insert(ctx, &ChartGroup{
DashboardId: dashId,
Name: "Default chart group",
Weight: 0,
})
}
func ChartGroupIdsOf(ctx *ctx.Context, dashId int64) ([]int64, error) {
var ids []int64
err := DB(ctx).Model(&ChartGroup{}).Where("dashboard_id = ?", dashId).Pluck("id", &ids).Error
return ids, err
}
func ChartGroupsOf(ctx *ctx.Context, dashId int64) ([]ChartGroup, error) {
var objs []ChartGroup
err := DB(ctx).Where("dashboard_id = ?", dashId).Order("weight").Find(&objs).Error
return objs, err
}
================================================
FILE: models/chart_share.go
================================================
package models
import "github.com/ccfos/nightingale/v6/pkg/ctx"
type ChartShare struct {
Id int64 `json:"id" gorm:"primaryKey"`
Cluster string `json:"cluster"`
DatasourceId int64 `json:"datasource_id"`
Configs string `json:"configs"`
CreateBy string `json:"create_by"`
CreateAt int64 `json:"create_at"`
}
func (cs *ChartShare) TableName() string {
return "chart_share"
}
func (cs *ChartShare) Add(ctx *ctx.Context) error {
return Insert(ctx, cs)
}
func ChartShareGetsByIds(ctx *ctx.Context, ids []int64) ([]ChartShare, error) {
var lst []ChartShare
if len(ids) == 0 {
return lst, nil
}
err := DB(ctx).Where("id in ?", ids).Order("id").Find(&lst).Error
return lst, err
}
================================================
FILE: models/common.go
================================================
package models
import (
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/str"
"gorm.io/gorm"
)
const AdminRole = "Admin"
// if rule's cluster field contains `ClusterAll`, means it take effect in all clusters
const DatasourceIdAll = 0
func DB(ctx *ctx.Context) *gorm.DB {
return ctx.DB
}
func Count(tx *gorm.DB) (int64, error) {
var cnt int64
err := tx.Count(&cnt).Error
return cnt, err
}
func Exists(tx *gorm.DB) (bool, error) {
num, err := Count(tx)
return num > 0, err
}
func Insert(ctx *ctx.Context, obj interface{}) error {
return DB(ctx).Create(obj).Error
}
// CryptoPass crypto password use salt
func CryptoPass(ctx *ctx.Context, raw string) (string, error) {
salt, err := ConfigsGet(ctx, SALT)
if err != nil {
return "", err
}
return str.MD5(salt + "<-*Uk30^96eY*->" + raw), nil
}
type Statistics struct {
Total int64 `gorm:"total"`
LastUpdated int64 `gorm:"last_updated"`
}
func StatisticsGet[T any](ctx *ctx.Context, model T) (*Statistics, error) {
var stats []*Statistics
session := DB(ctx).Model(model).Select("count(*) as total", "max(update_at) as last_updated")
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
func MatchDatasource(ids []int64, id int64) bool {
if id == DatasourceIdAll {
return true
}
for _, i := range ids {
if i == id {
return true
}
}
return false
}
func IsAllDatasource(datasourceIds []int64) bool {
for _, id := range datasourceIds {
if id == 0 {
return true
}
}
return false
}
type LabelAndKey struct {
Label string `json:"label"`
Key string `json:"key"`
}
func LabelAndKeyHasKey(keys []LabelAndKey, key string) bool {
for i := 0; i < len(keys); i++ {
if keys[i].Key == key {
return true
}
}
return false
}
================================================
FILE: models/configs.go
================================================
package models
import (
"encoding/json"
"fmt"
"log"
"os"
"regexp"
"sync"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/pkg/secu"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/runner"
"github.com/toolkits/pkg/str"
)
type Configs struct { //ckey+external
Id int64 `json:"id" gorm:"primaryKey"`
Ckey string `json:"ckey"` // Before inserting external configs, check if they are already defined as built-in configs.
Cval string `json:"cval"`
Note string `json:"note"`
External int `json:"external"` //Controls frontend list display: 0 hides built-in (default), 1 shows external
Encrypted int `json:"encrypted"` //Indicates whether the value(cval) is encrypted (1 for ciphertext, 0 for plaintext(default))
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
}
func (Configs) TableName() string {
return "configs"
}
var (
ConfigExternal = 1 //external type
ConfigEncrypted = 1 //ciphertext
)
const (
SALT = "salt"
RSA_PRIVATE_KEY = "rsa_private_key"
RSA_PUBLIC_KEY = "rsa_public_key"
RSA_PASSWORD = "rsa_password"
JWT_SIGNING_KEY = "jwt_signing_key"
PHONE_ENCRYPTION_ENABLED = "phone_encryption_enabled" // 手机号加密开关
)
// 手机号加密配置缓存
var (
phoneEncryptionCache struct {
sync.RWMutex
enabled bool
privateKey []byte
publicKey []byte
password string
loaded bool
}
)
// LoadPhoneEncryptionConfig 加载手机号加密配置到缓存
func LoadPhoneEncryptionConfig(ctx *ctx.Context) error {
enabled, err := GetPhoneEncryptionEnabled(ctx)
if err != nil {
return errors.WithMessage(err, "failed to get phone encryption enabled")
}
privateKey, publicKey, password, err := GetRSAKeys(ctx)
if err != nil {
return errors.WithMessage(err, "failed to get RSA keys")
}
phoneEncryptionCache.Lock()
defer phoneEncryptionCache.Unlock()
phoneEncryptionCache.enabled = enabled
phoneEncryptionCache.privateKey = privateKey
phoneEncryptionCache.publicKey = publicKey
phoneEncryptionCache.password = password
phoneEncryptionCache.loaded = true
logger.Debugf("Phone encryption config loaded: enabled=%v", enabled)
return nil
}
// GetPhoneEncryptionConfigFromCache 从缓存获取手机号加密配置
func GetPhoneEncryptionConfigFromCache() (enabled bool, publicKey []byte, privateKey []byte, password string, loaded bool) {
phoneEncryptionCache.RLock()
defer phoneEncryptionCache.RUnlock()
return phoneEncryptionCache.enabled,
phoneEncryptionCache.publicKey,
phoneEncryptionCache.privateKey,
phoneEncryptionCache.password,
phoneEncryptionCache.loaded
}
// RefreshPhoneEncryptionCache 刷新缓存(在修改配置后调用)
func RefreshPhoneEncryptionCache(ctx *ctx.Context) error {
return LoadPhoneEncryptionConfig(ctx)
}
func InitJWTSigningKey(ctx *ctx.Context) string {
val, err := ConfigsGet(ctx, JWT_SIGNING_KEY)
if err != nil {
log.Fatalln("init jwt signing key in mysql", err)
}
if val != "" {
return val
}
content := fmt.Sprintf("%s%d%d%s", runner.Hostname, os.Getpid(), time.Now().UnixNano(), str.RandLetters(6))
key := str.MD5(content)
err = ConfigsSet(ctx, JWT_SIGNING_KEY, key)
if err != nil {
log.Fatalln("init jwt signing key in mysql", err)
}
return key
}
// InitSalt generate random salt
func InitSalt(ctx *ctx.Context) {
val, err := ConfigsGet(ctx, SALT)
if err != nil {
log.Fatalln("init salt in mysql", err)
}
if val != "" {
return
}
content := fmt.Sprintf("%s%d%d%s", runner.Hostname, os.Getpid(), time.Now().UnixNano(), str.RandLetters(6))
salt := str.MD5(content)
err = ConfigsSet(ctx, SALT, salt)
if err != nil {
log.Fatalln("init salt in mysql", err)
}
}
func InitRSAPassWord(ctx *ctx.Context) (string, error) {
val, err := ConfigsGet(ctx, RSA_PASSWORD)
if err != nil {
return "", errors.WithMessage(err, "failed to get rsa password")
}
if val != "" {
return val, nil
}
content := fmt.Sprintf("%s%d%d%s", runner.Hostname, os.Getpid(), time.Now().UnixNano(), str.RandLetters(6))
pwd := str.MD5(content)
err = ConfigsSet(ctx, RSA_PASSWORD, pwd)
if err != nil {
return "", errors.WithMessage(err, "failed to set rsa password")
}
return pwd, nil
}
func ConfigsGet(ctx *ctx.Context, ckey string) (string, error) { //select built-in type configs
if !ctx.IsCenter {
s, err := poster.GetByUrls[string](ctx, "/v1/n9e/config?key="+ckey)
return s, err
}
var lst []string
err := DB(ctx).Model(&Configs{}).Where("ckey=? and external=? ", ckey, 0).Pluck("cval", &lst).Error
if err != nil {
return "", errors.WithMessage(err, "failed to query configs")
}
if len(lst) > 0 {
return lst[0], nil
}
return "", nil
}
func ConfigsGetAll(ctx *ctx.Context) ([]*Configs, error) { // select built-in type configs
if !ctx.IsCenter {
lst, err := poster.GetByUrls[[]*Configs](ctx, "/v1/n9e/all-configs")
return lst, err
}
var lst []*Configs
err := DB(ctx).Model(&Configs{}).Select("id, ckey, cval").
Where("ckey!='' and external=? ", 0).Find(&lst).Error
if err != nil {
return nil, errors.WithMessage(err, "failed to query configs")
}
return lst, nil
}
func ConfigsSet(ctx *ctx.Context, ckey, cval string) error {
return ConfigsSetWithUname(ctx, ckey, cval, "default")
}
func ConfigsSetWithUname(ctx *ctx.Context, ckey, cval, uName string) error { //built-in
num, err := Count(DB(ctx).Model(&Configs{}).Where("ckey=? and external=?", ckey, 0)) //built-in type
if err != nil {
return errors.WithMessage(err, "failed to count configs")
}
now := time.Now().Unix()
if num == 0 {
// insert
err = DB(ctx).Create(&Configs{
Ckey: ckey,
Cval: cval,
CreateBy: uName,
UpdateBy: uName,
CreateAt: now,
UpdateAt: now,
}).Error
} else {
// update
err = DB(ctx).Model(&Configs{}).Where("ckey=?", ckey).Updates(map[string]interface{}{
"cval": cval,
"update_by": uName,
"update_at": now,
}).Error
}
return err
}
func ConfigsGetFlashDutyAppKey(ctx *ctx.Context) (string, error) {
configs, err := ConfigsSelectByCkey(ctx, "flashduty_app_key")
if err != nil {
return "", err
}
if len(configs) == 0 || configs[0].Cval == "" {
return "", errors.New("flashduty_app_key is empty")
}
// Encrypted equals 1 means the value is encrypted
if configs[0].Encrypted == 1 {
privateKeyVal, err1 := ConfigsGet(ctx, RSA_PRIVATE_KEY)
passwordVal, err2 := ConfigsGet(ctx, RSA_PASSWORD)
if err1 != nil || err2 != nil {
return "", errors.New("failed to load RSA credentials from config")
}
decryptMap, decryptErr := ConfigUserVariableGetDecryptMap(ctx, []byte(privateKeyVal), passwordVal)
if decryptErr != nil {
return "", decryptErr
}
if val, ok := decryptMap["flashduty_app_key"]; ok {
return val, nil
} else {
return "", errors.New("flashduty_app_key is empty")
}
}
return configs[0].Cval, nil
}
func ConfigsGetSiteInfo(ctx *ctx.Context) (string, error) {
configs, err := ConfigsSelectByCkey(ctx, "site_info")
if err != nil {
return "", err
}
if len(configs) == 0 || configs[0].Cval == "" {
return "", errors.New("site_info is empty")
}
return configs[0].Cval, nil
}
func ConfigsGetSiteUrl(ctx *ctx.Context) (string, error) {
siteInfo, err := ConfigsGetSiteInfo(ctx)
if err != nil {
return "", err
}
// 转为json获取其中的site_url字段
var siteInfoMap map[string]interface{}
err = json.Unmarshal([]byte(siteInfo), &siteInfoMap)
if err != nil {
return "", errors.WithMessage(err, "failed to unmarshal site_info")
}
siteUrl, ok := siteInfoMap["site_url"].(string)
if !ok || siteUrl == "" {
return "", errors.New("site_url is empty in site_info")
}
return siteUrl, nil
}
// GetPhoneEncryptionEnabled 获取手机号加密是否开启
func GetPhoneEncryptionEnabled(ctx *ctx.Context) (bool, error) {
val, err := ConfigsGet(ctx, PHONE_ENCRYPTION_ENABLED)
if err != nil {
return false, err
}
return val == "true" || val == "1", nil
}
// SetPhoneEncryptionEnabled 设置手机号加密开关
func SetPhoneEncryptionEnabled(ctx *ctx.Context, enabled bool) error {
val := "false"
if enabled {
val = "true"
}
return ConfigsSet(ctx, PHONE_ENCRYPTION_ENABLED, val)
}
// GetRSAKeys 获取RSA密钥对
func GetRSAKeys(ctx *ctx.Context) (privateKey []byte, publicKey []byte, password string, err error) {
privateKeyVal, err := ConfigsGet(ctx, RSA_PRIVATE_KEY)
if err != nil {
return nil, nil, "", errors.WithMessage(err, "failed to get RSA private key")
}
publicKeyVal, err := ConfigsGet(ctx, RSA_PUBLIC_KEY)
if err != nil {
return nil, nil, "", errors.WithMessage(err, "failed to get RSA public key")
}
passwordVal, err := ConfigsGet(ctx, RSA_PASSWORD)
if err != nil {
return nil, nil, "", errors.WithMessage(err, "failed to get RSA password")
}
return []byte(privateKeyVal), []byte(publicKeyVal), passwordVal, nil
}
func ConfigsSelectByCkey(ctx *ctx.Context, ckey string) ([]Configs, error) {
if !ctx.IsCenter {
return []Configs{}, nil
}
var objs []Configs
err := DB(ctx).Where("ckey=?", ckey).Find(&objs).Error
if err != nil {
return nil, errors.WithMessage(err, "failed to select conf")
}
return objs, nil
}
func ConfigGet(ctx *ctx.Context, id int64) (*Configs, error) {
var objs []*Configs
err := DB(ctx).Where("id=?", id).Find(&objs).Error
if err != nil {
return nil, err
}
if len(objs) == 0 {
return nil, nil
}
return objs[0], nil
}
func ConfigsGets(ctx *ctx.Context, prefix string, limit, offset int) ([]*Configs, error) {
var objs []*Configs
session := DB(ctx)
if prefix != "" {
session = session.Where("ckey like ?", prefix+"%")
}
err := session.Order("id desc").Limit(limit).Offset(offset).Find(&objs).Error
return objs, err
}
func (c *Configs) Add(ctx *ctx.Context) error {
num, err := Count(DB(ctx).Model(&Configs{}).Where("ckey=? and external=? ", c.Ckey, c.External))
if err != nil {
return errors.WithMessage(err, "failed to count configs")
}
if num > 0 {
return errors.New("key is exists")
}
// insert
err = DB(ctx).Create(&Configs{
Ckey: c.Ckey,
Cval: c.Cval,
External: c.External,
CreateBy: c.CreateBy,
UpdateBy: c.CreateBy,
CreateAt: c.CreateAt,
UpdateAt: c.CreateAt,
}).Error
return err
}
func (c *Configs) Update(ctx *ctx.Context) error {
num, err := Count(DB(ctx).Model(&Configs{}).Where("id<>? and ckey=? and external=? ", c.Id, c.Ckey, c.External))
if err != nil {
return errors.WithMessage(err, "failed to count configs")
}
if num > 0 {
return errors.New("key is exists")
}
err = DB(ctx).Model(&Configs{}).Where("id=?", c.Id).Updates(c).Error
return err
}
func ConfigsDel(ctx *ctx.Context, ids []int64) error {
return DB(ctx).Where("id in ?", ids).Delete(&Configs{}).Error
}
func ConfigsGetUserVariable(context *ctx.Context) ([]Configs, error) {
var objs []Configs
tx := DB(context).Where("external = ?", ConfigExternal).Order("id desc")
err := tx.Find(&objs).Error
if err != nil {
return nil, errors.WithMessage(err, "failed to gets user variable")
}
return objs, nil
}
func ConfigsUserVariableInsert(context *ctx.Context, conf Configs) error {
conf.External = ConfigExternal
conf.Id = 0
err := userVariableCheck(context, conf.Ckey, conf.Id)
if err != nil {
return err
}
return DB(context).Create(&conf).Error
}
func ConfigsUserVariableUpdate(context *ctx.Context, conf Configs) error {
err := userVariableCheck(context, conf.Ckey, conf.Id)
if err != nil {
return err
}
configOld, _ := ConfigGet(context, conf.Id)
if configOld == nil || configOld.External != ConfigExternal { //not valid id
return fmt.Errorf("not valid configs(id)")
}
return DB(context).Model(&Configs{Id: conf.Id}).Select(
"ckey", "cval", "note", "encrypted", "update_by", "update_at").Updates(conf).Error
}
func isCStyleIdentifier(str string) bool {
regex := regexp.MustCompile(`^[a-zA-Z_][a-zA-Z0-9_]*$`)
return regex.MatchString(str)
}
func userVariableCheck(context *ctx.Context, ckey string, id int64) error {
var objs []*Configs
var err error
if !isCStyleIdentifier(ckey) {
return fmt.Errorf("invalid key(%q), please use ^[a-zA-Z_][a-zA-Z0-9_]*$ ", ckey)
}
// reserved words
words := []string{"Scheme", "Host", "Hostname", "Port", "Path", "Query", "Fragment"}
for _, word := range words {
if ckey == word {
return fmt.Errorf("invalid key(%q), reserved words, please use other key", ckey)
}
}
if id != 0 { //update
err = DB(context).Where("id <> ? and ckey = ? and external=?", &id, ckey, ConfigExternal).Find(&objs).Error
} else {
err = DB(context).Where("ckey = ? and external=?", ckey, ConfigExternal).Find(&objs).Error
}
if err != nil {
return err
}
if len(objs) == 0 {
return nil
}
return fmt.Errorf("duplicate ckey value found: %s", ckey)
}
func ConfigsUserVariableStatistics(context *ctx.Context) (*Statistics, error) {
if !context.IsCenter {
return poster.GetByUrls[*Statistics](context, "/v1/n9e/statistic?name=user_variable")
}
session := DB(context).Model(&Configs{}).Select(
"count(*) as total", "max(update_at) as last_updated").Where("external = ?", ConfigExternal)
var stats []*Statistics
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
func ConfigUserVariableGetDecryptMap(context *ctx.Context, privateKey []byte, passWord string) (map[string]string, error) {
if !context.IsCenter {
ret, err := poster.GetByUrls[map[string]string](context, "/v1/n9e/user-variable/decrypt")
if err != nil {
return nil, err
}
return ret, nil
}
lst, err := ConfigsGetUserVariable(context)
if err != nil {
return nil, err
}
ret := make(map[string]string, len(lst))
for i := 0; i < len(lst); i++ {
if lst[i].Encrypted != ConfigEncrypted {
ret[lst[i].Ckey] = lst[i].Cval
} else {
decCval, decErr := secu.Decrypt(lst[i].Cval, privateKey, passWord)
if decErr != nil {
logger.Errorf("RSA Decrypt failed: %v. Ckey: %s", decErr, lst[i].Ckey)
decCval = ""
}
ret[lst[i].Ckey] = decCval
}
}
return ret, nil
}
func ConfigCvalStatistics(context *ctx.Context) (*Statistics, error) {
if !context.IsCenter {
return poster.GetByUrls[*Statistics](context, "/v1/n9e/statistic?name=cval")
}
session := DB(context).Model(&Configs{}).Select("count(*) as total",
"max(update_at) as last_updated").Where("ckey!='' and external=? ", 0) // built-in config
var stats []*Statistics
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
================================================
FILE: models/dash_annotation.go
================================================
package models
import (
"encoding/json"
"github.com/ccfos/nightingale/v6/pkg/ctx"
)
type DashAnnotation struct {
Id int64 `json:"id" gorm:"primaryKey"`
DashboardId int64 `json:"dashboard_id"`
PanelId string `json:"panel_id"`
Tags string `json:"-"`
TagsJSON []string `json:"tags" gorm:"-"`
Description string `json:"description"`
Config string `json:"config"`
TimeStart int64 `json:"time_start"`
TimeEnd int64 `json:"time_end"`
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
}
func (da *DashAnnotation) TableName() string {
return "dash_annotation"
}
func (da *DashAnnotation) DB2FE() error {
return json.Unmarshal([]byte(da.Tags), &da.TagsJSON)
}
func (da *DashAnnotation) FE2DB() error {
b, err := json.Marshal(da.TagsJSON)
if err != nil {
return err
}
da.Tags = string(b)
return nil
}
func (da *DashAnnotation) Add(ctx *ctx.Context) error {
if err := da.FE2DB(); err != nil {
return err
}
return Insert(ctx, da)
}
func (da *DashAnnotation) Update(ctx *ctx.Context) error {
if err := da.FE2DB(); err != nil {
return err
}
return DB(ctx).Model(da).Select("dashboard_id", "panel_id", "tags", "description", "config", "time_start", "time_end", "update_at", "update_by").Updates(da).Error
}
func DashAnnotationDel(ctx *ctx.Context, id int64) error {
return DB(ctx).Where("id = ?", id).Delete(&DashAnnotation{}).Error
}
func DashAnnotationGet(ctx *ctx.Context, where string, args ...interface{}) (*DashAnnotation, error) {
var lst []*DashAnnotation
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
err = lst[0].DB2FE()
return lst[0], err
}
func DashAnnotationGets(ctx *ctx.Context, dashboardId int64, from, to int64, limit int) ([]DashAnnotation, error) {
session := DB(ctx).Where("dashboard_id = ? AND time_start <= ? AND time_end >= ?", dashboardId, to, from)
var lst []DashAnnotation
err := session.Order("id").Limit(limit).Find(&lst).Error
if err != nil {
return nil, err
}
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
return lst, nil
}
================================================
FILE: models/dashboard.go
================================================
package models
import (
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/str"
"gorm.io/gorm"
)
type Dashboard struct {
Id int64 `json:"id" gorm:"primaryKey"`
GroupId int64 `json:"group_id"`
Name string `json:"name"`
Tags string `json:"-"`
TagsLst []string `json:"tags" gorm:"-"`
Configs string `json:"configs"`
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
}
func (d *Dashboard) TableName() string {
return "dashboard"
}
func (d *Dashboard) Verify() error {
if d.Name == "" {
return errors.New("Name is blank")
}
if str.Dangerous(d.Name) {
return errors.New("Name has invalid characters")
}
return nil
}
func (d *Dashboard) Add(ctx *ctx.Context) error {
if err := d.Verify(); err != nil {
return err
}
exists, err := DashboardExists(ctx, "group_id=? and name=?", d.GroupId, d.Name)
if err != nil {
return errors.WithMessage(err, "failed to count dashboard")
}
if exists {
return errors.New("Dashboard already exists")
}
now := time.Now().Unix()
d.CreateAt = now
d.UpdateAt = now
return Insert(ctx, d)
}
func (d *Dashboard) Update(ctx *ctx.Context, selectField interface{}, selectFields ...interface{}) error {
if err := d.Verify(); err != nil {
return err
}
return DB(ctx).Model(d).Select(selectField, selectFields...).Updates(d).Error
}
func (d *Dashboard) Del(ctx *ctx.Context) error {
cgids, err := ChartGroupIdsOf(ctx, d.Id)
if err != nil {
return err
}
if len(cgids) == 0 {
return DB(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Where("id=?", d.Id).Delete(&Dashboard{}).Error; err != nil {
return err
}
return nil
})
}
return DB(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Where("group_id in ?", cgids).Delete(&Chart{}).Error; err != nil {
return err
}
if err := tx.Where("dashboard_id=?", d.Id).Delete(&ChartGroup{}).Error; err != nil {
return err
}
if err := tx.Where("id=?", d.Id).Delete(&Dashboard{}).Error; err != nil {
return err
}
return nil
})
}
func DashboardGet(ctx *ctx.Context, where string, args ...interface{}) (*Dashboard, error) {
var lst []*Dashboard
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
lst[0].TagsLst = strings.Fields(lst[0].Tags)
return lst[0], nil
}
func DashboardCount(ctx *ctx.Context, where string, args ...interface{}) (num int64, err error) {
return Count(DB(ctx).Model(&Dashboard{}).Where(where, args...))
}
func DashboardExists(ctx *ctx.Context, where string, args ...interface{}) (bool, error) {
num, err := DashboardCount(ctx, where, args...)
return num > 0, err
}
func DashboardGets(ctx *ctx.Context, groupId int64, query string) ([]Dashboard, error) {
session := DB(ctx).Where("group_id=?", groupId).Order("name")
arr := strings.Fields(query)
if len(arr) > 0 {
for i := 0; i < len(arr); i++ {
if strings.HasPrefix(arr[i], "-") {
q := "%" + arr[i][1:] + "%"
session = session.Where("name not like ? and tags not like ?", q, q)
} else {
q := "%" + arr[i] + "%"
session = session.Where("(name like ? or tags like ?)", q, q)
}
}
}
var objs []Dashboard
err := session.Select("id", "group_id", "name", "tags", "create_at", "create_by", "update_at", "update_by").Find(&objs).Error
if err == nil {
for i := 0; i < len(objs); i++ {
objs[i].TagsLst = strings.Fields(objs[i].Tags)
}
}
return objs, err
}
func DashboardGetsByIds(ctx *ctx.Context, ids []int64) ([]Dashboard, error) {
if len(ids) == 0 {
return []Dashboard{}, nil
}
var lst []Dashboard
err := DB(ctx).Where("id in ?", ids).Order("name").Find(&lst).Error
return lst, err
}
func DashboardGetAll(ctx *ctx.Context) ([]Dashboard, error) {
var lst []Dashboard
err := DB(ctx).Find(&lst).Error
return lst, err
}
================================================
FILE: models/datasource.go
================================================
package models
import (
"crypto/tls"
"crypto/x509"
"encoding/json"
"fmt"
"math/rand"
"net/http"
"net/url"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/pkg/secu"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/str"
"gorm.io/gorm"
)
type Datasource struct {
Id int64 `json:"id"`
Name string `json:"name"`
Identifier string `json:"identifier"`
Description string `json:"description"`
PluginId int64 `json:"plugin_id"`
PluginType string `json:"plugin_type"` // prometheus
PluginTypeName string `json:"plugin_type_name"` // Prometheus Like
Category string `json:"category"` // timeseries
ClusterName string `json:"cluster_name"`
Settings string `json:"-" gorm:"settings"`
SettingsJson map[string]interface{} `json:"settings" gorm:"-"`
SettingsEncoded string `json:"settings_encoded" gorm:"-"`
Status string `json:"status"`
HTTP string `json:"-" gorm:"http"`
HTTPJson HTTP `json:"http" gorm:"-"`
Auth string `json:"-" gorm:"auth"`
AuthJson Auth `json:"auth" gorm:"-"`
AuthEncoded string `json:"auth_encoded" gorm:"-"`
CreatedAt int64 `json:"created_at"`
UpdatedAt int64 `json:"updated_at"`
CreatedBy string `json:"created_by"`
UpdatedBy string `json:"updated_by"`
IsDefault bool `json:"is_default"`
Weight int `json:"weight"`
Transport *http.Transport `json:"-" gorm:"-"`
ForceSave bool `json:"force_save" gorm:"-"`
}
type Auth struct {
BasicAuth bool `json:"basic_auth"`
BasicAuthUser string `json:"basic_auth_user"`
BasicAuthPassword string `json:"basic_auth_password"`
}
var rsaConfig *RsaConfig
type RsaConfig struct {
OpenRSA bool `json:"open_rsa"`
RSAPublicKey string `json:"rsa_public_key,omitempty"`
RSAPrivateKey string `json:"rsa_private_key,omitempty"`
RSAPassWord string `json:"rsa_password,omitempty"`
PrivateKeyBytes []byte
}
func SetRsaConfig(cfg *RsaConfig) {
if cfg != nil {
rsaConfig = cfg
return
}
logger.Warning("Rsa config is nil")
}
func GetRsaConfig() *RsaConfig {
return rsaConfig
}
type HTTP struct {
Timeout int64 `json:"timeout"`
DialTimeout int64 `json:"dial_timeout"`
TLS TLS `json:"tls"`
MaxIdleConnsPerHost int `json:"max_idle_conns_per_host"`
Url string `json:"url"`
Urls []string `json:"urls"`
Headers map[string]string `json:"headers"`
}
func (h HTTP) IsLoki() bool {
if strings.Contains(h.Url, "loki") {
return true
}
for k := range h.Headers {
tmp := strings.ToLower(k)
if strings.Contains(tmp, "loki") {
return true
}
}
return false
}
func (h HTTP) GetUrls() []string {
var urls []string
if len(h.Urls) == 0 {
urls = []string{h.Url}
} else {
// 复制切片以避免修改原始数据
urls = make([]string, len(h.Urls))
copy(urls, h.Urls)
}
// 使用 Fisher-Yates 洗牌算法随机打乱顺序
for i := len(urls) - 1; i > 0; i-- {
j := rand.Intn(i + 1)
urls[i], urls[j] = urls[j], urls[i]
}
return urls
}
func (h HTTP) NewReq(reqUrl *string) (req *http.Request, err error) {
urls := h.GetUrls()
for i := 0; i < len(urls); i++ {
if req, err = http.NewRequest("GET", urls[i], nil); err == nil {
*reqUrl = urls[i]
return
}
}
return
}
func (h HTTP) ParseUrl() (target *url.URL, err error) {
urls := h.GetUrls()
if len(urls) == 0 {
return nil, errors.New("no urls")
}
target, err = url.Parse(urls[0])
if err != nil {
return nil, err
}
return
}
type TLS struct {
SkipTlsVerify bool `json:"skip_tls_verify"`
// mTLS 配置
CACert string `json:"ca_cert"` // CA 证书内容 (PEM 格式)
ClientCert string `json:"client_cert"` // 客户端证书内容 (PEM 格式)
ClientKey string `json:"client_key"` // 客户端密钥内容 (PEM 格式)
ClientKeyPassword string `json:"client_key_password"` // 密钥密码(可选)
ServerName string `json:"server_name"` // TLS ServerName(可选,用于证书验证)
MinVersion string `json:"min_version"` // TLS 最小版本 (1.0, 1.1, 1.2, 1.3)
MaxVersion string `json:"max_version"` // TLS 最大版本
}
// TLSConfig 从证书内容创建 tls.Config
// 证书内容为 PEM 格式字符串
func (t *TLS) TLSConfig() (*tls.Config, error) {
tlsConfig := &tls.Config{
InsecureSkipVerify: t.SkipTlsVerify,
}
// 设置 ServerName
if t.ServerName != "" {
tlsConfig.ServerName = t.ServerName
}
// 设置 TLS 版本
if t.MinVersion != "" {
if v, ok := tlsVersionMap[t.MinVersion]; ok {
tlsConfig.MinVersion = v
}
}
if t.MaxVersion != "" {
if v, ok := tlsVersionMap[t.MaxVersion]; ok {
tlsConfig.MaxVersion = v
}
}
// 如果配置了客户端证书,则加载 mTLS 配置
clientCert := strings.TrimSpace(t.ClientCert)
clientKey := strings.TrimSpace(t.ClientKey)
caCert := strings.TrimSpace(t.CACert)
if clientCert != "" && clientKey != "" {
// 加载客户端证书和密钥
cert, err := tls.X509KeyPair([]byte(clientCert), []byte(clientKey))
if err != nil {
return nil, fmt.Errorf("failed to load client certificate: %w", err)
}
tlsConfig.Certificates = []tls.Certificate{cert}
}
// 加载 CA 证书
if caCert != "" {
caCertPool := x509.NewCertPool()
if !caCertPool.AppendCertsFromPEM([]byte(caCert)) {
return nil, fmt.Errorf("failed to parse CA certificate")
}
tlsConfig.RootCAs = caCertPool
}
return tlsConfig, nil
}
// tlsVersionMap TLS 版本映射
var tlsVersionMap = map[string]uint16{
"1.0": tls.VersionTLS10,
"1.1": tls.VersionTLS11,
"1.2": tls.VersionTLS12,
"1.3": tls.VersionTLS13,
}
func (ds *Datasource) TableName() string {
return "datasource"
}
func (ds *Datasource) Verify() error {
if str.Dangerous(ds.Name) {
return errors.New("Name has invalid characters")
}
err := ds.FE2DB()
return err
}
func (ds *Datasource) Update(ctx *ctx.Context, selectField interface{}, selectFields ...interface{}) error {
if err := ds.Verify(); err != nil {
return err
}
if ds.UpdatedAt == 0 {
ds.UpdatedAt = time.Now().Unix()
}
return DB(ctx).Model(ds).Session(&gorm.Session{SkipHooks: true}).Select(selectField, selectFields...).Updates(ds).Error
}
func (ds *Datasource) Add(ctx *ctx.Context) error {
if err := ds.Verify(); err != nil {
return err
}
now := time.Now().Unix()
ds.CreatedAt = now
ds.UpdatedAt = now
return Insert(ctx, ds)
}
func DatasourceDel(ctx *ctx.Context, ids []int64) error {
if len(ids) == 0 {
return nil
}
return DB(ctx).Where("id in ?", ids).Delete(new(Datasource)).Error
}
func DatasourceGet(ctx *ctx.Context, id int64) (*Datasource, error) {
var ds *Datasource
err := DB(ctx).Where("id = ?", id).First(&ds).Error
if err != nil {
return nil, err
}
return ds, ds.DB2FE()
}
type DatasourceInfo struct {
Id int64 `json:"id"`
Name string `json:"name"`
PluginType string `json:"plugin_type"`
}
func GetDatasourceInfosByIds(ctx *ctx.Context, ids []int64) ([]*DatasourceInfo, error) {
if len(ids) == 0 {
return []*DatasourceInfo{}, nil
}
var dsInfos []*DatasourceInfo
err := DB(ctx).
Model(&Datasource{}).
Select("id", "name", "plugin_type").
Where("id in ?", ids).
Find(&dsInfos).Error
if err != nil {
return nil, err
}
return dsInfos, nil
}
func (ds *Datasource) Get(ctx *ctx.Context) error {
err := DB(ctx).Where("id = ?", ds.Id).First(ds).Error
if err != nil {
return err
}
return ds.DB2FE()
}
func GetDatasources(ctx *ctx.Context) ([]Datasource, error) {
if !ctx.IsCenter {
lst, err := poster.GetByUrls[[]Datasource](ctx, "/v1/n9e/datasources")
if err != nil {
return nil, err
}
for i := 0; i < len(lst); i++ {
if err := lst[i].Decrypt(); err != nil {
logger.Errorf("decrypt datasource %+v fail: %v", lst[i], err)
continue
}
lst[i].FE2DB()
}
return lst, nil
}
var dss []Datasource
err := DB(ctx).Find(&dss).Error
for i := 0; i < len(dss); i++ {
dss[i].DB2FE()
}
return dss, err
}
func GetDatasourceIdsByEngineName(ctx *ctx.Context, engineName string) ([]int64, error) {
if !ctx.IsCenter {
lst, err := poster.GetByUrls[[]int64](ctx, "/v1/n9e/datasource-ids?name="+engineName)
return lst, err
}
var dss []Datasource
var ids []int64
err := DB(ctx).Where("cluster_name = ?", engineName).Find(&dss).Error
if err != nil {
return ids, err
}
for i := 0; i < len(dss); i++ {
ids = append(ids, dss[i].Id)
}
return ids, err
}
func GetDatasourcesCountByName(ctx *ctx.Context, name string) (int64, error) {
session := DB(ctx).Model(&Datasource{})
if name != "" {
session = session.Where("name = ?", name)
}
return Count(session)
}
func GetDatasourcesCountBy(ctx *ctx.Context, typ, cate, name string) (int64, error) {
session := DB(ctx).Model(&Datasource{})
if name != "" {
arr := strings.Fields(name)
for i := 0; i < len(arr); i++ {
qarg := "%" + arr[i] + "%"
session = session.Where("name = ?", qarg)
}
}
if typ != "" {
session = session.Where("plugin_type = ?", typ)
}
if cate != "" {
session = session.Where("category = ?", cate)
}
return Count(session)
}
func GetDatasourcesGetsBy(ctx *ctx.Context, typ, cate, name, status string) ([]*Datasource, error) {
session := DB(ctx)
if name != "" {
arr := strings.Fields(name)
for i := 0; i < len(arr); i++ {
qarg := "%" + arr[i] + "%"
session = session.Where("name = ?", qarg)
}
}
if typ != "" {
session = session.Where("plugin_type = ?", typ)
}
if cate != "" {
session = session.Where("category = ?", cate)
}
if status != "" {
session = session.Where("status = ?", status)
}
var lst []*Datasource
err := session.Order("id desc").Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
}
return lst, err
}
func GetDatasourcesGetsByTypes(ctx *ctx.Context, types []string) (map[string]*Datasource, error) {
var lst []*Datasource
m := make(map[string]*Datasource)
err := DB(ctx).Where("plugin_type in ?", types).Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
m[lst[i].Name] = lst[i]
}
}
return m, err
}
func (ds *Datasource) FE2DB() error {
if ds.SettingsJson != nil {
b, err := json.Marshal(ds.SettingsJson)
if err != nil {
return err
}
ds.Settings = string(b)
}
b, err := json.Marshal(ds.HTTPJson)
if err != nil {
return err
}
ds.HTTP = string(b)
b, err = json.Marshal(ds.AuthJson)
if err != nil {
return err
}
ds.Auth = string(b)
return nil
}
func (ds *Datasource) DB2FE() error {
if ds.Settings != "" {
err := json.Unmarshal([]byte(ds.Settings), &ds.SettingsJson)
if err != nil {
return err
}
}
if ds.HTTP != "" {
err := json.Unmarshal([]byte(ds.HTTP), &ds.HTTPJson)
if err != nil {
return err
}
}
if ds.HTTPJson.Timeout == 0 {
ds.HTTPJson.Timeout = 10000
}
if ds.HTTPJson.DialTimeout == 0 {
ds.HTTPJson.DialTimeout = 10000
}
if ds.HTTPJson.MaxIdleConnsPerHost == 0 {
ds.HTTPJson.MaxIdleConnsPerHost = 100
}
if ds.PluginType == ELASTICSEARCH && len(ds.HTTPJson.Urls) == 0 {
ds.HTTPJson.Urls = []string{ds.HTTPJson.Url}
}
if ds.Auth != "" {
err := json.Unmarshal([]byte(ds.Auth), &ds.AuthJson)
if err != nil {
return err
}
}
return nil
}
// Encrypt 数据源密码加密
func (ds *Datasource) Encrypt(openRsa bool, publicKeyData []byte) error {
if !openRsa {
return nil
}
if ds.Settings != "" {
encVal, err := secu.EncryptValue(ds.Settings, publicKeyData)
if err != nil {
logger.Errorf("encrypt settings failed: datasource=%s err=%v", ds.Name, err)
return err
} else {
ds.SettingsEncoded = encVal
}
}
if ds.Auth != "" {
encVal, err := secu.EncryptValue(ds.Auth, publicKeyData)
if err != nil {
logger.Errorf("encrypt basic failed: datasource=%s err=%v", ds.Name, err)
return err
} else {
ds.AuthEncoded = encVal
}
}
ds.ClearPlaintext()
return nil
}
// Decrypt 用于 edge 将从中心同步的数据源解密,中心不可调用
func (ds *Datasource) Decrypt() error {
if rsaConfig == nil {
logger.Debugf("datasource %s rsa config is nil", ds.Name)
return nil
}
if !rsaConfig.OpenRSA {
return nil
}
privateKeyData := rsaConfig.PrivateKeyBytes
password := rsaConfig.RSAPassWord
if ds.SettingsEncoded != "" {
settings, err := secu.Decrypt(ds.SettingsEncoded, privateKeyData, password)
if err != nil {
return err
}
ds.Settings = settings
err = json.Unmarshal([]byte(settings), &ds.SettingsJson)
if err != nil {
return err
}
}
if ds.AuthEncoded != "" {
auth, err := secu.Decrypt(ds.AuthEncoded, privateKeyData, password)
if err != nil {
return err
}
ds.Auth = auth
err = json.Unmarshal([]byte(auth), &ds.AuthJson)
if err != nil {
return err
}
}
return nil
}
// ClearPlaintext 清理敏感字段
func (ds *Datasource) ClearPlaintext() {
ds.Settings = ""
ds.SettingsJson = nil
ds.Auth = ""
ds.AuthJson.BasicAuthUser = ""
ds.AuthJson.BasicAuthPassword = ""
}
func DatasourceGetMap(ctx *ctx.Context) (map[int64]*Datasource, error) {
var lst []*Datasource
var err error
if !ctx.IsCenter {
lst, err = poster.GetByUrls[[]*Datasource](ctx, "/v1/n9e/datasources")
if err != nil {
return nil, err
}
for i := 0; i < len(lst); i++ {
if err := lst[i].Decrypt(); err != nil {
logger.Errorf("decrypt datasource %+v fail: %v", lst[i], err)
continue
}
lst[i].FE2DB()
}
} else {
err := DB(ctx).Find(&lst).Error
if err != nil {
return nil, err
}
for i := 0; i < len(lst); i++ {
err := lst[i].DB2FE()
if err != nil {
logger.Warningf("get ds:%+v err:%v", lst[i], err)
continue
}
}
}
ds := make(map[int64]*Datasource)
for i := 0; i < len(lst); i++ {
ds[lst[i].Id] = lst[i]
}
return ds, nil
}
func DatasourceStatistics(ctx *ctx.Context) (*Statistics, error) {
if !ctx.IsCenter {
s, err := poster.GetByUrls[*Statistics](ctx, "/v1/n9e/statistic?name=datasource")
return s, err
}
session := DB(ctx).Model(&Datasource{}).Select("count(*) as total", "max(updated_at) as last_updated")
var stats []*Statistics
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
================================================
FILE: models/embedded_product.go
================================================
package models
import (
"encoding/json"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/str"
"gorm.io/gorm"
"gorm.io/gorm/clause"
)
type EmbeddedProduct struct {
ID int64 `json:"id" gorm:"primaryKey"` // 主键
Name string `json:"name" gorm:"column:name;type:varchar(255)"`
URL string `json:"url" gorm:"column:url;type:varchar(255)"`
IsPrivate bool `json:"is_private" gorm:"column:is_private;type:boolean"`
TeamIDs []int64 `json:"team_ids" gorm:"serializer:json"`
CreateAt int64 `json:"create_at" gorm:"column:create_at;not null;default:0"`
CreateBy string `json:"create_by" gorm:"column:create_by;type:varchar(64);not null;default:''"`
UpdateAt int64 `json:"update_at" gorm:"column:update_at;not null;default:0"`
UpdateBy string `json:"update_by" gorm:"column:update_by;type:varchar(64);not null;default:''"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
}
func (e *EmbeddedProduct) TableName() string {
return "embedded_product"
}
func (e *EmbeddedProduct) AfterFind(tx *gorm.DB) (err error) {
if e.TeamIDs == nil {
e.TeamIDs = []int64{}
}
return nil
}
func (e *EmbeddedProduct) Verify() error {
if e.Name == "" {
return errors.New("Name is blank")
}
if str.Dangerous(e.Name) {
return errors.New("Name has invalid characters")
}
if e.URL == "" {
return errors.New("URL is blank")
}
if e.IsPrivate && len(e.TeamIDs) == 0 {
return errors.New("TeamIDs is blank")
}
return nil
}
func AddEmbeddedProduct(ctx *ctx.Context, eps []EmbeddedProduct) error {
now := time.Now().Unix()
for i := range eps {
if err := eps[i].Verify(); err != nil {
return errors.Wrapf(err, "invalid entry %v", eps[i])
}
eps[i].CreateAt = now
eps[i].UpdateAt = now
}
// 用主键做冲突判断,有冲突则更新(UPSERT)
return DB(ctx).Clauses(clause.OnConflict{
UpdateAll: true, // 冲突时更新所有字段
}).Create(&eps).Error
}
func EmbeddedProductGets(ctx *ctx.Context) ([]*EmbeddedProduct, error) {
var list []*EmbeddedProduct
err := DB(ctx).Find(&list).Error
return list, err
}
func GetEmbeddedProductByID(ctx *ctx.Context, id int64) (*EmbeddedProduct, error) {
var ep EmbeddedProduct
err := DB(ctx).Where("id = ?", id).First(&ep).Error
return &ep, err
}
func UpdateEmbeddedProduct(ctx *ctx.Context, ep *EmbeddedProduct) error {
if err := ep.Verify(); err != nil {
return err
}
return DB(ctx).Save(ep).Error
}
func DeleteEmbeddedProduct(ctx *ctx.Context, id int64) error {
return DB(ctx).Where("id = ?", id).Delete(&EmbeddedProduct{}).Error
}
func CanMigrateEP(ctx *ctx.Context) bool {
var count int64
err := DB(ctx).Model(&EmbeddedProduct{}).Count(&count).Error
if err != nil {
logger.Errorf("failed to get embedded-product table count, err:%v", err)
return false
}
return count <= 0
}
func MigrateEP(ctx *ctx.Context) {
var lst []string
_ = DB(ctx).Model(&Configs{}).Where("ckey=? and external=? ", "embedded-dashboards", 0).Pluck("cval", &lst).Error
if len(lst) > 0 {
var oldData []DashboardConfig
if err := json.Unmarshal([]byte(lst[0]), &oldData); err != nil {
return
}
if len(oldData) < 1 {
return
}
now := time.Now().Unix()
var newData []EmbeddedProduct
for _, v := range oldData {
newData = append(newData, EmbeddedProduct{
Name: v.Name,
URL: v.URL,
IsPrivate: false,
TeamIDs: []int64{},
CreateBy: "system",
CreateAt: now,
UpdateAt: now,
UpdateBy: "system",
})
}
err := DB(ctx).Create(&newData).Error
if err != nil {
logger.Errorf("failed to create embedded-product, err:%v", err)
}
}
}
type DashboardConfig struct {
ID string `json:"id"`
Name string `json:"name"`
URL string `json:"url"`
}
================================================
FILE: models/es_index_pattern.go
================================================
package models
import (
"encoding/json"
"fmt"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/pkg/errors"
)
type EsIndexPattern struct {
Id int64 `json:"id" gorm:"primaryKey"`
DatasourceId int64 `json:"datasource_id"`
Name string `json:"name"`
TimeField string `json:"time_field"`
AllowHideSystemIndices int `json:"-" gorm:"allow_hide_system_indices"`
AllowHideSystemIndicesBool bool `json:"allow_hide_system_indices" gorm:"-"`
FieldsFormat string `json:"fields_format"`
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
CrossClusterEnabled int `json:"cross_cluster_enabled"`
Note string `json:"note"`
}
func (t *EsIndexPattern) TableName() string {
return "es_index_pattern"
}
func (r *EsIndexPattern) Add(ctx *ctx.Context) error {
esIndexPattern, err := EsIndexPatternGet(ctx, "datasource_id = ? and name = ?", r.DatasourceId, r.Name)
if err != nil {
return errors.WithMessage(err, "failed to query es index pattern")
}
if esIndexPattern != nil {
return errors.New("es index pattern datasource and name already exists")
}
r.FE2DB()
return Insert(ctx, r)
}
func EsIndexPatternDel(ctx *ctx.Context, ids []int64) error {
if len(ids) == 0 {
return nil
}
// 检查是否有告警规则引用了这些 index pattern
for _, id := range ids {
alertRules, err := GetAlertRulesByEsIndexPatternId(ctx, id)
if err != nil {
return errors.WithMessage(err, "failed to check alert rules")
}
if len(alertRules) > 0 {
names := make([]string, 0, len(alertRules))
for _, rule := range alertRules {
names = append(names, rule.Name)
}
return errors.Errorf("index pattern(id=%d) is used by alert rules: %s", id, strings.Join(names, ", "))
}
}
return DB(ctx).Where("id in ?", ids).Delete(new(EsIndexPattern)).Error
}
// GetAlertRulesByEsIndexPatternId 获取引用了指定 index pattern 的告警规则
func GetAlertRulesByEsIndexPatternId(ctx *ctx.Context, indexPatternId int64) ([]*AlertRule, error) {
// index_pattern 存储在 rule_config JSON 字段的 queries 数组中
// 格式如: {"queries":[{"index_type":"index_pattern","index_pattern":123,...}]}
// 先用 LIKE 粗筛,再在代码中精确过滤
pattern := fmt.Sprintf(`%%"index_pattern":%d%%`, indexPatternId)
var candidates []*AlertRule
err := DB(ctx).Where("rule_config LIKE ?", pattern).Find(&candidates).Error
if err != nil {
return nil, err
}
// 精确过滤:解析 JSON 检查 index_pattern 字段值是否精确匹配
var alertRules []*AlertRule
for _, rule := range candidates {
if ruleUsesIndexPattern(rule.RuleConfig, indexPatternId) {
alertRules = append(alertRules, rule)
}
}
return alertRules, nil
}
// ruleUsesIndexPattern 检查告警规则的 rule_config 是否引用了指定的 index_pattern
func ruleUsesIndexPattern(ruleConfig string, indexPatternId int64) bool {
var config struct {
Queries []struct {
IndexPattern int64 `json:"index_pattern"`
} `json:"queries"`
}
if err := json.Unmarshal([]byte(ruleConfig), &config); err != nil {
return false
}
for _, query := range config.Queries {
if query.IndexPattern == indexPatternId {
return true
}
}
return false
}
func (ei *EsIndexPattern) Update(ctx *ctx.Context, eip EsIndexPattern) error {
if ei.Name != eip.Name || ei.DatasourceId != eip.DatasourceId {
exists, err := EsIndexPatternExists(ctx, ei.Id, eip.DatasourceId, eip.Name)
if err != nil {
return err
}
if exists {
return errors.New("EsIndexPattern already exists")
}
}
eip.Id = ei.Id
eip.CreateAt = ei.CreateAt
eip.CreateBy = ei.CreateBy
eip.UpdateAt = time.Now().Unix()
eip.FE2DB()
return DB(ctx).Model(ei).Select("*").Updates(eip).Error
}
func (dbIndexPattern *EsIndexPattern) DB2FE() {
if dbIndexPattern.AllowHideSystemIndices == 1 {
dbIndexPattern.AllowHideSystemIndicesBool = true
}
}
func (feIndexPattern *EsIndexPattern) FE2DB() {
if feIndexPattern.AllowHideSystemIndicesBool {
feIndexPattern.AllowHideSystemIndices = 1
}
}
func EsIndexPatternGets(ctx *ctx.Context, where string, args ...interface{}) ([]*EsIndexPattern, error) {
if !ctx.IsCenter {
lst, err := poster.GetByUrls[[]*EsIndexPattern](ctx, "/v1/n9e/es-index-pattern-list")
return lst, err
}
var objs []*EsIndexPattern
err := DB(ctx).Where(where, args...).Find(&objs).Error
if err != nil {
return nil, errors.WithMessage(err, "failed to query es index pattern")
}
for _, i := range objs {
i.DB2FE()
}
return objs, nil
}
func EsIndexPatternGet(ctx *ctx.Context, where string, args ...interface{}) (*EsIndexPattern, error) {
var lst []*EsIndexPattern
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
lst[0].DB2FE()
return lst[0], nil
}
func EsIndexPatternGetById(ctx *ctx.Context, id int64) (*EsIndexPattern, error) {
return EsIndexPatternGet(ctx, "id=?", id)
}
func EsIndexPatternExists(ctx *ctx.Context, id, datasourceId int64, name string) (bool, error) {
session := DB(ctx).Where("id <> ? and datasource_id = ? and name = ?", id, datasourceId, name)
var lst []EsIndexPattern
err := session.Find(&lst).Error
if err != nil {
return false, err
}
if len(lst) == 0 {
return false, nil
}
return true, nil
}
================================================
FILE: models/event_pipeline.go
================================================
package models
import (
"errors"
"fmt"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
)
// EventPipeline 事件Pipeline模型
type EventPipeline struct {
ID int64 `json:"id" gorm:"primaryKey"`
Name string `json:"name" gorm:"type:varchar(128)"`
Typ string `json:"typ" gorm:"type:varchar(128)"` // builtin, user-defined // event_pipeline, event_summary, metric_explorer
UseCase string `json:"use_case" gorm:"type:varchar(128)"` // metric_explorer, event_summary, event_pipeline
TriggerMode string `json:"trigger_mode" gorm:"type:varchar(128)"` // event, api, cron
Disabled bool `json:"disabled" gorm:"type:boolean"`
TeamIds []int64 `json:"team_ids" gorm:"type:text;serializer:json"`
TeamNames []string `json:"team_names" gorm:"-"`
Description string `json:"description" gorm:"type:varchar(255)"`
FilterEnable bool `json:"filter_enable" gorm:"type:boolean"`
LabelFilters []TagFilter `json:"label_filters" gorm:"type:text;serializer:json"`
AttrFilters []TagFilter `json:"attribute_filters" gorm:"type:text;serializer:json"`
ProcessorConfigs []ProcessorConfig `json:"processors" gorm:"type:text;serializer:json"`
// 工作流节点列表
Nodes []WorkflowNode `json:"nodes,omitempty" gorm:"type:text;serializer:json"`
// 节点连接关系
Connections Connections `json:"connections,omitempty" gorm:"type:text;serializer:json"`
// 输入参数(工作流级别的配置变量)
Inputs []InputVariable `json:"inputs,omitempty" gorm:"type:text;serializer:json"`
CreateAt int64 `json:"create_at" gorm:"type:bigint"`
CreateBy string `json:"create_by" gorm:"type:varchar(64)"`
UpdateAt int64 `json:"update_at" gorm:"type:bigint"`
UpdateBy string `json:"update_by" gorm:"type:varchar(64)"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
}
type ProcessorConfig struct {
Typ string `json:"typ"`
Config interface{} `json:"config"`
}
func (e *EventPipeline) TableName() string {
return "event_pipeline"
}
func (e *EventPipeline) Verify() error {
if e.Name == "" {
return errors.New("name cannot be empty")
}
if len(e.TeamIds) == 0 {
return errors.New("team_ids cannot be empty")
}
if len(e.LabelFilters) == 0 {
e.LabelFilters = make([]TagFilter, 0)
}
if len(e.AttrFilters) == 0 {
e.AttrFilters = make([]TagFilter, 0)
}
if len(e.ProcessorConfigs) == 0 {
e.ProcessorConfigs = make([]ProcessorConfig, 0)
}
// 初始化空数组,避免 null
if e.Nodes == nil {
e.Nodes = make([]WorkflowNode, 0)
}
if e.Connections == nil {
e.Connections = make(Connections)
}
if e.Inputs == nil {
e.Inputs = make([]InputVariable, 0)
}
return nil
}
// CreateEventPipeline 创建事件Pipeline
func CreateEventPipeline(ctx *ctx.Context, pipeline *EventPipeline) error {
return DB(ctx).Create(pipeline).Error
}
// GetEventPipeline 获取单个事件Pipeline
func GetEventPipeline(ctx *ctx.Context, id int64) (*EventPipeline, error) {
var pipeline EventPipeline
err := DB(ctx).Where("id = ?", id).First(&pipeline).Error
if err != nil {
return nil, err
}
pipeline.Verify()
return &pipeline, nil
}
func GetEventPipelinesByIds(ctx *ctx.Context, ids []int64) ([]*EventPipeline, error) {
var pipelines []*EventPipeline
err := DB(ctx).Where("id in ?", ids).Find(&pipelines).Error
return pipelines, err
}
// UpdateEventPipeline 更新事件Pipeline
func UpdateEventPipeline(ctx *ctx.Context, pipeline *EventPipeline) error {
return DB(ctx).Save(pipeline).Error
}
// DeleteEventPipeline 删除事件Pipeline
func DeleteEventPipeline(ctx *ctx.Context, id int64) error {
return DB(ctx).Delete(&EventPipeline{}, id).Error
}
// ListEventPipelines 获取事件Pipeline列表
func ListEventPipelines(ctx *ctx.Context) ([]*EventPipeline, error) {
if !ctx.IsCenter {
pipelines, err := poster.GetByUrls[[]*EventPipeline](ctx, "/v1/n9e/event-pipelines")
return pipelines, err
}
var pipelines []*EventPipeline
err := DB(ctx).Order("name asc").Find(&pipelines).Error
if err != nil {
return nil, err
}
for _, p := range pipelines {
p.Verify()
}
return pipelines, nil
}
// DeleteEventPipelines 批量删除事件Pipeline
func DeleteEventPipelines(ctx *ctx.Context, ids []int64) error {
return DB(ctx).Where("id in ?", ids).Delete(&EventPipeline{}).Error
}
// Update 更新事件Pipeline
func (e *EventPipeline) Update(ctx *ctx.Context, ref *EventPipeline) error {
ref.ID = e.ID
ref.CreateAt = e.CreateAt
ref.CreateBy = e.CreateBy
ref.UpdateAt = time.Now().Unix()
err := ref.Verify()
if err != nil {
return err
}
return DB(ctx).Model(e).Select("*").Updates(*ref).Error
}
// FillTeamNames 填充团队名称
func (e *EventPipeline) FillTeamNames(ctx *ctx.Context) error {
e.TeamNames = make([]string, 0, len(e.TeamIds))
if len(e.TeamIds) == 0 {
return nil
}
teamMap, err := UserGroupIdAndNameMap(ctx, e.TeamIds)
if err != nil {
return err
}
// 按原始TeamIds顺序填充TeamNames
for _, tid := range e.TeamIds {
if name, exists := teamMap[tid]; exists {
e.TeamNames = append(e.TeamNames, name)
}
}
return nil
}
func EventPipelineStatistics(ctx *ctx.Context) (*Statistics, error) {
if !ctx.IsCenter {
s, err := poster.GetByUrls[*Statistics](ctx, "/v1/n9e/statistic?name=event_pipeline")
return s, err
}
session := DB(ctx).Model(&EventPipeline{}).Select("count(*) as total", "max(update_at) as last_updated")
var stats []*Statistics
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
if len(stats) == 0 {
return nil, fmt.Errorf("no event pipeline found")
}
return stats[0], nil
}
// 无论是新格式还是旧格式,都返回统一的 []WorkflowNode
func (e *EventPipeline) GetWorkflowNodes() []WorkflowNode {
// 优先使用新格式
if len(e.Nodes) > 0 {
return e.Nodes
}
// 兼容旧格式:将 ProcessorConfigs 转换为 WorkflowNode
nodes := make([]WorkflowNode, len(e.ProcessorConfigs))
for i, pc := range e.ProcessorConfigs {
nodeID := fmt.Sprintf("node_%d", i)
nodeName := pc.Typ
nodes[i] = WorkflowNode{
ID: nodeID,
Name: nodeName,
Type: pc.Typ,
Config: pc.Config,
}
}
return nodes
}
func (e *EventPipeline) GetWorkflowConnections() Connections {
// 优先使用显式定义的连接
if len(e.Connections) > 0 {
return e.Connections
}
// 自动生成线性连接:node_0 → node_1 → node_2 → ...
nodes := e.GetWorkflowNodes()
conns := make(Connections)
for i := 0; i < len(nodes)-1; i++ {
conns[nodes[i].ID] = NodeConnections{
Main: [][]ConnectionTarget{
{{Node: nodes[i+1].ID, Type: "main", Index: 0}},
},
}
}
return conns
}
func (e *EventPipeline) FillWorkflowFields() {
if len(e.Nodes) == 0 && len(e.ProcessorConfigs) > 0 {
e.Nodes = e.GetWorkflowNodes()
e.Connections = e.GetWorkflowConnections()
}
}
func (e *EventPipeline) GetInputsMap() map[string]string {
inputsMap := make(map[string]string)
for _, v := range e.Inputs {
inputsMap[v.Key] = v.Value
}
return inputsMap
}
================================================
FILE: models/event_pipeline_execution.go
================================================
package models
import (
"encoding/json"
"errors"
"fmt"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"gorm.io/gorm"
)
// 执行状态常量
const (
ExecutionStatusRunning = "running"
ExecutionStatusSuccess = "success"
ExecutionStatusFailed = "failed"
)
// EventPipelineExecution 工作流执行记录
type EventPipelineExecution struct {
ID string `json:"id" gorm:"primaryKey;type:varchar(36)"`
PipelineID int64 `json:"pipeline_id" gorm:"index"`
PipelineName string `json:"pipeline_name" gorm:"type:varchar(128)"`
EventID int64 `json:"event_id" gorm:"index"`
// 触发模式:event(告警触发)、api(API触发)、cron(定时触发)
Mode string `json:"mode" gorm:"type:varchar(16);index"`
// 状态:running、success、failed
Status string `json:"status" gorm:"type:varchar(16);index"`
// 各节点执行结果(JSON)
NodeResults string `json:"node_results" gorm:"type:mediumtext"`
// 错误信息
ErrorMessage string `json:"error_message" gorm:"type:varchar(1024)"`
ErrorNode string `json:"error_node" gorm:"type:varchar(36)"`
// 时间
CreatedAt int64 `json:"created_at" gorm:"index"`
FinishedAt int64 `json:"finished_at"`
DurationMs int64 `json:"duration_ms"`
// 触发者信息
TriggerBy string `json:"trigger_by" gorm:"type:varchar(64)"`
// 输入参数快照(脱敏后存储)
InputsSnapshot string `json:"inputs_snapshot,omitempty" gorm:"type:text"`
}
func (e *EventPipelineExecution) TableName() string {
return "event_pipeline_execution"
}
// SetNodeResults 设置节点执行结果(序列化为 JSON)
func (e *EventPipelineExecution) SetNodeResults(results []*NodeExecutionResult) error {
data, err := json.Marshal(results)
if err != nil {
return err
}
e.NodeResults = string(data)
return nil
}
// GetNodeResults 获取节点执行结果(反序列化)
func (e *EventPipelineExecution) GetNodeResults() ([]*NodeExecutionResult, error) {
if e.NodeResults == "" {
return nil, nil
}
var results []*NodeExecutionResult
err := json.Unmarshal([]byte(e.NodeResults), &results)
return results, err
}
// SetInputsSnapshot 设置输入参数快照(脱敏后存储)
func (e *EventPipelineExecution) SetInputsSnapshot(inputs map[string]string) error {
data, err := json.Marshal(inputs)
if err != nil {
return err
}
e.InputsSnapshot = string(data)
return nil
}
// GetInputsSnapshot 获取输入参数快照
func (e *EventPipelineExecution) GetInputsSnapshot() (map[string]string, error) {
if e.InputsSnapshot == "" {
return nil, nil
}
var inputs map[string]string
err := json.Unmarshal([]byte(e.InputsSnapshot), &inputs)
return inputs, err
}
// CreateEventPipelineExecution 创建执行记录
func CreateEventPipelineExecution(c *ctx.Context, execution *EventPipelineExecution) error {
if !c.IsCenter {
return poster.PostByUrls(c, "/v1/n9e/event-pipeline-execution", execution)
}
return DB(c).Create(execution).Error
}
// UpdateEventPipelineExecution 更新执行记录
func UpdateEventPipelineExecution(c *ctx.Context, execution *EventPipelineExecution) error {
return DB(c).Save(execution).Error
}
// GetEventPipelineExecution 获取单条执行记录
func GetEventPipelineExecution(c *ctx.Context, id string) (*EventPipelineExecution, error) {
var execution EventPipelineExecution
err := DB(c).Where("id = ?", id).First(&execution).Error
if err != nil {
if errors.Is(err, gorm.ErrRecordNotFound) {
return nil, nil
}
return nil, err
}
return &execution, nil
}
// ListEventPipelineExecutions 获取 Pipeline 的执行记录列表
func ListEventPipelineExecutions(c *ctx.Context, pipelineID int64, mode, status string, limit, offset int) ([]*EventPipelineExecution, int64, error) {
var executions []*EventPipelineExecution
var total int64
session := DB(c).Model(&EventPipelineExecution{}).Where("pipeline_id = ?", pipelineID)
if mode != "" {
session = session.Where("mode = ?", mode)
}
if status != "" {
session = session.Where("status = ?", status)
}
err := session.Count(&total).Error
if err != nil {
return nil, 0, err
}
err = session.Order("created_at desc").Limit(limit).Offset(offset).Find(&executions).Error
if err != nil {
return nil, 0, err
}
return executions, total, nil
}
// ListEventPipelineExecutionsByEventID 根据事件ID获取执行记录
func ListEventPipelineExecutionsByEventID(c *ctx.Context, eventID int64) ([]*EventPipelineExecution, error) {
var executions []*EventPipelineExecution
err := DB(c).Where("event_id = ?", eventID).Order("created_at desc").Find(&executions).Error
return executions, err
}
// ListAllEventPipelineExecutions 获取所有 Pipeline 的执行记录列表
func ListAllEventPipelineExecutions(c *ctx.Context, pipelineId int64, pipelineName, mode, status string, limit, offset int) ([]*EventPipelineExecution, int64, error) {
var executions []*EventPipelineExecution
var total int64
session := DB(c).Model(&EventPipelineExecution{})
if pipelineId > 0 {
session = session.Where("pipeline_id = ?", pipelineId)
}
if pipelineName != "" {
session = session.Where("pipeline_name LIKE ?", "%"+pipelineName+"%")
}
if mode != "" {
session = session.Where("mode = ?", mode)
}
if status != "" {
session = session.Where("status = ?", status)
}
err := session.Count(&total).Error
if err != nil {
return nil, 0, err
}
err = session.Order("created_at desc").Limit(limit).Offset(offset).Find(&executions).Error
if err != nil {
return nil, 0, err
}
return executions, total, nil
}
// DeleteEventPipelineExecutions 批量删除执行记录(按时间)
func DeleteEventPipelineExecutions(c *ctx.Context, beforeTime int64) (int64, error) {
result := DB(c).Where("created_at < ?", beforeTime).Delete(&EventPipelineExecution{})
return result.RowsAffected, result.Error
}
// DeleteEventPipelineExecutionsInBatches 分批删除执行记录(按时间)
// 每次删除 limit 条记录,返回本次删除的数量
// 使用子查询方式实现,兼容 MySQL、PostgreSQL、SQLite
func DeleteEventPipelineExecutionsInBatches(c *ctx.Context, beforeTime int64, limit int) (int64, error) {
// 先查询要删除的 ID
var ids []string
err := DB(c).Model(&EventPipelineExecution{}).
Where("created_at < ?", beforeTime).
Limit(limit).
Pluck("id", &ids).Error
if err != nil {
return 0, err
}
if len(ids) == 0 {
return 0, nil
}
// 按 ID 删除
result := DB(c).Where("id IN ?", ids).Delete(&EventPipelineExecution{})
return result.RowsAffected, result.Error
}
// DeleteEventPipelineExecutionsByPipelineID 删除指定 Pipeline 的所有执行记录
func DeleteEventPipelineExecutionsByPipelineID(c *ctx.Context, pipelineID int64) error {
return DB(c).Where("pipeline_id = ?", pipelineID).Delete(&EventPipelineExecution{}).Error
}
// EventPipelineExecutionStatistics 执行统计
type EventPipelineExecutionStatistics struct {
Total int64 `json:"total"`
Success int64 `json:"success"`
Failed int64 `json:"failed"`
Running int64 `json:"running"`
AvgDurMs int64 `json:"avg_duration_ms"`
LastRunAt int64 `json:"last_run_at"`
}
// GetEventPipelineExecutionStatistics 获取执行统计信息
func GetEventPipelineExecutionStatistics(c *ctx.Context, pipelineID int64) (*EventPipelineExecutionStatistics, error) {
var stats EventPipelineExecutionStatistics
// 总数
err := DB(c).Model(&EventPipelineExecution{}).Where("pipeline_id = ?", pipelineID).Count(&stats.Total).Error
if err != nil {
return nil, err
}
// 成功数
err = DB(c).Model(&EventPipelineExecution{}).Where("pipeline_id = ? AND status = ?", pipelineID, ExecutionStatusSuccess).Count(&stats.Success).Error
if err != nil {
return nil, err
}
// 失败数
err = DB(c).Model(&EventPipelineExecution{}).Where("pipeline_id = ? AND status = ?", pipelineID, ExecutionStatusFailed).Count(&stats.Failed).Error
if err != nil {
return nil, err
}
// 运行中
err = DB(c).Model(&EventPipelineExecution{}).Where("pipeline_id = ? AND status = ?", pipelineID, ExecutionStatusRunning).Count(&stats.Running).Error
if err != nil {
return nil, err
}
// 平均耗时
var avgDur struct {
AvgDur float64 `gorm:"column:avg_dur"`
}
err = DB(c).Model(&EventPipelineExecution{}).
Select("AVG(duration_ms) as avg_dur").
Where("pipeline_id = ? AND status = ?", pipelineID, ExecutionStatusSuccess).
Scan(&avgDur).Error
if err != nil {
return nil, err
}
stats.AvgDurMs = int64(avgDur.AvgDur)
// 最后执行时间
var lastExec EventPipelineExecution
err = DB(c).Where("pipeline_id = ?", pipelineID).Order("created_at desc").First(&lastExec).Error
if err == nil {
stats.LastRunAt = lastExec.CreatedAt
}
return &stats, nil
}
// EventPipelineExecutionDetail 执行详情(包含解析后的节点结果)
type EventPipelineExecutionDetail struct {
EventPipelineExecution
NodeResultsParsed []*NodeExecutionResult `json:"node_results_parsed"`
InputsSnapshotParsed map[string]string `json:"inputs_snapshot_parsed"`
}
// GetEventPipelineExecutionDetail 获取执行详情
func GetEventPipelineExecutionDetail(c *ctx.Context, id string) (*EventPipelineExecutionDetail, error) {
execution, err := GetEventPipelineExecution(c, id)
if err != nil {
return nil, err
}
if execution == nil {
return &EventPipelineExecutionDetail{}, nil
}
detail := &EventPipelineExecutionDetail{
EventPipelineExecution: *execution,
}
// 解析节点结果
nodeResults, err := execution.GetNodeResults()
if err != nil {
return nil, fmt.Errorf("parse node results error: %w", err)
}
detail.NodeResultsParsed = nodeResults
// 解析输入参数快照
inputsSnapshot, err := execution.GetInputsSnapshot()
if err != nil {
return nil, fmt.Errorf("parse inputs snapshot error: %w", err)
}
detail.InputsSnapshotParsed = inputsSnapshot
return detail, nil
}
================================================
FILE: models/event_processor.go
================================================
package models
import (
"fmt"
"strings"
"github.com/ccfos/nightingale/v6/pkg/ctx"
)
type Processor interface {
Init(settings interface{}) (Processor, error) // 初始化配置
Process(ctx *ctx.Context, wfCtx *WorkflowContext) (*WorkflowContext, string, error)
// 处理器有三种情况:
// 1. 处理成功,返回处理后的 WorkflowContext
// 2. 处理成功,不需要返回处理后的上下文,只返回处理结果,将处理结果放到 string 中,比如 eventdrop callback 处理器
// 3. 处理失败,返回错误,将错误放到 error 中
// WorkflowContext 包含:Event(事件)、Env(环境变量/输入参数)、Metadata(执行元数据)
}
// BranchProcessor 分支处理器接口
// 用于 if、switch、foreach 等需要返回分支索引或特殊输出的处理器
type BranchProcessor interface {
Processor
// ProcessWithBranch 处理事件并返回 NodeOutput
// NodeOutput 包含:处理后的上下文、消息、是否终止、分支索引
ProcessWithBranch(ctx *ctx.Context, wfCtx *WorkflowContext) (*NodeOutput, error)
}
type NewProcessorFn func(settings interface{}) (Processor, error)
var processorRegister = map[string]NewProcessorFn{}
func RegisterProcessor(typ string, p Processor) {
if _, found := processorRegister[typ]; found {
return
}
processorRegister[typ] = p.Init
}
func GetProcessorByType(typ string, settings interface{}) (Processor, error) {
typ = strings.TrimSpace(typ)
fn, found := processorRegister[typ]
if !found {
return nil, fmt.Errorf("processor type %s not found", typ)
}
processor, err := fn(settings)
if err != nil {
return nil, err
}
return processor, nil
}
================================================
FILE: models/host_meta.go
================================================
package models
import "encoding/json"
type HostMeta struct {
AgentVersion string `json:"agent_version"`
OS string `json:"os"`
Arch string `json:"arch"`
Hostname string `json:"hostname"`
CpuNum int `json:"cpu_num"`
CpuUtil float64 `json:"cpu_util"`
MemUtil float64 `json:"mem_util"`
Offset int64 `json:"offset"`
UnixTime int64 `json:"unixtime"`
RemoteAddr string `json:"remote_addr"`
HostIp string `json:"host_ip"`
EngineName string `json:"engine_name"`
GlobalLabels map[string]string `json:"global_labels"`
ExtendInfo map[string]interface{} `json:"extend_info"`
Config interface{} `json:"config"`
}
type HostUpdateTime struct {
Ident string `json:"ident"`
UpdateTime int64 `json:"update_time"`
}
func (h HostUpdateTime) MarshalBinary() ([]byte, error) {
return json.Marshal(h)
}
func (h *HostUpdateTime) UnmarshalBinary(data []byte) error {
return json.Unmarshal(data, h)
}
type HostUnixTime struct {
Ident string `json:"ident"`
UnixTime int64 `json:"unixtime"`
}
func (h HostMeta) MarshalBinary() ([]byte, error) {
return json.Marshal(h)
}
func (h *HostMeta) UnmarshalBinary(data []byte) error {
return json.Unmarshal(data, h)
}
func WrapIdent(ident string) string {
return "n9e_meta_" + ident
}
func WrapExtendIdent(ident string) string {
return "n9e_extend_meta_" + ident
}
func WrapIdentUpdateTime(ident string) string {
return "n9e_meta_update_time_" + ident
}
================================================
FILE: models/message_tpl.go
================================================
package models
import (
"bytes"
"fmt"
"html/template"
"regexp"
"strings"
texttemplate "text/template"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/pkg/tplx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
// MessageTemplate 消息模板结构
type MessageTemplate struct {
ID int64 `json:"id" gorm:"primarykey"`
Name string `json:"name"` // 模板名称
Ident string `json:"ident"` // 模板标识
Content map[string]string `json:"content" gorm:"serializer:json"` // 模板内容
UserGroupIds []int64 `json:"user_group_ids" gorm:"serializer:json"`
NotifyChannelIdent string `json:"notify_channel_ident"` // 通知媒介 Ident
Private int `json:"private"` // 0-公开 1-私有
Weight int `json:"weight"` // 权重,根据此字段对内置模板进行排序
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
}
func MessageTemplateStatistics(ctx *ctx.Context) (*Statistics, error) {
if !ctx.IsCenter {
s, err := poster.GetByUrls[*Statistics](ctx, "/v1/n9e/statistic?name=message_template")
return s, err
}
session := DB(ctx).Model(&MessageTemplate{}).Select("count(*) as total", "max(update_at) as last_updated")
var stats []*Statistics
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
func MessageTemplateGetsAll(ctx *ctx.Context) ([]*MessageTemplate, error) {
if !ctx.IsCenter {
templates, err := poster.GetByUrls[[]*MessageTemplate](ctx, "/v1/n9e/message-templates")
return templates, err
}
var templates []*MessageTemplate
err := DB(ctx).Find(&templates).Error
if err != nil {
return nil, err
}
return templates, nil
}
func MessageTemplateGets(ctx *ctx.Context, id int64, name, ident string) ([]*MessageTemplate, error) {
session := DB(ctx)
if id != 0 {
session = session.Where("id = ?", id)
}
if name != "" {
session = session.Where("name = ?", name)
}
if ident != "" {
session = session.Where("ident = ?", ident)
}
var templates []*MessageTemplate
err := session.Find(&templates).Error
return templates, err
}
func (t *MessageTemplate) TableName() string {
return "message_template"
}
func (t *MessageTemplate) Verify() error {
if t.Name == "" {
return errors.New("template name cannot be empty")
}
if t.Ident == "" {
return errors.New("template identifier cannot be empty")
}
if !regexp.MustCompile("^[a-zA-Z0-9_-]+$").MatchString(t.Ident) {
return fmt.Errorf("template identifier must be ^[a-zA-Z0-9_-]+$, current: %s", t.Ident)
}
for key := range t.Content {
if key == "" {
return errors.New("template content cannot have empty keys")
}
}
if t.Private == 1 && len(t.UserGroupIds) == 0 {
return errors.New("user group IDs of private msg tpl cannot be empty")
}
if t.Private != 0 && t.Private != 1 {
return errors.New("private flag must be 0 or 1")
}
return nil
}
func (t *MessageTemplate) Update(ctx *ctx.Context, ref MessageTemplate) error {
// ref.FE2DB()
if t.Ident != ref.Ident {
return errors.New("cannot update ident")
}
ref.ID = t.ID
ref.CreateAt = t.CreateAt
ref.CreateBy = t.CreateBy
ref.UpdateAt = time.Now().Unix()
err := ref.Verify()
if err != nil {
return err
}
return DB(ctx).Model(t).Select("*").Updates(ref).Error
}
func (t *MessageTemplate) DB2FE() {
if t.UserGroupIds == nil {
t.UserGroupIds = make([]int64, 0)
}
}
func MessageTemplateGet(ctx *ctx.Context, where string, args ...interface{}) (*MessageTemplate, error) {
lst, err := MessageTemplatesGet(ctx, where, args...)
if err != nil || len(lst) == 0 {
return nil, err
}
return lst[0], err
}
func MessageTemplatesGet(ctx *ctx.Context, where string, args ...interface{}) ([]*MessageTemplate, error) {
lst := make([]*MessageTemplate, 0)
session := DB(ctx)
if where != "" && len(args) > 0 {
session = session.Where(where, args...)
}
err := session.Find(&lst).Error
if err != nil {
return nil, err
}
for _, t := range lst {
t.DB2FE()
}
return lst, nil
}
func MessageTemplatesGetBy(ctx *ctx.Context, notifyChannelIdents []string) ([]*MessageTemplate, error) {
lst := make([]*MessageTemplate, 0)
session := DB(ctx)
if len(notifyChannelIdents) > 0 {
session = session.Where("notify_channel_ident IN (?)", notifyChannelIdents)
}
err := session.Order("weight asc").Find(&lst).Error
if err != nil {
return nil, err
}
for _, t := range lst {
t.DB2FE()
}
return lst, nil
}
type MsgTplList []*MessageTemplate
func (t MsgTplList) GetIdentSet() map[int64]struct{} {
idents := make(map[int64]struct{}, len(t))
for _, tpl := range t {
idents[tpl.ID] = struct{}{}
}
return idents
}
func (t MsgTplList) IfUsed(nr *NotifyRule) bool {
identSet := t.GetIdentSet()
for _, nc := range nr.NotifyConfigs {
if _, ok := identSet[nc.TemplateID]; ok {
return true
}
}
return false
}
const (
DingtalkTitle = `{{if $event.IsRecovered}} Recovered {{else}}Triggered{{end}}: {{$event.RuleName}}`
FeishuCardTitle = `🔔 {{$event.RuleName}}`
LarkCardTitle = `🔔 {{$event.RuleName}}`
)
var NewTplMap = map[string]string{
"ali-voice": `{{$event.RuleName}}`,
"ali-sms": `{{$event.RuleName}}`,
"tx-voice": `S{{$event.Severity}}{{if $event.IsRecovered}}Recovered{{else}}Triggered{{end}}{{$event.RuleName}}`,
"tx-sms": `级别状态: S{{$event.Severity}} {{if $event.IsRecovered}}Recovered{{else}}Triggered{{end}}规则名称: {{$event.RuleName}}`,
Dingtalk: `#### {{if $event.IsRecovered}}💚{{$event.RuleName}} {{else}}💔{{$event.RuleName}} {{end}}
---
{{$time_duration := sub now.Unix $event.FirstTriggerTime }}{{if $event.IsRecovered}}{{$time_duration = sub $event.LastEvalTime $event.FirstTriggerTime }}{{end}}
- **告警级别**: {{$event.Severity}}级
{{- if $event.RuleNote}}
- **规则备注**: {{$event.RuleNote}}
{{- end}}
{{- if not $event.IsRecovered}}
- **当次触发时值**: {{$event.TriggerValue}}
- **当次触发时间**: {{timeformat $event.TriggerTime}}
- **告警持续时长**: {{humanizeDurationInterface $time_duration}}
{{- else}}
{{- if $event.AnnotationsJSON.recovery_value}}
- **恢复时值**: {{formatDecimal $event.AnnotationsJSON.recovery_value 4}}
{{- end}}
- **恢复时间**: {{timeformat $event.LastEvalTime}}
- **告警持续时长**: {{humanizeDurationInterface $time_duration}}
{{- end}}
- **告警事件标签**:
{{- range $key, $val := $event.TagsMap}}
{{- if ne $key "rulename" }}
- {{$key}}: {{$val}}
{{- end}}
{{- end}}
{{if $event.AnnotationsJSON}}
- **附加信息**:
{{- range $key, $val := $event.AnnotationsJSON}}
- {{$key}}: {{$val}}
{{- end}}
{{end}}
[事件详情]({{.domain}}/share/alert-his-events/{{$event.Id}}) | [屏蔽1小时]({{.domain}}/alert-mutes/add?__event_id={{$event.Id}}){{if eq $event.Cate "prometheus"}} | [查看曲线]({{.domain}}/metric/explorer?__event_id={{$event.Id}}&mode=graph){{end}}`,
Email: `
夜莺告警通知
{{if $event.IsRecovered}}
级别状态:
S{{$event.Severity}} Recovered
{{else}}
级别状态:
S{{$event.Severity}} Triggered
{{end}}
策略备注:
{{$event.RuleNote}}
设备备注:
{{$event.TargetNote}}
{{if not $event.IsRecovered}}
触发时值:
{{$event.TriggerValue}}
{{end}}
{{if $event.TargetIdent}}
监控对象:
{{$event.TargetIdent}}
{{end}}
监控指标:
{{$event.TagsJSON}}
{{if $event.IsRecovered}}
恢复时间:
{{timeformat $event.LastEvalTime}}
{{else}}
触发时间:
{{timeformat $event.TriggerTime}}
{{end}}
发送时间:
{{timestamp}}
`,
Feishu: `级别状态: S{{$event.Severity}} {{if $event.IsRecovered}}Recovered{{else}}Triggered{{end}}
规则名称: {{$event.RuleName}}{{if $event.RuleNote}}
规则备注: {{$event.RuleNote}}{{end}}
监控指标: {{$event.TagsJSON}}
附加信息:
{{- range $key, $val := $event.AnnotationsJSON}}
{{$key}}: {{$val}}
{{- end}}
{{if $event.IsRecovered}}恢复时间:{{timeformat $event.LastEvalTime}}{{else}}触发时间: {{timeformat $event.TriggerTime}}
触发时值: {{$event.TriggerValue}}{{end}}
发送时间: {{timestamp}}
事件详情: {{.domain}}/share/alert-his-events/{{$event.Id}}
屏蔽1小时: {{.domain}}/alert-mutes/add?__event_id={{$event.Id}}`,
FeishuCard: `{{- if $event.IsRecovered -}}
{{- if ne $event.Cate "host" -}}
**告警集群:** {{$event.Cluster}}{{end}}
**级别状态:** S{{$event.Severity}} Recovered
**告警名称:** {{$event.RuleName}}
**事件标签:** {{$event.TagsJSON}}
**恢复时间:** {{timeformat $event.LastEvalTime}}
**告警描述:** **服务已恢复**
{{- else }}
{{- if ne $event.Cate "host"}}
**告警集群:** {{$event.Cluster}}{{end}}
**级别状态:** S{{$event.Severity}} Triggered
**告警名称:** {{$event.RuleName}}
**事件标签:** {{$event.TagsJSON}}
**触发时间:** {{timeformat $event.TriggerTime}}
**发送时间:** {{timestamp}}
**触发时值:** {{$event.TriggerValue}}
{{if $event.RuleNote }}**告警描述:** **{{$event.RuleNote}}**{{end}}
{{- end -}}
{{if $event.AnnotationsJSON}}
**附加信息**:
{{- range $key, $val := $event.AnnotationsJSON}}
{{$key}}: {{$val}}
{{- end}}
{{- end}}
[事件详情]({{.domain}}/share/alert-his-events/{{$event.Id}})|[屏蔽1小时]({{.domain}}/alert-mutes/add?__event_id={{$event.Id}}){{if eq $event.Cate "prometheus"}}|[查看曲线]({{.domain}}/metric/explorer?__event_id={{$event.Id}}&mode=graph){{end}}`,
EmailSubject: `{{if $event.IsRecovered}}Recovered{{else}}Triggered{{end}}: {{$event.RuleName}} {{$event.TagsJSON}}`,
Mm: `级别状态: S{{$event.Severity}} {{if $event.IsRecovered}}Recovered{{else}}Triggered{{end}}
规则名称: {{$event.RuleName}}{{if $event.RuleNote}}
规则备注: {{$event.RuleNote}}{{end}}
监控指标: {{$event.TagsJSON}}
{{if $event.IsRecovered}}恢复时间:{{timeformat $event.LastEvalTime}}{{else}}触发时间: {{timeformat $event.TriggerTime}}
触发时值: {{$event.TriggerValue}}{{end}}
发送时间: {{timestamp}}`,
Telegram: `级别状态: {{if $event.IsRecovered}}💚 S{{$event.Severity}} Recovered{{else}}⚠️ S{{$event.Severity}} Triggered{{end}}
规则标题 : {{$event.RuleName}}{{if $event.RuleNote}}
规则备注 : {{$event.RuleNote}}{{end}}{{if $event.TargetIdent}}
监控对象 : {{$event.TargetIdent}}{{end}}
监控指标 : {{$event.TagsJSON}}{{if not $event.IsRecovered}}
触发时值 : {{$event.TriggerValue}}{{end}}
{{if $event.IsRecovered}}恢复时间 : {{timeformat $event.LastEvalTime}}{{else}}首次触发时间 : {{timeformat $event.FirstTriggerTime}}{{end}}
{{$time_duration := sub now.Unix $event.FirstTriggerTime }}{{if $event.IsRecovered}}{{$time_duration = sub $event.LastEvalTime $event.FirstTriggerTime }}{{end}}距离首次告警 : {{humanizeDurationInterface $time_duration}}
发送时间 : {{timestamp}}`,
Wecom: `**级别状态**: {{if $event.IsRecovered}}💚S{{$event.Severity}} Recovered {{else}}💔S{{$event.Severity}} Triggered {{end}}
**规则标题**: {{$event.RuleName}}{{if $event.RuleNote}}
**规则备注**: {{$event.RuleNote}}{{end}}{{if $event.TargetIdent}}
**监控对象**: {{$event.TargetIdent}}{{end}}
**监控指标**: {{$event.TagsJSON}}
{{if $event.AnnotationsJSON}}**附加信息**:{{range $key, $val := $event.AnnotationsJSON}}{{$key}}:{{$val}} {{end}} {{end}}{{if not $event.IsRecovered}}
**触发时值**: {{$event.TriggerValue}}{{end}}
{{if $event.IsRecovered}}**恢复时间**: {{timeformat $event.LastEvalTime}}{{else}}**首次触发时间**: {{timeformat $event.FirstTriggerTime}}{{end}}
{{$time_duration := sub now.Unix $event.FirstTriggerTime }}{{if $event.IsRecovered}}{{$time_duration = sub $event.LastEvalTime $event.FirstTriggerTime }}{{end}}**距离首次告警**: {{humanizeDurationInterface $time_duration}}
**发送时间**: {{timestamp}}
[事件详情]({{.domain}}/share/alert-his-events/{{$event.Id}})|[屏蔽1小时]({{.domain}}/alert-mutes/add?__event_id={{$event.Id}}){{if eq $event.Cate "prometheus"}}|[查看曲线]({{.domain}}/metric/explorer?__event_id={{$event.Id}}&mode=graph){{end}}`,
Lark: `级别状态: S{{$event.Severity}} {{if $event.IsRecovered}}Recovered{{else}}Triggered{{end}}
规则名称: {{$event.RuleName}}{{if $event.RuleNote}}
规则备注: {{$event.RuleNote}}{{end}}
监控指标: {{$event.TagsJSON}}
{{if $event.IsRecovered}}恢复时间:{{timeformat $event.LastEvalTime}}{{else}}触发时间: {{timeformat $event.TriggerTime}}
触发时值: {{$event.TriggerValue}}{{end}}
发送时间: {{timestamp}}
事件详情: {{.domain}}/share/alert-his-events/{{$event.Id}}
屏蔽1小时: {{.domain}}/alert-mutes/add?__event_id={{$event.Id}}`,
LarkCard: `{{ if $event.IsRecovered }}
{{- if ne $event.Cate "host"}}
**告警集群:** {{$event.Cluster}}{{end}}
**级别状态:** S{{$event.Severity}} Recovered
**告警名称:** {{$event.RuleName}}
**事件标签:** {{$event.TagsJSON}}
**恢复时间:** {{timeformat $event.LastEvalTime}}
{{$time_duration := sub now.Unix $event.FirstTriggerTime }}{{if $event.IsRecovered}}{{$time_duration = sub $event.LastEvalTime $event.FirstTriggerTime }}{{end}}**持续时长**: {{humanizeDurationInterface $time_duration}}
**告警描述:** **服务已恢复**
{{- else }}
{{- if ne $event.Cate "host"}}
**告警集群:** {{$event.Cluster}}{{end}}
**级别状态:** S{{$event.Severity}} Triggered
**告警名称:** {{$event.RuleName}}
**事件标签:** {{$event.TagsJSON}}
**触发时间:** {{timeformat $event.TriggerTime}}
**发送时间:** {{timestamp}}
**触发时值:** {{$event.TriggerValue}}
{{$time_duration := sub now.Unix $event.FirstTriggerTime }}{{if $event.IsRecovered}}{{$time_duration = sub $event.LastEvalTime $event.FirstTriggerTime }}{{end}}**持续时长**: {{humanizeDurationInterface $time_duration}}
{{if $event.RuleNote }}**告警描述:** **{{$event.RuleNote}}**{{end}}
{{- end -}}
[事件详情]({{.domain}}/share/alert-his-events/{{$event.Id}})|[屏蔽1小时]({{.domain}}/alert-mutes/add?__event_id={{$event.Id}}){{if eq $event.Cate "prometheus"}}|[查看曲线]({{.domain}}/metric/explorer?__event_id={{$event.Id}}&mode=graph){{end}}`,
SlackWebhook: `{{ if $event.IsRecovered }}
{{- if ne $event.Cate "host"}}
*Alarm cluster:* {{$event.Cluster}}{{end}}
*Level Status:* S{{$event.Severity}} Recovered
*Alarm name:* {{$event.RuleName}}
*Recovery time:* {{timeformat $event.LastEvalTime}}
{{$time_duration := sub now.Unix $event.FirstTriggerTime }}
{{if $event.IsRecovered}}{{$time_duration = sub $event.LastEvalTime $event.FirstTriggerTime }}{{end}}
*Duration*: {{humanizeDurationInterface $time_duration}}
*Alarm description:* *Service has been restored*
{{- else }}
{{- if ne $event.Cate "host"}}
*Alarm cluster:* {{$event.Cluster}}{{end}}
*Level Status:* S{{$event.Severity}} Triggered
*Alarm name:* {{$event.RuleName}}
*Trigger time:* {{timeformat $event.TriggerTime}}
*Sending time:* {{timestamp}}
*Trigger time value:* {{$event.TriggerValue}}
{{$time_duration := sub now.Unix $event.FirstTriggerTime }}
{{if $event.IsRecovered}}{{$time_duration = sub $event.LastEvalTime $event.FirstTriggerTime }}{{end}}
*Duration*: {{humanizeDurationInterface $time_duration}}
{{if $event.RuleNote }}*Alarm description:* *{{$event.RuleNote}}*{{end}}
{{- end -}}
<{{.domain}}/share/alert-his-events/{{$event.Id}}|Event Details>
<{{.domain}}/alert-mutes/add?__event_id={{$event.Id}}|Block for 1 hour>
<{{.domain}}/metric/explorer?__event_id={{$event.Id}}&mode=graph|View Curve>`,
Discord: `**Level Status**: {{if $event.IsRecovered}}S{{$event.Severity}} Recovered{{else}}S{{$event.Severity}} Triggered{{end}}
**Rule Title**: {{$event.RuleName}}{{if $event.RuleNote}}
**Rule Note**: {{$event.RuleNote}}{{end}}{{if $event.TargetIdent}}
**Monitor Target**: {{$event.TargetIdent}}{{end}}
**Metrics**: {{$event.TagsJSON}}{{if not $event.IsRecovered}}
**Trigger Value**: {{$event.TriggerValue}}{{end}}
{{if $event.IsRecovered}}**Recovery Time**: {{timeformat $event.LastEvalTime}}{{else}}**First Trigger Time**: {{timeformat $event.FirstTriggerTime}}{{end}}
{{$time_duration := sub now.Unix $event.FirstTriggerTime }}{{if $event.IsRecovered}}{{$time_duration = sub $event.LastEvalTime $event.FirstTriggerTime }}{{end}}**Time Since First Alert**: {{humanizeDurationInterface $time_duration}}
**Send Time**: {{timestamp}}
[Event Details]({{.domain}}/share/alert-his-events/{{$event.Id}}) | [Silence 1h]({{.domain}}/alert-mutes/add?__event_id={{$event.Id}}) | [View Graph]({{.domain}}/metric/explorer?__event_id={{$event.Id}}&mode=graph)`,
MattermostWebhook: `{{ if $event.IsRecovered }}
{{- if ne $event.Cate "host"}}
**Alarm cluster:** {{$event.Cluster}}{{end}}
**Level Status:** S{{$event.Severity}} Recovered
**Alarm name:** {{$event.RuleName}}
**Recovery time:** {{timeformat $event.LastEvalTime}}
{{$time_duration := sub now.Unix $event.FirstTriggerTime }}{{if $event.IsRecovered}}{{$time_duration = sub $event.LastEvalTime $event.FirstTriggerTime }}{{end}}**Duration**: {{humanizeDurationInterface $time_duration}}
**Alarm description:** **Service has been restored**
{{- else }}
{{- if ne $event.Cate "host"}}
**Alarm cluster:** {{$event.Cluster}}{{end}}
**Level Status:** S{{$event.Severity}} Triggered
**Alarm name:** {{$event.RuleName}}
**Trigger time:** {{timeformat $event.TriggerTime}}
**Sending time:** {{timestamp}}
**Trigger time value:** {{$event.TriggerValue}}
{{$time_duration := sub now.Unix $event.FirstTriggerTime }}{{if $event.IsRecovered}}{{$time_duration = sub $event.LastEvalTime $event.FirstTriggerTime }}{{end}}**Duration**: {{humanizeDurationInterface $time_duration}}
{{if $event.RuleNote }}**Alarm description:** **{{$event.RuleNote}}**{{end}}
{{- end -}}
[Event Details]({{.domain}}/share/alert-his-events/{{$event.Id}})|[Block for 1 hour]({{.domain}}/alert-mutes/add?__event_id={{$event.Id}})|[View Curve]({{.domain}}/metric/explorer?__event_id={{$event.Id}}&mode=graph)`,
// Jira and JSMAlert share the same template format
Jira: `Severity: S{{$event.Severity}} {{if $event.IsRecovered}}Recovered{{else}}Triggered{{end}}
Rule Name: {{$event.RuleName}}{{if $event.RuleNote}}
Rule Notes: {{$event.RuleNote}}{{end}}
Metrics: {{$event.TagsJSON}}
Annotations:
{{- range $key, $val := $event.AnnotationsJSON}}
{{$key}}: {{$val}}
{{- end}}\n{{if $event.IsRecovered}}Recovery Time: {{timeformat $event.LastEvalTime}}{{else}}Trigger Time: {{timeformat $event.TriggerTime}}
Trigger Value: {{$event.TriggerValue}}{{end}}
Send Time: {{timestamp}}
Event Details: {{.domain}}/share/alert-his-events/{{$event.Id}}
Mute for 1 Hour: {{.domain}}/alert-mutes/add?__event_id={{$event.Id}}`,
}
// Weight 用于页面元素排序,weight 越大 排序越靠后
var MsgTplMap = []MessageTemplate{
{Name: "Jira", Ident: Jira, Weight: 18, Content: map[string]string{"content": NewTplMap[Jira]}},
{Name: "JSMAlert", Ident: JSMAlert, Weight: 17, Content: map[string]string{"content": NewTplMap[Jira]}},
{Name: "Callback", Ident: "callback", Weight: 16, Content: map[string]string{"content": ""}},
{Name: "MattermostWebhook", Ident: MattermostWebhook, Weight: 15, Content: map[string]string{"content": NewTplMap[MattermostWebhook]}},
{Name: "MattermostBot", Ident: MattermostBot, Weight: 14, Content: map[string]string{"content": NewTplMap[MattermostWebhook]}},
{Name: "SlackWebhook", Ident: SlackWebhook, Weight: 13, Content: map[string]string{"content": NewTplMap[SlackWebhook]}},
{Name: "SlackBot", Ident: SlackBot, Weight: 12, Content: map[string]string{"content": NewTplMap[SlackWebhook]}},
{Name: "Discord", Ident: Discord, Weight: 11, Content: map[string]string{"content": NewTplMap[Discord]}},
{Name: "Aliyun Voice", Ident: "ali-voice", Weight: 10, Content: map[string]string{"incident": NewTplMap["ali-voice"]}},
{Name: "Aliyun SMS", Ident: "ali-sms", Weight: 9, Content: map[string]string{"incident": NewTplMap["ali-sms"]}},
{Name: "Tencent Voice", Ident: "tx-voice", Weight: 8, Content: map[string]string{"content": NewTplMap["tx-voice"]}},
{Name: "Tencent SMS", Ident: "tx-sms", Weight: 7, Content: map[string]string{"content": NewTplMap["tx-sms"]}},
{Name: "Telegram", Ident: Telegram, Weight: 6, Content: map[string]string{"content": NewTplMap[Telegram]}},
{Name: "LarkCard", Ident: LarkCard, Weight: 5, Content: map[string]string{"title": LarkCardTitle, "content": NewTplMap[LarkCard]}},
{Name: "Lark", Ident: Lark, Weight: 5, Content: map[string]string{"content": NewTplMap[Lark]}},
{Name: "Feishu", Ident: Feishu, Weight: 4, Content: map[string]string{"content": NewTplMap[Feishu]}},
{Name: "FeishuCard", Ident: FeishuCard, Weight: 4, Content: map[string]string{"title": FeishuCardTitle, "content": NewTplMap[FeishuCard]}},
{Name: "Wecom", Ident: Wecom, Weight: 3, Content: map[string]string{"content": NewTplMap[Wecom]}},
{Name: "Dingtalk", Ident: Dingtalk, Weight: 2, Content: map[string]string{"title": NewTplMap[EmailSubject], "content": NewTplMap[Dingtalk]}},
{Name: "Email", Ident: Email, Weight: 1, Content: map[string]string{"subject": NewTplMap[EmailSubject], "content": NewTplMap[Email]}},
}
func InitMessageTemplate(ctx *ctx.Context) {
if !ctx.IsCenter {
return
}
for _, tpl := range MsgTplMap {
msgTpl := MessageTemplate{
Name: tpl.Name,
Ident: tpl.Ident,
Content: tpl.Content,
NotifyChannelIdent: tpl.Ident,
CreateBy: "system",
CreateAt: time.Now().Unix(),
UpdateBy: "system",
UpdateAt: time.Now().Unix(),
Weight: tpl.Weight,
}
err := msgTpl.Upsert(ctx, msgTpl.Ident)
if err != nil {
logger.Warningf("failed to upsert msg tpls %v", err)
}
}
}
func (t *MessageTemplate) Upsert(ctx *ctx.Context, ident string) error {
tpl, err := MessageTemplateGet(ctx, "ident = ?", ident)
if err != nil {
return errors.WithMessage(err, "failed to get message tpl")
}
if tpl == nil {
return Insert(ctx, t)
}
if tpl.UpdateBy != "" && tpl.UpdateBy != "system" {
return nil
}
return tpl.Update(ctx, *t)
}
var GetDefs func(map[string]interface{}) []string
func getDefs(renderData map[string]interface{}) []string {
return []string{
"{{ $events := .events }}",
"{{ $event := index $events 0 }}",
"{{ $labels := $event.TagsMap }}",
"{{ $value := $event.TriggerValue }}",
}
}
func init() {
GetDefs = getDefs
}
func (t *MessageTemplate) RenderEvent(events []*AlertCurEvent, siteUrl string) map[string]interface{} {
if t == nil {
return nil
}
renderData := make(map[string]interface{})
renderData["events"] = events
renderData["domain"] = siteUrl
// event 内容渲染到 messageTemplate
tplContent := make(map[string]interface{})
for key, msgTpl := range t.Content {
defs := GetDefs(renderData)
var body bytes.Buffer
if t.NotifyChannelIdent == "email" {
text := strings.Join(append(defs, msgTpl), "")
tpl, err := texttemplate.New(key).Funcs(tplx.TemplateFuncMap).Parse(text)
if err != nil {
logger.Errorf("failed to parse template: %v", err)
tplContent[key] = fmt.Sprintf("failed to parse template: %v", err)
continue
}
var body bytes.Buffer
if err = tpl.Execute(&body, renderData); err != nil {
logger.Errorf("failed to execute template: %v", err)
tplContent[key] = fmt.Sprintf("failed to execute template: %v", err)
continue
}
tplContent[key] = body.String()
continue
} else if t.NotifyChannelIdent == "slackwebhook" || t.NotifyChannelIdent == "slackbot" {
text := strings.Join(append(defs, msgTpl), "")
tpl, err := template.New(key).Funcs(tplx.TemplateFuncMap).Parse(text)
if err != nil {
logger.Errorf("failed to parse template: %v events: %v", err, events)
continue
}
if err = tpl.Execute(&body, renderData); err != nil {
logger.Errorf("failed to execute template: %v events: %v", err, events)
continue
}
escaped := strings.ReplaceAll(body.String(), `"`, `\"`)
escaped = strings.ReplaceAll(escaped, "\n", "\\n")
escaped = strings.ReplaceAll(escaped, "\r", "\\r")
escaped = strings.ReplaceAll(escaped, "<", "<")
tplContent[key] = template.HTML(escaped)
continue
}
text := strings.Join(append(defs, msgTpl), "")
tpl, err := template.New(key).Funcs(tplx.TemplateFuncMap).Parse(text)
if err != nil {
logger.Errorf("failed to parse template: %v events: %v", err, events)
tplContent[key] = fmt.Sprintf("failed to parse template: %v", err)
continue
}
if err = tpl.Execute(&body, renderData); err != nil {
logger.Errorf("failed to execute template: %v events: %v", err, events)
tplContent[key] = fmt.Sprintf("failed to execute template: %v", err)
continue
}
escaped := strings.ReplaceAll(body.String(), `"`, `\"`)
escaped = strings.ReplaceAll(escaped, "\n", "\\n")
escaped = strings.ReplaceAll(escaped, "\r", "\\r")
tplContent[key] = template.HTML(escaped)
}
return tplContent
}
================================================
FILE: models/metric_view.go
================================================
package models
import (
"errors"
"sort"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
)
// MetricView 在告警聚合视图查看的时候,要存储一些聚合规则
type MetricView struct {
Id int64 `json:"id" gorm:"primaryKey"`
Name string `json:"name"`
Cate int `json:"cate"`
Configs string `json:"configs"`
CreateAt int64 `json:"create_at"`
CreateBy int64 `json:"create_by"`
UpdateAt int64 `json:"update_at"`
}
func (v *MetricView) TableName() string {
return "metric_view"
}
func (v *MetricView) Verify() error {
v.Name = strings.TrimSpace(v.Name)
if v.Name == "" {
return errors.New("name is blank")
}
v.Configs = strings.TrimSpace(v.Configs)
if v.Configs == "" {
return errors.New("configs is blank")
}
return nil
}
func (v *MetricView) Add(ctx *ctx.Context) error {
if err := v.Verify(); err != nil {
return err
}
now := time.Now().Unix()
v.CreateAt = now
v.UpdateAt = now
return Insert(ctx, v)
}
func (v *MetricView) Update(ctx *ctx.Context, name, configs string, cate int, createBy int64) error {
if err := v.Verify(); err != nil {
return err
}
v.UpdateAt = time.Now().Unix()
v.Name = name
v.Configs = configs
v.Cate = cate
if v.CreateBy == 0 {
v.CreateBy = createBy
}
return DB(ctx).Model(v).Select("name", "configs", "cate", "update_at", "create_by").Updates(v).Error
}
// MetricViewDel: userid for safe delete
func MetricViewDel(ctx *ctx.Context, ids []int64, createBy ...interface{}) error {
if len(ids) == 0 {
return nil
}
if len(createBy) > 0 {
return DB(ctx).Where("id in ? and create_by = ?", ids, createBy[0]).Delete(new(MetricView)).Error
}
return DB(ctx).Where("id in ?", ids).Delete(new(MetricView)).Error
}
func MetricViewGets(ctx *ctx.Context, createBy interface{}) ([]MetricView, error) {
var lst []MetricView
err := DB(ctx).Where("create_by = ? or cate = 0", createBy).Find(&lst).Error
if err == nil && len(lst) > 1 {
sort.Slice(lst, func(i, j int) bool {
if lst[i].Cate < lst[j].Cate {
return true
}
if lst[i].Cate > lst[j].Cate {
return false
}
return lst[i].Name < lst[j].Name
})
}
return lst, err
}
func MetricViewGet(ctx *ctx.Context, where string, args ...interface{}) (*MetricView, error) {
var lst []*MetricView
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
return lst[0], nil
}
================================================
FILE: models/migrate/migrate.go
================================================
package migrate
import (
"fmt"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ormx"
imodels "github.com/flashcatcloud/ibex/src/models"
"github.com/toolkits/pkg/logger"
"gorm.io/driver/mysql"
"gorm.io/gorm"
)
func Migrate(db *gorm.DB) {
MigrateTables(db)
MigrateEsIndexPatternTable(db)
}
func MigrateIbexTables(db *gorm.DB) {
var tableOptions string
switch db.Dialector.(type) {
case *mysql.Dialector:
tableOptions = "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
if tableOptions != "" {
db = db.Set("gorm:table_options", tableOptions)
}
dts := []interface{}{&imodels.TaskMeta{}, &imodels.TaskScheduler{}, &TaskHostDoing{}, &imodels.TaskAction{}}
for _, dt := range dts {
err := db.AutoMigrate(dt)
if err != nil {
logger.Errorf("failed to migrate table:%v %v", dt, err)
}
}
for i := 0; i < 100; i++ {
tableName := fmt.Sprintf("task_host_%d", i)
exists := db.Migrator().HasTable(tableName)
if exists {
continue
} else {
err := db.Table(tableName).AutoMigrate(&imodels.TaskHost{})
if err != nil {
logger.Errorf("failed to migrate table:%s %v", tableName, err)
}
}
}
}
func isPostgres(db *gorm.DB) bool {
dialect := db.Dialector.Name()
return dialect == "postgres"
}
func MigrateTables(db *gorm.DB) error {
var tableOptions string
switch db.Dialector.(type) {
case *mysql.Dialector:
tableOptions = "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
if tableOptions != "" {
db = db.Set("gorm:table_options", tableOptions)
}
dts := []interface{}{&RecordingRule{}, &AlertRule{}, &AlertSubscribe{}, &AlertMute{},
&TaskRecord{}, &ChartShare{}, &Target{}, &Configs{}, &Datasource{}, &NotifyTpl{},
&Board{}, &BoardBusigroup{}, &Users{}, &SsoConfig{}, &models.BuiltinMetric{},
&models.MetricFilter{}, &models.NotificationRecord{}, &models.TargetBusiGroup{},
&models.UserToken{}, &models.DashAnnotation{}, MessageTemplate{}, NotifyRule{}, NotifyChannelConfig{}, &EsIndexPatternMigrate{},
&models.EventPipeline{}, &models.EventPipelineExecution{}, &models.EmbeddedProduct{}, &models.SourceToken{},
&models.SavedView{}, &models.UserViewFavorite{}}
if isPostgres(db) {
dts = append(dts, &models.PostgresBuiltinComponent{})
DropUniqueFiledLimit(db, &models.PostgresBuiltinComponent{}, "idx_ident", "idx_ident")
} else {
dts = append(dts, &models.BuiltinComponent{})
DropUniqueFiledLimit(db, &models.BuiltinComponent{}, "idx_ident", "idx_ident")
}
if !db.Migrator().HasColumn(&imodels.TaskSchedulerHealth{}, "scheduler") {
dts = append(dts, &imodels.TaskSchedulerHealth{})
}
asyncDts := []interface{}{&AlertHisEvent{}, &AlertCurEvent{}}
go func() {
defer func() {
if r := recover(); r != nil {
logger.Errorf("panic to migrate table: %v", r)
}
}()
for _, dt := range asyncDts {
if err := db.AutoMigrate(dt); err != nil {
logger.Errorf("failed to migrate table %+v err:%v", dt, err)
}
}
}()
if !db.Migrator().HasTable(&models.BuiltinPayload{}) {
if isPostgres(db) {
dts = append(dts, &models.PostgresBuiltinPayload{})
} else {
dts = append(dts, &models.BuiltinPayload{})
}
} else {
dts = append(dts, &BuiltinPayloads{})
}
for _, dt := range dts {
err := db.AutoMigrate(dt)
if err != nil {
logger.Errorf("failed to migrate table:%v %v", dt, err)
}
}
if db.Migrator().HasColumn(&AlertingEngines{}, "cluster") {
err := db.Migrator().RenameColumn(&AlertingEngines{}, "cluster", "engine_cluster")
if err != nil {
logger.Errorf("failed to renameColumn table: %v", err)
}
}
if db.Migrator().HasColumn(&ChartShare{}, "dashboard_id") {
err := db.Migrator().DropColumn(&ChartShare{}, "dashboard_id")
if err != nil {
logger.Errorf("failed to DropColumn table: %v", err)
}
}
DropUniqueFiledLimit(db, &Configs{}, "ckey", "configs_ckey_key")
// 删除 builtin_metrics 表的 idx_collector_typ_name 唯一索引
DropUniqueFiledLimit(db, &models.BuiltinMetric{}, "idx_collector_typ_name", "idx_collector_typ_name")
return nil
}
func DropUniqueFiledLimit(db *gorm.DB, dst interface{}, uniqueFiled string, pgUniqueFiled string) { // UNIQUE KEY (`ckey`)
// 先检查表是否存在,如果不存在则直接返回
if !db.Migrator().HasTable(dst) {
return
}
if db.Migrator().HasIndex(dst, uniqueFiled) {
err := db.Migrator().DropIndex(dst, uniqueFiled) //mysql DROP INDEX
if err != nil {
logger.Errorf("failed to DropIndex(%s) error: %v", uniqueFiled, err)
}
}
if db.Migrator().HasConstraint(dst, pgUniqueFiled) {
err := db.Migrator().DropConstraint(dst, pgUniqueFiled) //pg DROP CONSTRAINT
if err != nil {
logger.Errorf("failed to DropConstraint(%s) error: %v", pgUniqueFiled, err)
}
}
}
func columnHasIndex(db *gorm.DB, dst interface{}, indexColumn string) bool {
indexes, err := db.Migrator().GetIndexes(dst)
if err != nil {
logger.Errorf("failed to table getIndexes: %v", err)
return false
}
for i := range indexes {
for j := range indexes[i].Columns() {
if indexes[i].Columns()[j] == indexColumn {
return true
}
}
}
return false
}
type AlertRule struct {
ExtraConfig string `gorm:"type:text;column:extra_config"`
CronPattern string `gorm:"type:varchar(64);column:cron_pattern"`
TimeZone string `gorm:"type:varchar(64);column:time_zone;not null;default:''"`
DatasourceQueries []models.DatasourceQuery `gorm:"datasource_queries;type:text;serializer:json"` // datasource queries
NotifyRuleIds []int64 `gorm:"column:notify_rule_ids;type:varchar(1024)"`
NotifyVersion int `gorm:"column:notify_version;type:int;default:0"`
PipelineConfigs []models.PipelineConfig `gorm:"column:pipeline_configs;type:text;serializer:json"`
}
type AlertSubscribe struct {
ExtraConfig string `gorm:"type:text;column:extra_config"` // extra config
Severities string `gorm:"column:severities;type:varchar(32);not null;default:''"`
BusiGroups ormx.JSONArr `gorm:"column:busi_groups;type:varchar(4096)"`
Note string `gorm:"column:note;type:varchar(1024);default:'';comment:note"`
RuleIds []int64 `gorm:"column:rule_ids;type:varchar(1024)"`
NotifyRuleIds []int64 `gorm:"column:notify_rule_ids;type:varchar(1024)"`
NotifyVersion int `gorm:"column:notify_version;type:int;default:0"`
}
type AlertMute struct {
Severities string `gorm:"column:severities;type:varchar(32);not null;default:''"`
Tags string `gorm:"column:tags;type:varchar(4096);default:'[]';comment:json,map,tagkey->regexp|value"`
}
type RecordingRule struct {
QueryConfigs string `gorm:"type:text;not null;column:query_configs"` // query_configs
DatasourceIds string `gorm:"column:datasource_ids;type:varchar(255);default:'';comment:datasource ids"`
CronPattern string `gorm:"column:cron_pattern;type:varchar(255);default:'';comment:cron pattern"`
DatasourceQueries []models.DatasourceQuery `json:"datasource_queries" gorm:"datasource_queries;type:text;serializer:json"` // datasource queries
}
type AlertingEngines struct {
EngineCluster string `gorm:"column:engine_cluster;type:varchar(128);default:'';comment:n9e engine cluster"`
}
type ChartShare struct {
DatasourceId int64 `gorm:"column:datasource_id;bigint(20);not null;default:0;comment:datasource id"`
}
type TaskRecord struct {
EventId int64 `gorm:"column:event_id;bigint(20);not null;default:0;comment:event id;index:idx_event_id"`
}
type AlertHisEvent struct {
LastEvalTime int64 `gorm:"column:last_eval_time;bigint(20);not null;default:0;comment:for time filter;index:idx_last_eval_time"`
OriginalTags string `gorm:"column:original_tags;type:text;comment:labels key=val,,k2=v2"`
NotifyRuleIds []int64 `gorm:"column:notify_rule_ids;type:text;serializer:json;comment:notify rule ids"`
}
type AlertCurEvent struct {
OriginalTags string `gorm:"column:original_tags;type:text;comment:labels key=val,,k2=v2"`
NotifyRuleIds []int64 `gorm:"column:notify_rule_ids;type:text;serializer:json;comment:notify rule ids"`
}
type Target struct {
HostIp string `gorm:"column:host_ip;type:varchar(15);default:'';comment:IPv4 string;index:idx_host_ip"`
AgentVersion string `gorm:"column:agent_version;type:varchar(255);default:'';comment:agent version;index:idx_agent_version"`
EngineName string `gorm:"column:engine_name;type:varchar(255);default:'';comment:engine name;index:idx_engine_name"`
OS string `gorm:"column:os;type:varchar(31);default:'';comment:os type;index:idx_os"`
HostTags []string `gorm:"column:host_tags;type:text;comment:global labels set in conf file;serializer:json"`
}
type Datasource struct {
IsDefault bool `gorm:"column:is_default;type:boolean;comment:is default datasource"`
Identifier string `gorm:"column:identifier;type:varchar(255);default:'';comment:identifier"`
Weight int `gorm:"column:weight;type:int;default:0;comment:weight for sorting"`
}
type Configs struct {
Note string `gorm:"column:note;type:varchar(1024);default:'';comment:note"`
Cval string `gorm:"column:cval;type:text;comment:config value"`
//mysql tinyint//postgresql smallint
External int `gorm:"column:external;type:int;default:0;comment:0\\:built-in 1\\:external"`
Encrypted int `gorm:"column:encrypted;type:int;default:0;comment:0\\:plaintext 1\\:ciphertext"`
CreateAt int64 `gorm:"column:create_at;type:int;default:0;comment:create_at"`
CreateBy string `gorm:"column:create_by;type:varchar(64);default:'';comment:create_by"`
UpdateAt int64 `gorm:"column:update_at;type:int;default:0;comment:update_at"`
UpdateBy string `gorm:"column:update_by;type:varchar(64);default:'';comment:update_by"`
}
type NotifyTpl struct {
CreateAt int64 `gorm:"column:create_at;type:int;default:0;comment:create_at"`
CreateBy string `gorm:"column:create_by;type:varchar(64);default:'';comment:create_by"`
UpdateAt int64 `gorm:"column:update_at;type:int;default:0;comment:update_at"`
UpdateBy string `gorm:"column:update_by;type:varchar(64);default:'';comment:update_by"`
}
type Board struct {
PublicCate int `gorm:"column:public_cate;int;not null;default:0;comment:0 anonymous 1 login 2 busi"`
Note string `gorm:"column:note;type:varchar(1024);not null;default:'';comment:note"`
}
type BoardBusigroup struct {
BusiGroupId int64 `gorm:"column:busi_group_id;bigint(20);not null;default:0;comment:busi group id"`
BoardId int64 `gorm:"column:board_id;bigint(20);not null;default:0;comment:board id"`
}
type Users struct {
Belong string `gorm:"column:belong;type:varchar(16);default:'';comment:belong"`
LastActiveTime int64 `gorm:"column:last_active_time;type:int;default:0;comment:last_active_time"`
Phone string `gorm:"column:phone;type:varchar(1024);not null;default:''"`
}
type SsoConfig struct {
UpdateAt int64 `gorm:"column:update_at;type:int;default:0;comment:update_at"`
}
type BuiltinPayloads struct {
UUID int64 `json:"uuid" gorm:"type:bigint;not null;index:idx_uuid;comment:'uuid of payload'"`
ComponentID int64 `json:"component_id" gorm:"type:bigint;index:idx_component,sort:asc;not null;default:0;comment:'component_id of payload'"`
Note string `json:"note" gorm:"type:varchar(1024);not null;default:'';comment:'note of payload'"`
}
type TaskHostDoing struct {
Id int64 `gorm:"column:id;index;primaryKey:false"`
Host string `gorm:"column:host;size:128;not null;index"`
Clock int64 `gorm:"column:clock;not null;default:0"`
Action string `gorm:"column:action;size:16;not null"`
AlertTriggered bool `gorm:"-"`
}
func (TaskHostDoing) TableName() string {
return "task_host_doing"
}
type EsIndexPatternMigrate struct {
CrossClusterEnabled int `gorm:"column:cross_cluster_enabled;type:int;default:0"`
Note string `gorm:"column:note;type:varchar(1024);default:''"`
}
func (EsIndexPatternMigrate) TableName() string {
return "es_index_pattern"
}
type DashAnnotation struct {
Id int64 `gorm:"column:id;primaryKey;autoIncrement"`
DashboardId int64 `gorm:"column:dashboard_id;not null"`
PanelId string `gorm:"column:panel_id;type:varchar(191);not null"`
Tags string `gorm:"column:tags;type:text"`
Description string `gorm:"column:description;type:text"`
Config string `gorm:"column:config;type:text"`
TimeStart int64 `gorm:"column:time_start;not null;default:0"`
TimeEnd int64 `gorm:"column:time_end;not null;default:0"`
CreateAt int64 `gorm:"column:create_at;not null;default:0"`
CreateBy string `gorm:"column:create_by;type:varchar(64);not null;default:''"`
UpdateAt int64 `gorm:"column:update_at;not null;default:0"`
UpdateBy string `gorm:"column:update_by;type:varchar(64);not null;default:''"`
}
func (DashAnnotation) TableName() string {
return "dash_annotation"
}
type MessageTemplate struct {
ID int64 `gorm:"column:id;primaryKey;autoIncrement"`
Name string `gorm:"column:name;type:varchar(64);not null"`
Ident string `gorm:"column:ident;type:varchar(64);not null"`
Content map[string]string `gorm:"column:content;type:text"`
UserGroupIds []int64 `gorm:"column:user_group_ids;type:varchar(64)"`
NotifyChannelIdent string `gorm:"column:notify_channel_ident;type:varchar(64);not null;default:''"`
Private int `gorm:"column:private;type:int;not null;default:0"`
Weight int `gorm:"column:weight;type:int;not null;default:0"`
CreateAt int64 `gorm:"column:create_at;not null;default:0"`
CreateBy string `gorm:"column:create_by;type:varchar(64);not null;default:''"`
UpdateAt int64 `gorm:"column:update_at;not null;default:0"`
UpdateBy string `gorm:"column:update_by;type:varchar(64);not null;default:''"`
}
func (t *MessageTemplate) TableName() string {
return "message_template"
}
type NotifyRule struct {
ID int64 `gorm:"column:id;primaryKey;autoIncrement"`
Name string `gorm:"column:name;type:varchar(255);not null"`
Description string `gorm:"column:description;type:text"`
Enable bool `gorm:"column:enable;not null;default:false"`
UserGroupIds []int64 `gorm:"column:user_group_ids;type:varchar(255)"`
NotifyConfigs []models.NotifyConfig `gorm:"column:notify_configs;type:text"`
PipelineConfigs []models.PipelineConfig `gorm:"column:pipeline_configs;type:text"`
ExtraConfig interface{} `gorm:"column:extra_config;type:text"`
CreateAt int64 `gorm:"column:create_at;not null;default:0"`
CreateBy string `gorm:"column:create_by;type:varchar(64);not null;default:''"`
UpdateAt int64 `gorm:"column:update_at;not null;default:0"`
UpdateBy string `gorm:"column:update_by;type:varchar(64);not null;default:''"`
}
func (r *NotifyRule) TableName() string {
return "notify_rule"
}
type NotifyChannelConfig struct {
ID int64 `gorm:"column:id;primaryKey;autoIncrement"`
Name string `gorm:"column:name;type:varchar(255);not null"`
Ident string `gorm:"column:ident;type:varchar(255);not null"`
Description string `gorm:"column:description;type:text"`
Enable bool `gorm:"column:enable;not null;default:false"`
ParamConfig models.NotifyParamConfig `gorm:"column:param_config;type:text"`
RequestType string `gorm:"column:request_type;type:varchar(50);not null"`
RequestConfig *models.RequestConfig `gorm:"column:request_config;type:text"`
Weight int `gorm:"column:weight;type:int;not null;default:0"`
CreateAt int64 `gorm:"column:create_at;not null;default:0"`
CreateBy string `gorm:"column:create_by;type:varchar(64);not null;default:''"`
UpdateAt int64 `gorm:"column:update_at;not null;default:0"`
UpdateBy string `gorm:"column:update_by;type:varchar(64);not null;default:''"`
}
func (c *NotifyChannelConfig) TableName() string {
return "notify_channel"
}
================================================
FILE: models/migrate/migrate_es_index_pattern.go
================================================
package migrate
import (
"github.com/toolkits/pkg/logger"
"gorm.io/gorm"
)
type EsIndexPattern struct {
Id int64 `gorm:"primaryKey;type:bigint unsigned"`
DatasourceId int64 `gorm:"type:bigint not null default '0';uniqueIndex:idx_ds_name"`
Name string `gorm:"type:varchar(191) not null default '';uniqueIndex:idx_ds_name"`
TimeField string `gorm:"type:varchar(128) not null default ''"`
AllowHideSystemIndices int `gorm:"type:tinyint(1) not null default 0"`
FieldsFormat string `gorm:"type:varchar(4096) not null default ''"`
CreateAt int64 `gorm:"type:bigint default '0'"`
CreateBy string `gorm:"type:varchar(64) default ''"`
UpdateAt int64 `gorm:"type:bigint default '0'"`
UpdateBy string `gorm:"type:varchar(64) default ''"`
}
func MigrateEsIndexPatternTable(db *gorm.DB) error {
db = db.Set("gorm:table_options", "CHARSET=utf8mb4")
if db.Migrator().HasTable("es_index_pattern") {
return nil
}
err := db.Table("es_index_pattern").AutoMigrate(&EsIndexPattern{})
if err != nil {
logger.Errorf("failed to migrate es index pattern table: %v", err)
return err
}
return nil
}
================================================
FILE: models/migrate/migrate_test.go
================================================
package migrate
import (
"fmt"
"testing"
"github.com/ccfos/nightingale/v6/models"
"gorm.io/driver/mysql"
"gorm.io/gorm"
"gorm.io/gorm/schema"
)
func TestInsertPermPoints(t *testing.T) {
db, err := gorm.Open(mysql.Open("root:1234@tcp(127.0.0.1:3306)/n9e_v6?charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true"), &gorm.Config{NamingStrategy: schema.NamingStrategy{
SingularTable: true,
}})
if err != nil {
fmt.Printf("failed to connect database: %v", err)
}
var ops []models.RoleOperation
ops = append(ops, models.RoleOperation{
RoleName: "Standard",
Operation: "/alert-mutes/put",
})
ops = append(ops, models.RoleOperation{
RoleName: "Standard",
Operation: "/log/index-patterns",
})
ops = append(ops, models.RoleOperation{
RoleName: "Standard",
Operation: "/help/variable-configs",
})
ops = append(ops, models.RoleOperation{
RoleName: "Standard",
Operation: "/ibex-settings",
})
db = db.Debug()
for _, op := range ops {
var count int64
err := db.Raw("SELECT COUNT(*) FROM role_operation WHERE operation = ? AND role_name = ?",
op.Operation, op.RoleName).Scan(&count).Error
fmt.Printf("count: %d\n", count)
if err != nil {
fmt.Printf("check role operation exists failed, %v", err)
continue
}
if count > 0 {
continue
}
err = db.Create(&op).Error
if err != nil {
fmt.Printf("insert role operation failed, %v", err)
}
}
}
================================================
FILE: models/notification_record.go
================================================
package models
import (
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/strx"
"github.com/toolkits/pkg/logger"
)
const (
NotiStatusSuccess = iota + 1
NotiStatusFailure
)
type NotificationRecord struct {
Id int64 `json:"id" gorm:"primaryKey;type:bigint;autoIncrement"`
NotifyRuleID int64 `json:"notify_rule_id" gorm:"type:bigint;comment:notify rule id"`
EventId int64 `json:"event_id" gorm:"type:bigint;not null;index:idx_evt,priority:1;comment:event history id"`
SubId int64 `json:"sub_id" gorm:"type:bigint;comment:subscribed rule id"`
Channel string `json:"channel" gorm:"type:varchar(255);not null;comment:notification channel name"`
Status int `json:"status" gorm:"type:int;comment:notification status"` // 1-成功,2-失败
Target string `json:"target" gorm:"type:varchar(1024);not null;comment:notification target"`
Details string `json:"details" gorm:"type:varchar(2048);default:'';comment:notification other info"`
CreatedAt int64 `json:"created_at" gorm:"type:bigint;not null;comment:create time"`
}
func NewNotificationRecord(event *AlertCurEvent, notifyRuleID int64, channel, target string) *NotificationRecord {
return &NotificationRecord{
NotifyRuleID: notifyRuleID,
EventId: event.Id,
SubId: event.SubRuleId,
Channel: channel,
Status: NotiStatusSuccess,
Target: target,
}
}
func (n *NotificationRecord) SetStatus(status int) {
if n == nil {
return
}
n.Status = status
}
func (n *NotificationRecord) SetDetails(details string) {
if n == nil {
return
}
n.Details = details
}
func (n *NotificationRecord) TableName() string {
return "notification_record"
}
func (n *NotificationRecord) Add(ctx *ctx.Context) error {
return Insert(ctx, n)
}
func (n *NotificationRecord) GetGroupIds(ctx *ctx.Context) (groupIds []int64) {
if n == nil {
return
}
if n.SubId > 0 {
if sub, err := AlertSubscribeGet(ctx, "id=?", n.SubId); err != nil {
logger.Errorf("AlertSubscribeGet failed, err: %v", err)
} else {
groupIds = strx.IdsInt64ForAPI(sub.UserGroupIds, " ")
}
return
}
if event, err := AlertHisEventGetById(ctx, n.EventId); err != nil {
logger.Errorf("AlertHisEventGetById failed, err: %v", err)
} else {
groupIds = strx.IdsInt64ForAPI(event.NotifyGroups, " ")
}
return
}
func NotificationRecordsGetByEventId(ctx *ctx.Context, eid int64) ([]*NotificationRecord, error) {
return NotificationRecordsGet(ctx, "event_id=?", eid)
}
func NotificationRecordsGet(ctx *ctx.Context, where string, args ...interface{}) ([]*NotificationRecord, error) {
var lst []*NotificationRecord
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
return lst, nil
}
================================================
FILE: models/notify_channel.go
================================================
package models
import (
"bytes"
"crypto/hmac"
"crypto/sha256"
"crypto/tls"
"encoding/hex"
"encoding/json"
"fmt"
"html/template"
"io"
"net"
"net/http"
"net/url"
"os"
"os/exec"
"path"
"regexp"
"sort"
"strconv"
"strings"
"time"
"unicode/utf8"
"github.com/ccfos/nightingale/v6/pkg/cmdx"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/pkg/tplx"
"github.com/google/uuid"
"github.com/pkg/errors"
"github.com/toolkits/pkg/file"
"github.com/toolkits/pkg/logger"
"gopkg.in/gomail.v2"
)
type EmailContext struct {
NotifyRuleId int64
Events []*AlertCurEvent
Mail *gomail.Message
}
// NotifyChannelConfig 通知媒介
type NotifyChannelConfig struct {
ID int64 `json:"id" gorm:"primaryKey"`
// 基础配置
Name string `json:"name"` // 媒介名称
Ident string `json:"ident"` // 媒介类型
Description string `json:"description"` // 媒介描述
Enable bool `json:"enable"` // 是否启用
// 用户参数配置
ParamConfig *NotifyParamConfig `json:"param_config,omitempty" gorm:"serializer:json"`
// 通知请求配置
RequestType string `json:"request_type"` // http, stmp, script, flashduty
RequestConfig *RequestConfig `json:"request_config,omitempty" gorm:"serializer:json"`
Weight int `json:"weight"` // 权重,根据此字段对内置模板进行排序
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
}
func (ncc *NotifyChannelConfig) TableName() string {
return "notify_channel"
}
type RequestConfig struct {
HTTPRequestConfig *HTTPRequestConfig `json:"http_request_config,omitempty" gorm:"serializer:json"`
SMTPRequestConfig *SMTPRequestConfig `json:"smtp_request_config,omitempty" gorm:"serializer:json"`
ScriptRequestConfig *ScriptRequestConfig `json:"script_request_config,omitempty" gorm:"serializer:json"`
FlashDutyRequestConfig *FlashDutyRequestConfig `json:"flashduty_request_config,omitempty" gorm:"serializer:json"`
PagerDutyRequestConfig *PagerDutyRequestConfig `json:"pagerduty_request_config,omitempty" gorm:"serializer:json"`
}
// NotifyParamConfig 参数配置
type NotifyParamConfig struct {
UserInfo *UserInfo `json:"user_info,omitempty"`
Custom Params `json:"custom"` // 自定义参数配置
}
type Params struct {
Params []ParamItem `json:"params"`
}
type UserInfo struct {
ContactKey string `json:"contact_key"` // phone, email, dingtalk_robot_token 等
}
// FlashDutyParam flashduty 类型的参数配置
type FlashDutyRequestConfig struct {
Proxy string `json:"proxy"`
IntegrationUrl string `json:"integration_url"`
Timeout int `json:"timeout"` // 超时时间(毫秒)
RetryTimes int `json:"retry_times"` // 重试次数
RetrySleep int `json:"retry_sleep"` // 重试等待时间(毫秒)
}
// PagerDutyRequestConfig PagerDuty 类型的参数配置
type PagerDutyRequestConfig struct {
Proxy string `json:"proxy"`
ApiKey string `json:"api_key"` // PagerDuty 账户或用户的 API Key,不是集成的 Integration Key (routing key)
Timeout int `json:"timeout"` // 超时时间(毫秒)
RetryTimes int `json:"retry_times"` // 重试次数
RetrySleep int `json:"retry_sleep"` // 重试等待时间(毫秒)
}
// ParamItem 自定义参数项
type ParamItem struct {
Key string `json:"key"` // 参数键名
CName string `json:"cname"` // 参数别名
Type string `json:"type"` // 参数类型,目前支持 string
}
type SMTPRequestConfig struct {
Host string `json:"host"`
Port int `json:"port"`
Username string `json:"username"`
Password string `json:"password"`
From string `json:"from"`
InsecureSkipVerify bool `json:"insecure_skip_verify"`
Batch int `json:"batch"`
}
type ScriptRequestConfig struct {
ScriptType string `json:"script_type"` // 脚本类型,目前支持 python, shell
Timeout int `json:"timeout"` // 超时时间(毫秒)
Script string `json:"script"` // 脚本内容
Path string `json:"path"` // 脚本路径
}
// HTTPRequestConfig 通知请求配置
type HTTPRequestConfig struct {
URL string `json:"url"`
Method string `json:"method"` // GET, POST, PUT
Headers map[string]string `json:"headers"`
Proxy string `json:"proxy"`
Timeout int `json:"timeout"` // 超时时间(毫秒)
Concurrency int `json:"concurrency"` // 并发数
RetryTimes int `json:"retry_times"` // 重试次数
RetryInterval int `json:"retry_interval"` // 重试间隔(毫秒)
TLS *TLSConfig `json:"tls,omitempty"`
Request RequestDetail `json:"request"`
}
// TLSConfig TLS 配置
type TLSConfig struct {
Enable bool `json:"enable"`
CertFile string `json:"cert_file"`
KeyFile string `json:"key_file"`
CAFile string `json:"ca_file"`
SkipVerify bool `json:"skip_verify"`
}
// RequestDetail 请求详情配置
type RequestDetail struct {
Parameters map[string]string `json:"parameters"` // URL 参数
Form string `json:"form"` // 来源
Body string `json:"body"` // 请求体
}
func (ncc *NotifyChannelConfig) SendScript(events []*AlertCurEvent, tpl map[string]interface{}, params map[string]string, sendtos []string) (string, string, error) {
config := ncc.RequestConfig.ScriptRequestConfig
if config.Script == "" && config.Path == "" {
return "", "", fmt.Errorf("script or path is empty")
}
fpath := ".notify_script_" + strconv.FormatInt(ncc.ID, 10)
if config.Path != "" {
fpath = config.Path
} else {
rewrite := true
if file.IsExist(fpath) {
oldContent, err := file.ToString(fpath)
if err != nil {
return "", "", fmt.Errorf("failed to read script file: %v", err)
}
if oldContent == config.Script {
rewrite = false
}
}
if rewrite {
_, err := file.WriteString(fpath, config.Script)
if err != nil {
return "", "", fmt.Errorf("failed to write script file: %v", err)
}
err = os.Chmod(fpath, 0777)
if err != nil {
return "", "", fmt.Errorf("failed to chmod script file: %v", err)
}
}
cur, _ := os.Getwd()
fpath = path.Join(cur, fpath)
}
cmd := exec.Command(fpath)
cmd.Stdin = bytes.NewReader(getStdinBytes(events, tpl, params, sendtos))
var buf bytes.Buffer
cmd.Stdout = &buf
cmd.Stderr = &buf
err, isTimeout := cmdx.RunTimeout(cmd, time.Duration(config.Timeout)*time.Millisecond)
logger.Infof("event_script_notify_result: exec %s output: %s isTimeout: %v err: %v stdin: %s", fpath, buf.String(), isTimeout, err, string(getStdinBytes(events, tpl, params, sendtos)))
res := buf.String()
// 截断超出长度的输出
if len(res) > 512 {
// 确保在有效的UTF-8字符边界处截断
validLen := 0
for i := 0; i < 512 && i < len(res); {
_, size := utf8.DecodeRuneInString(res[i:])
if i+size > 512 {
break
}
i += size
validLen = i
}
res = res[:validLen] + "..."
}
if isTimeout {
if err == nil {
return cmd.String(), res, errors.New("timeout and killed process")
}
return cmd.String(), res, err
}
if err != nil {
return cmd.String(), res, fmt.Errorf("failed to execute script: %v", err)
}
return cmd.String(), res, nil
}
func getStdinBytes(events []*AlertCurEvent, tpl map[string]interface{}, params map[string]string, sendtos []string) []byte {
if len(events) == 0 {
return []byte("")
}
// 创建一个 map 来存储所有数据
data := map[string]interface{}{
"event": events[0],
"events": events,
"tpl": tpl,
"params": params,
"sendtos": sendtos,
}
// 将数据序列化为 JSON 字节数组
jsonBytes, err := json.Marshal(data)
if err != nil {
return nil
}
return jsonBytes
}
func NotifyChannelStatistics(ctx *ctx.Context) (*Statistics, error) {
if !ctx.IsCenter {
s, err := poster.GetByUrls[*Statistics](ctx, "/v1/n9e/statistic?name=notify_channel")
return s, err
}
session := DB(ctx).Model(&NotifyChannelConfig{}).Select("count(*) as total", "max(update_at) as last_updated").Where("enable = ?", true)
var stats []*Statistics
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
func NotifyChannelGetsAll(ctx *ctx.Context) ([]*NotifyChannelConfig, error) {
if !ctx.IsCenter {
channels, err := poster.GetByUrls[[]*NotifyChannelConfig](ctx, "/v1/n9e/notify-channels")
return channels, err
}
var channels []*NotifyChannelConfig
err := DB(ctx).Where("enable = ?", true).Find(&channels).Error
if err != nil {
return nil, err
}
return channels, nil
}
func NotifyChannelGets(ctx *ctx.Context, id int64, name, ident string, enabled int) ([]*NotifyChannelConfig, error) {
session := DB(ctx)
if id != 0 {
session = session.Where("id = ?", id)
}
if name != "" {
session = session.Where("name = ?", name)
}
if ident != "" {
session = session.Where("ident = ?", ident)
}
if enabled != -1 {
session = session.Where("enable = ?", enabled)
}
var channels []*NotifyChannelConfig
err := session.Find(&channels).Error
return channels, err
}
func GetHTTPClient(nc *NotifyChannelConfig) (*http.Client, error) {
if nc.RequestConfig == nil || nc.RequestConfig.HTTPRequestConfig == nil {
return nil, fmt.Errorf("%+v http request config not found", nc)
}
httpConfig := nc.RequestConfig.HTTPRequestConfig
// 对于 FlashDuty 类型,优先使用 FlashDuty 配置中的超时时间
timeout := httpConfig.Timeout
if nc.RequestType == "flashduty" && nc.RequestConfig.FlashDutyRequestConfig != nil {
flashDutyTimeout := nc.RequestConfig.FlashDutyRequestConfig.Timeout
if flashDutyTimeout > 0 {
timeout = flashDutyTimeout
}
}
if timeout == 0 {
timeout = 10000 // HTTP 默认 10 秒
}
if httpConfig.Concurrency == 0 {
httpConfig.Concurrency = 5
}
if httpConfig.RetryTimes == 0 {
httpConfig.RetryTimes = 3
}
if httpConfig.RetryInterval == 0 {
httpConfig.RetryInterval = 100
}
// 设置代理
var proxyFunc func(*http.Request) (*url.URL, error)
proxy := httpConfig.Proxy
// 对于 FlashDuty 类型,优先使用 FlashDuty 配置中的代理
if nc.RequestType == "flashduty" && nc.RequestConfig.FlashDutyRequestConfig != nil && nc.RequestConfig.FlashDutyRequestConfig.Proxy != "" {
proxy = nc.RequestConfig.FlashDutyRequestConfig.Proxy
}
// 对于 PagerDuty 类型,优先使用 PagerDuty 配置中的代理
if nc.RequestType == "pagerduty" && nc.RequestConfig.PagerDutyRequestConfig != nil && nc.RequestConfig.PagerDutyRequestConfig.Proxy != "" {
proxy = nc.RequestConfig.PagerDutyRequestConfig.Proxy
}
if proxy != "" {
proxyURL, err := url.Parse(proxy)
if err != nil {
return nil, fmt.Errorf("invalid proxy URL: %v", err)
}
proxyFunc = http.ProxyURL(proxyURL)
}
tlsConfig := &tls.Config{
InsecureSkipVerify: httpConfig.TLS != nil && httpConfig.TLS.SkipVerify,
}
transport := &http.Transport{
Proxy: proxyFunc,
TLSClientConfig: tlsConfig,
DialContext: (&net.Dialer{
Timeout: time.Duration(timeout) * time.Millisecond,
}).DialContext,
}
client := &http.Client{
Transport: transport,
Timeout: time.Duration(timeout) * time.Millisecond,
}
return client, nil
}
func (ncc *NotifyChannelConfig) makeHTTPRequest(httpConfig *HTTPRequestConfig, url string, headers map[string]string, parameters map[string]string, body []byte) (*http.Request, error) {
req, err := http.NewRequest(httpConfig.Method, url, bytes.NewBuffer(body))
if err != nil {
logger.Errorf("failed to create request: %v", err)
return nil, err
}
query := req.URL.Query()
// 设置请求头 腾讯云短信、语音特殊处理
if ncc.Ident == "tx-sms" || ncc.Ident == "tx-voice" {
headers = ncc.setTxHeader(headers, body)
for key, value := range headers {
req.Header.Add(key, value)
}
} else if ncc.Ident == "ali-sms" || ncc.Ident == "ali-voice" {
req, err = http.NewRequest(httpConfig.Method, url, nil)
if err != nil {
return nil, err
}
query, headers = ncc.getAliQuery(ncc.Ident, query, httpConfig.Request.Parameters["AccessKeyId"], httpConfig.Request.Parameters["AccessKeySecret"], parameters)
for key, value := range headers {
req.Header.Set(key, value)
}
} else {
for key, value := range headers {
req.Header.Add(key, value)
}
}
if ncc.Ident != "ali-sms" && ncc.Ident != "ali-voice" {
for key, value := range parameters {
query.Add(key, value)
}
}
req.URL.RawQuery = query.Encode()
// 记录完整的请求信息
logger.Debugf("URL: %v, Method: %s, Headers: %+v, params: %+v, Body: %s", req.URL, req.Method, req.Header, query, string(body))
return req, nil
}
func (ncc *NotifyChannelConfig) makeFlashDutyRequest(url string, bodyBytes []byte, flashDutyChannelID int64) (*http.Request, error) {
req, err := http.NewRequest("POST", url, bytes.NewBuffer(bodyBytes))
if err != nil {
return nil, err
}
// 设置 URL 参数
query := req.URL.Query()
if flashDutyChannelID != 0 {
// 如果 flashduty 有配置协作空间(channel_id),则传入 channel_id 参数
query.Add("channel_id", strconv.FormatInt(flashDutyChannelID, 10))
}
req.URL.RawQuery = query.Encode()
req.Header.Add("Content-Type", "application/json")
return req, nil
}
func (ncc *NotifyChannelConfig) SendFlashDuty(events []*AlertCurEvent, flashDutyChannelID int64, client *http.Client) (string, error) {
// todo 每一个 channel 批量发送事件
if client == nil {
return "", fmt.Errorf("http client not found")
}
body, err := json.Marshal(events)
if err != nil {
return "", err
}
url := ncc.RequestConfig.FlashDutyRequestConfig.IntegrationUrl
retrySleep := time.Second
if ncc.RequestConfig.FlashDutyRequestConfig.RetrySleep > 0 {
retrySleep = time.Duration(ncc.RequestConfig.FlashDutyRequestConfig.RetrySleep) * time.Millisecond
}
retryTimes := 3
if ncc.RequestConfig.FlashDutyRequestConfig.RetryTimes > 0 {
retryTimes = ncc.RequestConfig.FlashDutyRequestConfig.RetryTimes
}
// 把最后一次错误保存下来,后面返回,让用户在页面上也可以看到
var lastErrorMessage string
for i := 0; i <= retryTimes; i++ {
req, err := ncc.makeFlashDutyRequest(url, body, flashDutyChannelID)
if err != nil {
logger.Errorf("send_flashduty: failed to create request. url=%s request_body=%s error=%v", url, string(body), err)
return fmt.Sprintf("failed to create request. error: %v", err), err
}
// 直接使用客户端发送请求,超时时间已经在 client 中设置
resp, err := client.Do(req)
if err != nil {
logger.Errorf("send_flashduty: http_call=fail url=%s request_body=%s error=%v times=%d", url, string(body), err, i+1)
if i < retryTimes {
// 重试等待时间,后面要放到页面上配置
time.Sleep(retrySleep)
}
lastErrorMessage = err.Error()
continue
}
// 走到这里,说明请求 Flashduty 成功,不管 Flashduty 返回了什么结果,都不判断,仅保存,给用户查看即可
// 比如服务端返回 5xx,也不要重试,重试可能会导致服务端数据有问题。告警事件这样的东西,没有那么关键,只要最终能在 UI 上看到调用结果就行
var resBody []byte
if resp.Body != nil {
defer resp.Body.Close()
resBody, err = io.ReadAll(resp.Body)
if err != nil {
logger.Errorf("send_flashduty: failed to read response. request_body=%s, error=%v", string(body), err)
resBody = []byte("failed to read response. error: " + err.Error())
}
}
logger.Infof("send_flashduty: http_call=succ url=%s request_body=%s response_code=%d response_body=%s times=%d", url, string(body), resp.StatusCode, string(resBody), i+1)
return fmt.Sprintf("status_code:%d, response:%s", resp.StatusCode, string(resBody)), nil
}
return lastErrorMessage, errors.New("failed to send request")
}
func (ncc *NotifyChannelConfig) SendPagerDuty(events []*AlertCurEvent, routingKey, siteUrl string, client *http.Client) (string, error) {
if client == nil {
return "", fmt.Errorf("http client not found")
}
if ncc.RequestConfig == nil || ncc.RequestConfig.PagerDutyRequestConfig == nil {
return "", fmt.Errorf("pagerduty request config not found")
}
retrySleep := time.Second
if ncc.RequestConfig.PagerDutyRequestConfig.RetrySleep > 0 {
retrySleep = time.Duration(ncc.RequestConfig.PagerDutyRequestConfig.RetrySleep) * time.Millisecond
}
retryTimes := 3
if ncc.RequestConfig.PagerDutyRequestConfig.RetryTimes > 0 {
retryTimes = ncc.RequestConfig.PagerDutyRequestConfig.RetryTimes
}
endpoint := "https://events.pagerduty.com/v2/enqueue"
var failedMsgs []string
var responses []string
for _, event := range events {
action := "trigger"
if event.IsRecovered {
action = "resolve"
}
severity := "critical"
switch event.Severity {
case 2:
severity = "error"
case 3:
severity = "warning"
}
jsonBody := map[string]interface{}{
"routing_key": routingKey,
"event_action": action,
"dedup_key": event.Hash,
"payload": map[string]interface{}{
"summary": event.RuleName,
"source": event.Cluster,
"severity": severity,
"group": event.GroupName,
"component": event.Target,
"timestamp": time.Unix(event.TriggerTime, 0).Format(time.RFC3339),
"custom_details": map[string]interface{}{
"tags": event.TagsJSON,
"annotations": event.AnnotationsJSON,
"cluster": event.Cluster,
"rule_id": event.RuleId,
"rule_note": event.RuleNote,
"rule_prod": event.RuleProd,
"prom_ql": event.PromQl,
"target_ident": event.TargetIdent,
"target_note": event.TargetNote,
"datasource_id": event.DatasourceId,
"first_trigger_time": time.Unix(event.FirstTriggerTime, 0).Format(time.RFC3339),
"prom_for_duration": event.PromForDuration,
"runbook_url": event.RunbookUrl,
"notify_cur_number": event.NotifyCurNumber,
"group_id": event.GroupId,
"cate": event.Cate,
},
},
"links": []map[string]string{
{"href": fmt.Sprintf("%s/alert-his-events/%d", siteUrl, event.Id), "text": "Event Detail"},
{"href": fmt.Sprintf("%s/alert-mutes/add?__event_id=%d", siteUrl, event.Id), "text": "Mute this alert"},
},
}
body, err := json.Marshal(jsonBody)
if err != nil {
logger.Errorf("send_pagerduty: failed to marshal request body. error=%v", err)
failedMsgs = append(failedMsgs, fmt.Sprintf("event %d marshal error: %v", event.Id, err))
// 记录一条空响应占位,方便上层区分事件
responses = append(responses, fmt.Sprintf("event %d: marshal error: %v", event.Id, err))
continue
}
var lastErrorMessage string
var lastRespSummary string
attempts := retryTimes + 1
for i := 0; i < attempts; i++ {
req, err := http.NewRequest("POST", endpoint, bytes.NewReader(body))
if err != nil {
logger.Errorf("send_pagerduty: failed to create request. url=%s request_body=%s error=%v", endpoint, string(body), err)
lastErrorMessage = err.Error()
if i < attempts-1 {
time.Sleep(retrySleep)
continue
}
break
}
req.Header.Add("Content-Type", "application/json")
resp, err := client.Do(req)
if err != nil {
logger.Errorf("send_pagerduty: http_call=fail url=%s request_body=%s error=%v times=%d", endpoint, string(body), err, i+1)
lastErrorMessage = err.Error()
if i < attempts-1 {
time.Sleep(retrySleep)
continue
}
break
}
// 确保关闭 body
var resBody []byte
if resp.Body != nil {
resBody, err = io.ReadAll(resp.Body)
resp.Body.Close()
if err != nil {
logger.Errorf("send_pagerduty: failed to read response. request_body=%s, error=%v", string(body), err)
resBody = []byte("failed to read response. error: " + err.Error())
}
} else {
resBody = []byte("")
}
respSummary := fmt.Sprintf("status_code:%d, response:%s", resp.StatusCode, string(resBody))
lastRespSummary = respSummary
logger.Infof("send_pagerduty: http_call=succ url=%s request_body=%s response_code=%d response_body=%s times=%d", endpoint, string(body), resp.StatusCode, string(resBody), i+1)
if resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusAccepted {
// 当前事件发送成功
lastErrorMessage = ""
break
}
lastErrorMessage = respSummary
if i < attempts-1 {
time.Sleep(retrySleep)
continue
}
break
}
// 保存本次事件的响应摘要(无论成功或失败),便于上层记录 traceId 等信息
if lastRespSummary == "" && lastErrorMessage != "" {
lastRespSummary = lastErrorMessage
}
responses = append(responses, fmt.Sprintf("event %d: %s", event.Id, lastRespSummary))
if lastErrorMessage != "" {
failedMsgs = append(failedMsgs, fmt.Sprintf("event %d: %s", event.Id, lastErrorMessage))
}
}
// 将每个 event 的响应摘要返回给上层,便于记录 pagerduty 返回的 traceId 等信息
if len(failedMsgs) > 0 {
return strings.Join(responses, " | "), errors.New(strings.Join(failedMsgs, " | "))
}
return strings.Join(responses, " | "), nil
}
func (ncc *NotifyChannelConfig) SendHTTP(events []*AlertCurEvent, tpl map[string]interface{}, params map[string]string, sendtos []string, client *http.Client) (string, error) {
if client == nil {
return "", fmt.Errorf("http client not found")
}
if len(events) == 0 {
return "", fmt.Errorf("events is empty")
}
httpConfig := ncc.RequestConfig.HTTPRequestConfig
// MessageTemplate
fullTpl := make(map[string]interface{})
fullTpl["sendtos"] = sendtos // 发送对象
fullTpl["params"] = params // 自定义参数
fullTpl["tpl"] = tpl
fullTpl["events"] = events
fullTpl["event"] = events[0]
if len(sendtos) > 0 {
fullTpl["sendto"] = sendtos[0]
}
// 将 MessageTemplate 与变量配置的信息渲染进 reqBody
body, err := ncc.parseRequestBody(fullTpl)
if err != nil {
logger.Errorf("failed to parse request body: %v, event: %v", err, events)
return "", err
}
// 替换 URL Header Parameters 中的变量
url, headers, parameters := ncc.replaceVariables(fullTpl)
logger.Infof("url: %v, headers: %v, parameters: %v", url, headers, parameters)
// 重试机制
var lastErrorMessage string
for i := 0; i < httpConfig.RetryTimes; i++ {
var resp *http.Response
req, err := ncc.makeHTTPRequest(httpConfig, url, headers, parameters, body)
if err != nil {
logger.Errorf("send_http: failed to create request. url=%s request_body=%s error=%v", url, string(body), err)
return fmt.Sprintf("failed to create request. error: %v", err), err
}
resp, err = client.Do(req)
if err != nil {
logger.Errorf("send_http: failed to send http notify. url=%s request_body=%s error=%v", url, string(body), err)
lastErrorMessage = err.Error()
time.Sleep(time.Duration(httpConfig.RetryInterval) * time.Millisecond)
continue
}
defer resp.Body.Close()
// 读取响应
body, err := io.ReadAll(resp.Body)
logger.Debugf("send http request: %+v, response: %+v, body: %+v", req, resp, string(body))
if err != nil {
logger.Errorf("send_http: failed to read response. url=%s request_body=%s error=%v", url, string(body), err)
}
if resp.StatusCode == http.StatusOK {
return fmt.Sprintf("status_code:%d, response:%s", resp.StatusCode, string(body)), nil
}
return fmt.Sprintf("status_code:%d, response:%s", resp.StatusCode, string(body)), fmt.Errorf("failed to send request, status code: %d, body: %s", resp.StatusCode, string(body))
}
return lastErrorMessage, errors.New("all retries failed, last error: " + lastErrorMessage)
}
// getAliQuery 获取阿里云API的查询参数和请求头
func (ncc *NotifyChannelConfig) getAliQuery(ident string, query url.Values, ak, sk string, params map[string]string) (url.Values, map[string]string) {
// 获取基础配置
httpConfig := ncc.RequestConfig.HTTPRequestConfig
httpMethod := "POST"
canonicalURI := "/"
var queryParams map[string]string
if ident == "ali-sms" {
queryParams = map[string]string{
"PhoneNumbers": params["PhoneNumbers"],
"SignName": params["SignName"],
"TemplateCode": params["TemplateCode"],
"TemplateParam": params["TemplateParam"],
}
} else if ident == "ali-voice" {
queryParams = map[string]string{
"CalledNumber": params["CalledNumber"],
"TtsCode": params["TtsCode"],
"TtsParam": params["TtsParam"],
"CalledShowNumber": params["CalledShowNumber"],
}
}
// 设置基础headers
headers := map[string]string{
"host": httpConfig.Headers["Host"],
"x-acs-version": "2017-05-25",
"x-acs-date": time.Now().UTC().Format("2006-01-02T15:04:05Z"),
"x-acs-signature-nonce": uuid.New().String(),
"x-acs-content-sha256": fmt.Sprintf("%x", sha256.Sum256([]byte(""))),
}
// 根据服务类型设置action
if ncc.Ident == "ali-sms" {
headers["x-acs-action"] = "SendSms"
} else if ncc.Ident == "ali-voice" {
headers["x-acs-action"] = "SingleCallByTts"
}
// 计算签名
signature, signedHeaders := getSignature(sk, httpMethod, canonicalURI, headers, queryParams, "")
// 添加授权头
headers["Authorization"] = fmt.Sprintf("ACS3-HMAC-SHA256 Credential=%s,SignedHeaders=%s,Signature=%s",
ak, signedHeaders, signature)
// 业务参数
for k, v := range queryParams {
query.Add(k, v)
}
query.Del("AccessKeyId")
query.Del("AccessKeySecret")
return query, headers
}
// getSignature 计算签名
func getSignature(accessKeySecret string, httpMethod, canonicalURI string, headers map[string]string, queryParams map[string]string, body string) (string, string) {
// 1. 构造规范化请求
// 处理查询参数
var sortedQueryParams []string
for k, v := range queryParams {
sortedQueryParams = append(sortedQueryParams, fmt.Sprintf("%s=%s",
percentEncode(k), percentEncode(v)))
}
sort.Strings(sortedQueryParams)
canonicalQueryString := strings.Join(sortedQueryParams, "&")
// 处理请求头
var canonicalHeaders []string
var signedHeaders []string
for k, v := range headers {
lowerK := strings.ToLower(k)
if lowerK == "host" || lowerK == "content-type" || strings.HasPrefix(lowerK, "x-acs-") {
canonicalHeaders = append(canonicalHeaders, fmt.Sprintf("%s:%s", lowerK, strings.TrimSpace(v)))
signedHeaders = append(signedHeaders, lowerK)
}
}
sort.Strings(canonicalHeaders)
sort.Strings(signedHeaders)
canonicalHeadersStr := strings.Join(canonicalHeaders, "\n") + "\n"
signedHeadersStr := strings.Join(signedHeaders, ";")
// 计算body的hash值
h := sha256.New()
h.Write([]byte(body))
bodyHash := hex.EncodeToString(h.Sum(nil))
// 构造规范化请求
canonicalRequest := fmt.Sprintf("%s\n%s\n%s\n%s\n%s\n%s",
httpMethod, canonicalURI, canonicalQueryString, canonicalHeadersStr,
signedHeadersStr, bodyHash)
// 2. 构造待签名字符串
algorithm := "ACS3-HMAC-SHA256"
h = sha256.New()
h.Write([]byte(canonicalRequest))
canonicalRequestHash := hex.EncodeToString(h.Sum(nil))
stringToSign := fmt.Sprintf("%s\n%s", algorithm, canonicalRequestHash)
// 3. 计算签名
h = hmac.New(sha256.New, []byte(accessKeySecret))
h.Write([]byte(stringToSign))
signature := hex.EncodeToString(h.Sum(nil))
return signature, signedHeadersStr
}
func percentEncode(str string) string {
encoded := url.QueryEscape(str)
encoded = strings.ReplaceAll(encoded, "+", "%20")
encoded = strings.ReplaceAll(encoded, "*", "%2A")
encoded = strings.ReplaceAll(encoded, "%7E", "~")
return encoded
}
func (ncc *NotifyChannelConfig) setTxHeader(headers map[string]string, payloadBytes []byte) map[string]string {
timestamp := time.Now().Unix()
authorization := ncc.getTxSignature(string(payloadBytes), timestamp)
headers["X-TC-Timestamp"] = fmt.Sprintf("%d", timestamp)
headers["Authorization"] = authorization
return headers
}
func (ncc *NotifyChannelConfig) getTxSignature(payloadStr string, timestamp int64) string {
httpConfig := ncc.RequestConfig.HTTPRequestConfig
canonicalHeaders := fmt.Sprintf("content-type:application/json\nhost:%s\nx-tc-action:%s\n",
httpConfig.Headers["Host"], strings.ToLower(httpConfig.Headers["X-TC-Action"]))
hasher := sha256.New()
hasher.Write([]byte(payloadStr))
hashedRequestPayload := hex.EncodeToString(hasher.Sum(nil))
canonicalRequest := fmt.Sprintf("%s\n%s\n%s\n%s\n%s\n%s",
httpConfig.Method,
"/",
"",
canonicalHeaders,
"content-type;host;x-tc-action",
hashedRequestPayload)
// 1. 生成日期
date := time.Unix(timestamp, 0).UTC().Format("2006-01-02")
// 2. 拼接待签名字符串
credentialScope := fmt.Sprintf("%s/%s/tc3_request", date, httpConfig.Headers["Service"])
hasher = sha256.New()
hasher.Write([]byte(canonicalRequest))
hashedCanonicalRequest := hex.EncodeToString(hasher.Sum(nil))
stringToSign := fmt.Sprintf("TC3-HMAC-SHA256\n%d\n%s\n%s",
timestamp,
credentialScope,
hashedCanonicalRequest)
// 3. 计算签名
secretDate := sign([]byte("TC3"+httpConfig.Headers["Secret_Key"]), date)
secretService := sign(secretDate, httpConfig.Headers["Service"])
secretSigning := sign(secretService, "tc3_request")
signature := hex.EncodeToString(sign(secretSigning, stringToSign))
// 4. 组织Authorization
authorization := fmt.Sprintf("TC3-HMAC-SHA256 Credential=%s/%s, SignedHeaders=%s, Signature=%s",
httpConfig.Headers["Secret_ID"], credentialScope, "content-type;host;x-tc-action", signature)
return authorization
}
func sign(key []byte, msg string) []byte {
h := hmac.New(sha256.New, key)
h.Write([]byte(msg))
return h.Sum(nil)
}
func (ncc *NotifyChannelConfig) parseRequestBody(bodyTpl map[string]interface{}) ([]byte, error) {
var defs = []string{
"{{$tpl := .tpl}}",
"{{$sendto := .sendto}}",
"{{$sendtos := .sendtos}}",
"{{$params := .params}}",
"{{$events := .events}}",
"{{$event := .event}}",
}
text := strings.Join(append(defs, ncc.RequestConfig.HTTPRequestConfig.Request.Body), "")
tpl, err := template.New("requestBody").Funcs(tplx.TemplateFuncMap).Parse(text)
if err != nil {
return nil, err
}
var body bytes.Buffer
err = tpl.Execute(&body, bodyTpl)
return body.Bytes(), err
}
func getParsedString(name, tplStr string, tplData map[string]interface{}) string {
var defs = []string{
"{{$tpl := .tpl}}",
"{{$sendto := .sendto}}",
"{{$sendtos := .sendtos}}",
"{{$params := .params}}",
"{{$events := .events}}",
"{{$event := .event}}",
}
text := strings.Join(append(defs, tplStr), "")
tpl, err := template.New(name).Funcs(tplx.TemplateFuncMap).Parse(text)
if err != nil {
return ""
}
var body bytes.Buffer
err = tpl.Execute(&body, tplData)
if err != nil {
return fmt.Sprintf("failed to parse template: %v data: %v", err, tplData)
}
return body.String()
}
func (ncc *NotifyChannelConfig) replaceVariables(tpl map[string]interface{}) (string, map[string]string, map[string]string) {
httpConfig := ncc.RequestConfig.HTTPRequestConfig
url := ""
headers := make(map[string]string)
parameters := make(map[string]string)
if needsTemplateRendering(httpConfig.URL) {
logger.Infof("replace variables url: %s tpl: %+v", httpConfig.URL, tpl)
url = getParsedString("url", httpConfig.URL, tpl)
} else {
url = httpConfig.URL
}
for key, value := range httpConfig.Headers {
if needsTemplateRendering(value) {
headers[key] = getParsedString(key, value, tpl)
} else {
headers[key] = value
}
}
for key, value := range httpConfig.Request.Parameters {
if needsTemplateRendering(value) {
parameters[key] = getParsedString(key, value, tpl)
} else {
parameters[key] = value
}
}
return url, headers, parameters
}
// needsTemplateRendering 检查字符串是否包含模板语法
func needsTemplateRendering(s string) bool {
return strings.Contains(s, "{{") && strings.Contains(s, "}}")
}
func (ncc *NotifyChannelConfig) SendEmail(notifyRuleId int64, events []*AlertCurEvent, tpl map[string]interface{}, sendtos []string, ch chan *EmailContext) {
m := gomail.NewMessage()
m.SetHeader("From", ncc.RequestConfig.SMTPRequestConfig.From)
m.SetHeader("To", sendtos...)
m.SetHeader("Subject", tpl["subject"].(string))
m.SetBody("text/html", tpl["content"].(string))
ch <- &EmailContext{notifyRuleId, events, m}
}
func (ncc *NotifyChannelConfig) SendEmailNow(events []*AlertCurEvent, tpl map[string]interface{}, sendtos []string) error {
d := gomail.NewDialer(ncc.RequestConfig.SMTPRequestConfig.Host, ncc.RequestConfig.SMTPRequestConfig.Port, ncc.RequestConfig.SMTPRequestConfig.Username, ncc.RequestConfig.SMTPRequestConfig.Password)
if ncc.RequestConfig.SMTPRequestConfig.InsecureSkipVerify {
d.TLSConfig = &tls.Config{InsecureSkipVerify: true}
}
s, err := d.Dial()
if err != nil {
logger.Errorf("email_sender: failed to dial: %s", err)
return err
}
m := gomail.NewMessage()
m.SetHeader("From", ncc.RequestConfig.SMTPRequestConfig.From)
m.SetHeader("To", sendtos...)
m.SetHeader("Subject", tpl["subject"].(string))
m.SetBody("text/html", tpl["content"].(string))
return gomail.Send(s, m)
}
func (ncc *NotifyChannelConfig) Verify() error {
if ncc.Name == "" {
return errors.New("channel name cannot be empty")
}
if ncc.Ident == "" {
return errors.New("channel identifier cannot be empty")
}
if !regexp.MustCompile("^[a-zA-Z0-9_-]+$").MatchString(ncc.Ident) {
return fmt.Errorf("channel identifier must be ^[a-zA-Z0-9_-]+$, current: %s", ncc.Ident)
}
if ncc.RequestType != "http" && ncc.RequestType != "smtp" && ncc.RequestType != "script" && ncc.RequestType != "flashduty" && ncc.RequestType != "pagerduty" {
return errors.New("invalid request type, must be 'http', 'smtp' or 'script'")
}
if ncc.ParamConfig != nil {
for _, param := range ncc.ParamConfig.Custom.Params {
if param.Key != "" && param.CName == "" {
return errors.New("param items must have valid cname")
}
}
}
// 校验 Request 配置
switch ncc.RequestType {
case "http":
if err := ncc.ValidateHTTPRequestConfig(); err != nil {
return err
}
case "smtp":
if err := ncc.ValidateSMTPRequestConfig(); err != nil {
return err
}
case "script":
if err := ncc.ValidateScriptRequestConfig(); err != nil {
return err
}
case "flashduty":
if err := ncc.ValidateFlashDutyRequestConfig(); err != nil {
return err
}
case "pagerduty":
if err := ncc.ValidatePagerDutyRequestConfig(); err != nil {
return err
}
}
return nil
}
func (ncc *NotifyChannelConfig) ValidateHTTPRequestConfig() error {
if ncc.RequestConfig.HTTPRequestConfig == nil {
return errors.New("http request config cannot be nil")
}
return ncc.RequestConfig.HTTPRequestConfig.Verify()
}
func (c *HTTPRequestConfig) Verify() error {
if c.URL == "" {
return errors.New("http request URL cannot be empty")
}
if c.Method == "" {
return errors.New("http request method cannot be empty")
}
if !(c.Method == "GET" || c.Method == "POST" || c.Method == "PUT") {
return errors.New("http request method must be GET, POST or PUT")
}
return nil
}
func (ncc *NotifyChannelConfig) ValidateSMTPRequestConfig() error {
if ncc.RequestConfig.SMTPRequestConfig == nil {
return errors.New("smtp request config cannot be nil")
}
return ncc.RequestConfig.SMTPRequestConfig.Verify()
}
func (c *SMTPRequestConfig) Verify() error {
if c.Host == "" {
return errors.New("smtp host cannot be empty")
}
if c.Port <= 0 {
return errors.New("smtp port must be greater than 0")
}
if c.Username == "" {
return errors.New("smtp username cannot be empty")
}
if c.Password == "" {
return errors.New("smtp password cannot be empty")
}
if c.From == "" {
return errors.New("smtp from address cannot be empty")
}
return nil
}
func (ncc *NotifyChannelConfig) ValidateScriptRequestConfig() error {
if ncc.RequestConfig.ScriptRequestConfig == nil {
return errors.New("script request config cannot be nil")
}
if !(ncc.RequestConfig.ScriptRequestConfig.ScriptType == "script" || ncc.RequestConfig.ScriptRequestConfig.ScriptType == "path") {
return errors.New("script type must be 'script' or 'path'")
}
if ncc.RequestConfig.ScriptRequestConfig.Script == "" && ncc.RequestConfig.ScriptRequestConfig.Path == "" {
return errors.New("either script content or script path must be provided")
}
return nil
}
func (ncc *NotifyChannelConfig) ValidateFlashDutyRequestConfig() error {
if ncc.RequestConfig.FlashDutyRequestConfig == nil {
return errors.New("flashduty request config cannot be nil")
}
return nil
}
func (ncc *NotifyChannelConfig) ValidatePagerDutyRequestConfig() error {
if ncc.RequestConfig.PagerDutyRequestConfig == nil {
return errors.New("pagerduty request config cannot be nil")
}
return nil
}
func (ncc *NotifyChannelConfig) Update(ctx *ctx.Context, ref NotifyChannelConfig) error {
ref.ID = ncc.ID
ref.CreateAt = ncc.CreateAt
ref.CreateBy = ncc.CreateBy
ref.UpdateAt = time.Now().Unix()
err := ref.Verify()
if err != nil {
return err
}
return DB(ctx).Model(ncc).Select("*").Updates(ref).Error
}
func NotifyChannelGet(ctx *ctx.Context, where string, args ...interface{}) (
*NotifyChannelConfig, error) {
lst, err := NotifyChannelsGet(ctx, where, args...)
if err != nil || len(lst) == 0 {
return nil, err
}
return lst[0], err
}
func NotifyChannelsGet(ctx *ctx.Context, where string, args ...interface{}) (
[]*NotifyChannelConfig, error) {
lst := make([]*NotifyChannelConfig, 0)
session := DB(ctx)
if where != "" && len(args) > 0 {
session = session.Where(where, args...)
}
err := session.Order("weight asc").Find(&lst).Error
if err != nil {
return nil, err
}
return lst, nil
}
type NotiChList []*NotifyChannelConfig
func (c NotiChList) GetIdentSet() map[int64]struct{} {
idents := make(map[int64]struct{}, len(c))
for _, tpl := range c {
idents[tpl.ID] = struct{}{}
}
return idents
}
func (c NotiChList) IfUsed(nr *NotifyRule) bool {
identSet := c.GetIdentSet()
for _, nc := range nr.NotifyConfigs {
if _, ok := identSet[nc.ChannelID]; ok {
return true
}
}
return false
}
// Weight 用于页面元素排序,weight 越大 排序越靠后
var NotiChMap = []*NotifyChannelConfig{
{
Name: "PagerDuty", Ident: "pagerduty", RequestType: "pagerduty", Weight: 19, Enable: true,
RequestConfig: &RequestConfig{
PagerDutyRequestConfig: &PagerDutyRequestConfig{
ApiKey: "pagerduty api key",
Timeout: 5000,
RetryTimes: 3,
},
},
},
{
Name: "JIRA", Ident: Jira, RequestType: "http", Weight: 18, Enable: true,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
URL: "https://{JIRA Service Account Email}:{API Token}@api.atlassian.com/ex/jira/{CloudID}/rest/api/3/issue",
Method: "POST",
Headers: map[string]string{"Content-Type": "application/json"},
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Body: `{"fields":{"project":{"key":"{{$params.project_key}}"},"issuetype":{"name":"{{if $event.IsRecovered}}Recovery{{else}}Alert{{end}}"},"summary":"{{$event.RuleName}}","description":{"type":"doc","version":1,"content":[{"type":"paragraph","content":[{"type":"text","text":"{{$tpl.content}}"}]}]},"labels":["{{join $event.TagsJSON "\",\""}}", "eventHash={{$event.Hash}}"]}}`,
},
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{Key: "project_key", CName: "Project Key", Type: "string"},
},
},
},
},
{
Name: "JSM Alert", Ident: JSMAlert, RequestType: "http", Weight: 17, Enable: true,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
URL: `https://api.atlassian.com/jsm/ops/integration/v2/alerts{{if $event.IsRecovered}}/{{$event.Hash}}/close?identifierType=alias{{else}}{{end}}`,
Method: "POST",
Headers: map[string]string{"Content-Type": "application/json", "Authorization": "GenieKey {{$params.api_key}}"},
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Body: `{{if $event.IsRecovered}}{"note":"{{$tpl.content}}","source":"{{$event.Cluster}}"}{{else}}{"message":"{{$event.RuleName}}","description":"{{$tpl.content}}","alias":"{{$event.Hash}}","priority":"P{{$event.Severity}}","tags":[{{range $i, $v := $event.TagsJSON}}{{if $i}},{{end}}"{{$v}}"{{end}}],"details":{{jsonMarshal $event.AnnotationsJSON}},"entity":"{{$event.TargetIdent}}","source":"{{$event.Cluster}}"}{{end}}`,
},
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{Key: "api_key", CName: "API Key", Type: "string"},
},
},
},
},
{
Name: "Discord", Ident: Discord, RequestType: "http", Weight: 16, Enable: false,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
URL: "{{$params.webhook_url}}",
Method: "POST", Headers: map[string]string{"Content-Type": "application/json"},
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Body: `{"content": "{{$tpl.content}}"}`,
},
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{Key: "webhook_url", CName: "Webhook Url", Type: "string"},
},
},
},
},
{
Name: "MattermostWebhook", Ident: MattermostWebhook, RequestType: "http", Weight: 15, Enable: false,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
URL: "{{$params.webhook_url}}",
Method: "POST", Headers: map[string]string{"Content-Type": "application/json"},
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Body: `{"text": "{{$tpl.content}}"}`,
},
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{Key: "webhook_url", CName: "Webhook Url", Type: "string"},
{Key: "bot_name", CName: "Bot Name", Type: "string"},
},
},
},
},
{
Name: "MattermostBot", Ident: MattermostBot, RequestType: "http", Weight: 14, Enable: false,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
URL: "/api/v4/posts",
Method: "POST", Headers: map[string]string{"Content-Type": "application/json", "Authorization": "Bearer "},
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Body: `{"channel_id": "{{$params.channel_id}}", "message": "{{$tpl.content}}"}`,
},
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{Key: "channel_id", CName: "Channel ID", Type: "string"},
{Key: "channel_name", CName: "Channel Name", Type: "string"},
},
},
},
},
{
Name: "SlackWebhook", Ident: SlackWebhook, RequestType: "http", Weight: 13, Enable: false,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
URL: "{{$params.webhook_url}}",
Method: "POST", Headers: map[string]string{"Content-Type": "application/json"},
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Body: `{"text": "{{$tpl.content}}", "mrkdwn": true}`,
},
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{Key: "webhook_url", CName: "Webhook Url", Type: "string"},
{Key: "bot_name", CName: "Bot Name", Type: "string"},
},
},
},
},
{
Name: "SlackBot", Ident: SlackBot, RequestType: "http", Weight: 12, Enable: false,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
URL: "https://slack.com/api/chat.postMessage",
Method: "POST", Headers: map[string]string{"Content-Type": "application/json", "Authorization": "Bearer "},
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Body: `{"channel": "#{{$params.channel}}", "text": "{{$tpl.content}}", "mrkdwn": true}`,
},
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{Key: "channel", CName: "channel", Type: "string"},
{Key: "channel_name", CName: "Channel Name", Type: "string"},
},
},
},
},
{
Name: "Tencent SMS", Ident: "tx-sms", RequestType: "http", Weight: 11, Enable: true,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
Method: "POST",
URL: "https://sms.tencentcloudapi.com",
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Body: `{"PhoneNumberSet":["{{ $sendto }}"],"SignName":"需要改为实际的签名","SmsSdkAppId":"需要改为实际的appid","TemplateId":"需要改为实际的模板id","TemplateParamSet":["{{$tpl.content}}"]}`,
},
Headers: map[string]string{
"Content-Type": "application/json",
"Host": "sms.tencentcloudapi.com",
"X-TC-Action": "SendSms",
"X-TC-Version": "2021-01-11",
"X-TC-Region": "需要改为实际的region",
"Service": "sms",
"Secret_ID": "需要改为实际的secret_id",
"Secret_Key": "需要改为实际的secret_key",
},
},
},
ParamConfig: &NotifyParamConfig{
UserInfo: &UserInfo{
ContactKey: "phone",
},
},
},
{
Name: "Tencent Voice", Ident: "tx-voice", RequestType: "http", Weight: 10, Enable: true,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
Method: "POST",
URL: "https://vms.tencentcloudapi.com",
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Body: `{"CalledNumber":"+86{{ $sendto }}","TemplateId":"需要改为实际的模板id","TemplateParamSet":["{{$tpl.content}}"],"VoiceSdkAppid":"需要改为实际的appid"}`,
},
Headers: map[string]string{
"Content-Type": "application/json",
"Host": "vms.tencentcloudapi.com",
"X-TC-Action": "SendTtsVoice",
"X-TC-Version": "2020-09-02",
"X-TC-Region": "ap-beijing",
"Service": "vms",
"Secret_ID": "需要改为实际的secret_id",
"Secret_Key": "需要改为实际的secret_key",
},
},
},
ParamConfig: &NotifyParamConfig{
UserInfo: &UserInfo{
ContactKey: "phone",
},
},
},
{
Name: "Aliyun SMS", Ident: "ali-sms", RequestType: "http", Weight: 9, Enable: true,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
Method: "POST",
URL: "https://dysmsapi.aliyuncs.com",
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Parameters: map[string]string{
"PhoneNumbers": "{{ $sendto }}",
"SignName": "需要改为实际的签名",
"TemplateCode": "需要改为实际的模板id",
"TemplateParam": `{"incident":"故障{{$tpl.incident}},请及时处理"}`,
"AccessKeyId": "需要改为实际的access_key_id",
"AccessKeySecret": "需要改为实际的access_key_secret",
},
},
Headers: map[string]string{
"Content-Type": "application/json",
"Host": "dysmsapi.aliyuncs.com",
},
},
},
ParamConfig: &NotifyParamConfig{
UserInfo: &UserInfo{
ContactKey: "phone",
},
},
},
{
Name: "Aliyun Voice", Ident: "ali-voice", RequestType: "http", Weight: 8, Enable: true,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
Method: "POST",
URL: "https://dyvmsapi.aliyuncs.com",
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Parameters: map[string]string{
"TtsCode": "需要改为实际的voice_code",
"TtsParam": `{"incident":"故障{{$tpl.incident}},一键认领请按1"}`,
"CalledNumber": `{{ $sendto }}`,
"CalledShowNumber": `需要改为实际的show_number, 如果为空则不显示`,
"AccessKeyId": "需要改为实际的access_key_id",
"AccessKeySecret": "需要改为实际的access_key_secret",
},
},
Headers: map[string]string{
"Content-Type": "application/json",
"Host": "dyvmsapi.aliyuncs.com",
},
},
},
ParamConfig: &NotifyParamConfig{
UserInfo: &UserInfo{
ContactKey: "phone",
},
},
},
{
Name: "Telegram", Ident: Telegram, RequestType: "http", Weight: 7, Enable: true,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
URL: "https://api.telegram.org/bot{{$params.token}}/sendMessage",
Method: "POST", Headers: map[string]string{"Content-Type": "application/json"},
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Parameters: map[string]string{"chat_id": "{{$params.chat_id}}"},
Body: `{"text":"{{$tpl.content}}","parse_mode": "HTML"}`,
},
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{Key: "token", CName: "Token", Type: "string"},
{Key: "chat_id", CName: "Chat Id", Type: "string"},
{Key: "bot_name", CName: "Bot Name", Type: "string"},
},
},
},
},
{
Name: "Lark", Ident: Lark, RequestType: "http", Weight: 6, Enable: true,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
URL: "https://open.larksuite.com/open-apis/bot/v2/hook/{{$params.token}}",
Method: "POST", Headers: map[string]string{"Content-Type": "application/json"},
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Body: `{"msg_type": "text", "content": {"text": "{{$tpl.content}}"}}`,
},
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{Key: "token", CName: "Token", Type: "string"},
{Key: "bot_name", CName: "Bot Name", Type: "string"},
},
},
},
},
{
Name: "Lark Card", Ident: LarkCard, RequestType: "http", Weight: 6, Enable: true,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
URL: "https://open.larksuite.com/open-apis/bot/v2/hook/{{$params.token}}",
Method: "POST", Headers: map[string]string{"Content-Type": "application/json"},
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Body: `{"msg_type": "interactive", "card": {"config": {"wide_screen_mode": true}, "header": {"title": {"content": "{{$tpl.title}}", "tag": "plain_text"}, "template": "{{if $event.IsRecovered}}green{{else}}red{{end}}"}, "elements": [{"tag": "markdown", "content": "{{$tpl.content}}"}]}}`,
},
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{Key: "token", CName: "Token", Type: "string"},
{Key: "bot_name", CName: "Bot Name", Type: "string"},
},
},
},
},
{
Name: "Feishu", Ident: Feishu, RequestType: "http", Weight: 5, Enable: true,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
URL: "https://open.feishu.cn/open-apis/bot/v2/hook/{{$params.access_token}}",
Method: "POST", Headers: map[string]string{"Content-Type": "application/json"},
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Body: `{"msg_type": "text", "content": {"text": "{{$tpl.content}}"}}`,
},
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{Key: "access_token", CName: "Access Token", Type: "string"},
{Key: "bot_name", CName: "Bot Name", Type: "string"},
},
},
},
},
{
Name: "Feishu Card", Ident: FeishuCard, RequestType: "http", Weight: 5, Enable: true,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
URL: "https://open.feishu.cn/open-apis/bot/v2/hook/{{$params.access_token}}",
Method: "POST", Headers: map[string]string{"Content-Type": "application/json"},
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Body: `{"msg_type": "interactive", "card": {"config": {"wide_screen_mode": true}, "header": {"title": {"content": "{{$tpl.title}}", "tag": "plain_text"}, "template": "{{if $event.IsRecovered}}green{{else}}red{{end}}"}, "elements": [{"tag": "markdown", "content": "{{$tpl.content}}"}]}}`,
},
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{Key: "access_token", CName: "Access Token", Type: "string"},
{Key: "bot_name", CName: "Bot Name", Type: "string"},
},
},
},
},
{
Name: "Wecom", Ident: Wecom, RequestType: "http", Weight: 4, Enable: true,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
URL: "https://qyapi.weixin.qq.com/cgi-bin/webhook/send",
Method: "POST", Headers: map[string]string{"Content-Type": "application/json"},
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Parameters: map[string]string{"key": "{{$params.key}}"},
Body: `{"msgtype": "markdown", "markdown": {"content": "{{$tpl.content}}"}}`,
},
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{Key: "key", CName: "Key", Type: "string"},
{Key: "bot_name", CName: "Bot Name", Type: "string"},
},
},
},
},
{
Name: "Dingtalk", Ident: Dingtalk, RequestType: "http", Weight: 3, Enable: true,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
URL: "https://oapi.dingtalk.com/robot/send", Method: "POST",
Headers: map[string]string{"Content-Type": "application/json"},
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Parameters: map[string]string{"access_token": "{{$params.access_token}}"},
Body: `{"msgtype": "markdown", "markdown": {"title": "{{$tpl.title}}", "text": "{{$tpl.content}}\n{{batchContactsAts $sendtos}}"}, "at": {"atMobiles": {{batchContactsJsonMarshal $sendtos}} }}`,
},
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{Key: "access_token", CName: "Access Token", Type: "string"},
{Key: "bot_name", CName: "Bot Name", Type: "string"},
},
},
},
},
{
Name: "Email", Ident: Email, RequestType: "smtp", Weight: 2, Enable: true,
RequestConfig: &RequestConfig{
SMTPRequestConfig: &SMTPRequestConfig{
Host: "smtp.host",
Port: 25,
Username: "your-username",
Password: "your-password",
From: "your-email",
InsecureSkipVerify: true,
},
},
ParamConfig: &NotifyParamConfig{
UserInfo: &UserInfo{
ContactKey: "email",
},
},
},
{
Name: "Callback", Ident: "callback", RequestType: "http", Weight: 2, Enable: true,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
URL: "{{$params.callback_url}}",
Method: "POST", Headers: map[string]string{"Content-Type": "application/json"},
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Request: RequestDetail{
Body: `{{ jsonMarshal $events }}`,
},
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{Key: "callback_url", CName: "Callback Url", Type: "string"},
{Key: "note", CName: "Note", Type: "string"},
},
},
},
},
{
Name: "FlashDuty", Ident: "flashduty", RequestType: "flashduty", Weight: 1, Enable: true,
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
Timeout: 10000, Concurrency: 5, RetryTimes: 3, RetryInterval: 100,
Headers: map[string]string{
"Content-Type": "application/json",
},
},
FlashDutyRequestConfig: &FlashDutyRequestConfig{
IntegrationUrl: "flashduty integration url",
Timeout: 5000, // 默认5秒超时
RetryTimes: 3, // 默认重试3次
},
},
},
}
func InitNotifyChannel(ctx *ctx.Context) {
if !ctx.IsCenter {
return
}
for _, notiCh := range NotiChMap {
notiCh.CreateBy = "system"
notiCh.CreateAt = time.Now().Unix()
notiCh.UpdateBy = "system"
notiCh.UpdateAt = time.Now().Unix()
err := notiCh.Upsert(ctx)
if err != nil {
logger.Warningf("notify channel init failed to upsert notify channels %v", err)
}
}
}
func (ncc *NotifyChannelConfig) Upsert(ctx *ctx.Context) error {
ch, err := NotifyChannelGet(ctx, "name = ?", ncc.Name)
if err != nil {
return errors.WithMessage(err, "notify channel init failed to get message tpl")
}
if ch == nil {
return Insert(ctx, ncc)
}
if ch.UpdateBy != "" && ch.UpdateBy != "system" {
return nil
}
return ch.Update(ctx, *ncc)
}
================================================
FILE: models/notify_channel_test.go
================================================
package models
import (
"encoding/json"
"net/http"
"os"
"strings"
"testing"
"time"
)
// 创建测试事件
var events = []*AlertCurEvent{
{
Id: 1,
Hash: "test-hash",
RuleName: "测试规则",
RuleNote: "这是一个测试告警规则",
Severity: 3,
GroupId: 1,
GroupName: "测试业务组",
TriggerTime: time.Now().Unix(),
TriggerValue: "100",
TagsMap: map[string]string{
"host": "test-host",
"app": "test-app",
"service": "test-service",
"env": "test",
"instance": "127.0.0.1",
},
AnnotationsJSON: map[string]string{
"summary": "测试告警摘要",
"description": "这是一个详细的告警描述",
},
Target: &Target{
Ident: "test-target",
},
NotifyGroupsObj: []*UserGroup{
{
Name: "运维组",
},
},
FirstTriggerTime: time.Now().Unix() - 3600, // 1小时前首次触发
},
}
func TestSendDingTalkNotification(t *testing.T) {
data, err := readKeyValueFromJsonFile("../.env.json")
if err != nil {
t.Fatalf("读取JSON文件失败: %v", err)
}
// 创建钉钉通知配置
notifyChannel := &NotifyChannelConfig{
RequestType: "http",
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
Method: "POST",
URL: "https://oapi.dingtalk.com/robot/send", // 使用测试服务器的URL
Timeout: 10000,
Request: RequestDetail{
Body: `{"msgtype": "markdown", "markdown": {"title": "{{$tpl.title}}", "text": "{{$tpl.content}}\n{{batchContactsAts $sendtos}}"}, "at": {"atMobiles": {{batchContactsJsonMarshal $sendtos}} }}`,
Parameters: map[string]string{
"access_token": "{{ $params.access_token }}",
},
},
Headers: map[string]string{
"Content-Type": "application/json",
},
RetryTimes: 2,
RetryInterval: 1,
},
},
ParamConfig: &NotifyParamConfig{
Custom: Params{
Params: []ParamItem{
{
Key: "access_token",
},
},
},
},
}
// 创建通知模板
tpl := map[string]interface{}{
"title": "测试告警消息",
"content": "测试告警消息",
}
// 创建通知参数
params := map[string]string{
"access_token": data["DingTalkAccessToken"],
}
// 创建HTTP客户端
client, err := GetHTTPClient(notifyChannel)
if err != nil {
t.Fatalf("Failed to create HTTP client: %v", err)
}
// 调用SendHTTP方法
resp, err := notifyChannel.SendHTTP(events, tpl, params, []string{data["Phone"]}, client)
if err != nil {
t.Fatalf("SendHTTP failed: %v", err)
}
// 验证响应
if !strings.Contains(resp, "errmsg") {
t.Errorf("Response does not contain expected content, got: %s", resp)
}
}
func TestSendTencentVoiceNotification(t *testing.T) {
data, err := readKeyValueFromJsonFile("../.env.json")
if err != nil {
t.Fatalf("读取JSON文件失败: %v", err)
}
// 创建腾讯云语音通知配置
notifyChannel := &NotifyChannelConfig{
RequestType: "http",
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
Method: "POST",
URL: data["TencentVoiceUrl"], // 使用测试服务器的URL
Timeout: 5,
Request: RequestDetail{
Body: `{"TemplateId":"1475778","CalledNumber":"{{ $sendto }}","VoiceSdkAppid":"1400655317","TemplateParamSet":["测试"],"PlayTimes":2}`,
},
Headers: map[string]string{
"Content-Type": "application/json",
"Host": "vms.tencentcloudapi.com",
"X-TC-Action": "SendTtsVoice",
"X-TC-Version": "2020-09-02",
"X-TC-Region": "ap-beijing",
"Service": "vms",
"Secret_ID": "test-id",
"Secret_Key": "test-key",
},
RetryTimes: 2,
RetryInterval: 1,
},
},
ParamConfig: &NotifyParamConfig{
UserInfo: &UserInfo{
ContactKey: "phone",
},
},
}
// 创建通知模板
tpl := map[string]interface{}{
"code": "123456",
}
// 创建HTTP客户端
client, err := GetHTTPClient(notifyChannel)
if err != nil {
t.Fatalf("创建HTTP客户端失败: %v", err)
}
// 调用SendHTTP方法
resp, err := notifyChannel.SendHTTP(events, tpl, map[string]string{}, []string{"+8618021015257"}, client)
if err != nil {
t.Fatalf("SendHTTP失败: %v", err)
}
// 验证响应
if !strings.Contains(resp, "RequestId") || !strings.Contains(resp, "SendStatus") {
t.Errorf("响应不包含预期内容,得到: %s", resp)
}
}
func TestSendTencentSMSNotification(t *testing.T) {
data, err := readKeyValueFromJsonFile("../.env.json")
if err != nil {
t.Fatalf("读取JSON文件失败: %v", err)
}
// 创建腾讯云短信通知配置
notifyChannel := &NotifyChannelConfig{
RequestType: "http",
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
Method: "POST",
URL: data["TencentSMSUrl"], // 使用测试服务器的URL
Timeout: 5,
Request: RequestDetail{
Body: `{"PhoneNumberSet":["{{ $sendto }}"],"SignName":"测试签名","SmsSdkAppId":"1400000000","TemplateId":"1000000","TemplateParamSet":["测试"]}`,
},
Headers: map[string]string{
"Content-Type": "application/json",
"Host": "sms.tencentcloudapi.com",
"X-TC-Action": "SendSms",
"X-TC-Version": "2021-01-11",
"X-TC-Region": "ap-guangzhou",
"Service": "sms",
"Secret_ID": "test-id",
"Secret_Key": "test-key",
},
RetryTimes: 2,
RetryInterval: 1,
},
},
ParamConfig: &NotifyParamConfig{
UserInfo: &UserInfo{
ContactKey: "phone",
},
},
}
// 创建通知模板
tpl := map[string]interface{}{
"code": "123456",
}
// 创建HTTP客户端
client, err := GetHTTPClient(notifyChannel)
if err != nil {
t.Fatalf("创建HTTP客户端失败: %v", err)
}
// 调用SendHTTP方法
resp, err := notifyChannel.SendHTTP(events, tpl, map[string]string{}, []string{"+8618021015257"}, client)
if err != nil {
t.Fatalf("SendHTTP失败: %v", err)
}
// 验证响应
if !strings.Contains(resp, "RequestId") || !strings.Contains(resp, "SendStatusSet") {
t.Errorf("响应不包含预期内容,得到: %s", resp)
}
}
func TestSendAliYunVoiceNotification(t *testing.T) {
data, err := readKeyValueFromJsonFile("../.env.json")
if err != nil {
t.Fatalf("读取JSON文件失败: %v", err)
}
// 创建阿里云语音通知配置
notifyChannel := &NotifyChannelConfig{
Ident: "ali-voice",
RequestType: "http",
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
Method: "POST",
URL: "http://dyvmsapi.aliyuncs.com",
Timeout: 10,
Request: RequestDetail{
Parameters: map[string]string{
"AccessKeyId": data["AccessKeyId"],
"AccessKeySecret": data["AccessKeySecret"],
"TtsCode": data["TtsCode"],
"CalledNumber": `{{ $sendto }}`,
"TtsParam": `{"alert_name":"test"}`,
},
},
RetryTimes: 2,
RetryInterval: 1,
},
},
ParamConfig: &NotifyParamConfig{
UserInfo: &UserInfo{
ContactKey: "phone",
},
},
}
// 创建通知模板
tpl := map[string]interface{}{
"code": data["TtsCode"],
}
// 创建HTTP客户端
client, err := GetHTTPClient(notifyChannel)
if err != nil {
t.Fatalf("创建HTTP客户端失败: %v", err)
}
// 调用SendHTTP方法
resp, err := notifyChannel.SendHTTP(events, tpl, map[string]string{}, []string{data["Phone"]}, client)
if err != nil {
t.Fatalf("SendHTTP失败: %v", err)
}
// 验证响应
if !strings.Contains(resp, "RequestId") || !strings.Contains(resp, "CallId") {
t.Errorf("响应不包含预期内容,得到: %s", resp)
}
}
func TestSendAliYunSMSNotification(t *testing.T) {
data, err := readKeyValueFromJsonFile("../.env.json")
if err != nil {
t.Fatalf("读取JSON文件失败: %v", err)
}
notifyChannel := &NotifyChannelConfig{
Ident: "ali-sms",
RequestType: "http",
RequestConfig: &RequestConfig{
HTTPRequestConfig: &HTTPRequestConfig{
Method: "POST",
URL: "https://dysmsapi.aliyuncs.com",
Timeout: 10000,
Request: RequestDetail{
Parameters: map[string]string{
"PhoneNumbers": "{{ $sendto }}",
"SignName": data["SignName"],
"TemplateCode": data["TemplateCode"],
"TemplateParam": `{"name":"text","tag":"text"}`,
"AccessKeyId": data["AccessKeyId"],
"AccessKeySecret": data["AccessKeySecret"],
},
},
Headers: map[string]string{
"Content-Type": "application/json",
},
RetryTimes: 2,
RetryInterval: 1,
},
},
ParamConfig: &NotifyParamConfig{
UserInfo: &UserInfo{
ContactKey: "phone",
},
},
}
// 创建通知模板
tpl := map[string]interface{}{
"code": data["TemplateCode"],
}
// 创建HTTP客户端
client, err := GetHTTPClient(notifyChannel)
if err != nil {
t.Fatalf("创建HTTP客户端失败: %v", err)
}
// 调用SendHTTP方法
resp, err := notifyChannel.SendHTTP(events, tpl, map[string]string{}, []string{data["Phone"]}, client)
if err != nil {
t.Fatalf("SendHTTP失败: %v", err)
}
// 验证响应
if !strings.Contains(resp, "BizId") || !strings.Contains(resp, "RequestId") {
t.Errorf("响应不包含预期内容,得到: %s", resp)
}
}
func TestSendFlashDuty(t *testing.T) {
data, err := readKeyValueFromJsonFile("../.env.json")
if err != nil {
t.Fatalf("读取JSON文件失败: %v", err)
}
// 创建NotifyChannelConfig对象
notifyChannel := &NotifyChannelConfig{
ID: 1,
Name: "FlashDuty测试",
Ident: "flashduty-test",
RequestType: "flashduty",
RequestConfig: &RequestConfig{
FlashDutyRequestConfig: &FlashDutyRequestConfig{
IntegrationUrl: data["FlashDutyIntegrationUrl"],
},
},
}
// 创建HTTP客户端
client := &http.Client{
Timeout: 5 * time.Second,
}
// 调用SendFlashDuty方法
flashDutyChannelID := int64(4344322009498)
resp, err := notifyChannel.SendFlashDuty(events, flashDutyChannelID, client)
// 验证结果
if err != nil {
t.Errorf("SendFlashDuty返回错误: %v", err)
}
// 验证响应内容
if !strings.Contains(resp, "success") {
t.Errorf("响应内容不包含预期的'success'字符串, 得到: %s", resp)
}
// 测试无效的客户端情况
_, err = notifyChannel.SendFlashDuty(events, flashDutyChannelID, nil)
if err == nil || !strings.Contains(err.Error(), "http client not found") {
t.Errorf("预期错误'http client not found',但得到: %v", err)
}
// 测试请求失败的情况
invalidNotifyChannel := &NotifyChannelConfig{
RequestType: "flashduty",
RequestConfig: &RequestConfig{
FlashDutyRequestConfig: &FlashDutyRequestConfig{
IntegrationUrl: "http://invalid-url-that-does-not-exist",
},
},
}
_, err = invalidNotifyChannel.SendFlashDuty(events, flashDutyChannelID, client)
if err == nil {
t.Errorf("预期请求失败,但未返回错误")
}
}
// read key value from json file
func readKeyValueFromJsonFile(filePath string) (map[string]string, error) {
jsonFile, err := os.Open(filePath)
if err != nil {
return nil, err
}
defer jsonFile.Close()
var data map[string]string
err = json.NewDecoder(jsonFile).Decode(&data)
return data, err
}
================================================
FILE: models/notify_config.go
================================================
package models
import (
"fmt"
"net/http"
"github.com/toolkits/pkg/str"
)
const WEBHOOKKEY = "webhook"
const NOTIFYSCRIPT = "notify_script"
const NOTIFYCHANNEL = "notify_channel"
const NOTIFYCONTACT = "notify_contact"
const SMTP = "smtp_config"
const IBEX = "ibex_server"
var GlobalCallback = 0
var RuleCallback = 1
type Webhook struct {
Type int `json:"type"`
Enable bool `json:"enable"`
Url string `json:"url"`
BasicAuthUser string `json:"basic_auth_user"`
BasicAuthPass string `json:"basic_auth_pass"`
Timeout int `json:"timeout"`
HeaderMap map[string]string `json:"headers"`
Headers []string `json:"headers_str"`
SkipVerify bool `json:"skip_verify"`
Note string `json:"note"`
RetryCount int `json:"retry_count"`
RetryInterval int `json:"retry_interval"`
Batch int `json:"batch"`
Client *http.Client `json:"-"`
}
func (w *Webhook) Hash() string {
return str.MD5(fmt.Sprintf("%d_%t_%s_%s_%s_%d_%v_%t_%s_%d_%d_%d", w.Type, w.Enable, w.Url, w.BasicAuthUser, w.BasicAuthPass, w.Timeout, w.HeaderMap, w.SkipVerify, w.Note, w.RetryCount, w.RetryInterval, w.Batch))
}
type NotifyScript struct {
Enable bool `json:"enable"`
Type int `json:"type"` // 0 script 1 path
Content string `json:"content"`
Timeout int `json:"timeout"`
}
type NotifyChannel struct {
Name string `json:"name"`
Ident string `json:"ident"`
Hide bool `json:"hide"`
BuiltIn bool `json:"built_in"`
}
type NotifyContact struct {
Name string `json:"name"`
Ident string `json:"ident"`
Hide bool `json:"hide"`
BuiltIn bool `json:"built_in"`
}
================================================
FILE: models/notify_rule.go
================================================
package models
import (
"crypto/sha256"
"encoding/hex"
"errors"
"fmt"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
)
type NotifyRule struct {
ID int64 `json:"id" gorm:"primarykey"`
Name string `json:"name"` // 名称
Description string `json:"description"` // 备注
Enable bool `json:"enable"` // 启用状态
UserGroupIds []int64 `json:"user_group_ids" gorm:"serializer:json"` // 告警组ID
PipelineConfigs []PipelineConfig `json:"pipeline_configs" gorm:"serializer:json"`
// 通知配置
NotifyConfigs []NotifyConfig `json:"notify_configs" gorm:"serializer:json"`
ExtraConfig interface{} `json:"extra_config,omitempty" gorm:"serializer:json"`
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
}
type PipelineConfig struct {
PipelineId int64 `json:"pipeline_id"`
Enable bool `json:"enable"`
}
func (r *NotifyRule) TableName() string {
return "notify_rule"
}
type NotifyConfig struct {
ChannelID int64 `json:"channel_id"` // 通知媒介(如:阿里云短信)
TemplateID int64 `json:"template_id"` // 通知模板
Params map[string]interface{} `json:"params"` // 通知参数
Type string `json:"type"`
Severities []int `json:"severities"` // 适用级别(一级告警、二级告警、三级告警)
TimeRanges []TimeRanges `json:"time_ranges"` // 适用时段
LabelKeys []TagFilter `json:"label_keys"` // 适用标签
Attributes []TagFilter `json:"attributes"` // 适用属性
}
func (n *NotifyConfig) Hash() string {
hash := sha256.New()
hash.Write([]byte(fmt.Sprintf("%d%d%v%s%v%v%v%v", n.ChannelID, n.TemplateID, n.Params, n.Type, n.Severities, n.TimeRanges, n.LabelKeys, n.Attributes)))
return hex.EncodeToString(hash.Sum(nil))
}
type CustomParams struct {
UserIDs []int64 `json:"user_ids"`
UserGroupIDs []int64 `json:"user_group_ids"`
IDs []int64 `json:"ids"`
}
type TimeRanges struct {
Start string `json:"start"`
End string `json:"end"`
Week []int `json:"week"`
}
var NotifyRuleCache struct {
}
// 创建 NotifyRule
func CreateNotifyRule(c *ctx.Context, rule *NotifyRule) error {
return DB(c).Create(rule).Error
}
// 读取 NotifyRule
func GetNotifyRule(c *ctx.Context, id int64) (*NotifyRule, error) {
var rule NotifyRule
if err := DB(c).First(&rule, id).Error; err != nil {
return nil, err
}
return &rule, nil
}
// 删除 NotifyRule
func DeleteNotifyRule(c *ctx.Context, id int64) error {
return DB(c).Delete(&NotifyRule{}, id).Error
}
func NotifyRuleStatistics(ctx *ctx.Context) (*Statistics, error) {
if !ctx.IsCenter {
s, err := poster.GetByUrls[*Statistics](ctx, "/v1/n9e/statistic?name=notify_rule")
return s, err
}
session := DB(ctx).Model(&NotifyRule{}).Select("count(*) as total", "max(update_at) as last_updated").Where("enable = ?", true)
var stats []*Statistics
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
func NotifyRuleGetsAll(ctx *ctx.Context) ([]*NotifyRule, error) {
if !ctx.IsCenter {
rules, err := poster.GetByUrls[[]*NotifyRule](ctx, "/v1/n9e/notify-rules")
return rules, err
}
var rules []*NotifyRule
err := DB(ctx).Where("enable = ?", true).Find(&rules).Error
if err != nil {
return nil, err
}
return rules, nil
}
func (r *NotifyRule) Verify() error {
if r.Name == "" {
return errors.New("name cannot be empty")
}
// if len(r.UserGroupIds) == 0 {
// return errors.New("user group ids cannot be empty")
// }
// if len(r.NotifyConfigs) == 0 {
// return errors.New("notify configs cannot be empty")
// }
for _, config := range r.NotifyConfigs {
if err := config.Verify(); err != nil {
return err
}
}
return nil
}
func (c *NotifyConfig) Verify() error {
if c.ChannelID <= 0 {
return errors.New("invalid channel id")
}
for _, severity := range c.Severities {
if severity < 1 || severity > 3 {
return errors.New("invalid severity level")
}
}
for _, timeRange := range c.TimeRanges {
if err := timeRange.Verify(); err != nil {
return err
}
}
for _, label := range c.LabelKeys {
if err := label.Verify(); err != nil {
return err
}
}
return nil
}
func (t *TimeRanges) Verify() error {
if t.Start == "" {
return errors.New("start time cannot be empty")
}
if t.End == "" {
return errors.New("end time cannot be empty")
}
// 进一步校验时间格式或检查时间段的合理性
return nil
}
func (r *NotifyRule) Update(ctx *ctx.Context, ref NotifyRule) error {
// ref.FE2DB()
ref.ID = r.ID
ref.CreateAt = r.CreateAt
ref.CreateBy = r.CreateBy
ref.UpdateAt = time.Now().Unix()
err := ref.Verify()
if err != nil {
return err
}
db := DB(ctx).Model(r).Select("*")
if ref.ExtraConfig == nil {
db = db.Omit("ExtraConfig")
}
return db.Updates(ref).Error
}
func (r *NotifyRule) DB2FE() {
if r.UserGroupIds == nil {
r.UserGroupIds = make([]int64, 0)
}
if r.NotifyConfigs == nil {
r.NotifyConfigs = make([]NotifyConfig, 0)
}
}
func NotifyRuleGet(ctx *ctx.Context, where string, args ...interface{}) (*NotifyRule, error) {
lst, err := NotifyRulesGet(ctx, where, args...)
if err != nil || len(lst) == 0 {
return nil, err
}
return lst[0], err
}
func NotifyRulesGet(ctx *ctx.Context, where string, args ...interface{}) ([]*NotifyRule, error) {
lst := make([]*NotifyRule, 0)
session := DB(ctx)
if where != "" && len(args) > 0 {
session = session.Where(where, args...)
}
err := session.Order("name asc").Find(&lst).Error
if err != nil {
return nil, err
}
for _, r := range lst {
r.DB2FE()
}
return lst, nil
}
type NotifyRuleChecker interface {
IfUsed(*NotifyRule) bool
}
func UsedByNotifyRule(ctx *ctx.Context, nrc NotifyRuleChecker) ([]int64, error) {
notifyRules, err := NotifyRulesGet(ctx, "", nil)
if err != nil {
return nil, err
}
ids := make([]int64, 0)
for _, nr := range notifyRules {
if nrc.IfUsed(nr) {
ids = append(ids, nr.ID)
}
}
return ids, nil
}
================================================
FILE: models/notify_tpl.go
================================================
package models
import (
"encoding/json"
"fmt"
"html/template"
"path"
"strings"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/pkg/tplx"
"github.com/pkg/errors"
"github.com/toolkits/pkg/file"
"github.com/toolkits/pkg/logger"
)
type NotifyTpl struct {
Id int64 `json:"id"`
Name string `json:"name"`
Channel string `json:"channel"`
Content string `json:"content"`
BuiltIn bool `json:"built_in" gorm:"-"`
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
}
func (n *NotifyTpl) TableName() string {
return "notify_tpl"
}
func (n *NotifyTpl) Create(c *ctx.Context) error {
return Insert(c, n)
}
func (n *NotifyTpl) UpdateContent(c *ctx.Context) error {
return DB(c).Model(n).Select("content", "update_at", "update_by").Updates(n).Error
}
func (n *NotifyTpl) Update(c *ctx.Context) error {
return DB(c).Model(n).Select("name", "update_at", "update_by").Updates(n).Error
}
func (n *NotifyTpl) CreateIfNotExists(c *ctx.Context, channel string) error {
count, err := NotifyTplCountByChannel(c, channel)
if err != nil {
return errors.WithMessage(err, "failed to count notify tpls")
}
if count != 0 {
return nil
}
err = n.Create(c)
return err
}
func (n *NotifyTpl) NotifyTplDelete(ctx *ctx.Context, id int64) error {
return DB(ctx).Where("channel not in (?) and id =? ", DefaultChannels, id).Delete(new(NotifyTpl)).Error
}
func NotifyTplCountByChannel(c *ctx.Context, channel string) (int64, error) {
var count int64
err := DB(c).Model(&NotifyTpl{}).Where("channel=?", channel).Count(&count).Error
return count, err
}
func NotifyTplGets(c *ctx.Context) ([]*NotifyTpl, error) {
if !c.IsCenter {
lst, err := poster.GetByUrls[[]*NotifyTpl](c, "/v1/n9e/notify-tpls")
return lst, err
}
var lst []*NotifyTpl
err := DB(c).Find(&lst).Error
return lst, err
}
func ListTpls(c *ctx.Context) (map[string]*template.Template, error) {
notifyTpls, err := NotifyTplGets(c)
if err != nil {
return nil, errors.WithMessage(err, "failed to get notify tpls")
}
tpls := make(map[string]*template.Template)
for _, notifyTpl := range notifyTpls {
var defs = []string{
"{{$labels := .TagsMap}}",
"{{$value := .TriggerValue}}",
}
text := strings.Join(append(defs, notifyTpl.Content), "")
tpl, err := template.New(notifyTpl.Channel).Funcs(tplx.TemplateFuncMap).Parse(text)
if err != nil {
return nil, fmt.Errorf("failed to parse tpl:%v %v ", notifyTpl, err)
}
tpls[notifyTpl.Channel] = tpl
}
return tpls, nil
}
// get notify by id
func NotifyTplGet(c *ctx.Context, id int64) (*NotifyTpl, error) {
var tpl NotifyTpl
err := DB(c).Where("id=?", id).First(&tpl).Error
return &tpl, err
}
func InitNotifyConfig(c *ctx.Context, tplDir string) {
if !c.IsCenter {
return
}
// init notify channel
cval, err := ConfigsGet(c, NOTIFYCHANNEL)
if err != nil {
logger.Errorf("failed to get notify contact config: %v", err)
return
}
if cval == "" {
var notifyChannels []NotifyChannel
for _, channel := range DefaultChannels {
notifyChannels = append(notifyChannels, NotifyChannel{Ident: channel, Name: channel, BuiltIn: true})
}
data, _ := json.Marshal(notifyChannels)
err = ConfigsSet(c, NOTIFYCHANNEL, string(data))
if err != nil {
logger.Errorf("failed to set notify contact config: %v", err)
return
}
} else {
var channels []NotifyChannel
err = json.Unmarshal([]byte(cval), &channels)
if err != nil {
logger.Errorf("failed to unmarshal notify channel config: %v", err)
return
}
channelMap := make(map[string]bool)
for _, channel := range channels {
channelMap[channel.Ident] = true
}
var newChannels []NotifyChannel
for _, channel := range DefaultChannels {
if _, ok := channelMap[channel]; !ok {
newChannels = append(newChannels, NotifyChannel{Ident: channel, Name: channel, BuiltIn: true})
}
}
if len(newChannels) > 0 {
channels = append(channels, newChannels...)
data, _ := json.Marshal(channels)
err = ConfigsSet(c, NOTIFYCHANNEL, string(data))
if err != nil {
logger.Errorf("failed to set notify contact config: %v", err)
return
}
}
}
// init notify tpl
tplMap := getNotifyTpl(tplDir)
for channel, content := range tplMap {
notifyTpl := NotifyTpl{
Name: channel,
Channel: channel,
Content: content,
}
err := notifyTpl.CreateIfNotExists(c, channel)
if err != nil {
logger.Warningf("failed to create notify tpls %v", err)
}
}
}
func getNotifyTpl(tplDir string) map[string]string {
filenames, err := file.FilesUnder(tplDir)
if err != nil {
logger.Errorf("failed to get tpl files under %s", tplDir)
return nil
}
tplMap := make(map[string]string)
if len(filenames) != 0 {
for i := 0; i < len(filenames); i++ {
if strings.HasSuffix(filenames[i], ".tpl") {
name := strings.TrimSuffix(filenames[i], ".tpl")
tplpath := path.Join(tplDir, filenames[i])
content, err := file.ToString(tplpath)
if err != nil {
logger.Errorf("failed to read tpl file: %s", filenames[i])
continue
}
tplMap[name] = content
}
}
return tplMap
}
logger.Debugf("no tpl files under %s, use default tpl", tplDir)
return TplMap
}
var TplMap = map[string]string{
Dingtalk: `#### {{if .IsRecovered}}💚{{.RuleName}} {{else}}💔{{.RuleName}} {{end}}
---
{{$time_duration := sub now.Unix .FirstTriggerTime }}{{if .IsRecovered}}{{$time_duration = sub .LastEvalTime .FirstTriggerTime }}{{end}}
- **告警级别**: {{.Severity}}级
{{- if .RuleNote}}
- **规则备注**: {{.RuleNote}}
{{- end}}
{{- if not .IsRecovered}}
- **当次触发时值**: {{.TriggerValue}}
- **当次触发时间**: {{timeformat .TriggerTime}}
- **告警持续时长**: {{humanizeDurationInterface $time_duration}}
{{- else}}
{{- if .AnnotationsJSON.recovery_value}}
- **恢复时值**: {{formatDecimal .AnnotationsJSON.recovery_value 4}}
{{- end}}
- **恢复时间**: {{timeformat .LastEvalTime}}
- **告警持续时长**: {{humanizeDurationInterface $time_duration}}
{{- end}}
- **告警事件标签**:
{{- range $key, $val := .TagsMap}}
{{- if ne $key "rulename" }}
- {{$key}}: {{$val}}
{{- end}}
{{- end}}
{{$domain := "http://请联系管理员修改通知模板将域名替换为实际的域名" }}
[事件详情]({{$domain}}/alert-his-events/{{.Id}})|[屏蔽1小时]({{$domain}}/alert-mutes/add?__event_id={{.Id}}){{if eq .Cate "prometheus"}}|[查看曲线]({{$domain}}/metric/explorer?__event_id={{.Id}}&mode=graph}}){{end}}`,
Email: `
夜莺告警通知
{{if .IsRecovered}}
级别状态:
S{{.Severity}} Recovered
{{else}}
级别状态:
S{{.Severity}} Triggered
{{end}}
策略备注:
{{.RuleNote}}
设备备注:
{{.TargetNote}}
{{if not .IsRecovered}}
触发时值:
{{.TriggerValue}}
{{end}}
{{if .TargetIdent}}
监控对象:
{{.TargetIdent}}
{{end}}
监控指标:
{{.TagsJSON}}
{{if .IsRecovered}}
恢复时间:
{{timeformat .LastEvalTime}}
{{else}}
触发时间:
{{timeformat .TriggerTime}}
{{end}}
发送时间:
{{timestamp}}
`,
Feishu: `级别状态: S{{.Severity}} {{if .IsRecovered}}Recovered{{else}}Triggered{{end}}
规则名称: {{.RuleName}}{{if .RuleNote}}
规则备注: {{.RuleNote}}{{end}}
监控指标: {{.TagsJSON}}
{{if .IsRecovered}}恢复时间:{{timeformat .LastEvalTime}}{{else}}触发时间: {{timeformat .TriggerTime}}
触发时值: {{.TriggerValue}}{{end}}
发送时间: {{timestamp}}
{{$domain := "http://请联系管理员修改通知模板将域名替换为实际的域名" }}
事件详情: {{$domain}}/alert-his-events/{{.Id}}
屏蔽1小时: {{$domain}}/alert-mutes/add?__event_id={{.Id}}`,
FeishuCard: `{{ if .IsRecovered }}
{{- if ne .Cate "host"}}
**告警集群:** {{.Cluster}}{{end}}
**级别状态:** S{{.Severity}} Recovered
**告警名称:** {{.RuleName}}
**恢复时间:** {{timeformat .LastEvalTime}}
**告警描述:** **服务已恢复**
{{- else }}
{{- if ne .Cate "host"}}
**告警集群:** {{.Cluster}}{{end}}
**级别状态:** S{{.Severity}} Triggered
**告警名称:** {{.RuleName}}
**触发时间:** {{timeformat .TriggerTime}}
**发送时间:** {{timestamp}}
**触发时值:** {{.TriggerValue}}
{{if .RuleNote }}**告警描述:** **{{.RuleNote}}**{{end}}
{{- end -}}
{{$domain := "http://请联系管理员修改通知模板将域名替换为实际的域名" }}
[事件详情]({{$domain}}/alert-his-events/{{.Id}})|[屏蔽1小时]({{$domain}}/alert-mutes/add?__event_id={{.Id}}){{if eq .Cate "prometheus"}}|[查看曲线]({{$domain}}/metric/explorer?__event_id={{.Id}}&mode=graph}}){{end}}`,
EmailSubject: `{{if .IsRecovered}}Recovered{{else}}Triggered{{end}}: {{.RuleName}} {{.TagsJSON}}`,
Mm: `级别状态: S{{.Severity}} {{if .IsRecovered}}Recovered{{else}}Triggered{{end}}
规则名称: {{.RuleName}}{{if .RuleNote}}
规则备注: {{.RuleNote}}{{end}}
监控指标: {{.TagsJSON}}
{{if .IsRecovered}}恢复时间:{{timeformat .LastEvalTime}}{{else}}触发时间: {{timeformat .TriggerTime}}
触发时值: {{.TriggerValue}}{{end}}
发送时间: {{timestamp}}`,
Telegram: `**级别状态**: {{if .IsRecovered}}S{{.Severity}} Recovered {{else}}S{{.Severity}} Triggered {{end}}
**规则标题**: {{.RuleName}}{{if .RuleNote}}
**规则备注**: {{.RuleNote}}{{end}}{{if .TargetIdent}}
**监控对象**: {{.TargetIdent}}{{end}}
**监控指标**: {{.TagsJSON}}{{if not .IsRecovered}}
**触发时值**: {{.TriggerValue}}{{end}}
{{if .IsRecovered}}**恢复时间**: {{timeformat .LastEvalTime}}{{else}}**首次触发时间**: {{timeformat .FirstTriggerTime}}{{end}}
{{$time_duration := sub now.Unix .FirstTriggerTime }}{{if .IsRecovered}}{{$time_duration = sub .LastEvalTime .FirstTriggerTime }}{{end}}**距离首次告警**: {{humanizeDurationInterface $time_duration}}
**发送时间**: {{timestamp}}`,
Wecom: `**级别状态**: {{if .IsRecovered}}S{{.Severity}} Recovered {{else}}S{{.Severity}} Triggered {{end}}
**规则标题**: {{.RuleName}}{{if .RuleNote}}
**规则备注**: {{.RuleNote}}{{end}}{{if .TargetIdent}}
**监控对象**: {{.TargetIdent}}{{end}}
**监控指标**: {{.TagsJSON}}{{if not .IsRecovered}}
**触发时值**: {{.TriggerValue}}{{end}}
{{if .IsRecovered}}**恢复时间**: {{timeformat .LastEvalTime}}{{else}}**首次触发时间**: {{timeformat .FirstTriggerTime}}{{end}}
{{$time_duration := sub now.Unix .FirstTriggerTime }}{{if .IsRecovered}}{{$time_duration = sub .LastEvalTime .FirstTriggerTime }}{{end}}**距离首次告警**: {{humanizeDurationInterface $time_duration}}
**发送时间**: {{timestamp}}
{{$domain := "http://请联系管理员修改通知模板将域名替换为实际的域名" }}
[事件详情]({{$domain}}/alert-his-events/{{.Id}})|[屏蔽1小时]({{$domain}}/alert-mutes/add?__event_id={{.Id}}){{if eq .Cate "prometheus"}}|[查看曲线]({{$domain}}/metric/explorer?__event_id={{.Id}}&mode=graph}}){{end}}`,
Lark: `级别状态: S{{.Severity}} {{if .IsRecovered}}Recovered{{else}}Triggered{{end}}
规则名称: {{.RuleName}}{{if .RuleNote}}
规则备注: {{.RuleNote}}{{end}}
监控指标: {{.TagsJSON}}
{{if .IsRecovered}}恢复时间:{{timeformat .LastEvalTime}}{{else}}触发时间: {{timeformat .TriggerTime}}
触发时值: {{.TriggerValue}}{{end}}
发送时间: {{timestamp}}
{{$domain := "http://请联系管理员修改通知模板将域名替换为实际的域名" }}
事件详情: {{$domain}}/alert-his-events/{{.Id}}
屏蔽1小时: {{$domain}}/alert-mutes/add?__event_id={{.Id}}`,
LarkCard: `{{ if .IsRecovered }}
{{- if ne .Cate "host"}}
**告警集群:** {{.Cluster}}{{end}}
**级别状态:** S{{.Severity}} Recovered
**告警名称:** {{.RuleName}}
**恢复时间:** {{timeformat .LastEvalTime}}
{{$time_duration := sub now.Unix .FirstTriggerTime }}{{if .IsRecovered}}{{$time_duration = sub .LastEvalTime .FirstTriggerTime }}{{end}}**持续时长**: {{humanizeDurationInterface $time_duration}}
**告警描述:** **服务已恢复**
{{- else }}
{{- if ne .Cate "host"}}
**告警集群:** {{.Cluster}}{{end}}
**级别状态:** S{{.Severity}} Triggered
**告警名称:** {{.RuleName}}
**触发时间:** {{timeformat .TriggerTime}}
**发送时间:** {{timestamp}}
**触发时值:** {{.TriggerValue}}
{{$time_duration := sub now.Unix .FirstTriggerTime }}{{if .IsRecovered}}{{$time_duration = sub .LastEvalTime .FirstTriggerTime }}{{end}}**持续时长**: {{humanizeDurationInterface $time_duration}}
{{if .RuleNote }}**告警描述:** **{{.RuleNote}}**{{end}}
{{- end -}}
{{$domain := "http://请联系管理员修改通知模板将域名替换为实际的域名" }}
[事件详情]({{$domain}}/alert-his-events/{{.Id}})|[屏蔽1小时]({{$domain}}/alert-mutes/add?__event_id={{.Id}}){{if eq .Cate "prometheus"}}|[查看曲线]({{$domain}}/metric/explorer?__event_id={{.Id}}&mode=graph}}){{end}}`,
}
================================================
FILE: models/prom_alert_rule.go
================================================
package models
import (
"fmt"
"strings"
"time"
"github.com/toolkits/pkg/logger"
)
type PromRule struct {
Alert string `yaml:"alert,omitempty" json:"alert,omitempty"` // 报警规则的名称
Record string `yaml:"record,omitempty" json:"record,omitempty"` // 记录规则的名称
Expr string `yaml:"expr,omitempty" json:"expr,omitempty"` // PromQL 表达式
For string `yaml:"for,omitempty" json:"for,omitempty"` // 告警的等待时间
Annotations map[string]string `yaml:"annotations,omitempty" json:"annotations,omitempty"` // 规则的注释信息
Labels map[string]string `yaml:"labels,omitempty" json:"labels,omitempty"` // 规则的标签信息
}
type PromRuleGroup struct {
Name string `yaml:"name"`
Rules []PromRule `yaml:"rules"`
Interval string `yaml:"interval,omitempty"`
}
func convertInterval(interval string) int {
duration, err := time.ParseDuration(interval)
if err != nil {
logger.Errorf("Error parsing interval `%s`, err: %v", interval, err)
return 60
}
if duration.Seconds() == 0 {
duration = 60 * time.Second
}
return int(duration.Seconds())
}
func ConvertAlert(rule PromRule, interval string, datasouceQueries []DatasourceQuery, disabled int) AlertRule {
annotations := rule.Annotations
appendTags := []string{}
severity := 2
ruleName := rule.Alert
if len(rule.Labels) > 0 {
for k, v := range rule.Labels {
if k != "severity" {
appendTags = append(appendTags, fmt.Sprintf("%s=%s", strings.ReplaceAll(k, " ", ""), strings.ReplaceAll(v, " ", "")))
} else {
switch v {
case "critical", "Critical", "CRITICAL", "error", "Error", "ERROR", "fatal", "Fatal", "FATAL", "page", "Page", "PAGE", "sev1", "SEV1", "Severity1", "severity1", "SEVERITY1":
severity = 1
case "warning", "Warning", "WARNING", "warn", "Warn", "WARN", "sev2", "SEV2", "Severity2", "severity2", "SEVERITY2":
severity = 2
case "info", "Info", "INFO", "notice", "Notice", "NOTICE", "sev3", "SEV3", "Severity3", "severity3", "SEVERITY3":
severity = 3
}
ruleName += "-" + v
}
}
}
ar := AlertRule{
Name: rule.Alert,
Severity: severity,
Disabled: disabled,
PromForDuration: convertInterval(rule.For),
PromQl: rule.Expr,
CronPattern: fmt.Sprintf("@every %ds", convertInterval(interval)),
EnableInBG: AlertRuleEnableInGlobalBG,
NotifyRecovered: AlertRuleNotifyRecovered,
NotifyRepeatStep: AlertRuleNotifyRepeatStep60Min,
RecoverDuration: AlertRuleRecoverDuration0Sec,
AnnotationsJSON: annotations,
AppendTagsJSON: appendTags,
DatasourceQueries: datasouceQueries,
NotifyVersion: 1,
NotifyRuleIds: []int64{},
}
return ar
}
func DealPromGroup(promRule []PromRuleGroup, dataSourceQueries []DatasourceQuery, disabled int) []AlertRule {
var alertRules []AlertRule
for _, group := range promRule {
interval := group.Interval
if interval == "" {
interval = "60s"
}
for _, rule := range group.Rules {
if rule.Alert != "" {
alertRules = append(alertRules,
ConvertAlert(rule, interval, dataSourceQueries, disabled))
}
}
}
return alertRules
}
================================================
FILE: models/prom_alert_rule_test.go
================================================
package models_test
import (
"testing"
"github.com/ccfos/nightingale/v6/models"
"gopkg.in/yaml.v2"
)
func TestConvertAlert(t *testing.T) {
jobMissing := []models.PromRule{}
err := yaml.Unmarshal([]byte(` - alert: PrometheusJobMissing
expr: absent(up{job="prometheus"})
for: 1m
labels:
severity: warning
annotations:
summary: Prometheus job missing (instance {{ $labels.instance }})
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`), &jobMissing)
if err != nil {
t.Errorf("Failed to Unmarshal, err: %s", err)
}
t.Logf("jobMissing: %+v", jobMissing[0])
convJobMissing := models.ConvertAlert(jobMissing[0], "30s", []models.DatasourceQuery{}, 0)
if convJobMissing.PromEvalInterval != 30 {
t.Errorf("PromEvalInterval is expected to be 30, but got %d",
convJobMissing.PromEvalInterval)
}
if convJobMissing.PromForDuration != 60 {
t.Errorf("PromForDuration is expected to be 60, but got %d",
convJobMissing.PromForDuration)
}
if convJobMissing.Severity != 2 {
t.Errorf("Severity is expected to be 2, but got %d", convJobMissing.Severity)
}
ruleEvaluationSlow := []models.PromRule{}
yaml.Unmarshal([]byte(` - alert: PrometheusRuleEvaluationSlow
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
for: 180s
labels:
severity: info
annotations:
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
`), &ruleEvaluationSlow)
t.Logf("ruleEvaluationSlow: %+v", ruleEvaluationSlow[0])
convRuleEvaluationSlow := models.ConvertAlert(ruleEvaluationSlow[0], "1m", []models.DatasourceQuery{}, 0)
if convRuleEvaluationSlow.PromEvalInterval != 60 {
t.Errorf("PromEvalInterval is expected to be 60, but got %d",
convJobMissing.PromEvalInterval)
}
if convRuleEvaluationSlow.PromForDuration != 180 {
t.Errorf("PromForDuration is expected to be 180, but got %d",
convJobMissing.PromForDuration)
}
if convRuleEvaluationSlow.Severity != 3 {
t.Errorf("Severity is expected to be 3, but got %d", convJobMissing.Severity)
}
targetMissing := []models.PromRule{}
yaml.Unmarshal([]byte(` - alert: PrometheusTargetMissing
expr: up == 0
for: 1.5m
labels:
severity: critical
annotations:
summary: Prometheus target missing (instance {{ $labels.instance }})
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
`), &targetMissing)
t.Logf("targetMissing: %+v", targetMissing[0])
convTargetMissing := models.ConvertAlert(targetMissing[0], "1h", []models.DatasourceQuery{}, 0)
if convTargetMissing.PromEvalInterval != 3600 {
t.Errorf("PromEvalInterval is expected to be 3600, but got %d",
convTargetMissing.PromEvalInterval)
}
if convTargetMissing.PromForDuration != 90 {
t.Errorf("PromForDuration is expected to be 90, but got %d",
convTargetMissing.PromForDuration)
}
if convTargetMissing.Severity != 1 {
t.Errorf("Severity is expected to be 1, but got %d", convTargetMissing.Severity)
}
}
================================================
FILE: models/recording_rule.go
================================================
package models
import (
"encoding/json"
"fmt"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/pkg/errors"
"github.com/prometheus/common/model"
"github.com/toolkits/pkg/logger"
)
// A RecordingRule records its vector expression into new timeseries.
type RecordingRule struct {
Id int64 `json:"id" gorm:"primaryKey"`
GroupId int64 `json:"group_id"` // busi group id
DatasourceIds string `json:"-" gorm:"datasource_ids,omitempty"`
DatasourceIdsJson []int64 `json:"datasource_ids" gorm:"-"` // for open source fe
DatasourceQueries []DatasourceQuery `json:"datasource_queries,omitempty" gorm:"serializer:json"` // datasource queries
Cluster string `json:"cluster"` // take effect by cluster, separated by space
Name string `json:"name"` // new metric name
Disabled int `json:"disabled"` // 0: enabled, 1: disabled
PromQl string `json:"prom_ql"` // just one ql for promql
QueryConfigs string `json:"-" gorm:"query_configs"` // query_configs
QueryConfigsJson []QueryConfig `json:"query_configs" gorm:"-"` // query_configs for fe
PromEvalInterval int `json:"prom_eval_interval"` // unit:s
CronPattern string `json:"cron_pattern"`
AppendTags string `json:"-"` // split by space: service=n9e mod=api
AppendTagsJSON []string `json:"append_tags" gorm:"-"` // for fe
Note string `json:"note"` // note
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
}
type QueryConfig struct {
Queries []Query `json:"queries"`
NewMetric string `json:"new_metric"`
Exp string `json:"exp"`
WriteDatasourceId int64 `json:"write_datasource_id"`
Delay int `json:"delay"`
WritebackEnabled bool `json:"writeback_enabled"` // 是否写入与查询数据源相同的数据源
}
type Query struct {
DatasourceIds []int64 `json:"datasource_ids"`
DatasourceQueries []DatasourceQuery `json:"datasource_queries"`
Cate string `json:"cate"`
Config interface{} `json:"config"`
}
func (re *RecordingRule) TableName() string {
return "recording_rule"
}
func (re *RecordingRule) FE2DB() {
re.AppendTags = strings.Join(re.AppendTagsJSON, " ")
idsByte, _ := json.Marshal(re.DatasourceIdsJson)
re.DatasourceIds = string(idsByte)
queryConfigsByte, _ := json.Marshal(re.QueryConfigsJson)
re.QueryConfigs = string(queryConfigsByte)
}
func (re *RecordingRule) DB2FE() error {
re.AppendTagsJSON = strings.Fields(re.AppendTags)
json.Unmarshal([]byte(re.DatasourceIds), &re.DatasourceIdsJson)
re.FillDatasourceQueries()
json.Unmarshal([]byte(re.QueryConfigs), &re.QueryConfigsJson)
// 存量数据规则不包含 DatasourceQueries 字段,将 DatasourceIds 转换为 DatasourceQueries 字段
for i := range re.QueryConfigsJson {
for j := range re.QueryConfigsJson[i].Queries {
if len(re.QueryConfigsJson[i].Queries[j].DatasourceQueries) == 0 {
values := make([]interface{}, 0, len(re.QueryConfigsJson[i].Queries[j].DatasourceIds))
for _, dsID := range re.QueryConfigsJson[i].Queries[j].DatasourceIds {
values = append(values, dsID)
}
re.QueryConfigsJson[i].Queries[j].DatasourceQueries = []DatasourceQuery{
{
MatchType: 0,
Op: "in",
Values: values,
},
}
}
}
}
if re.CronPattern == "" && re.PromEvalInterval != 0 {
re.CronPattern = fmt.Sprintf("@every %ds", re.PromEvalInterval)
}
return nil
}
func (re *RecordingRule) FillDatasourceQueries() error {
// 兼容旧逻辑,将 datasourceIds 转换为 datasourceQueries
if len(re.DatasourceQueries) == 0 && len(re.DatasourceIds) != 0 {
datasourceQueries := DatasourceQuery{
MatchType: 0,
Op: "in",
Values: make([]interface{}, 0),
}
var values []int64
if re.DatasourceIds != "" {
json.Unmarshal([]byte(re.DatasourceIds), &values)
}
for i := range values {
if values[i] == 0 {
// 0 表示所有数据源
datasourceQueries.MatchType = 2
break
}
datasourceQueries.Values = append(datasourceQueries.Values, values[i])
}
re.DatasourceQueries = []DatasourceQuery{datasourceQueries}
}
return nil
}
func (re *RecordingRule) Verify() error {
if re.GroupId < 0 {
return fmt.Errorf("GroupId(%d) invalid", re.GroupId)
}
//if IsAllDatasource(re.DatasourceIdsJson) {
// re.DatasourceIdsJson = []int64{0}
//}
if re.PromQl != "" && !model.MetricNameRE.MatchString(re.Name) {
return errors.New("Name has invalid chreacters")
}
for _, queryConfig := range re.QueryConfigsJson {
if !model.MetricNameRE.MatchString(queryConfig.NewMetric) {
return errors.New("Metric Name has invalid chreacters")
}
}
if re.Name == "" && re.PromQl != "" {
return errors.New("name is blank")
}
if re.PromEvalInterval <= 0 {
re.PromEvalInterval = 60
}
if re.CronPattern == "" {
re.CronPattern = "@every 60s"
}
re.AppendTags = strings.TrimSpace(re.AppendTags)
rer := strings.Fields(re.AppendTags)
for i := 0; i < len(rer); i++ {
pair := strings.Split(rer[i], "=")
if len(pair) != 2 || !model.LabelNameRE.MatchString(pair[0]) {
return fmt.Errorf("AppendTags(%s) invalid", rer[i])
}
}
// Check if query_configs length exceeds TEXT type limit (65535 bytes)
if len(re.QueryConfigs) > 65535 {
return fmt.Errorf("query_configs length (%d bytes) exceeds TEXT type limit (65535 bytes), please reduce the configuration size", len(re.QueryConfigs))
}
return nil
}
func (re *RecordingRule) Add(ctx *ctx.Context) error {
if err := re.Verify(); err != nil {
return err
}
// 由于实际场景中会出现name重复的recording rule,所以不需要检查重复
//exists, err := RecordingRuleExists(0, re.GroupId, re.Cluster, re.Name)
//if err != nil {
// return err
//}
//
//if exists {
// return errors.New("RecordingRule already exists")
//}
now := time.Now().Unix()
re.CreateAt = now
re.UpdateAt = now
return Insert(ctx, re)
}
func (re *RecordingRule) Update(ctx *ctx.Context, ref RecordingRule) error {
// 由于实际场景中会出现name重复的recording rule,所以不需要检查重复
//if re.Name != ref.Name {
// exists, err := RecordingRuleExists(re.Id, re.GroupId, re.Cluster, ref.Name)
// if err != nil {
// return err
// }
// if exists {
// return errors.New("RecordingRule already exists")
// }
//}
ref.FE2DB()
ref.Id = re.Id
ref.CreateAt = re.CreateAt
ref.CreateBy = re.CreateBy
ref.UpdateAt = time.Now().Unix()
err := ref.Verify()
if err != nil {
return err
}
return DB(ctx).Model(re).Select("*").Updates(ref).Error
}
func (re *RecordingRule) UpdateFieldsMap(ctx *ctx.Context, fields map[string]interface{}) error {
return DB(ctx).Model(re).Updates(fields).Error
}
func RecordingRuleDels(ctx *ctx.Context, ids []int64, groupId int64) error {
for i := 0; i < len(ids); i++ {
ret := DB(ctx).Where("id = ? and group_id=?", ids[i], groupId).Delete(&RecordingRule{})
if ret.Error != nil {
return ret.Error
}
}
return nil
}
// func RecordingRuleExists(ctx *ctx.Context, id, groupId int64, cluster, name string) (bool, error) {
// session := DB(ctx).Where("id <> ? and group_id = ? and name =? ", id, groupId, name)
// var lst []RecordingRule
// err := session.Find(&lst).Error
// if err != nil {
// return false, err
// }
// if len(lst) == 0 {
// return false, nil
// }
// // match cluster
// for _, r := range lst {
// if MatchCluster(r.Cluster, cluster) {
// return true, nil
// }
// }
// return false, nil
// }
func RecordingRuleGets(ctx *ctx.Context, groupId int64) ([]RecordingRule, error) {
session := DB(ctx).Where("group_id=?", groupId).Order("name")
var lst []RecordingRule
err := session.Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
}
return lst, err
}
func RecordingRuleGetsByBGIds(ctx *ctx.Context, bgids []int64) ([]RecordingRule, error) {
session := DB(ctx)
if len(bgids) > 0 {
session = session.Where("group_id in (?)", bgids).Order("name")
}
var lst []RecordingRule
err := session.Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
}
return lst, err
}
func RecordingRuleGet(ctx *ctx.Context, where string, regs ...interface{}) (*RecordingRule, error) {
var lst []*RecordingRule
err := DB(ctx).Where(where, regs...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
lst[0].DB2FE()
return lst[0], nil
}
func RecordingRuleGetById(ctx *ctx.Context, id int64) (*RecordingRule, error) {
return RecordingRuleGet(ctx, "id=?", id)
}
func RecordingRuleEnabledGets(ctx *ctx.Context) ([]*RecordingRule, error) {
session := DB(ctx)
var lst []*RecordingRule
err := session.Where("disabled = ?", 0).Find(&lst).Error
if err != nil {
return lst, err
}
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
return lst, nil
}
func RecordingRuleGetsByCluster(ctx *ctx.Context) ([]*RecordingRule, error) {
if !ctx.IsCenter {
lst, err := poster.GetByUrls[[]*RecordingRule](ctx, "/v1/n9e/recording-rules")
if err != nil {
return nil, err
}
for i := 0; i < len(lst); i++ {
lst[i].FE2DB()
}
return lst, err
}
session := DB(ctx).Where("disabled = ?", 0)
var lst []*RecordingRule
err := session.Find(&lst).Error
if err != nil {
return lst, err
}
if len(lst) == 0 {
return lst, nil
}
for i := 0; i < len(lst); i++ {
lst[i].DB2FE()
}
return lst, nil
}
func RecordingRuleStatistics(ctx *ctx.Context) (*Statistics, error) {
if !ctx.IsCenter {
s, err := poster.GetByUrls[*Statistics](ctx, "/v1/n9e/statistic?name=recording_rule")
return s, err
}
session := DB(ctx).Model(&RecordingRule{}).Select("count(*) as total", "max(update_at) as last_updated")
var stats []*Statistics
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
func RecordingRuleUpgradeToV6(ctx *ctx.Context, dsm map[string]Datasource) error {
var lst []*RecordingRule
err := DB(ctx).Find(&lst).Error
if err != nil {
return err
}
for i := 0; i < len(lst); i++ {
var ids []int64
if lst[i].Cluster == "$all" {
ids = append(ids, 0)
} else {
clusters := strings.Fields(lst[i].Cluster)
for j := 0; j < len(clusters); j++ {
if ds, exists := dsm[clusters[j]]; exists {
ids = append(ids, ds.Id)
}
}
}
b, err := json.Marshal(ids)
if err != nil {
continue
}
lst[i].DatasourceIds = string(b)
err = lst[i].UpdateFieldsMap(ctx, map[string]interface{}{"datasource_ids": lst[i].DatasourceIds})
if err != nil {
logger.Errorf("update alert rule:%d datasource ids failed, %v", lst[i].Id, err)
}
}
return nil
}
================================================
FILE: models/role.go
================================================
package models
import (
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/pkg/errors"
)
type Role struct {
Id int64 `json:"id" gorm:"primaryKey"`
Name string `json:"name"`
Note string `json:"note"`
}
func (Role) TableName() string {
return "role"
}
func RoleGets(ctx *ctx.Context, where string, args ...interface{}) ([]Role, error) {
var objs []Role
err := DB(ctx).Where(where, args...).Find(&objs).Error
if err != nil {
return nil, errors.WithMessage(err, "failed to query roles")
}
return objs, nil
}
func RoleGetsAll(ctx *ctx.Context) ([]Role, error) {
return RoleGets(ctx, "")
}
// 增加角色
func (r *Role) Add(ctx *ctx.Context) error {
role, err := RoleGet(ctx, "name = ?", r.Name)
if err != nil {
return errors.WithMessage(err, "failed to query user")
}
if role != nil {
return errors.New("role name already exists")
}
return DB(ctx).Create(r).Error
}
// 删除角色
func (r *Role) Del(ctx *ctx.Context) error {
return DB(ctx).Delete(r).Error
}
// 更新角色
func (ug *Role) Update(ctx *ctx.Context, selectField interface{}, selectFields ...interface{}) error {
return DB(ctx).Model(ug).Select(selectField, selectFields...).Updates(ug).Error
}
func RoleGet(ctx *ctx.Context, where string, args ...interface{}) (*Role, error) {
var lst []*Role
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
return lst[0], nil
}
func RoleCount(ctx *ctx.Context, where string, args ...interface{}) (num int64, err error) {
return Count(DB(ctx).Model(&Role{}).Where(where, args...))
}
================================================
FILE: models/role_operation.go
================================================
package models
import (
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/slice"
)
type RoleOperation struct {
RoleName string
Operation string
}
func (RoleOperation) TableName() string {
return "role_operation"
}
func RoleHasOperation(ctx *ctx.Context, roles []string, operation string) (bool, error) {
if len(roles) == 0 {
return false, nil
}
return Exists(DB(ctx).Model(&RoleOperation{}).Where("operation = ? and role_name in ?", operation, roles))
}
func OperationsOfRole(ctx *ctx.Context, roles []string) ([]string, error) {
session := DB(ctx).Model(&RoleOperation{}).Select("distinct(operation) as operation")
if !slice.ContainsString(roles, AdminRole) {
session = session.Where("role_name in ?", roles)
}
var ret []string
err := session.Pluck("operation", &ret).Error
return ret, err
}
func RoleOperationBind(ctx *ctx.Context, roleName string, operation []string) error {
tx := DB(ctx).Begin()
if err := tx.Where("role_name = ?", roleName).Delete(&RoleOperation{}).Error; err != nil {
tx.Rollback()
return err
}
if len(operation) == 0 {
return tx.Commit().Error
}
var ops []RoleOperation
for _, op := range operation {
ops = append(ops, RoleOperation{
RoleName: roleName,
Operation: op,
})
}
if err := tx.Create(&ops).Error; err != nil {
tx.Rollback()
return err
}
return tx.Commit().Error
}
================================================
FILE: models/saved_view.go
================================================
package models
import (
"errors"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
)
var (
ErrSavedViewNameEmpty = errors.New("saved view name is blank")
ErrSavedViewPageEmpty = errors.New("saved view page is blank")
ErrSavedViewNotFound = errors.New("saved view not found")
ErrSavedViewNameDuplicate = errors.New("saved view name already exists in this page")
)
type SavedView struct {
Id int64 `json:"id" gorm:"primaryKey;autoIncrement"`
Name string `json:"name" gorm:"type:varchar(255);not null"`
Page string `json:"page" gorm:"type:varchar(64);not null;index"`
Filter string `json:"filter" gorm:"type:text"`
PublicCate int `json:"public_cate" gorm:"default:0"` // 0: self, 1: team, 2: all
Gids []int64 `json:"gids" gorm:"column:gids;type:text;serializer:json"`
CreateAt int64 `json:"create_at" gorm:"type:bigint;not null;default:0"`
CreateBy string `json:"create_by" gorm:"type:varchar(64);index"`
UpdateAt int64 `json:"update_at" gorm:"type:bigint;not null;default:0"`
UpdateBy string `json:"update_by" gorm:"type:varchar(64)"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
// 查询时填充的字段
IsFavorite bool `json:"is_favorite" gorm:"-"`
}
func (SavedView) TableName() string {
return "saved_view"
}
func (sv *SavedView) Verify() error {
sv.Name = strings.TrimSpace(sv.Name)
if sv.Name == "" {
return ErrSavedViewNameEmpty
}
if sv.Page == "" {
return ErrSavedViewPageEmpty
}
return nil
}
func SavedViewCheckDuplicateName(c *ctx.Context, page, name string, excludeId int64) error {
var count int64
session := DB(c).Model(&SavedView{}).Where("page = ? AND name = ? AND public_cate = 2", page, name)
if excludeId > 0 {
session = session.Where("id != ?", excludeId)
}
if err := session.Count(&count).Error; err != nil {
return err
}
if count > 0 {
return ErrSavedViewNameDuplicate
}
return nil
}
func SavedViewAdd(c *ctx.Context, sv *SavedView) error {
if err := sv.Verify(); err != nil {
return err
}
// 当 PublicCate 为 all(2) 时,检查同一个 page 下 name 是否重复
if sv.PublicCate == 2 {
if err := SavedViewCheckDuplicateName(c, sv.Page, sv.Name, 0); err != nil {
return err
}
}
now := time.Now().Unix()
sv.CreateAt = now
sv.UpdateAt = now
return Insert(c, sv)
}
func SavedViewUpdate(c *ctx.Context, sv *SavedView, username string) error {
if err := sv.Verify(); err != nil {
return err
}
// 当 PublicCate 为 all(2) 时,检查同一个 page 下 name 是否重复(排除自身)
if sv.PublicCate == 2 {
if err := SavedViewCheckDuplicateName(c, sv.Page, sv.Name, sv.Id); err != nil {
return err
}
}
sv.UpdateAt = time.Now().Unix()
sv.UpdateBy = username
return DB(c).Model(sv).Select("name", "filter", "public_cate", "gids", "update_at", "update_by").Updates(sv).Error
}
func SavedViewDel(c *ctx.Context, id int64) error {
// 先删除收藏关联
if err := DB(c).Where("view_id = ?", id).Delete(&UserViewFavorite{}).Error; err != nil {
return err
}
return DB(c).Where("id = ?", id).Delete(&SavedView{}).Error
}
func SavedViewGetById(c *ctx.Context, id int64) (*SavedView, error) {
var sv SavedView
err := DB(c).Where("id = ?", id).First(&sv).Error
if err != nil {
return nil, err
}
return &sv, nil
}
func SavedViewGets(c *ctx.Context, page string) ([]SavedView, error) {
var views []SavedView
session := DB(c).Where("page = ?", page)
if err := session.Order("update_at DESC").Find(&views).Error; err != nil {
return nil, err
}
return views, nil
}
func SavedViewFavoriteGetByUserId(c *ctx.Context, userId int64) (map[int64]bool, error) {
var favorites []UserViewFavorite
if err := DB(c).Where("user_id = ?", userId).Find(&favorites).Error; err != nil {
return nil, err
}
result := make(map[int64]bool)
for _, f := range favorites {
result[f.ViewId] = true
}
return result, nil
}
type UserViewFavorite struct {
Id int64 `json:"id" gorm:"primaryKey;autoIncrement"`
ViewId int64 `json:"view_id" gorm:"index"`
UserId int64 `json:"user_id" gorm:"index"`
CreateAt int64 `json:"create_at"`
}
func (UserViewFavorite) TableName() string {
return "user_view_favorite"
}
func UserViewFavoriteAdd(c *ctx.Context, viewId, userId int64) error {
var count int64
if err := DB(c).Model(&SavedView{}).Where("id = ?", viewId).Count(&count).Error; err != nil {
return err
}
if count == 0 {
return ErrSavedViewNotFound
}
if err := DB(c).Model(&UserViewFavorite{}).Where("view_id = ? AND user_id = ?", viewId, userId).Count(&count).Error; err != nil {
return err
}
if count > 0 {
return nil // 已收藏,直接返回成功
}
fav := &UserViewFavorite{
ViewId: viewId,
UserId: userId,
CreateAt: time.Now().Unix(),
}
return DB(c).Create(fav).Error
}
func UserViewFavoriteDel(c *ctx.Context, viewId, userId int64) error {
return DB(c).Where("view_id = ? AND user_id = ?", viewId, userId).Delete(&UserViewFavorite{}).Error
}
================================================
FILE: models/source_token.go
================================================
package models
import (
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
)
type SourceToken struct {
Id int64 `json:"id" gorm:"primaryKey"`
SourceType string `json:"source_type" gorm:"column:source_type;type:varchar(64);not null;default:''"`
SourceId string `json:"source_id" gorm:"column:source_id;type:varchar(255);not null;default:''"`
Token string `json:"token" gorm:"column:token;type:varchar(255);not null;default:''"`
ExpireAt int64 `json:"expire_at" gorm:"type:bigint;not null;default:0"`
CreateAt int64 `json:"create_at" gorm:"type:bigint;not null;default:0"`
CreateBy string `json:"create_by" gorm:"type:varchar(64);not null;default:''"`
}
func (SourceToken) TableName() string {
return "source_token"
}
func (st *SourceToken) Add(ctx *ctx.Context) error {
return Insert(ctx, st)
}
// GetSourceTokenBySource 根据源类型和源ID获取源令牌
func GetSourceTokenBySource(ctx *ctx.Context, sourceType, sourceId, token string) (*SourceToken, error) {
var st SourceToken
err := DB(ctx).Where("source_type = ? AND source_id = ? AND token = ?", sourceType, sourceId, token).First(&st).Error
if err != nil {
return nil, err
}
return &st, nil
}
func (st *SourceToken) IsExpired() bool {
if st.ExpireAt == 0 {
return false // 0 表示永不过期
}
return time.Now().Unix() > st.ExpireAt
}
func CleanupExpiredTokens(ctx *ctx.Context) (int64, error) {
now := time.Now().Unix()
result := DB(ctx).Where("expire_at > 0 AND expire_at < ?", now).Delete(&SourceToken{})
return result.RowsAffected, result.Error
}
================================================
FILE: models/sso_config.go
================================================
package models
import (
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
)
type SsoConfig struct {
Id int64 `json:"id"`
Name string `json:"name"`
Content string `json:"content"`
SettingJson interface{} `json:"setting" gorm:"-"`
UpdateAt int64 `json:"update_at"`
}
func (b *SsoConfig) TableName() string {
return "sso_config"
}
// get all sso_config
func SsoConfigGets(c *ctx.Context) ([]SsoConfig, error) {
var lst []SsoConfig
err := DB(c).Find(&lst).Error
return lst, err
}
// Query query sso config
func (b *SsoConfig) Query(c *ctx.Context) (SsoConfig, error) {
var sso SsoConfig
err := DB(c).Model(b).Where("name = ?", b.Name).First(&sso).Error
return sso, err
}
// 创建 builtin_cate
func (b *SsoConfig) Create(c *ctx.Context) error {
return Insert(c, b)
}
func (b *SsoConfig) Update(c *ctx.Context) error {
b.UpdateAt = time.Now().Unix()
return DB(c).Model(b).Select("content", "update_at").Updates(b).Error
}
// get sso_config last update time
func SsoConfigLastUpdateTime(c *ctx.Context) (int64, error) {
var lastUpdateTime int64
err := DB(c).Model(&SsoConfig{}).Select("max(update_at)").Row().Scan(&lastUpdateTime)
return lastUpdateTime, err
}
// get sso_config count by name
func SsoConfigCountByName(c *ctx.Context, name string) (int64, error) {
var count int64
err := DB(c).Model(&SsoConfig{}).Where("name = ?", name).Count(&count).Error
return count, err
}
================================================
FILE: models/target.go
================================================
package models
import (
"context"
"encoding/json"
"log"
"sort"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/storage"
"golang.org/x/exp/slices"
"github.com/pkg/errors"
"github.com/toolkits/pkg/container/set"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/slice"
"gorm.io/gorm"
)
type TargetDeleteHookFunc func(tx *gorm.DB, idents []string) error
type Target struct {
Id int64 `json:"id" gorm:"primaryKey"`
GroupId int64 `json:"group_id"`
GroupObjs []*BusiGroup `json:"group_objs" gorm:"-"`
Ident string `json:"ident"`
Note string `json:"note"`
Tags string `json:"-"` // user tags
TagsJSON []string `json:"tags" gorm:"-"`
TagsMap map[string]string `json:"tags_maps" gorm:"-"` // internal use, append tags to series
UpdateAt int64 `json:"update_at"`
HostIp string `json:"host_ip"` //ipv4,do not needs range select
AgentVersion string `json:"agent_version"`
EngineName string `json:"engine_name"`
OS string `json:"os" gorm:"column:os"`
HostTags []string `json:"host_tags" gorm:"serializer:json"`
BeatTime int64 `json:"beat_time" gorm:"-"` // 实时心跳时间,从 Redis 获取
UnixTime int64 `json:"unixtime" gorm:"-"`
Offset int64 `json:"offset" gorm:"-"`
TargetUp float64 `json:"target_up" gorm:"-"`
MemUtil float64 `json:"mem_util" gorm:"-"`
CpuNum int `json:"cpu_num" gorm:"-"`
CpuUtil float64 `json:"cpu_util" gorm:"-"`
Arch string `json:"arch" gorm:"-"`
RemoteAddr string `json:"remote_addr" gorm:"-"`
GroupIds []int64 `json:"group_ids" gorm:"-"`
GroupNames []string `json:"group_names" gorm:"-"`
}
func (t *Target) TableName() string {
return "target"
}
func (t *Target) FillGroup(ctx *ctx.Context, cache map[int64]*BusiGroup) error {
var err error
if len(t.GroupIds) == 0 {
t.GroupIds, err = TargetGroupIdsGetByIdent(ctx, t.Ident)
if err != nil {
return errors.WithMessage(err, "failed to get target gids")
}
t.GroupObjs = make([]*BusiGroup, 0, len(t.GroupIds))
}
for _, gid := range t.GroupIds {
bg, has := cache[gid]
if has && bg != nil {
t.GroupObjs = append(t.GroupObjs, bg)
continue
}
bg, err := BusiGroupGetById(ctx, gid)
if err != nil {
return errors.WithMessage(err, "failed to get busi group")
}
if bg == nil {
continue
}
t.GroupObjs = append(t.GroupObjs, bg)
cache[gid] = bg
}
return nil
}
func (t *Target) MatchGroupId(gid ...int64) bool {
for _, tgId := range t.GroupIds {
for _, id := range gid {
if tgId == id {
return true
}
}
}
return false
}
func (t *Target) AfterFind(tx *gorm.DB) (err error) {
t.FillTagsMap()
return
}
func TargetStatistics(ctx *ctx.Context) (*Statistics, error) {
if !ctx.IsCenter {
s, err := poster.GetByUrls[*Statistics](ctx, "/v1/n9e/statistic?name=target")
return s, err
}
var stats []*Statistics
err := DB(ctx).Model(&Target{}).Select("count(*) as total", "max(update_at) as last_updated").Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
func TargetDel(ctx *ctx.Context, idents []string, deleteHook TargetDeleteHookFunc) error {
if len(idents) == 0 {
return errors.New("idents cannot be empty")
}
return DB(ctx).Transaction(func(tx *gorm.DB) error {
txErr := tx.Where("ident in ?", idents).Delete(new(Target)).Error
if txErr != nil {
return txErr
}
txErr = deleteHook(tx, idents)
if txErr != nil {
return txErr
}
txErr = TargetDeleteBgids(tx, idents)
if txErr != nil {
return txErr
}
return nil
})
}
type BuildTargetWhereOption func(session *gorm.DB) *gorm.DB
func BuildTargetWhereWithBgids(bgids []int64) BuildTargetWhereOption {
return func(session *gorm.DB) *gorm.DB {
if len(bgids) == 1 && bgids[0] == 0 {
session = session.Joins("left join target_busi_group on target.ident = " +
"target_busi_group.target_ident").Where("target_busi_group.target_ident is null")
} else if len(bgids) > 0 {
if slices.Contains(bgids, 0) {
session = session.Joins("left join target_busi_group on target.ident = target_busi_group.target_ident").
Where("target_busi_group.target_ident is null OR target_busi_group.group_id in (?)", bgids)
} else {
session = session.Joins("join target_busi_group on target.ident = "+
"target_busi_group.target_ident").Where("target_busi_group.group_id in (?)", bgids)
}
}
return session
}
}
func BuildTargetWhereWithDsIds(dsIds []int64) BuildTargetWhereOption {
return func(session *gorm.DB) *gorm.DB {
if len(dsIds) > 0 {
session = session.Where("datasource_id in (?)", dsIds)
}
return session
}
}
func BuildTargetWhereWithHosts(hosts []string) BuildTargetWhereOption {
return func(session *gorm.DB) *gorm.DB {
if len(hosts) > 0 {
session = session.Where("ident in (?) or host_ip in (?)", hosts, hosts)
}
return session
}
}
func BuildTargetWhereWithIdents(idents []string) BuildTargetWhereOption {
return func(session *gorm.DB) *gorm.DB {
if len(idents) > 0 {
session = session.Where("ident in (?)", idents)
}
return session
}
}
func BuildTargetWhereExcludeIdents(idents []string) BuildTargetWhereOption {
return func(session *gorm.DB) *gorm.DB {
if len(idents) > 0 {
session = session.Where("ident not in (?)", idents)
}
return session
}
}
func BuildTargetWhereWithQuery(query string) BuildTargetWhereOption {
return func(session *gorm.DB) *gorm.DB {
if query != "" {
arr := strings.Fields(query)
for i := 0; i < len(arr); i++ {
if strings.HasPrefix(arr[i], "-") {
q := "%" + arr[i][1:] + "%"
session = session.Where("ident not like ? and host_ip not like ? and "+
"note not like ? and tags not like ? and (host_tags not like ? or "+
"host_tags is null) and os not like ?", q, q, q, q, q, q)
} else {
q := "%" + arr[i] + "%"
session = session.Where("ident like ? or host_ip like ? or note like ? or "+
"tags like ? or host_tags like ? or os like ?", q, q, q, q, q, q)
}
}
}
return session
}
}
func buildTargetWhere(ctx *ctx.Context, options ...BuildTargetWhereOption) *gorm.DB {
sub := DB(ctx).Model(&Target{}).Distinct("target.ident")
for _, opt := range options {
sub = opt(sub)
}
return DB(ctx).Model(&Target{}).Where("ident in (?)", sub)
}
func TargetTotal(ctx *ctx.Context, options ...BuildTargetWhereOption) (int64, error) {
return Count(buildTargetWhere(ctx, options...))
}
func TargetGets(ctx *ctx.Context, limit, offset int, order string, desc bool, options ...BuildTargetWhereOption) ([]*Target, error) {
var lst []*Target
order = validateOrderField(order, "ident")
if desc {
order += " desc"
} else {
order += " asc"
}
err := buildTargetWhere(ctx, options...).Order(order).Limit(limit).Offset(offset).Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].TagsJSON = strings.Fields(lst[i].Tags)
}
}
return lst, err
}
// 根据 groupids, tags, hosts 查询 targets
func TargetGetsByFilter(ctx *ctx.Context, query []map[string]interface{}, limit, offset int) ([]*Target, error) {
var lst []*Target
session := TargetFilterQueryBuild(ctx, query, limit, offset)
err := session.Order("ident").Find(&lst).Error
cache := make(map[int64]*BusiGroup)
for i := 0; i < len(lst); i++ {
lst[i].TagsJSON = strings.Fields(lst[i].Tags)
lst[i].FillGroup(ctx, cache)
}
return lst, err
}
func TargetCountByFilter(ctx *ctx.Context, query []map[string]interface{}) (int64, error) {
session := TargetFilterQueryBuild(ctx, query, 0, 0)
return Count(session)
}
func TargetFilterQueryBuild(ctx *ctx.Context, query []map[string]interface{}, limit, offset int) *gorm.DB {
sub := DB(ctx).Model(&Target{}).Distinct("target.ident").Joins("left join " +
"target_busi_group on target.ident = target_busi_group.target_ident")
for _, q := range query {
tx := DB(ctx).Model(&Target{})
for k, v := range q {
if strings.Count(k, "?") > 1 {
tx = tx.Or(k, v.([]interface{})...)
} else {
tx = tx.Or(k, v)
}
}
sub = sub.Where(tx)
}
session := DB(ctx).Model(&Target{}).Where("ident in (?)", sub)
if limit > 0 {
session = session.Limit(limit).Offset(offset)
}
return session
}
func TargetGetsAll(ctx *ctx.Context) ([]*Target, error) {
if !ctx.IsCenter {
lst, err := poster.GetByUrls[[]*Target](ctx, "/v1/n9e/targets")
return lst, err
}
var lst []*Target
err := DB(ctx).Model(&Target{}).Find(&lst).Error
if err != nil {
return lst, err
}
tgs, err := TargetBusiGroupsGetAll(ctx)
if err != nil {
return lst, err
}
for i := 0; i < len(lst); i++ {
lst[i].FillTagsMap()
lst[i].GroupIds = tgs[lst[i].Ident]
}
return lst, err
}
func TargetUpdateNote(ctx *ctx.Context, idents []string, note string) error {
return DB(ctx).Model(&Target{}).Where("ident in ?", idents).Updates(map[string]interface{}{
"note": note,
"update_at": time.Now().Unix(),
}).Error
}
func TargetUpdateBgid(ctx *ctx.Context, idents []string, bgid int64, clearTags bool) error {
fields := map[string]interface{}{
"group_id": bgid,
"update_at": time.Now().Unix(),
}
if clearTags {
fields["tags"] = ""
}
return DB(ctx).Model(&Target{}).Where("ident in ?", idents).Updates(fields).Error
}
func TargetGet(ctx *ctx.Context, where string, args ...interface{}) (*Target, error) {
var lst []*Target
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
lst[0].TagsJSON = strings.Fields(lst[0].Tags)
return lst[0], nil
}
func TargetGetById(ctx *ctx.Context, id int64) (*Target, error) {
return TargetGet(ctx, "id = ?", id)
}
func TargetGetByIdent(ctx *ctx.Context, ident string) (*Target, error) {
return TargetGet(ctx, "ident = ?", ident)
}
func TargetsGetByIdents(ctx *ctx.Context, idents []string) ([]*Target, error) {
var targets []*Target
err := DB(ctx).Where("ident IN ?", idents).Find(&targets).Error
return targets, err
}
func TargetsGetIdentsByIdentsAndHostIps(ctx *ctx.Context, idents, hostIps []string) (map[string]string, []string, error) {
inexistence := make(map[string]string)
identSet := set.NewStringSet()
// Query the ident corresponding to idents
if len(idents) > 0 {
var identsFromIdents []string
err := DB(ctx).Model(&Target{}).Where("ident IN ?", idents).Pluck("ident", &identsFromIdents).Error
if err != nil {
return nil, nil, err
}
for _, ident := range identsFromIdents {
identSet.Add(ident)
}
for _, ident := range idents {
if !identSet.Exists(ident) {
inexistence[ident] = "Ident not found"
}
}
}
// Query the hostIp corresponding to idents
if len(hostIps) > 0 {
var hostIpToIdentMap []struct {
HostIp string
Ident string
}
err := DB(ctx).Model(&Target{}).Select("host_ip, ident").Where("host_ip IN ?", hostIps).Scan(&hostIpToIdentMap).Error
if err != nil {
return nil, nil, err
}
hostIpToIdent := set.NewStringSet()
for _, entry := range hostIpToIdentMap {
hostIpToIdent.Add(entry.HostIp)
identSet.Add(entry.Ident)
}
for _, hostIp := range hostIps {
if !hostIpToIdent.Exists(hostIp) {
inexistence[hostIp] = "HostIp not found"
}
}
}
return inexistence, identSet.ToSlice(), nil
}
func TargetsGetIdsByIdentsAndHostIps(ctx *ctx.Context, idents, hostIps []string) (
map[string]string, []int64, error) {
inexistence := make(map[string]string)
idSet := set.NewInt64Set()
if len(idents) > 0 {
var identToIdMap []struct {
Ident string
Id int64
}
err := DB(ctx).Model(&Target{}).Select("id, ident").Where("ident IN ?", idents).Scan(&identToIdMap).Error
if err != nil {
return nil, nil, err
}
identSet := set.NewStringSet()
for _, entry := range identToIdMap {
idSet.Add(entry.Id)
identSet.Add(entry.Ident)
}
for _, ident := range idents {
if !identSet.Exists(ident) {
inexistence[ident] = "Ident not found"
}
}
}
// Query the hostIp corresponding to idents
if len(hostIps) > 0 {
var hostIpMap []struct {
HostIp string
Ident string
Id int64
}
err := DB(ctx).Model(&Target{}).Select("id, host_ip").Where("host_ip IN ?", hostIps).Scan(&hostIpMap).Error
if err != nil {
return nil, nil, err
}
hostIpSet := set.NewStringSet()
for _, entry := range hostIpMap {
hostIpSet.Add(entry.HostIp)
idSet.Add(entry.Id)
}
for _, hostIp := range hostIps {
if !hostIpSet.Exists(hostIp) {
inexistence[hostIp] = "HostIp not found"
}
}
}
return inexistence, idSet.ToSlice(), nil
}
func TargetGetTags(ctx *ctx.Context, idents []string, ignoreHostTag bool, bgLabelKey string) (
[]string, error) {
session := DB(ctx).Model(new(Target))
var arr []*Target
if len(idents) > 0 {
session = session.Where("ident in ?", idents)
}
err := session.Select("tags", "host_tags").Find(&arr).Error
if err != nil {
return nil, err
}
cnt := len(arr)
if cnt == 0 {
return []string{}, nil
}
set := make(map[string]struct{})
for i := 0; i < cnt; i++ {
tags := strings.Fields(arr[i].Tags)
for j := 0; j < len(tags); j++ {
set[tags[j]] = struct{}{}
}
if !ignoreHostTag {
for _, ht := range arr[i].HostTags {
set[ht] = struct{}{}
}
}
}
cnt = len(set)
ret := make([]string, 0, cnt)
for key := range set {
ret = append(ret, key)
}
if bgLabelKey != "" {
sort.Slice(ret, func(i, j int) bool {
if strings.HasPrefix(ret[i], bgLabelKey) && strings.HasPrefix(ret[j], bgLabelKey) {
return ret[i] < ret[j]
}
if strings.HasPrefix(ret[i], bgLabelKey) {
return true
}
if strings.HasPrefix(ret[j], bgLabelKey) {
return false
}
return ret[i] < ret[j]
})
} else {
sort.Strings(ret)
}
return ret, err
}
func (t *Target) AddTags(ctx *ctx.Context, tags []string) error {
for i := 0; i < len(tags); i++ {
if !strings.Contains(t.Tags, tags[i]+" ") {
t.Tags += tags[i] + " "
}
}
arr := strings.Fields(t.Tags)
sort.Strings(arr)
return DB(ctx).Model(t).Updates(map[string]interface{}{
"tags": strings.Join(arr, " ") + " ",
"update_at": time.Now().Unix(),
}).Error
}
func (t *Target) DelTags(ctx *ctx.Context, tags []string) error {
for _, tag := range tags {
t.Tags = strings.ReplaceAll(t.Tags, tag+" ", "")
}
return DB(ctx).Model(t).Updates(map[string]interface{}{
"tags": t.Tags,
"update_at": time.Now().Unix(),
}).Error
}
func (t *Target) FillTagsMap() {
t.TagsJSON = strings.Fields(t.Tags)
t.TagsMap = make(map[string]string)
m := make(map[string]string)
allTags := append(t.TagsJSON, t.HostTags...)
for _, item := range allTags {
arr := strings.Split(item, "=")
if len(arr) != 2 {
continue
}
m[arr[0]] = arr[1]
}
t.TagsMap = m
}
func (t *Target) GetTagsMap() map[string]string {
tagsJSON := strings.Fields(t.Tags)
m := make(map[string]string)
for _, item := range tagsJSON {
if arr := strings.Split(item, "="); len(arr) == 2 {
m[arr[0]] = arr[1]
}
}
return m
}
func (t *Target) GetHostTagsMap() map[string]string {
m := make(map[string]string)
for _, item := range t.HostTags {
arr := strings.Split(item, "=")
if len(arr) != 2 {
continue
}
m[arr[0]] = arr[1]
}
return m
}
func (t *Target) FillMeta(meta *HostMeta) {
t.MemUtil = meta.MemUtil
t.CpuUtil = meta.CpuUtil
t.CpuNum = meta.CpuNum
t.UnixTime = meta.UnixTime
t.Offset = meta.Offset
t.Arch = meta.Arch
t.RemoteAddr = meta.RemoteAddr
}
// FetchBeatTimesFromRedis 从 Redis 批量获取心跳时间,返回 ident -> updateTime 的映射
func FetchBeatTimesFromRedis(redis storage.Redis, idents []string) map[string]int64 {
result := make(map[string]int64, len(idents))
if redis == nil || len(idents) == 0 {
return result
}
num := 0
var keys []string
for i := 0; i < len(idents); i++ {
keys = append(keys, WrapIdentUpdateTime(idents[i]))
num++
if num == 100 {
fetchBeatTimeBatch(redis, keys, result)
keys = keys[:0]
num = 0
}
}
if len(keys) > 0 {
fetchBeatTimeBatch(redis, keys, result)
}
return result
}
func fetchBeatTimeBatch(redis storage.Redis, keys []string, result map[string]int64) {
vals := storage.MGet(context.Background(), redis, keys)
for _, value := range vals {
if value == nil {
continue
}
var hut HostUpdateTime
if err := json.Unmarshal(value, &hut); err != nil {
logger.Warningf("failed to unmarshal host update time: %v", err)
continue
}
result[hut.Ident] = hut.UpdateTime
}
}
// FillTargetsBeatTime 从 Redis 批量获取心跳时间填充 target.BeatTime
func FillTargetsBeatTime(redis storage.Redis, targets []*Target) {
if len(targets) == 0 {
return
}
idents := make([]string, len(targets))
for i, t := range targets {
idents[i] = t.Ident
}
beatTimes := FetchBeatTimesFromRedis(redis, idents)
for _, t := range targets {
if ts, ok := beatTimes[t.Ident]; ok {
t.BeatTime = ts
}
}
}
func TargetIdents(ctx *ctx.Context, ids []int64) ([]string, error) {
var ret []string
if len(ids) == 0 {
return ret, nil
}
err := DB(ctx).Model(&Target{}).Where("id in ?", ids).Pluck("ident", &ret).Error
return ret, err
}
func TargetIds(ctx *ctx.Context, idents []string) ([]int64, error) {
var ret []int64
if len(idents) == 0 {
return ret, nil
}
err := DB(ctx).Model(&Target{}).Where("ident in ?", idents).Pluck("id", &ret).Error
return ret, err
}
func IdentsFilter(ctx *ctx.Context, idents []string, where string, args ...interface{}) ([]string, error) {
var arr []string
if len(idents) == 0 {
return arr, nil
}
err := DB(ctx).Model(&Target{}).Where("ident in ?", idents).Where(where, args...).Pluck("ident", &arr).Error
return arr, err
}
func (m *Target) UpdateFieldsMap(ctx *ctx.Context, fields map[string]interface{}) error {
return DB(ctx).Model(m).Updates(fields).Error
}
// 1. 是否可以进行 busi_group 迁移
func CanMigrateBg(ctx *ctx.Context) bool {
// 1.1 检查 target 表是否为空
var cnt int64
if err := DB(ctx).Model(&Target{}).Count(&cnt).Error; err != nil {
log.Println("failed to get target table count, err:", err)
return false
}
if cnt == 0 {
logger.Debug("target table is empty, skip migration.")
return false
}
// 1.2 判断是否已经完成迁移
var maxGroupId int64
if err := DB(ctx).Model(&Target{}).Select("MAX(group_id)").Scan(&maxGroupId).Error; err != nil {
log.Println("failed to get max group_id from target table, err:", err)
return false
}
if maxGroupId == 0 {
return false
}
return true
}
func MigrateBg(ctx *ctx.Context, bgLabelKey string) {
err := DoMigrateBg(ctx, bgLabelKey)
if err != nil {
log.Println("failed to migrate bgid, err:", err)
return
}
log.Println("migration bgid has been completed")
}
func DoMigrateBg(ctx *ctx.Context, bgLabelKey string) error {
// 2. 获取全量 target
targets, err := TargetGetsAll(ctx)
if err != nil {
return err
}
// 3. 获取全量 busi_group
bgs, err := BusiGroupGetAll(ctx)
if err != nil {
return err
}
bgById := make(map[int64]*BusiGroup, len(bgs))
for _, bg := range bgs {
bgById[bg.Id] = bg
}
// 4. 如果某 busi_group 有 label,将其存至对应的 target tags 中
for _, t := range targets {
if t.GroupId == 0 {
continue
}
// 4.1 将 group_id 迁移至关联表
if err := TargetBindBgids(ctx, []string{t.Ident}, []int64{t.GroupId}, nil); err != nil {
logger.Errorf("migrate failed to migrate bgid %v to %v, err: %v", t.GroupId, t.Ident, err)
continue
}
// 4.1.1 将 group_id 迁移至关联表
if err := TargetUpdateBgid(ctx, []string{t.Ident}, 0, false); err != nil {
logger.Errorf("migrate failed to migrate ident group id to 0, ident: %v, err: %v", t.Ident, err)
continue
}
// 4.2 判断该机器是否需要新增 tag
if bg, ok := bgById[t.GroupId]; !ok || bg.LabelEnable == 0 ||
strings.Contains(t.Tags, bgLabelKey+"=") {
logger.Infof("migrate ident %v has no bg label tag, skip", t.Ident)
continue
} else {
err := t.AddTags(ctx, []string{" " + bgLabelKey + "=" + bg.LabelValue})
if err != nil {
logger.Errorf("migrate failed to add bg label tag %v to %v, err: %v", bgLabelKey+"="+bg.LabelValue, t.Ident, err)
continue
}
logger.Infof("migrate add bg label tag %v to %v", bgLabelKey+"="+bg.LabelValue, t.Ident)
}
}
return nil
}
// 返回不存在的 idents
func TargetNoExistIdents(ctx *ctx.Context, idents []string) ([]string, error) {
var existingIdents []string
err := ctx.DB.Table("target").Where("ident in ?", idents).Pluck("ident", &existingIdents).Error
if err != nil {
return nil, err
}
notExistIdents := slice.SubString(idents, existingIdents)
return notExistIdents, nil
}
================================================
FILE: models/target_busi_group.go
================================================
package models
import (
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"gorm.io/gorm"
"gorm.io/gorm/clause"
)
type TargetBusiGroup struct {
Id int64 `json:"id" gorm:"primaryKey;type:bigint;autoIncrement"`
TargetIdent string `json:"target_ident" gorm:"type:varchar(191);not null;index:idx_target_group,unique,priority:1"`
GroupId int64 `json:"group_id" gorm:"type:bigint;not null;index:idx_target_group,unique,priority:2"`
UpdateAt int64 `json:"update_at" gorm:"type:bigint;not null"`
}
func (t *TargetBusiGroup) TableName() string {
return "target_busi_group"
}
func (t *TargetBusiGroup) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci"
}
func TargetBusiGroupsGetAll(ctx *ctx.Context) (map[string][]int64, error) {
var lst []*TargetBusiGroup
err := DB(ctx).Find(&lst).Error
if err != nil {
return nil, err
}
tgs := make(map[string][]int64)
for _, tg := range lst {
tgs[tg.TargetIdent] = append(tgs[tg.TargetIdent], tg.GroupId)
}
return tgs, nil
}
func TargetGroupIdsGetByIdent(ctx *ctx.Context, ident string) ([]int64, error) {
var lst []*TargetBusiGroup
err := DB(ctx).Where("target_ident = ?", ident).Find(&lst).Error
if err != nil {
return nil, err
}
groupIds := make([]int64, 0, len(lst))
for _, tg := range lst {
groupIds = append(groupIds, tg.GroupId)
}
return groupIds, nil
}
func TargetGroupIdsGetByIdents(ctx *ctx.Context, idents []string) ([]int64, error) {
var groupIds []int64
err := DB(ctx).Model(&TargetBusiGroup{}).
Where("target_ident IN ?", idents).
Distinct().
Pluck("group_id", &groupIds).
Error
if err != nil {
return nil, err
}
return groupIds, nil
}
func TargetBindBgids(ctx *ctx.Context, idents []string, bgids []int64, tags []string) error {
lst := make([]TargetBusiGroup, 0, len(bgids)*len(idents))
updateAt := time.Now().Unix()
for _, bgid := range bgids {
for _, ident := range idents {
cur := TargetBusiGroup{
TargetIdent: ident,
GroupId: bgid,
UpdateAt: updateAt,
}
lst = append(lst, cur)
}
}
var cl clause.Expression = clause.Insert{Modifier: "ignore"}
switch DB(ctx).Dialector.Name() {
case "sqlite":
cl = clause.Insert{Modifier: "or ignore"}
case "postgres":
cl = clause.OnConflict{DoNothing: true}
}
return DB(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Clauses(cl).CreateInBatches(&lst, 10).Error; err != nil {
return err
}
if targets, err := TargetsGetByIdents(ctx, idents); err != nil {
return err
} else if len(tags) > 0 {
for _, t := range targets {
if err := t.AddTags(ctx, tags); err != nil {
return err
}
}
}
// update target.update_at so that syncTargets can detect the change and refresh GroupIds cache
if err := tx.Model(&Target{}).Where("ident in ?", idents).Update("update_at", updateAt).Error; err != nil {
return err
}
return nil
})
}
func TargetUnbindBgids(ctx *ctx.Context, idents []string, bgids []int64) error {
return DB(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Where("target_ident in ? and group_id in ?",
idents, bgids).Delete(&TargetBusiGroup{}).Error; err != nil {
return err
}
// update target.update_at so that syncTargets can detect the change and refresh GroupIds cache
return tx.Model(&Target{}).Where("ident in ?", idents).Update("update_at", time.Now().Unix()).Error
})
}
func TargetDeleteBgids(tx *gorm.DB, idents []string) error {
return tx.Where("target_ident in ?", idents).Delete(&TargetBusiGroup{}).Error
}
func TargetOverrideBgids(ctx *ctx.Context, idents []string, bgids []int64, tags []string) error {
return DB(ctx).Transaction(func(tx *gorm.DB) error {
// 先删除旧的关联
if err := tx.Where("target_ident IN ?", idents).Delete(&TargetBusiGroup{}).Error; err != nil {
return err
}
// 准备新的关联数据
lst := make([]TargetBusiGroup, 0, len(bgids)*len(idents))
updateAt := time.Now().Unix()
for _, ident := range idents {
for _, bgid := range bgids {
cur := TargetBusiGroup{
TargetIdent: ident,
GroupId: bgid,
UpdateAt: updateAt,
}
lst = append(lst, cur)
}
}
if len(lst) == 0 {
return nil
}
// 添加新的关联
var cl clause.Expression = clause.Insert{Modifier: "ignore"}
switch tx.Dialector.Name() {
case "sqlite":
cl = clause.Insert{Modifier: "or ignore"}
case "postgres":
cl = clause.OnConflict{DoNothing: true}
}
if err := tx.Clauses(cl).CreateInBatches(&lst, 10).Error; err != nil {
return err
}
if len(tags) == 0 {
// update target.update_at so that syncTargets can detect the change and refresh GroupIds cache
return tx.Model(&Target{}).Where("ident IN ?", idents).Update("update_at", updateAt).Error
}
return tx.Model(Target{}).Where("ident IN ?", idents).Updates(map[string]interface{}{
"tags": strings.Join(tags, " ") + " ", "update_at": updateAt}).Error
})
}
func SeparateTargetIdents(ctx *ctx.Context, idents []string) (existing, nonExisting []string, err error) {
existingMap := make(map[string]bool)
// 查询已存在的 idents 并直接填充 map
err = DB(ctx).Model(&TargetBusiGroup{}).
Where("target_ident IN ?", idents).
Distinct().
Pluck("target_ident", &existing).
Error
if err != nil {
return nil, nil, err
}
for _, ident := range existing {
existingMap[ident] = true
}
// 分离不存在的 idents
for _, ident := range idents {
if !existingMap[ident] {
nonExisting = append(nonExisting, ident)
}
}
return
}
func TargetIndentsGetByBgids(ctx *ctx.Context, bgids []int64) ([]string, error) {
var idents []string
err := DB(ctx).Model(&TargetBusiGroup{}).
Where("group_id IN ?", bgids).
Distinct("target_ident").
Pluck("target_ident", &idents).
Error
return idents, err
}
================================================
FILE: models/task_record.go
================================================
package models
import (
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
)
type TaskRecord struct {
Id int64 `json:"id" gorm:"primaryKey"`
EventId int64 `json:"event_id"`
GroupId int64 `json:"group_id"`
IbexAddress string `json:"ibex_address"`
IbexAuthUser string `json:"ibex_auth_user"`
IbexAuthPass string `json:"ibex_auth_pass"`
Title string `json:"title"`
Account string `json:"account"`
Batch int `json:"batch"`
Tolerance int `json:"tolerance"`
Timeout int `json:"timeout"`
Pause string `json:"pause"`
Script string `json:"script"`
Args string `json:"args"`
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
}
func (r *TaskRecord) TableName() string {
return "task_record"
}
// create task
func (r *TaskRecord) Add(ctx *ctx.Context) error {
if !ctx.IsCenter {
err := poster.PostByUrls(ctx, "/v1/n9e/task-record-add", r)
return err
}
return Insert(ctx, r)
}
// list task, filter by group_id, create_by
func TaskRecordTotal(ctx *ctx.Context, bgids []int64, beginTime int64, createBy, query string) (int64, error) {
session := DB(ctx).Model(&TaskRecord{}).Where("create_at > ?", beginTime)
if len(bgids) > 0 {
session = session.Where("group_id in (?)", bgids)
}
if createBy != "" {
session = session.Where("create_by = ?", createBy)
}
if query != "" {
session = session.Where("title like ?", "%"+query+"%")
}
return Count(session)
}
func TaskRecordGets(ctx *ctx.Context, bgids []int64, beginTime int64, createBy, query string, limit, offset int) ([]*TaskRecord, error) {
session := DB(ctx).Where("create_at > ?", beginTime).Order("create_at desc").Limit(limit).Offset(offset)
if len(bgids) > 0 {
session = session.Where("group_id in (?)", bgids)
}
if createBy != "" {
session = session.Where("create_by = ?", createBy)
}
if query != "" {
session = session.Where("title like ?", "%"+query+"%")
}
var lst []*TaskRecord
err := session.Find(&lst).Error
return lst, err
}
// update is_done field
func (r *TaskRecord) UpdateIsDone(ctx *ctx.Context, isDone int) error {
return DB(ctx).Model(r).Update("is_done", isDone).Error
}
================================================
FILE: models/task_tpl.go
================================================
package models
import (
"errors"
"fmt"
"sort"
"strconv"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/toolkits/pkg/str"
"gorm.io/gorm"
)
type TaskTpl struct {
Id int64 `json:"id" gorm:"primaryKey"`
GroupId int64 `json:"group_id"`
Title string `json:"title"`
Batch int `json:"batch"`
Tolerance int `json:"tolerance"`
Timeout int `json:"timeout"`
Pause string `json:"pause"`
Script string `json:"script"`
Args string `json:"args"`
Tags string `json:"-"`
TagsJSON []string `json:"tags" gorm:"-"`
Account string `json:"account"`
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
}
func (t *TaskTpl) TableName() string {
return "task_tpl"
}
func TaskTplTotal(ctx *ctx.Context, bgids []int64, query string) (int64, error) {
session := DB(ctx).Model(&TaskTpl{})
if len(bgids) > 0 {
session = session.Where("group_id in (?)", bgids)
}
if query == "" {
return Count(session)
}
arr := strings.Fields(query)
for i := 0; i < len(arr); i++ {
arg := "%" + arr[i] + "%"
session = session.Where("title like ? or tags like ?", arg, arg)
}
return Count(session)
}
func TaskTplStatistics(ctx *ctx.Context) (*Statistics, error) {
if !ctx.IsCenter {
return poster.GetByUrls[*Statistics](ctx, "/v1/n9e/task-tpl/statistics")
}
session := DB(ctx).Model(&TaskTpl{}).Select("count(*) as total", "max(update_at) as last_updated")
var stats []*Statistics
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
func TaskTplGetAll(ctx *ctx.Context) ([]*TaskTpl, error) {
if !ctx.IsCenter {
return poster.GetByUrls[[]*TaskTpl](ctx, "/v1/n9e/task-tpls")
}
lst := make([]*TaskTpl, 0)
err := DB(ctx).Find(&lst).Error
return lst, err
}
func TaskTplGets(ctx *ctx.Context, bgids []int64, query string, limit, offset int) ([]TaskTpl, error) {
session := DB(ctx).Order("title").Limit(limit).Offset(offset)
if len(bgids) > 0 {
session = session.Where("group_id in (?)", bgids)
}
var tpls []TaskTpl
if query != "" {
arr := strings.Fields(query)
for i := 0; i < len(arr); i++ {
arg := "%" + arr[i] + "%"
session = session.Where("title like ? or tags like ?", arg, arg)
}
}
err := session.Find(&tpls).Error
if err == nil {
for i := 0; i < len(tpls); i++ {
tpls[i].TagsJSON = strings.Fields(tpls[i].Tags)
}
}
return tpls, err
}
func TaskTplGetById(ctx *ctx.Context, id int64) (*TaskTpl, error) {
if !ctx.IsCenter {
tpl, err := poster.GetByUrls[*TaskTpl](ctx, "/v1/n9e/task-tpl/"+strconv.FormatInt(id, 10))
return tpl, err
}
return TaskTplGet(ctx, "id = ?", id)
}
func TaskTplGet(ctx *ctx.Context, where string, args ...interface{}) (*TaskTpl, error) {
var arr []*TaskTpl
err := DB(ctx).Where(where, args...).Find(&arr).Error
if err != nil {
return nil, err
}
if len(arr) == 0 {
return nil, nil
}
arr[0].TagsJSON = strings.Fields(arr[0].Tags)
return arr[0], nil
}
func (t *TaskTpl) CleanFields() error {
if t.Batch < 0 {
return errors.New("arg(batch) should be nonnegative")
}
if t.Tolerance < 0 {
return errors.New("arg(tolerance) should be nonnegative")
}
if t.Timeout < 0 {
return errors.New("arg(timeout) should be nonnegative")
}
if t.Timeout == 0 {
t.Timeout = 30
}
if t.Timeout > 3600*24*5 {
return errors.New("arg(timeout) longer than five days")
}
t.Pause = strings.Replace(t.Pause, ",", ",", -1)
t.Pause = strings.Replace(t.Pause, " ", "", -1)
t.Args = strings.Replace(t.Args, ",", ",", -1)
t.Tags = strings.Replace(t.Tags, ",", ",", -1)
if t.Title == "" {
return errors.New("arg(title) is required")
}
if str.Dangerous(t.Title) {
return errors.New("arg(title) is dangerous")
}
if t.Script == "" {
return errors.New("arg(script) is required")
}
t.Script = strings.Replace(t.Script, "\r\n", "\n", -1)
if str.Dangerous(t.Args) {
return errors.New("arg(args) is dangerous")
}
if str.Dangerous(t.Pause) {
return errors.New("arg(pause) is dangerous")
}
if str.Dangerous(t.Tags) {
return errors.New("arg(tags) is dangerous")
}
return nil
}
type TaskTplHost struct {
Id int64 `json:"id"`
Host string `json:"host"`
}
func (t *TaskTpl) Save(ctx *ctx.Context, hosts []string) error {
if err := t.CleanFields(); err != nil {
return err
}
cnt, err := Count(DB(ctx).Model(&TaskTpl{}).Where("group_id=? and title=?", t.GroupId, t.Title))
if err != nil {
return err
}
if cnt > 0 {
return fmt.Errorf("task template already exists")
}
return DB(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Create(t).Error; err != nil {
return err
}
for i := 0; i < len(hosts); i++ {
host := strings.TrimSpace(hosts[i])
if host == "" {
continue
}
taskTplHost := TaskTplHost{
Id: t.Id,
Host: host,
}
err := tx.Table("task_tpl_host").Create(&taskTplHost).Error
if err != nil {
return err
}
}
return nil
})
}
func (t *TaskTpl) Hosts(ctx *ctx.Context) ([]string, error) {
var arr []string
err := DB(ctx).Table("task_tpl_host").Where("id=?", t.Id).Order("ii").Pluck("host", &arr).Error
return arr, err
}
func (t *TaskTpl) Update(ctx *ctx.Context, hosts []string) error {
if err := t.CleanFields(); err != nil {
return err
}
cnt, err := Count(DB(ctx).Model(&TaskTpl{}).Where("group_id=? and title=? and id <> ?", t.GroupId, t.Title, t.Id))
if err != nil {
return err
}
if cnt > 0 {
return fmt.Errorf("task template already exists")
}
return DB(ctx).Transaction(func(tx *gorm.DB) error {
err := tx.Model(t).Updates(map[string]interface{}{
"title": t.Title,
"batch": t.Batch,
"tolerance": t.Tolerance,
"timeout": t.Timeout,
"pause": t.Pause,
"script": t.Script,
"args": t.Args,
"tags": t.Tags,
"account": t.Account,
"update_by": t.UpdateBy,
"update_at": t.UpdateAt,
}).Error
if err != nil {
return err
}
if err = tx.Exec("DELETE FROM task_tpl_host WHERE id = ?", t.Id).Error; err != nil {
return err
}
for i := 0; i < len(hosts); i++ {
host := strings.TrimSpace(hosts[i])
if host == "" {
continue
}
err := tx.Table("task_tpl_host").Create(map[string]interface{}{
"id": t.Id,
"host": host,
}).Error
if err != nil {
return err
}
}
return nil
})
}
func (t *TaskTpl) Del(ctx *ctx.Context) error {
return DB(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Exec("DELETE FROM task_tpl_host WHERE id=?", t.Id).Error; err != nil {
return err
}
if err := tx.Delete(t).Error; err != nil {
return err
}
return nil
})
}
func (t *TaskTpl) AddTags(ctx *ctx.Context, tags []string, updateBy string) error {
for i := 0; i < len(tags); i++ {
if -1 == strings.Index(t.Tags, tags[i]+" ") {
t.Tags += tags[i] + " "
}
}
arr := strings.Fields(t.Tags)
sort.Strings(arr)
return DB(ctx).Model(t).Updates(map[string]interface{}{
"tags": strings.Join(arr, " ") + " ",
"update_by": updateBy,
"update_at": time.Now().Unix(),
}).Error
}
func (t *TaskTpl) DelTags(ctx *ctx.Context, tags []string, updateBy string) error {
for i := 0; i < len(tags); i++ {
t.Tags = strings.ReplaceAll(t.Tags, tags[i]+" ", "")
}
return DB(ctx).Model(t).Updates(map[string]interface{}{
"tags": t.Tags,
"update_by": updateBy,
"update_at": time.Now().Unix(),
}).Error
}
func (t *TaskTpl) UpdateGroup(ctx *ctx.Context, groupId int64, updateBy string) error {
return DB(ctx).Model(t).Updates(map[string]interface{}{
"group_id": groupId,
"update_by": updateBy,
"update_at": time.Now().Unix(),
}).Error
}
type TaskForm struct {
Title string `json:"title"`
Account string `json:"account"`
Batch int `json:"batch"`
Tolerance int `json:"tolerance"`
Timeout int `json:"timeout"`
Pause string `json:"pause"`
Script string `json:"script"`
Args string `json:"args"`
Stdin string `json:"stdin"`
Action string `json:"action"`
Creator string `json:"creator"`
Hosts []string `json:"hosts"`
AlertTriggered bool `json:"alert_triggered"`
}
func (f *TaskForm) Verify() error {
if f.Batch < 0 {
return fmt.Errorf("arg(batch) should be nonnegative")
}
if f.Tolerance < 0 {
return fmt.Errorf("arg(tolerance) should be nonnegative")
}
if f.Timeout < 0 {
return fmt.Errorf("arg(timeout) should be nonnegative")
}
if f.Timeout > 3600*24*5 {
return fmt.Errorf("arg(timeout) longer than five days")
}
if f.Timeout == 0 {
f.Timeout = 30
}
f.Pause = strings.Replace(f.Pause, ",", ",", -1)
f.Pause = strings.Replace(f.Pause, " ", "", -1)
f.Args = strings.Replace(f.Args, ",", ",", -1)
if f.Title == "" {
return fmt.Errorf("arg(title) is required")
}
if str.Dangerous(f.Title) {
return fmt.Errorf("arg(title) is dangerous")
}
if f.Script == "" {
return fmt.Errorf("arg(script) is required")
}
f.Script = strings.Replace(f.Script, "\r\n", "\n", -1)
if str.Dangerous(f.Args) {
return fmt.Errorf("arg(args) is dangerous")
}
if str.Dangerous(f.Pause) {
return fmt.Errorf("arg(pause) is dangerous")
}
if len(f.Hosts) == 0 {
return fmt.Errorf("arg(hosts) empty")
}
if f.Action != "start" && f.Action != "pause" {
return fmt.Errorf("arg(action) invalid")
}
return nil
}
func (f *TaskForm) HandleFH(fh string) {
i := strings.Index(f.Title, " FH: ")
if i > 0 {
f.Title = f.Title[:i]
}
f.Title = f.Title + " FH: " + fh
}
================================================
FILE: models/ts.go
================================================
package models
import (
"bytes"
"fmt"
"strconv"
"github.com/prometheus/common/model"
)
type DataResp struct {
Ref string `json:"ref"`
Metric model.Metric `json:"metric"`
Labels string `json:"-"`
Values [][]float64 `json:"values"`
Query string `json:"query"`
}
func (d *DataResp) String() string {
var buf bytes.Buffer
buf.WriteString(fmt.Sprintf("Ref: %s ", d.Ref))
buf.WriteString(fmt.Sprintf("Metric: %+v ", d.Metric))
buf.WriteString(fmt.Sprintf("Labels: %s ", d.Labels))
buf.WriteString("Values: ")
for _, v := range d.Values {
buf.WriteString(" [")
for i, ts := range v {
if i > 0 {
buf.WriteString(", ")
}
buf.WriteString(strconv.FormatInt(int64(ts), 10))
}
buf.WriteString("] ")
}
buf.WriteString(fmt.Sprintf("Query: %s ", d.Query))
return buf.String()
}
func (d *DataResp) Last() (float64, float64, bool) {
if len(d.Values) == 0 {
return 0, 0, false
}
lastValue := d.Values[len(d.Values)-1]
if len(lastValue) != 2 {
return 0, 0, false
}
return lastValue[0], lastValue[1], true
}
func (d *DataResp) MetricName() string {
metric := d.Metric["__name__"]
return string(metric)
}
// labels 转换为 string
func (d *DataResp) LabelsString() string {
labels := d.Metric
return labels.String()
}
type RelationKey struct {
LeftKey string `json:"left_key"`
RightKey string `json:"right_key"`
OP string `json:"op"`
}
type QueryParam struct {
Cate string `json:"cate"`
DatasourceId int64 `json:"datasource_id"`
Queries []interface{} `json:"query"`
}
type Series struct {
SeriesStore map[uint64]DataResp `json:"store"`
SeriesIndex map[string]map[uint64]struct{} `json:"index"`
}
================================================
FILE: models/user.go
================================================
package models
import (
"encoding/json"
"fmt"
"os"
"reflect"
"regexp"
"strconv"
"strings"
"time"
"unicode"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/logx"
"github.com/ccfos/nightingale/v6/pkg/ormx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/pkg/secu"
"github.com/ccfos/nightingale/v6/storage"
"github.com/redis/go-redis/v9"
"github.com/pkg/errors"
"github.com/tidwall/gjson"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/slice"
"github.com/toolkits/pkg/str"
"gorm.io/gorm"
)
const (
Dingtalk = "dingtalk"
Wecom = "wecom"
Feishu = "feishu"
FeishuCard = "feishucard"
Discord = "discord"
MattermostWebhook = "mattermostwebhook"
MattermostBot = "mattermostbot"
SlackWebhook = "slackwebhook"
SlackBot = "slackbot"
Mm = "mm"
Telegram = "telegram"
Email = "email"
EmailSubject = "mailsubject"
Lark = "lark"
LarkCard = "larkcard"
Phone = "phone"
Jira = "jira"
JSMAlert = "jsm_alert"
DingtalkKey = "dingtalk_robot_token"
WecomKey = "wecom_robot_token"
FeishuKey = "feishu_robot_token"
MmKey = "mm_webhook_url"
TelegramKey = "telegram_robot_token"
LarkKey = "lark_robot_token"
PagerDutyKey = "pagerduty_key"
DingtalkDomain = "oapi.dingtalk.com"
WecomDomain = "qyapi.weixin.qq.com"
FeishuDomain = "open.feishu.cn"
LarkDomain = "open.larksuite.com"
// FeishuCardDomain The domain name of the feishu card is the same as the feishu,distinguished by the parameter
FeishuCardDomain = "open.feishu.cn?card=1"
LarkCardDomain = "open.larksuite.com?card=1"
TelegramDomain = "api.telegram.org"
IbexDomain = "ibex"
DefaultDomain = "default"
)
var (
DefaultChannels = []string{Dingtalk, Wecom, Feishu, Mm, Telegram, Email, FeishuCard, Lark, LarkCard}
DefaultContacts = []string{DingtalkKey, WecomKey, FeishuKey, MmKey, TelegramKey, LarkKey}
)
type User struct {
Id int64 `json:"id" gorm:"primaryKey"`
Username string `json:"username"`
Nickname string `json:"nickname"`
Password string `json:"-"`
Phone string `json:"phone"`
Email string `json:"email"`
Portrait string `json:"portrait"`
Roles string `json:"-"` // 这个字段写入数据库
RolesLst []string `json:"roles" gorm:"-"` // 这个字段和前端交互
TeamsLst []int64 `json:"-" gorm:"-"` // 这个字段方便映射团队,前端和数据库都不用到
Contacts ormx.JSONObj `json:"contacts"` // 内容为 map[string]string 结构
Maintainer int `json:"maintainer"` // 是否给管理员发消息 0:not send 1:send
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
Belong string `json:"belong"`
Admin bool `json:"admin" gorm:"-"` // 方便前端使用
UserGroupsRes []*UserGroupRes `json:"user_groups" gorm:"-"`
BusiGroupsRes []*BusiGroupRes `json:"busi_groups" gorm:"-"`
LastActiveTime int64 `json:"last_active_time"`
}
type UserGroupRes struct {
Id int64 `json:"id"`
Name string `json:"name"`
}
type BusiGroupRes struct {
Id int64 `json:"id"`
Name string `json:"name"`
}
func (u *User) TableName() string {
return "users"
}
func (u *User) String() string {
bs, err := u.Contacts.MarshalJSON()
if err != nil {
return err.Error()
}
return fmt.Sprintf("", u.Id, u.Username, u.Nickname, u.Email, u.Phone, string(bs))
}
func (u *User) IsAdmin() bool {
for i := 0; i < len(u.RolesLst); i++ {
if u.RolesLst[i] == AdminRole {
return true
}
}
return false
}
// has group permission
func (u *User) CheckGroupPermission(ctx *ctx.Context, groupIds []int64) error {
if !u.IsAdmin() {
ids, err := MyGroupIdsMap(ctx, u.Id)
if err != nil {
return err
}
for _, id := range groupIds {
if _, ok := ids[id]; ok {
return nil
}
}
return errors.New("forbidden")
}
return nil
}
// stripInvisibleChars removes invisible Unicode characters from a string
// This includes zero-width spaces, control characters, and other invisible chars
func stripInvisibleChars(s string) string {
return strings.Map(func(r rune) rune {
// Keep printable characters and common whitespace (space, tab, newline)
if unicode.IsPrint(r) || r == ' ' || r == '\t' || r == '\n' || r == '\r' {
return r
}
// Remove invisible characters
return -1
}, s)
}
// stripInvisibleCharsFromContacts removes invisible characters from Contacts JSON values
func stripInvisibleCharsFromContacts(contacts ormx.JSONObj) ormx.JSONObj {
if len(contacts) == 0 {
return contacts
}
var contactsMap map[string]string
if err := json.Unmarshal(contacts, &contactsMap); err != nil {
return contacts
}
for k, v := range contactsMap {
contactsMap[k] = stripInvisibleChars(v)
}
result, err := json.Marshal(contactsMap)
if err != nil {
return contacts
}
return ormx.JSONObj(result)
}
func (u *User) Verify() error {
u.Username = strings.TrimSpace(u.Username)
if u.Username == "" {
return errors.New("Username is blank")
}
if str.Dangerous(u.Username) {
return errors.New("Username has invalid characters")
}
if str.Dangerous(u.Nickname) {
return errors.New("Nickname has invalid characters")
}
if u.Phone != "" && !str.IsPhone(u.Phone) {
return errors.New("Phone invalid")
}
if u.Email != "" && !str.IsMail(u.Email) {
return errors.New("Email invalid")
}
// Strip invisible characters from Contacts values
u.Contacts = stripInvisibleCharsFromContacts(u.Contacts)
if u.Phone != "" {
return u.EncryptPhone()
}
return nil
}
func (u *User) UpdateSsoFields(sso string, nickname, phone, email string) []interface{} {
u.UpdateAt = time.Now().Unix()
if nickname != "" {
u.Nickname = nickname
}
if phone != "" {
u.Phone = phone
}
if email != "" {
u.Email = email
}
u.UpdateBy = sso
u.Belong = sso
updatedFields := []interface{}{"nickname", "phone", "email", "update_by", "belong"}
return updatedFields
}
func (u *User) UpdateSsoFieldsWithRoles(sso string, nickname, phone, email string, roles []string) []interface{} {
updatedFields := u.UpdateSsoFields(sso, nickname, phone, email)
if len(roles) == 0 {
return updatedFields
}
u.Roles = strings.Join(roles, " ")
u.RolesLst = roles
return append(updatedFields, "roles")
}
func (u *User) FullSsoFields(sso, username, nickname, phone, email string, defaultRoles []string) {
now := time.Now().Unix()
u.Username = username
u.Password = "******"
u.Nickname = nickname
u.Phone = phone
u.Email = email
u.Portrait = ""
u.Roles = strings.Join(defaultRoles, " ")
u.RolesLst = defaultRoles
u.Contacts = []byte("{}")
u.CreateAt = now
u.UpdateAt = now
u.CreateBy = sso
u.UpdateBy = sso
u.Belong = sso
}
func (u *User) FullSsoFieldsWithTeams(sso, username, nickname, phone, email string, defaultRoles []string,
teams []int64) {
u.FullSsoFields(sso, username, nickname, phone, email, defaultRoles)
u.TeamsLst = teams
}
func (u *User) Add(ctx *ctx.Context) error {
user, err := UserGetByUsername(ctx, u.Username)
if err != nil {
return errors.WithMessage(err, "failed to query user")
}
if user != nil {
return errors.New("Username already exists")
}
now := time.Now().Unix()
u.CreateAt = now
u.UpdateAt = now
return Insert(ctx, u)
}
func (u *User) Update(ctx *ctx.Context, selectField interface{}, selectFields ...interface{}) error {
if u.Belong == "" {
if err := u.Verify(); err != nil {
return err
}
}
return DB(ctx).Model(u).Select(selectField, selectFields...).Updates(u).Error
}
func (u *User) UpdateAllFields(ctx *ctx.Context) error {
if err := u.Verify(); err != nil {
return err
}
u.UpdateAt = time.Now().Unix()
return DB(ctx).Model(u).Select("*").Updates(u).Error
}
func (u *User) UpdatePassword(ctx *ctx.Context, password, updateBy string) error {
return DB(ctx).Model(u).Updates(map[string]interface{}{
"password": password,
"update_at": time.Now().Unix(),
"update_by": updateBy,
}).Error
}
func (u *User) AddToUserGroups(ctx *ctx.Context, userGroupIds []int64) error {
count := len(userGroupIds)
for i := 0; i < count; i++ {
err := UserGroupMemberAdd(ctx, userGroupIds[i], u.Id)
if err != nil {
return err
}
}
return nil
}
func UpdateUserLastActiveTime(ctx *ctx.Context, userId int64, lastActiveTime int64) error {
return DB(ctx).Model(&User{}).Where("id = ?", userId).Updates(map[string]interface{}{
"last_active_time": lastActiveTime,
"update_at": time.Now().Unix(),
}).Error
}
func (u *User) Del(ctx *ctx.Context) error {
return DB(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Where("user_id=?", u.Id).Delete(&UserGroupMember{}).Error; err != nil {
return err
}
if err := tx.Where("id=?", u.Id).Delete(&User{}).Error; err != nil {
return err
}
if err := tx.Where("username=?", u.Username).Delete(&UserToken{}).Error; err != nil {
return err
}
return nil
})
}
func (u *User) ChangePassword(ctx *ctx.Context, oldpass, newpass string) error {
// SSO 用户(ldap/oidc/cas/oauth2/dingtalk等)且未设置本地密码,不支持本地修改密码
if u.Belong != "" && u.Password == "******" {
return fmt.Errorf("SSO user(%s) cannot change password locally, please change password in %s", u.Username, u.Belong)
}
_oldpass, err := CryptoPass(ctx, oldpass)
if err != nil {
return err
}
_newpass, err := CryptoPass(ctx, newpass)
if err != nil {
return err
}
if u.Password != _oldpass {
return errors.New("Incorrect old password")
}
return u.UpdatePassword(ctx, _newpass, u.Username)
}
func UserGet(ctx *ctx.Context, where string, args ...interface{}) (*User, error) {
var lst []*User
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
lst[0].RolesLst = strings.Fields(lst[0].Roles)
lst[0].Admin = lst[0].IsAdmin()
lst[0].DecryptPhone() // 解密手机号
return lst[0], nil
}
func UsersGet(ctx *ctx.Context, where string, args ...interface{}) ([]*User, error) {
var lst []*User
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
for _, user := range lst {
user.RolesLst = strings.Fields(user.Roles)
user.Admin = user.IsAdmin()
user.DecryptPhone() // 解密手机号
}
return lst, nil
}
func UserMapGet(ctx *ctx.Context, where string, args ...interface{}) map[string]*User {
lst, err := UsersGet(ctx, where, args...)
if err != nil {
logger.Errorf("UsersGet err: %v", err)
return nil
}
um := make(map[string]*User, len(lst))
for _, user := range lst {
um[user.Username] = user
}
return um
}
// UserNicknameMap returns a deduplicated username -> nickname map.
func UserNicknameMap(ctx *ctx.Context, names []string) map[string]string {
m := make(map[string]string)
if len(names) == 0 {
return m
}
seen := make(map[string]struct{}, len(names))
unique := make([]string, 0, len(names))
for _, name := range names {
if name == "" {
continue
}
if _, ok := seen[name]; ok {
continue
}
seen[name] = struct{}{}
unique = append(unique, name)
}
if len(unique) == 0 {
return m
}
users := UserMapGet(ctx, "username in (?)", unique)
for username, user := range users {
m[username] = user.Nickname
}
return m
}
// FillUpdateByNicknames fills the UpdateByNickname field for each element in items
// by looking up the UpdateBy username. Supports both []T and []*T slices.
func FillUpdateByNicknames[T any](ctx *ctx.Context, items []T) {
if len(items) == 0 {
return
}
elemType := reflect.TypeOf(items).Elem()
isPtr := elemType.Kind() == reflect.Ptr
if isPtr {
elemType = elemType.Elem()
}
updateByField, ok1 := elemType.FieldByName("UpdateBy")
nicknameField, ok2 := elemType.FieldByName("UpdateByNickname")
if !ok1 || !ok2 {
return
}
names := make([]string, 0, len(items))
for i := range items {
v := reflect.ValueOf(&items[i]).Elem()
if isPtr {
if v.IsNil() {
continue
}
v = v.Elem()
}
names = append(names, v.FieldByIndex(updateByField.Index).String())
}
nm := UserNicknameMap(ctx, names)
for i := range items {
v := reflect.ValueOf(&items[i]).Elem()
if isPtr {
if v.IsNil() {
continue
}
v = v.Elem()
}
updateBy := v.FieldByIndex(updateByField.Index).String()
v.FieldByIndex(nicknameField.Index).SetString(nm[updateBy])
}
}
func UserGetByUsername(ctx *ctx.Context, username string) (*User, error) {
return UserGet(ctx, "username=?", username)
}
func UserGetById(ctx *ctx.Context, id int64) (*User, error) {
return UserGet(ctx, "id=?", id)
}
func CountAdminUsers(ctx *ctx.Context) (int64, error) {
var count int64
err := DB(ctx).Model(&User{}).Where("roles LIKE ?", "%"+AdminRole+"%").Count(&count).Error
return count, err
}
func UsersGetByGroupIds(ctx *ctx.Context, groupIds []int64) ([]User, error) {
if len(groupIds) == 0 {
return nil, nil
}
userIds, err := GroupsMemberIds(ctx, groupIds)
if err != nil {
return nil, err
}
users, err := UserGetsByIds(ctx, userIds)
if err != nil {
return nil, err
}
return users, nil
}
func InitRoot(ctx *ctx.Context) bool {
user, err := UserGetByUsername(ctx, "root")
if err != nil {
fmt.Println("failed to query user root:", err)
os.Exit(1)
}
if user == nil {
return false
}
if len(user.Password) > 31 {
// already done before
return false
}
// 查询用户个数
count, err := Count(DB(ctx).Model(&User{}))
if err != nil {
fmt.Println("failed to count user:", err)
os.Exit(1)
}
if count == 1 {
// 说明数据库只有一个 root 用户,并且 root 用户密码没有加密,需要初始化 salt
InitSalt(ctx)
}
newPass, err := CryptoPass(ctx, user.Password)
if err != nil {
fmt.Println("failed to crypto pass:", err)
os.Exit(1)
}
err = DB(ctx).Model(user).Update("password", newPass).Error
if err != nil {
fmt.Println("failed to update root password:", err)
os.Exit(1)
}
fmt.Println("root password init done")
return true
}
func reachLoginFailCount(ctx *ctx.Context, redisObj storage.Redis, username string, count int64) (bool, error) {
key := "/userlogin/errorcount/" + username
val, err := redisObj.Get(ctx.GetContext(), key).Result()
if err == redis.Nil {
return false, nil
}
if err != nil {
return false, err
}
c, err := strconv.ParseInt(val, 10, 64)
if err != nil {
return false, err
}
return c >= count, nil
}
func incrLoginFailCount(ctx *ctx.Context, redisObj storage.Redis, username string, seconds int64) {
key := "/userlogin/errorcount/" + username
duration := time.Duration(seconds) * time.Second
val, err := redisObj.Get(ctx.GetContext(), key).Result()
if err == redis.Nil {
redisObj.Set(ctx.GetContext(), key, "1", duration)
return
}
if err != nil {
logx.Warningf(ctx.Ctx, "login_fail_count: failed to get redis value. key:%s, error:%s", key, err)
redisObj.Set(ctx.GetContext(), key, "1", duration)
return
}
count, err := strconv.ParseInt(val, 10, 64)
if err != nil {
logx.Warningf(ctx.Ctx, "login_fail_count: failed to parse int64. key:%s, error:%s", key, err)
redisObj.Set(ctx.GetContext(), key, "1", duration)
return
}
count++
redisObj.Set(ctx.GetContext(), key, fmt.Sprintf("%d", count), duration)
}
func PassLogin(ctx *ctx.Context, redis storage.Redis, username, pass string) (*User, error) {
// 300 5 meaning: 300 seconds, 5 times
val, err := ConfigsGet(ctx, "login_fail_count")
if err != nil {
return nil, err
}
var (
needCheck = val != "" // DB 里有配置,说明启用了这个 feature
seconds int64
count int64
)
if needCheck {
pair := strings.Fields(val)
if len(pair) != 2 {
logx.Warningf(ctx.Ctx, "login_fail_count config invalid: %s", val)
needCheck = false
} else {
seconds, err = strconv.ParseInt(pair[0], 10, 64)
if err != nil {
logx.Warningf(ctx.Ctx, "login_fail_count seconds invalid: %s", pair[0])
needCheck = false
}
count, err = strconv.ParseInt(pair[1], 10, 64)
if err != nil {
logx.Warningf(ctx.Ctx, "login_fail_count count invalid: %s", pair[1])
needCheck = false
}
}
}
if needCheck {
reach, err := reachLoginFailCount(ctx, redis, username, count)
if err != nil {
return nil, err
}
if reach {
return nil, fmt.Errorf("reach login fail count")
}
}
user, err := UserGetByUsername(ctx, username)
if err != nil {
return nil, err
}
if user == nil {
if needCheck {
incrLoginFailCount(ctx, redis, username, seconds)
}
return nil, fmt.Errorf("Username or password invalid")
}
loginPass, err := CryptoPass(ctx, pass)
if err != nil {
return nil, err
}
if loginPass != user.Password {
if needCheck {
incrLoginFailCount(ctx, redis, username, seconds)
}
return nil, fmt.Errorf("Username or password invalid")
}
return user, nil
}
func UserTotal(ctx *ctx.Context, query string, stime, etime int64) (num int64, err error) {
db := DB(ctx).Model(&User{})
if stime != 0 && etime != 0 {
db = db.Where("last_active_time between ? and ?", stime, etime)
}
if query != "" {
q := "%" + query + "%"
num, err = Count(db.Where("username like ? or nickname like ? or phone like ? or email like ?", q, q, q, q))
} else {
num, err = Count(db)
}
if err != nil {
return num, errors.WithMessage(err, "failed to count user")
}
return num, nil
}
var (
// 预编译正则表达式,避免重复编译
whitespaceRegex = regexp.MustCompile(`\s+`)
validOrderRegex = regexp.MustCompile(`^[a-zA-Z_][a-zA-Z0-9_]*(\.[a-zA-Z_][a-zA-Z0-9_]*)?$`)
)
func validateOrderField(order string, defaultField string) string {
// 空值检查
if order == "" {
return defaultField
}
// 长度检查
if len(order) > 64 {
logger.Warningf("SQL injection attempt detected: order field too long (%d chars)", len(order))
return defaultField
}
// 移除所有空白字符
order = whitespaceRegex.ReplaceAllString(order, "")
if order == "" {
return defaultField
}
// 检查危险字符
orderLower := strings.ToLower(order)
if strings.ContainsAny(order, "();,'\"` --/*\\=+-*/><|&^~") ||
strings.Contains(orderLower, "0x") || strings.Contains(orderLower, "0b") {
logger.Warningf("SQL injection attempt detected: contains dangerous characters")
return defaultField
}
// 使用正则表达式验证格式:只允许字母开头的字段名,可选择性包含表名
if !validOrderRegex.MatchString(order) {
logger.Warningf("SQL injection attempt detected: invalid order field format")
return defaultField
}
return order
}
func UserGets(ctx *ctx.Context, query string, limit, offset int, stime, etime int64,
order string, desc bool, usernames, phones, emails []string) ([]User, error) {
session := DB(ctx)
if stime != 0 && etime != 0 {
session = session.Where("last_active_time between ? and ?", stime, etime)
}
order = validateOrderField(order, "username")
if desc {
order = order + " desc"
} else {
order = order + " asc"
}
session = session.Order(order)
if len(usernames) > 0 {
session = session.Where("username in (?)", usernames)
}
if len(phones) > 0 {
session = session.Where("phone in (?)", phones)
}
if len(emails) > 0 {
session = session.Where("email in (?)", emails)
}
if query != "" {
q := "%" + query + "%"
session = session.Where("username like ? or nickname like ? or phone like ? or email like ?", q, q, q, q)
}
var users []User
err := session.Limit(limit).Offset(offset).Find(&users).Error
if err != nil {
return users, errors.WithMessage(err, "failed to query user")
}
for i := range users {
users[i].RolesLst = strings.Fields(users[i].Roles)
users[i].Admin = users[i].IsAdmin()
users[i].Password = ""
users[i].DecryptPhone() // 解密手机号
// query for user group information
var userGroupIDs []int64
userGroupIDs, err = MyGroupIds(ctx, users[i].Id)
if err != nil {
return users, errors.WithMessage(err, "failed to query group_ids")
}
if err = DB(ctx).Table("user_group").Where("id IN (?)", userGroupIDs).
Find(&users[i].UserGroupsRes).Error; err != nil {
return users, errors.WithMessage(err, "failed to query user_groups")
}
// query business group information
var busiGroupIDs []int64
busiGroupIDs, err = BusiGroupIds(ctx, userGroupIDs)
if err != nil {
return users, errors.WithMessage(err, "failed to query busi_group_id")
}
if err = DB(ctx).Table("busi_group").Where("id IN (?)", busiGroupIDs).
Find(&users[i].BusiGroupsRes).Error; err != nil {
return users, errors.WithMessage(err, "failed to query busi_groups")
}
}
return users, nil
}
func UserGetAll(ctx *ctx.Context) ([]*User, error) {
if !ctx.IsCenter {
lst, err := poster.GetByUrls[[]*User](ctx, "/v1/n9e/users")
return lst, err
}
var lst []*User
err := DB(ctx).Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].RolesLst = strings.Fields(lst[i].Roles)
lst[i].Admin = lst[i].IsAdmin()
lst[i].DecryptPhone() // 解密手机号
}
}
return lst, err
}
func UserGetsByIds(ctx *ctx.Context, ids []int64) ([]User, error) {
if len(ids) == 0 {
return []User{}, nil
}
var lst []User
err := DB(ctx).Where("id in ?", ids).Order("username").Find(&lst).Error
if err == nil {
for i := 0; i < len(lst); i++ {
lst[i].RolesLst = strings.Fields(lst[i].Roles)
lst[i].Admin = lst[i].IsAdmin()
lst[i].DecryptPhone() // 解密手机号
}
}
return lst, err
}
func UserGetsBySso(ctx *ctx.Context, sso string) (map[string]*User, error) {
session := DB(ctx).Where("belong=?", sso).Order("username")
var users []User
err := session.Find(&users).Error
if err != nil {
return nil, errors.WithMessage(err, "failed to query user")
}
usersMap := make(map[string]*User, len(users))
for i, user := range users {
usersMap[user.Username] = &users[i]
}
return usersMap, nil
}
func UserDelByIds(ctx *ctx.Context, userIds []int64) error {
return DB(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Where("user_id in ?", userIds).Delete(&UserGroupMember{}).Error; err != nil {
return err
}
if err := tx.Where("id in ?", userIds).Delete(&User{}).Error; err != nil {
return err
}
return nil
})
}
func (u *User) CanModifyUserGroup(ctx *ctx.Context, ug *UserGroup) (bool, error) {
// 我是管理员,自然可以
if u.IsAdmin() {
return true, nil
}
// 我是创建者,自然可以
if ug.CreateBy == u.Username {
return true, nil
}
// 我是成员,也可以吧,简单搞
num, err := UserGroupMemberCount(ctx, "user_id=? and group_id=?", u.Id, ug.Id)
if err != nil {
return false, err
}
return num > 0, nil
}
func (u *User) CanDoBusiGroup(ctx *ctx.Context, bg *BusiGroup, permFlag ...string) (bool, error) {
if u.IsAdmin() {
return true, nil
}
// 我在任意一个UserGroup里,就有权限
ugids, err := UserGroupIdsOfBusiGroup(ctx, bg.Id, permFlag...)
if err != nil {
return false, err
}
if len(ugids) == 0 {
return false, nil
}
num, err := UserGroupMemberCount(ctx, "user_id = ? and group_id in ?", u.Id, ugids)
return num > 0, err
}
func (u *User) CheckPerm(ctx *ctx.Context, operation string) (bool, error) {
if u.IsAdmin() {
return true, nil
}
return RoleHasOperation(ctx, u.RolesLst, operation)
}
func UserStatistics(ctx *ctx.Context) (*Statistics, error) {
if !ctx.IsCenter {
s, err := poster.GetByUrls[*Statistics](ctx, "/v1/n9e/statistic?name=user")
return s, err
}
session := DB(ctx).Model(&User{}).Select("count(*) as total", "max(update_at) as last_updated")
var stats []*Statistics
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
func (u *User) NopriIdents(ctx *ctx.Context, idents []string) ([]string, error) {
if u.IsAdmin() {
return []string{}, nil
}
ugids, err := MyGroupIds(ctx, u.Id)
if err != nil {
return []string{}, err
}
if len(ugids) == 0 {
return idents, nil
}
bgids, err := BusiGroupIds(ctx, ugids, "rw")
if err != nil {
return []string{}, err
}
if len(bgids) == 0 {
return idents, nil
}
var allowedIdents []string
sub := DB(ctx).Model(&Target{}).Distinct("target.ident").
Joins("join target_busi_group on target.ident = target_busi_group.target_ident").
Where("target_busi_group.group_id in (?)", bgids)
err = DB(ctx).Model(&Target{}).Where("ident in (?)", sub).Pluck("ident", &allowedIdents).Error
if err != nil {
return []string{}, err
}
return slice.SubString(idents, allowedIdents), nil
}
// 我是管理员,返回所有
// 或者我是成员
func (u *User) BusiGroups(ctx *ctx.Context, limit int, query string, all ...bool) ([]BusiGroup, error) {
session := DB(ctx).Order("name").Limit(limit)
var lst []BusiGroup
if u.IsAdmin() || (len(all) > 0 && all[0]) {
err := session.Where("name like ?", "%"+query+"%").Find(&lst).Error
if err != nil {
return lst, err
}
if len(lst) == 0 && len(query) > 0 {
// 隐藏功能,一般人不告诉,哈哈。query可能是给的ident,所以上面的sql没有查到,当做ident来查一下试试
var t *Target
t, err = TargetGet(ctx, "ident=?", query)
if err != nil {
return lst, err
}
if t == nil {
return lst, nil
}
t.GroupIds, err = TargetGroupIdsGetByIdent(ctx, t.Ident)
if err != nil {
return nil, err
}
err = DB(ctx).Order("name").Limit(limit).Where("id in ?", t.GroupIds).Find(&lst).Error
}
return lst, err
}
userGroupIds, err := MyGroupIds(ctx, u.Id)
if err != nil {
return nil, errors.WithMessage(err, "failed to get MyGroupIds")
}
busiGroupIds, err := BusiGroupIds(ctx, userGroupIds)
if err != nil {
return nil, errors.WithMessage(err, "failed to get BusiGroupIds")
}
if len(busiGroupIds) == 0 {
return lst, nil
}
err = session.Where("id in ?", busiGroupIds).Where("name like ?", "%"+query+"%").Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 && len(query) > 0 {
var t *Target
t, err = TargetGet(ctx, "ident=?", query)
if err != nil {
return lst, err
}
if t == nil {
return lst, nil
}
t.GroupIds, err = TargetGroupIdsGetByIdent(ctx, t.Ident)
if err != nil {
return nil, err
}
if t != nil && t.MatchGroupId(busiGroupIds...) {
err = DB(ctx).Order("name").Limit(limit).Where("id in ?", t.GroupIds).Find(&lst).Error
}
}
return lst, err
}
func (u *User) UserGroups(ctx *ctx.Context, limit int, query string) ([]UserGroup, error) {
session := DB(ctx).Order("name").Limit(limit)
var lst []UserGroup
if u.IsAdmin() {
err := session.Where("name like ?", "%"+query+"%").Find(&lst).Error
if err != nil {
return lst, err
}
var user *User
if len(lst) == 0 && len(query) > 0 {
// 隐藏功能,一般人不告诉,哈哈。query可能是给的用户名,所以上面的sql没有查到,当做user来查一下试试
user, err = UserGetByUsername(ctx, query)
if user == nil {
return lst, err
}
var ids []int64
ids, err = MyGroupIds(ctx, user.Id)
if err != nil || len(ids) == 0 {
return lst, err
}
lst, err = UserGroupGetByIds(ctx, ids)
}
return lst, err
}
ids, err := MyGroupIds(ctx, u.Id)
if err != nil {
return nil, errors.WithMessage(err, "failed to get MyGroupIds")
}
if len(ids) > 0 {
session = session.Where("id in ? or create_by = ?", ids, u.Username)
} else {
session = session.Where("create_by = ?", u.Username)
}
if len(query) > 0 {
session = session.Where("name like ?", "%"+query+"%")
}
err = session.Find(&lst).Error
return lst, err
}
func (u *User) ExtractToken(key string) (string, bool) {
bs, err := u.Contacts.MarshalJSON()
if err != nil {
logger.Errorf("handle_notice: failed to marshal contacts: %v", err)
return "", false
}
switch key {
case Dingtalk:
ret := gjson.GetBytes(bs, DingtalkKey)
return ret.String(), ret.Exists()
case Wecom:
ret := gjson.GetBytes(bs, WecomKey)
return ret.String(), ret.Exists()
case Feishu, FeishuCard:
ret := gjson.GetBytes(bs, FeishuKey)
return ret.String(), ret.Exists()
case Mm:
ret := gjson.GetBytes(bs, MmKey)
return ret.String(), ret.Exists()
case Telegram:
ret := gjson.GetBytes(bs, TelegramKey)
return ret.String(), ret.Exists()
case Email:
return u.Email, u.Email != ""
case Lark, LarkCard:
ret := gjson.GetBytes(bs, LarkKey)
return ret.String(), ret.Exists()
case Phone:
return u.Phone, u.Phone != ""
default:
ret := gjson.GetBytes(bs, key)
return ret.String(), ret.Exists()
}
}
func (u *User) FindSameContact(email, phone string) string {
if u.Email != "" && u.Email == email {
return "email"
}
if u.Phone != "" && u.Phone == phone {
return "phone"
}
return ""
}
// AddUserAndGroups Add a user and add it to multiple groups in a single transaction
func (u *User) AddUserAndGroups(ctx *ctx.Context, coverTeams bool) error {
// Try to add a user
if err := u.Add(ctx); err != nil {
return errors.WithMessage(err, "failed to add user")
}
// Try to add a group for the user
if err := UserGroupMemberSyncByUser(ctx, u, coverTeams); err != nil {
return errors.WithMessage(err, "failed to add user to groups")
}
return nil
}
func (u *User) EncryptPhone() (err error) {
// 从缓存获取手机号加密配置
enabled, publicKey, _, _, loaded := GetPhoneEncryptionConfigFromCache()
if !loaded {
// 如果缓存未加载,记录日志但不阻止保存
logger.Infof("Phone encryption config cache not loaded, user: %s", u.Username)
return nil
}
// 检查是否启用了手机号加密
if enabled && u.Phone != "" {
// 检查手机号是否已经加密(避免重复加密)
if len(u.Phone) > 4 && u.Phone[:4] == "enc:" {
// 已经加密,跳过
return nil
}
encryptedPhone, err := secu.EncryptValue(u.Phone, publicKey)
if err != nil {
logger.Warningf("Failed to encrypt phone: %v, user: %s", err, u.Username)
return nil
}
u.Phone = encryptedPhone
}
return nil
}
// DecryptPhone 解密用户手机号(如果已加密)
func (u *User) DecryptPhone() {
if u.Phone == "" {
return
}
// 检查手机号是否是加密格式(有 "enc:" 前缀)
if len(u.Phone) <= 4 || u.Phone[:4] != "enc:" {
// 不是加密格式,不需要解密
return
}
// 从缓存获取手机号加密配置
enabled, _, privateKey, password, loaded := GetPhoneEncryptionConfigFromCache()
if !loaded || !enabled {
// 如果缓存未加载或未启用加密,不解密
return
}
// 对手机号进行解密
decryptedPhone, err := secu.Decrypt(u.Phone, privateKey, password)
if err != nil {
// 如果解密失败,记录错误但保持原样
logger.Warningf("Failed to decrypt phone for user %s: %v", u.Username, err)
return
}
u.Phone = decryptedPhone
}
================================================
FILE: models/user_group.go
================================================
package models
import (
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/pkg/errors"
"github.com/toolkits/pkg/str"
"gorm.io/gorm"
)
type UserGroup struct {
Id int64 `json:"id" gorm:"primaryKey"`
Name string `json:"name"`
Note string `json:"note"`
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`
UpdateBy string `json:"update_by"`
UpdateByNickname string `json:"update_by_nickname" gorm:"-"`
UserIds []int64 `json:"-" gorm:"-"`
Users []User `json:"users" gorm:"-"`
BusiGroups []*BusiGroup `json:"busi_groups" gorm:"-"`
}
func (ug *UserGroup) TableName() string {
return "user_group"
}
func (ug *UserGroup) Verify() error {
if str.Dangerous(ug.Name) {
return errors.New("Name has invalid characters")
}
if str.Dangerous(ug.Note) {
return errors.New("Note has invalid characters")
}
return nil
}
func (ug *UserGroup) Update(ctx *ctx.Context, selectField interface{}, selectFields ...interface{}) error {
if err := ug.Verify(); err != nil {
return err
}
return DB(ctx).Model(ug).Select(selectField, selectFields...).Updates(ug).Error
}
func UserGroupCount(ctx *ctx.Context, where string, args ...interface{}) (num int64, err error) {
return Count(DB(ctx).Model(&UserGroup{}).Where(where, args...))
}
func (ug *UserGroup) Add(ctx *ctx.Context) error {
if err := ug.Verify(); err != nil {
return err
}
num, err := UserGroupCount(ctx, "name=?", ug.Name)
if err != nil {
return errors.WithMessage(err, "failed to count user-groups")
}
if num > 0 {
return errors.New("UserGroup already exists")
}
now := time.Now().Unix()
ug.CreateAt = now
ug.UpdateAt = now
return Insert(ctx, ug)
}
func (ug *UserGroup) Del(ctx *ctx.Context) error {
return DB(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Where("group_id=?", ug.Id).Delete(&UserGroupMember{}).Error; err != nil {
return err
}
if err := tx.Where("id=?", ug.Id).Delete(&UserGroup{}).Error; err != nil {
return err
}
return nil
})
}
func UserGroupGet(ctx *ctx.Context, where string, args ...interface{}) (*UserGroup, error) {
var lst []*UserGroup
err := DB(ctx).Where(where, args...).Find(&lst).Error
if err != nil {
return nil, err
}
if len(lst) == 0 {
return nil, nil
}
return lst[0], nil
}
func UserGroupGetById(ctx *ctx.Context, id int64) (*UserGroup, error) {
return UserGroupGet(ctx, "id = ?", id)
}
func UserGroupGetByIds(ctx *ctx.Context, ids []int64) ([]UserGroup, error) {
var lst []UserGroup
if len(ids) == 0 {
return lst, nil
}
err := DB(ctx).Where("id in ?", ids).Order("name").Find(&lst).Error
return lst, err
}
func UserGroupIdAndNameMap(ctx *ctx.Context, ids []int64) (map[int64]string, error) {
lst, err := UserGroupGetByIds(ctx, ids)
if err != nil {
return nil, err
}
m := make(map[int64]string)
for _, ug := range lst {
m[ug.Id] = ug.Name
}
return m, nil
}
func UserGroupGetAll(ctx *ctx.Context) ([]*UserGroup, error) {
if !ctx.IsCenter {
lst, err := poster.GetByUrls[[]*UserGroup](ctx, "/v1/n9e/user-groups")
return lst, err
}
var lst []*UserGroup
err := DB(ctx).Find(&lst).Error
return lst, err
}
func (ug *UserGroup) AddMembers(ctx *ctx.Context, userIds []int64) error {
count := len(userIds)
for i := 0; i < count; i++ {
user, err := UserGetById(ctx, userIds[i])
if err != nil {
return err
}
if user == nil {
continue
}
err = UserGroupMemberAdd(ctx, ug.Id, user.Id)
if err != nil {
return err
}
}
return nil
}
func (ug *UserGroup) DelMembers(ctx *ctx.Context, userIds []int64) error {
return UserGroupMemberDel(ctx, ug.Id, userIds)
}
func UserGroupStatistics(ctx *ctx.Context) (*Statistics, error) {
if !ctx.IsCenter {
s, err := poster.GetByUrls[*Statistics](ctx, "/v1/n9e/statistic?name=user_group")
return s, err
}
session := DB(ctx).Model(&UserGroup{}).Select("count(*) as total", "max(update_at) as last_updated")
var stats []*Statistics
err := session.Find(&stats).Error
if err != nil {
return nil, err
}
return stats[0], nil
}
================================================
FILE: models/user_group_member.go
================================================
package models
import (
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/toolkits/pkg/logger"
)
type UserGroupMember struct {
GroupId int64
UserId int64
}
func (UserGroupMember) TableName() string {
return "user_group_member"
}
func MyGroupIds(ctx *ctx.Context, userId int64) ([]int64, error) {
var ids []int64
err := DB(ctx).Model(&UserGroupMember{}).Where("user_id=?", userId).Pluck("group_id", &ids).Error
return ids, err
}
func MyGroupIdsMap(ctx *ctx.Context, userId int64) (map[int64]struct{}, error) {
ids, err := MyGroupIds(ctx, userId)
if err != nil {
return nil, err
}
res := make(map[int64]struct{}, len(ids))
for _, id := range ids {
res[id] = struct{}{}
}
return res, nil
}
// my business group ids
func MyBusiGroupIds(ctx *ctx.Context, userId int64) ([]int64, error) {
groupIds, err := MyGroupIds(ctx, userId)
if err != nil {
return []int64{}, err
}
return BusiGroupIds(ctx, groupIds)
}
func MemberIds(ctx *ctx.Context, groupId int64) ([]int64, error) {
var ids []int64
err := DB(ctx).Model(&UserGroupMember{}).Where("group_id=?", groupId).Pluck("user_id", &ids).Error
return ids, err
}
func GroupsMemberIds(ctx *ctx.Context, groupIds []int64) ([]int64, error) {
var ids []int64
err := DB(ctx).Model(&UserGroupMember{}).Where("group_id in ?", groupIds).Pluck("user_id", &ids).Error
return ids, err
}
func UserGroupMemberCount(ctx *ctx.Context, where string, args ...interface{}) (int64, error) {
return Count(DB(ctx).Model(&UserGroupMember{}).Where(where, args...))
}
func UserGroupMemberAdd(ctx *ctx.Context, groupId, userId int64) error {
num, err := UserGroupMemberCount(ctx, "user_id=? and group_id=?", userId, groupId)
if err != nil {
return err
}
if num > 0 {
// already exists
return nil
}
obj := UserGroupMember{
GroupId: groupId,
UserId: userId,
}
return Insert(ctx, obj)
}
func UserGroupMemberDel(ctx *ctx.Context, groupId int64, userIds []int64) error {
if len(userIds) == 0 {
return nil
}
return DB(ctx).Where("group_id = ? and user_id in ?", groupId, userIds).Delete(&UserGroupMember{}).Error
}
func UserGroupMemberGetAll(ctx *ctx.Context) ([]*UserGroupMember, error) {
if !ctx.IsCenter {
lst, err := poster.GetByUrls[[]*UserGroupMember](ctx, "/v1/n9e/user-group-members")
return lst, err
}
var lst []*UserGroupMember
err := DB(ctx).Find(&lst).Error
return lst, err
}
// UserGroupMemberSync Sync group information, incrementally adding or overwriting deletes
func UserGroupMemberSync(ctx *ctx.Context, ldapGids []int64, userId int64, coverTeams bool) error {
if len(ldapGids) == 0 {
if coverTeams {
// If the user is not in any group, delete all the groups that the user is currently in
return DB(ctx).Where("user_id = ?", userId).Delete(&UserGroupMember{}).Error
}
return nil
}
// queries all the groups that the user is currently in
curGids, err := MyGroupIds(ctx, userId)
if err != nil {
return err
}
curGidsCount := len(curGids)
curGidSet := slice2Set(curGids) // All the current groups Set
toInsert := make([]UserGroupMember, 0, curGidsCount) // Will be added
// Prepare data for bulk insertion
for _, gid := range ldapGids {
if !curGidSet[gid] {
// Add only groups where the user does not already exist
toInsert = append(toInsert, UserGroupMember{GroupId: gid, UserId: userId})
curGidSet[gid] = true
}
}
if len(toInsert) > 0 {
err = DB(ctx).CreateInBatches(toInsert, 10).Error
if err != nil {
logger.Warningf("failed to insert user(%d) group member err: %+v", userId, err)
}
}
if !coverTeams || len(curGids) == 0 {
return nil
}
// 需要将用户在 ldap 中没有, n9e 中有的团队删除
ldapGidSet := slice2Set(ldapGids)
toDeleteIds := make([]int64, 0, curGidsCount)
for _, gid := range curGids {
if !ldapGidSet[gid] {
toDeleteIds = append(toDeleteIds, gid)
ldapGidSet[gid] = true
}
}
if len(toDeleteIds) == 0 {
return nil
}
return DB(ctx).Where("user_id = ? AND group_id IN ?", userId, toDeleteIds).
Delete(&UserGroupMember{}).Error
}
func UserGroupMemberSyncByUser(ctx *ctx.Context, user *User, coverTeams bool) error {
if user == nil {
return nil
}
return UserGroupMemberSync(ctx, user.TeamsLst, user.Id, coverTeams)
}
func slice2Set[T comparable](s []T) map[T]bool {
m := make(map[T]bool, len(s))
for _, item := range s {
m[item] = true
}
return m
}
================================================
FILE: models/user_token.go
================================================
package models
import (
"time"
"github.com/ccfos/nightingale/v6/pkg/ctx"
)
type UserToken struct {
Id int64 `json:"id" gorm:"primaryKey"`
Username string `json:"username" gorm:"type:varchar(255); not null; default ''"`
TokenName string `json:"token_name" gorm:"type:varchar(255); not null; default ''"`
Token string `json:"token" gorm:"type:varchar(255); not null; default ''"`
CreateAt int64 `json:"create_at" gorm:"type:bigint; not null; default 0"`
LastUsed int64 `json:"last_used" gorm:"type:bigint; not null; default 0"`
}
func (UserToken) TableName() string {
return "user_token"
}
func CountToken(ctx *ctx.Context, username string) (int64, error) {
var count int64
err := DB(ctx).Model(&UserToken{}).Where("username = ?", username).Count(&count).Error
return count, err
}
func AddToken(ctx *ctx.Context, username, token, tokenName string) (*UserToken, error) {
newToken := UserToken{
TokenName: tokenName,
Username: username,
Token: token,
CreateAt: time.Now().Unix(),
}
err := Insert(ctx, &newToken)
return &newToken, err
}
func DeleteToken(ctx *ctx.Context, id int64) error {
err := DB(ctx).Where("id = ?", id).Delete(&UserToken{}).Error
return err
}
func GetTokensByUsername(ctx *ctx.Context, username string) ([]UserToken, error) {
var tokens []UserToken
err := DB(ctx).Where("username = ?", username).Find(&tokens).Error
return tokens, err
}
func UserTokenGetAll(ctx *ctx.Context) ([]*UserToken, error) {
var lst []*UserToken
err := DB(ctx).Find(&lst).Error
return lst, err
}
func UserTokenTotal(ctx *ctx.Context) (int64, error) {
var total int64
err := DB(ctx).Model(&UserToken{}).Count(&total).Error
return total, err
}
func UserTokenUpdateLastUsedTime(ctx *ctx.Context, token string, lastUsedTime int64) error {
return DB(ctx).Model(&UserToken{}).Where("token = ?", token).Update("last_used", lastUsedTime).Error
}
================================================
FILE: models/workflow.go
================================================
package models
// WorkflowNode 工作流节点
type WorkflowNode struct {
ID string `json:"id"` // 节点唯一ID
Name string `json:"name"` // 显示名称
Type string `json:"type"` // 节点类型(对应 Processor typ)
Position []float64 `json:"position,omitempty"` // [x, y] UI位置
Config interface{} `json:"config"` // 节点配置
// 执行控制
Disabled bool `json:"disabled,omitempty"`
ContinueOnFail bool `json:"continue_on_fail,omitempty"`
RetryOnFail bool `json:"retry_on_fail,omitempty"`
MaxRetries int `json:"max_retries,omitempty"`
RetryInterval int `json:"retry_interval,omitempty"` // 秒
}
// Connections 节点连接关系 map[源节点ID]NodeConnections
type Connections map[string]NodeConnections
// NodeConnections 单个节点的输出连接
type NodeConnections struct {
// Main 输出端口的连接
// Main[outputIndex] = []ConnectionTarget
Main [][]ConnectionTarget `json:"main"`
}
// ConnectionTarget 连接目标
type ConnectionTarget struct {
Node string `json:"node"` // 目标节点ID
Type string `json:"type"` // 输入类型,通常是 "main"
Index int `json:"index"` // 目标节点的输入端口索引
}
// InputVariable 输入参数
type InputVariable struct {
Key string `json:"key"` // 变量名
Value string `json:"value"` // 默认值
Description string `json:"description,omitempty"` // 描述
}
// NodeOutput 节点执行输出
type NodeOutput struct {
WfCtx *WorkflowContext `json:"wf_ctx"` // 处理后的工作流上下文
Message string `json:"message"` // 处理消息
Terminate bool `json:"terminate"` // 是否终止流程
BranchIndex *int `json:"branch_index,omitempty"` // 分支索引(条件节点使用)
// 流式输出支持
Stream bool `json:"stream,omitempty"` // 是否流式输出
StreamChan chan *StreamChunk `json:"-"` // 流式数据通道(不序列化)
}
// WorkflowResult 工作流执行结果
type WorkflowResult struct {
Event *AlertCurEvent `json:"event"` // 最终事件
Status string `json:"status"` // success, failed, streaming
Message string `json:"message"` // 汇总消息
NodeResults []*NodeExecutionResult `json:"node_results"` // 各节点执行结果
ErrorNode string `json:"error_node,omitempty"`
// 流式输出支持
Stream bool `json:"stream,omitempty"` // 是否流式输出
StreamChan chan *StreamChunk `json:"-"` // 流式数据通道(不序列化)
}
// NodeExecutionResult 节点执行结果
type NodeExecutionResult struct {
NodeID string `json:"node_id"`
NodeName string `json:"node_name"`
NodeType string `json:"node_type"`
Status string `json:"status"` // success, failed, skipped
Message string `json:"message"`
StartedAt int64 `json:"started_at"`
FinishedAt int64 `json:"finished_at"`
DurationMs int64 `json:"duration_ms"`
Error string `json:"error,omitempty"`
BranchIndex *int `json:"branch_index,omitempty"` // 条件节点的分支选择
}
// 触发模式常量
const (
TriggerModeEvent = "event" // 告警事件触发
TriggerModeAPI = "api" // API 触发
TriggerModeCron = "cron" // 定时触发(后续支持)
)
const (
UseCaseEventPipeline = "event_pipeline"
UseCaseEventSummary = "firemap"
)
// WorkflowTriggerContext 工作流触发上下文
type WorkflowTriggerContext struct {
// 触发模式
Mode string `json:"mode"`
// 触发者
TriggerBy string `json:"trigger_by"`
// 请求ID(API/Cron 触发使用)
RequestID string `json:"request_id"`
// 输入参数覆盖
InputsOverrides map[string]string `json:"inputs_overrides"`
// 流式输出(API 调用时动态指定)
Stream bool `json:"stream"`
// Cron 相关(后续使用)
CronJobID string `json:"cron_job_id,omitempty"`
CronExpr string `json:"cron_expr,omitempty"`
ScheduledAt int64 `json:"scheduled_at,omitempty"`
}
type WorkflowContext struct {
Event *AlertCurEvent `json:"event"` // 当前事件
Inputs map[string]string `json:"inputs"` // 前置输入参数(静态,用户配置)
Vars map[string]interface{} `json:"vars"` // 节点间传递的数据(动态,运行时产生)
Metadata map[string]string `json:"metadata"` // 执行元数据(request_id、start_time 等)
Output map[string]interface{} `json:"output,omitempty"` // 输出结果(非告警场景使用)
// 流式输出支持
Stream bool `json:"-"` // 是否启用流式输出(不序列化)
StreamChan chan *StreamChunk `json:"-"` // 流式数据通道(不序列化)
}
// StreamChunk 类型常量
const (
StreamTypeThinking = "thinking" // AI 思考过程(ReAct Thought)
StreamTypeToolCall = "tool_call" // 工具调用
StreamTypeToolResult = "tool_result" // 工具执行结果
StreamTypeText = "text" // LLM 文本输出
StreamTypeDone = "done" // 完成
StreamTypeError = "error" // 错误
)
// StreamChunk 流式数据块
type StreamChunk struct {
Type string `json:"type"` // thinking / tool_call / tool_result / text / done / error
Content string `json:"content"` // 完整内容(累积)
Delta string `json:"delta,omitempty"` // 增量内容
NodeID string `json:"node_id,omitempty"` // 当前节点 ID
RequestID string `json:"request_id,omitempty"` // 请求追踪 ID
Metadata interface{} `json:"metadata,omitempty"` // 额外元数据(如工具调用参数)
Done bool `json:"done"` // 是否结束
Error string `json:"error,omitempty"` // 错误信息
Timestamp int64 `json:"timestamp"` // 时间戳(毫秒)
}
================================================
FILE: pkg/aop/log.go
================================================
package aop
import (
"bytes"
"compress/gzip"
"fmt"
"io"
"net/http"
"os"
"strings"
"time"
"github.com/gin-gonic/gin"
"github.com/mattn/go-isatty"
"github.com/toolkits/pkg/logger"
)
type consoleColorModeValue int
const (
autoColor consoleColorModeValue = iota
disableColor
forceColor
)
var (
green = string([]byte{27, 91, 57, 55, 59, 52, 50, 109})
white = string([]byte{27, 91, 57, 48, 59, 52, 55, 109})
yellow = string([]byte{27, 91, 57, 48, 59, 52, 51, 109})
red = string([]byte{27, 91, 57, 55, 59, 52, 49, 109})
blue = string([]byte{27, 91, 57, 55, 59, 52, 52, 109})
magenta = string([]byte{27, 91, 57, 55, 59, 52, 53, 109})
cyan = string([]byte{27, 91, 57, 55, 59, 52, 54, 109})
reset = string([]byte{27, 91, 48, 109})
consoleColorMode = autoColor
)
// LoggerConfig defines the config for Logger middleware.
type LoggerConfig struct {
// Optional. Default value is gin.defaultLogFormatter
Formatter LogFormatter
// Output is a writer where logs are written.
// Optional. Default value is gin.DefaultWriter.
Output io.Writer
PrintAccessLog func() bool
PrintBodyPaths func() map[string]struct{}
// SkipPaths is a url path array which logs are not written.
// Optional.
SkipPaths []string
}
func (c *LoggerConfig) ContainsPath(path string) bool {
path = strings.Split(path, "?")[0]
_, exist := c.PrintBodyPaths()[path]
return exist
}
// LogFormatter gives the signature of the formatter function passed to LoggerWithFormatter
type LogFormatter func(params LogFormatterParams) string
// LogFormatterParams is the structure any formatter will be handed when time to log comes
type LogFormatterParams struct {
Request *http.Request
// TimeStamp shows the time after the server returns a response.
TimeStamp time.Time
// StatusCode is HTTP response code.
StatusCode int
// Latency is how much time the server cost to process a certain request.
Latency time.Duration
// ClientIP equals Context's ClientIP method.
ClientIP string
// Method is the HTTP method given to the request.
Method string
// Path is a path the client requests.
Path string
// ErrorMessage is set if error has occurred in processing the request.
ErrorMessage string
// isTerm shows whether does gin's output descriptor refers to a terminal.
isTerm bool
// BodySize is the size of the Response Body
BodySize int
// Keys are the keys set on the request's context.
Keys map[string]interface{}
}
// StatusCodeColor is the ANSI color for appropriately logging http status code to a terminal.
func (p *LogFormatterParams) StatusCodeColor() string {
code := p.StatusCode
switch {
case code >= http.StatusOK && code < http.StatusMultipleChoices:
return green
case code >= http.StatusMultipleChoices && code < http.StatusBadRequest:
return white
case code >= http.StatusBadRequest && code < http.StatusInternalServerError:
return yellow
default:
return red
}
}
// MethodColor is the ANSI color for appropriately logging http method to a terminal.
func (p *LogFormatterParams) MethodColor() string {
method := p.Method
switch method {
case "GET":
return blue
case "POST":
return cyan
case "PUT":
return yellow
case "DELETE":
return red
case "PATCH":
return green
case "HEAD":
return magenta
case "OPTIONS":
return white
default:
return reset
}
}
// ResetColor resets all escape attributes.
func (p *LogFormatterParams) ResetColor() string {
return reset
}
// IsOutputColor indicates whether can colors be outputted to the log.
func (p *LogFormatterParams) IsOutputColor() bool {
return consoleColorMode == forceColor || (consoleColorMode == autoColor && p.isTerm)
}
// defaultLogFormatter is the default log format function Logger middleware uses.
var defaultLogFormatter = func(param LogFormatterParams) string {
var statusColor, methodColor, resetColor string
if param.IsOutputColor() {
statusColor = param.StatusCodeColor()
methodColor = param.MethodColor()
resetColor = param.ResetColor()
}
if param.Latency > time.Minute {
// Truncate in a golang < 1.8 safe way
param.Latency = param.Latency - param.Latency%time.Second
}
return fmt.Sprintf("[GIN] |%s %3d %s| %13v | %15s |%s %-7s %s %s\n%s",
statusColor, param.StatusCode, resetColor,
param.Latency,
param.ClientIP,
methodColor, param.Method, resetColor,
param.Path,
param.ErrorMessage,
)
}
// DisableConsoleColor disables color output in the console.
func DisableConsoleColor() {
consoleColorMode = disableColor
}
// ForceConsoleColor force color output in the console.
func ForceConsoleColor() {
consoleColorMode = forceColor
}
// ErrorLogger returns a handlerfunc for any error type.
func ErrorLogger() gin.HandlerFunc {
return ErrorLoggerT(gin.ErrorTypeAny)
}
// ErrorLoggerT returns a handlerfunc for a given error type.
func ErrorLoggerT(typ gin.ErrorType) gin.HandlerFunc {
return func(c *gin.Context) {
c.Next()
errors := c.Errors.ByType(typ)
if len(errors) > 0 {
c.JSON(-1, errors)
}
}
}
// Logger instances a Logger middleware that will write the logs to gin.DefaultWriter.
// By default gin.DefaultWriter = os.Stdout.
func Logger(conf ...LoggerConfig) gin.HandlerFunc {
var configuration LoggerConfig
if len(conf) > 0 {
configuration = conf[0]
}
return LoggerWithConfig(configuration)
}
// LoggerWithFormatter instance a Logger middleware with the specified log format function.
func LoggerWithFormatter(f LogFormatter) gin.HandlerFunc {
return LoggerWithConfig(LoggerConfig{
Formatter: f,
})
}
// LoggerWithWriter instance a Logger middleware with the specified writer buffer.
// Example: os.Stdout, a file opened in write mode, a socket...
func LoggerWithWriter(out io.Writer, notlogged ...string) gin.HandlerFunc {
return LoggerWithConfig(LoggerConfig{
Output: out,
SkipPaths: notlogged,
})
}
type CustomResponseWriter struct {
gin.ResponseWriter
body *bytes.Buffer
}
func (w CustomResponseWriter) Write(data []byte) (int, error) {
w.body.Write(data)
return w.ResponseWriter.Write(data)
}
func (w CustomResponseWriter) WriteString(s string) (int, error) {
w.body.WriteString(s)
return w.ResponseWriter.WriteString(s)
}
func (w CustomResponseWriter) Unwrap() http.ResponseWriter {
return w.ResponseWriter
}
// LoggerWithConfig instance a Logger middleware with config.
func LoggerWithConfig(conf LoggerConfig) gin.HandlerFunc {
formatter := conf.Formatter
if formatter == nil {
formatter = defaultLogFormatter
}
out := conf.Output
if out == nil {
out = os.Stdout
}
notlogged := conf.SkipPaths
isTerm := true
if w, ok := out.(*os.File); !ok || os.Getenv("TERM") == "dumb" ||
(!isatty.IsTerminal(w.Fd()) && !isatty.IsCygwinTerminal(w.Fd())) {
isTerm = false
}
var skip map[string]struct{}
if length := len(notlogged); length > 0 {
skip = make(map[string]struct{}, length)
for _, path := range notlogged {
skip[path] = struct{}{}
}
}
return func(c *gin.Context) {
if !conf.PrintAccessLog() {
c.Next()
return
}
// Start timer
start := time.Now()
path := c.Request.URL.Path
raw := c.Request.URL.RawQuery
var (
rdr1 io.ReadCloser
rdr2 io.ReadCloser
)
bodyWriter := &CustomResponseWriter{
ResponseWriter: c.Writer,
body: bytes.NewBuffer(nil),
}
c.Writer = bodyWriter
if conf.ContainsPath(c.Request.RequestURI) {
buf, _ := io.ReadAll(c.Request.Body)
rdr1 = io.NopCloser(bytes.NewBuffer(buf))
rdr2 = io.NopCloser(bytes.NewBuffer(buf))
c.Request.Body = rdr2
}
// Process request
c.Next()
// Log only when path is not being skipped
if _, ok := skip[path]; !ok {
param := LogFormatterParams{
Request: c.Request,
isTerm: isTerm,
Keys: c.Keys,
}
// Stop timer
param.TimeStamp = time.Now()
param.Latency = param.TimeStamp.Sub(start)
param.ClientIP = c.ClientIP()
param.Method = c.Request.Method
param.StatusCode = c.Writer.Status()
param.ErrorMessage = c.Errors.ByType(gin.ErrorTypePrivate).String()
param.BodySize = c.Writer.Size()
if raw != "" {
path = path + "?" + raw
}
param.Path = path
traceId := c.GetString("trace_id")
if traceId != "" {
logger.Infof("trace_id=%s %s", traceId, formatter(param))
} else {
logger.Info(formatter(param))
}
if conf.ContainsPath(c.Request.RequestURI) {
respBody := readBody(bytes.NewReader(bodyWriter.body.Bytes()), c.Writer.Header().Get("Content-Encoding"))
reqBody := readBody(rdr1, c.Request.Header.Get("Content-Encoding"))
if traceId != "" {
logger.Debugf("trace_id=%s path:%s req body:%s resp:%s", traceId, path, reqBody, respBody)
} else {
logger.Debugf("path:%s req body:%s resp:%s", path, reqBody, respBody)
}
}
}
}
}
func readBody(reader io.Reader, encoding string) string {
var bodyBytes []byte
var err error
if encoding == "gzip" {
gzipReader, err := gzip.NewReader(reader)
if err != nil {
return fmt.Sprintf("failed to create gzip reader: %v", err)
}
defer gzipReader.Close()
bodyBytes, err = io.ReadAll(gzipReader)
if err != nil {
return fmt.Sprintf("failed to read gzip response body: %v", err)
}
} else {
bodyBytes, err = io.ReadAll(reader)
if err != nil {
return fmt.Sprintf("failed to read response body: %v", err)
}
}
return string(bodyBytes)
}
================================================
FILE: pkg/aop/rec.go
================================================
package aop
// Copyright 2014 Manu Martinez-Almeida. All rights reserved.
// Use of this source code is governed by a MIT style
// license that can be found in the LICENSE file.
import (
"bytes"
"fmt"
"io"
"io/ioutil"
"log"
"net"
"net/http"
"net/http/httputil"
"os"
"runtime"
"strings"
"time"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/errorx"
"github.com/toolkits/pkg/i18n"
)
var (
dunno = []byte("???")
centerDot = []byte("·")
dot = []byte(".")
slash = []byte("/")
)
// Recovery returns a middleware that recovers from any panics and writes a 500 if there was one.
func Recovery() gin.HandlerFunc {
return RecoveryWithWriter(gin.DefaultErrorWriter)
}
// RecoveryWithWriter returns a middleware for a given writer that recovers from any panics and writes a 500 if there was one.
func RecoveryWithWriter(out io.Writer) gin.HandlerFunc {
var logger *log.Logger
if out != nil {
logger = log.New(out, "\n\n\x1b[31m", log.LstdFlags)
}
return func(c *gin.Context) {
defer func() {
if err := recover(); err != nil {
// custom error
if e, ok := err.(errorx.PageError); ok {
if e.Code != 200 {
c.String(e.Code, i18n.Sprintf(c.GetHeader("X-Language"), e.Message))
} else {
c.JSON(e.Code, gin.H{
"err": i18n.Sprintf(c.GetHeader("X-Language"), e.Message),
"request_id": c.GetString("trace_id"),
})
}
c.Abort()
return
}
// Check for a broken connection, as it is not really a
// condition that warrants a panic stack trace.
var brokenPipe bool
if ne, ok := err.(*net.OpError); ok {
if se, ok := ne.Err.(*os.SyscallError); ok {
if strings.Contains(strings.ToLower(se.Error()), "broken pipe") || strings.Contains(strings.ToLower(se.Error()), "connection reset by peer") {
brokenPipe = true
}
}
}
if logger != nil {
stack := stack(3)
httpRequest, _ := httputil.DumpRequest(c.Request, false)
headers := strings.Split(string(httpRequest), "\r\n")
for idx, header := range headers {
current := strings.Split(header, ":")
if current[0] == "Authorization" {
headers[idx] = current[0] + ": *"
}
}
if brokenPipe {
logger.Printf("%s\n%s%s", err, string(httpRequest), reset)
} else if gin.IsDebugging() {
logger.Printf("[Recovery] %s panic recovered:\n%s\n%s\n%s%s",
timeFormat(time.Now()), strings.Join(headers, "\r\n"), err, stack, reset)
} else {
logger.Printf("[Recovery] %s panic recovered:\n%s\n%s%s",
timeFormat(time.Now()), err, stack, reset)
}
}
// If the connection is dead, we can't write a status to it.
if brokenPipe {
c.Error(err.(error)) // nolint: errcheck
c.Abort()
} else {
c.AbortWithStatus(http.StatusInternalServerError)
}
}
}()
c.Next()
}
}
// stack returns a nicely formatted stack frame, skipping skip frames.
func stack(skip int) []byte {
buf := new(bytes.Buffer) // the returned data
// As we loop, we open files and read them. These variables record the currently
// loaded file.
var lines [][]byte
var lastFile string
for i := skip; ; i++ { // Skip the expected number of frames
pc, file, line, ok := runtime.Caller(i)
if !ok {
break
}
// Print this much at least. If we can't find the source, it won't show.
fmt.Fprintf(buf, "%s:%d (0x%x)\n", file, line, pc)
if file != lastFile {
data, err := ioutil.ReadFile(file)
if err != nil {
continue
}
lines = bytes.Split(data, []byte{'\n'})
lastFile = file
}
fmt.Fprintf(buf, "\t%s: %s\n", function(pc), source(lines, line))
}
return buf.Bytes()
}
// source returns a space-trimmed slice of the n'th line.
func source(lines [][]byte, n int) []byte {
n-- // in stack trace, lines are 1-indexed but our array is 0-indexed
if n < 0 || n >= len(lines) {
return dunno
}
return bytes.TrimSpace(lines[n])
}
// function returns, if possible, the name of the function containing the PC.
func function(pc uintptr) []byte {
fn := runtime.FuncForPC(pc)
if fn == nil {
return dunno
}
name := []byte(fn.Name())
// The name includes the path name to the package, which is unnecessary
// since the file name is already included. Plus, it has center dots.
// That is, we see
// runtime/debug.*T·ptrmethod
// and want
// *T.ptrmethod
// Also the package path might contains dot (e.g. code.google.com/...),
// so first eliminate the path prefix
if lastSlash := bytes.LastIndex(name, slash); lastSlash >= 0 {
name = name[lastSlash+1:]
}
if period := bytes.Index(name, dot); period >= 0 {
name = name[period+1:]
}
name = bytes.Replace(name, centerDot, dot, -1)
return name
}
func timeFormat(t time.Time) string {
return t.Format("2006/01/02 - 15:04:05")
}
================================================
FILE: pkg/cas/cas.go
================================================
package cas
import (
"bytes"
"context"
"crypto/tls"
"net/http"
"net/url"
"strings"
"sync"
"time"
"github.com/ccfos/nightingale/v6/storage"
"github.com/google/uuid"
"github.com/toolkits/pkg/cas"
"github.com/toolkits/pkg/logger"
)
type Config struct {
Enable bool
RedirectURL string
SsoAddr string
SsoLogoutAddr string
LoginPath string
DisplayName string
CoverAttributes bool
SkipTlsVerify bool
Attributes struct {
UserName string
Nickname string
Phone string
Email string
}
DefaultRoles []string
}
type SsoClient struct {
Enable bool
Config Config
SsoAddr string
SsoLogoutAddr string
CallbackAddr string
DisplayName string
Attributes struct {
UserName string
Nickname string
Phone string
Email string
}
DefaultRoles []string
CoverAttributes bool
HTTPClient *http.Client
sync.RWMutex
}
func New(cf Config) *SsoClient {
var cli SsoClient
if !cf.Enable {
return &cli
}
cli.Enable = cf.Enable
cli.Config = cf
cli.SsoAddr = cf.SsoAddr
cli.SsoLogoutAddr = cf.SsoLogoutAddr
cli.CallbackAddr = cf.RedirectURL
cli.DisplayName = cf.DisplayName
cli.Attributes.UserName = cf.Attributes.UserName
cli.Attributes.Nickname = cf.Attributes.Nickname
cli.Attributes.Phone = cf.Attributes.Phone
cli.Attributes.Email = cf.Attributes.Email
cli.DefaultRoles = cf.DefaultRoles
cli.CoverAttributes = cf.CoverAttributes
if cf.SkipTlsVerify {
transport := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
}
cli.HTTPClient = &http.Client{Transport: transport}
}
return &cli
}
func (s *SsoClient) Reload(cf Config) {
s.Lock()
defer s.Unlock()
if !cf.Enable {
s.Enable = cf.Enable
return
}
s.Enable = cf.Enable
s.Config = cf
s.SsoAddr = cf.SsoAddr
s.SsoLogoutAddr = cf.SsoLogoutAddr
s.CallbackAddr = cf.RedirectURL
s.DisplayName = cf.DisplayName
s.Attributes.UserName = cf.Attributes.UserName
s.Attributes.Nickname = cf.Attributes.Nickname
s.Attributes.Phone = cf.Attributes.Phone
s.Attributes.Email = cf.Attributes.Email
s.DefaultRoles = cf.DefaultRoles
s.CoverAttributes = cf.CoverAttributes
if cf.SkipTlsVerify {
transport := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
}
s.HTTPClient = &http.Client{Transport: transport}
}
}
func (s *SsoClient) GetDisplayName() string {
s.RLock()
defer s.RUnlock()
if !s.Enable {
return ""
}
return s.DisplayName
}
func (s *SsoClient) GetSsoLogoutAddr() string {
s.RLock()
defer s.RUnlock()
if !s.Enable {
return ""
}
return s.SsoLogoutAddr
}
// Authorize return the cas authorize location and state
func (s *SsoClient) Authorize(redis storage.Redis, redirect string) (string, string, error) {
state := uuid.New().String()
ctx := context.Background()
err := redis.Set(ctx, wrapStateKey(state), redirect, time.Duration(300*time.Second)).Err()
if err != nil {
return "", "", err
}
return s.genRedirectURL(state), state, nil
}
func fetchRedirect(ctx context.Context, state string, redis storage.Redis) (string, error) {
return redis.Get(ctx, wrapStateKey(state)).Result()
}
func deleteRedirect(ctx context.Context, state string, redis storage.Redis) error {
return redis.Del(ctx, wrapStateKey(state)).Err()
}
func wrapStateKey(key string) string {
return "n9e_cas_" + key
}
func (s *SsoClient) genRedirectURL(state string) string {
var buf bytes.Buffer
s.RLock()
defer s.RUnlock()
SsoAddr, err := url.Parse(s.Config.SsoAddr)
if err != nil {
logger.Error(err)
return buf.String()
}
if s.Config.LoginPath == "" {
if strings.Contains(s.Config.SsoAddr, "p3") {
SsoAddr.Path += "/login"
} else {
SsoAddr.Path += "/cas/login"
}
} else {
SsoAddr.Path += s.Config.LoginPath
}
buf.WriteString(SsoAddr.String())
v := url.Values{
"service": {s.CallbackAddr},
}
if strings.Contains(s.SsoAddr, "?") {
buf.WriteByte('&')
} else {
buf.WriteByte('?')
}
buf.WriteString(v.Encode())
return buf.String()
}
type CallbackOutput struct {
Redirect string `json:"redirect"`
Msg string `json:"msg"`
AccessToken string `json:"accessToken"`
Username string `json:"username"`
Nickname string `json:"Nickname"`
Phone string `yaml:"Phone"`
Email string `yaml:"Email"`
}
func (s *SsoClient) ValidateServiceTicket(ctx context.Context, ticket, state string, redis storage.Redis) (ret *CallbackOutput, err error) {
s.RLock()
defer s.RUnlock()
casUrl, err := url.Parse(s.Config.SsoAddr)
if err != nil {
logger.Error(err)
return
}
serviceUrl, err := url.Parse(s.CallbackAddr)
if err != nil {
logger.Error(err)
return
}
resOptions := &cas.RestOptions{
CasURL: casUrl,
ServiceURL: serviceUrl,
}
if s.HTTPClient != nil {
resOptions.Client = s.HTTPClient
}
resCli := cas.NewRestClient(resOptions)
authRet, err := resCli.ValidateServiceTicket(cas.ServiceTicket(ticket))
if err != nil {
logger.Errorf("Ticket Validating Failed: %s", err)
return
}
ret = &CallbackOutput{}
ret.Username = authRet.User
ret.Nickname = authRet.Attributes.Get(s.Attributes.Nickname)
ret.Email = authRet.Attributes.Get(s.Attributes.Email)
ret.Phone = authRet.Attributes.Get(s.Attributes.Phone)
ret.Redirect, err = fetchRedirect(ctx, state, redis)
if err != nil {
logger.Debugf("get redirect err:%s state:%s", state, err)
}
err = deleteRedirect(ctx, state, redis)
if err != nil {
logger.Debugf("delete redirect err:%s state:%s", state, err)
}
return
}
================================================
FILE: pkg/cfg/cfg.go
================================================
package cfg
import (
"bytes"
"fmt"
"path"
"strings"
"github.com/koding/multiconfig"
"github.com/toolkits/pkg/file"
"github.com/toolkits/pkg/runner"
)
func LoadConfigByDir(configDir string, configPtr interface{}) error {
var (
tBuf []byte
)
loaders := []multiconfig.Loader{
&multiconfig.TagLoader{},
&multiconfig.EnvironmentLoader{},
}
if !file.IsExist(configDir) {
return fmt.Errorf("config directory: %s not exist. working directory: %s", configDir, runner.Cwd)
}
files, err := file.FilesUnder(configDir)
if err != nil {
return fmt.Errorf("failed to list files under: %s : %v", configDir, err)
}
found := false
s := NewFileScanner()
for _, fpath := range files {
switch {
case strings.HasSuffix(fpath, ".toml"):
found = true
s.Read(path.Join(configDir, fpath))
tBuf = append(tBuf, s.Data()...)
tBuf = append(tBuf, []byte("\n")...)
case strings.HasSuffix(fpath, ".json"):
found = true
loaders = append(loaders, &multiconfig.JSONLoader{Path: path.Join(configDir, fpath)})
case strings.HasSuffix(fpath, ".yaml") || strings.HasSuffix(fpath, ".yml"):
found = true
loaders = append(loaders, &multiconfig.YAMLLoader{Path: path.Join(configDir, fpath)})
}
if s.Err() != nil {
return s.Err()
}
}
if !found {
return fmt.Errorf("fail to found config file, config dir path: %v", configDir)
}
if len(tBuf) != 0 {
loaders = append(loaders, &multiconfig.TOMLLoader{Reader: bytes.NewReader(tBuf)})
}
m := multiconfig.DefaultLoader{
Loader: multiconfig.MultiLoader(loaders...),
Validator: multiconfig.MultiValidator(&multiconfig.RequiredValidator{}),
}
return m.Load(configPtr)
}
================================================
FILE: pkg/cfg/scan.go
================================================
package cfg
import (
"os"
)
type scanner struct {
data []byte
err error
}
func NewFileScanner() *scanner {
return &scanner{}
}
func (s *scanner) Err() error {
return s.err
}
func (s *scanner) Data() []byte {
return s.data
}
func (s *scanner) Read(file string) {
if s.err == nil {
s.data, s.err = os.ReadFile(file)
}
}
================================================
FILE: pkg/choice/choice.go
================================================
// Package choice provides basic functions for working with
// plugin options that must be one of several values.
package choice
import (
"fmt"
"strings"
)
// Contains return true if the choice in the list of choices.
func Contains(choice string, choices []string) bool {
for _, item := range choices {
if item == choice {
return true
}
}
return false
}
// Contains return true if the choice in the list of choices.
func ContainsPrefix(choice string, choices []string) bool {
for _, item := range choices {
if strings.HasPrefix(choice, item) {
return true
}
}
return false
}
// Check returns an error if a choice is not one of
// the available choices.
func Check(choice string, available []string) error {
if !Contains(choice, available) {
return fmt.Errorf("unknown choice %s", choice)
}
return nil
}
// CheckSlice returns an error if the choices is not a subset of
// available.
func CheckSlice(choices, available []string) error {
for _, choice := range choices {
err := Check(choice, available)
if err != nil {
return err
}
}
return nil
}
================================================
FILE: pkg/cmdx/cmd_notwindows.go
================================================
//go:build !windows
// +build !windows
package cmdx
import (
"os/exec"
"syscall"
"time"
)
func CmdWait(cmd *exec.Cmd, timeout time.Duration) (error, bool) {
var err error
done := make(chan error)
go func() {
done <- cmd.Wait()
}()
select {
case <-time.After(timeout):
go func() {
<-done // allow goroutine to exit
}()
// IMPORTANT: cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} is necessary before cmd.Start()
err = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
return err, true
case err = <-done:
return err, false
}
}
func CmdStart(cmd *exec.Cmd) error {
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
return cmd.Start()
}
================================================
FILE: pkg/cmdx/cmd_windows.go
================================================
//go:build windows
// +build windows
package cmdx
import (
"os/exec"
"syscall"
"time"
)
func CmdWait(cmd *exec.Cmd, timeout time.Duration) (error, bool) {
var err error
done := make(chan error)
go func() {
done <- cmd.Wait()
}()
select {
case <-time.After(timeout):
go func() {
<-done // allow goroutine to exit
}()
err = cmd.Process.Signal(syscall.SIGKILL)
return err, true
case err = <-done:
return err, false
}
}
func CmdStart(cmd *exec.Cmd) error {
return cmd.Start()
}
================================================
FILE: pkg/cmdx/cmdx.go
================================================
package cmdx
import (
"os/exec"
"time"
)
func RunTimeout(cmd *exec.Cmd, timeout time.Duration) (error, bool) {
err := CmdStart(cmd)
if err != nil {
return err, false
}
return CmdWait(cmd, timeout)
}
================================================
FILE: pkg/ctx/ctx.go
================================================
package ctx
import (
"context"
"github.com/ccfos/nightingale/v6/conf"
"gorm.io/gorm"
)
type Context struct {
DB *gorm.DB
CenterApi conf.CenterApi
Ctx context.Context
IsCenter bool
}
func NewContext(ctx context.Context, db *gorm.DB, isCenter bool, centerApis ...conf.CenterApi) *Context {
var api conf.CenterApi
if len(centerApis) > 0 {
api = centerApis[0]
}
return &Context{
Ctx: ctx,
DB: db,
CenterApi: api,
IsCenter: isCenter,
}
}
// set db to Context
func (c *Context) SetDB(db *gorm.DB) {
c.DB = db
}
// get context from Context
func (c *Context) GetContext() context.Context {
return c.Ctx
}
// get db from Context
func (c *Context) GetDB() *gorm.DB {
return c.DB
}
// WithContext returns a shallow copy with a different standard context.
// Useful for carrying per-request values (e.g. traceId) without mutating the global instance.
func (c *Context) WithContext(stdCtx context.Context) *Context {
return &Context{
DB: c.DB,
CenterApi: c.CenterApi,
Ctx: stdCtx,
IsCenter: c.IsCenter,
}
}
================================================
FILE: pkg/dingtalk/dingtalk.go
================================================
package dingtalk
import (
"bytes"
"context"
"fmt"
"net/url"
"strings"
"sync"
"time"
dingtalkUserClient "github.com/ccfos/nightingale/v6/pkg/dingtalk/user"
"github.com/ccfos/nightingale/v6/storage"
openapi "github.com/alibabacloud-go/darabonba-openapi/v2/client"
"github.com/alibabacloud-go/dingtalk/contact_1_0"
dingtalkoauth2 "github.com/alibabacloud-go/dingtalk/oauth2_1_0"
util "github.com/alibabacloud-go/tea-utils/v2/service"
"github.com/alibabacloud-go/tea/tea"
"github.com/google/uuid"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
const defaultAuthURL = "https://login.dingtalk.com/oauth2/auth"
const SsoTypeName = "dingtalk"
type SsoClient struct {
Enable bool
DingTalkConfig *Config `json:"-"`
Ctx context.Context
sync.RWMutex
}
type Config struct {
Enable bool `json:"enable"`
AuthURL string `json:"auth_url"`
DisplayName string `json:"display_name"`
// CorpId 用于指定用户需要选择的组织, scope包含corpid时该参数存在意义
CorpId string `json:"corpId"`
ClientID string `json:"client_id"`
ClientSecret string `json:"client_secret"`
RedirectURL string `json:"redirect_url"`
UsernameField string `json:"username_field"`
Endpoint string `json:"endpoint"`
DingTalkAPI string `json:"dingtalk_api"`
UseMemberInfo bool `json:"use_member_info"` // 是否开启查询用户详情,需要qyapi_get_member权限
Proxy string `json:"proxy"`
SkipTlsVerify bool `json:"skip_tls_verify"`
CoverAttributes bool `json:"cover_attributes"`
DefaultRoles []string `json:"default_roles"`
}
type CallbackOutput struct {
Redirect string `json:"redirect"`
Msg string `json:"msg"`
AccessToken string `json:"accessToken"`
Username string `json:"Username"`
Nickname string `json:"Nickname"`
Phone string `yaml:"Phone"`
Email string `yaml:"Email"`
}
func wrapStateKey(key string) string {
return "n9e_dingtalk_oauth_" + key
}
// CreateClient
/**
* 使用 Token 初始化账号Client
* @return Client
* @throws Exception
*/
func (c *Config) CreateClient() (*dingtalkoauth2.Client, error) {
config := &openapi.Config{}
config.Protocol = tea.String("https")
config.RegionId = tea.String("central")
err := c.setProxy(config)
if err != nil {
return nil, err
}
err = c.setEndpoint(config, c.Endpoint)
if err != nil {
return nil, err
}
dingTalkOAuthClient, err := dingtalkoauth2.NewClient(config)
return dingTalkOAuthClient, err
}
// ContactClient 联系人
func (c *Config) ContactClient() (*contact_1_0.Client, error) {
config := &openapi.Config{}
// 请求协议
config.Protocol = tea.String("https")
config.RegionId = tea.String("central")
err := c.setProxy(config)
if err != nil {
return nil, err
}
err = c.setEndpoint(config, c.Endpoint)
if err != nil {
return nil, err
}
dingTalkContactClient, err := contact_1_0.NewClient(config)
return dingTalkContactClient, err
}
// UserClient 用户详情
func (c *Config) UserClient() (*dingtalkUserClient.Client, error) {
config := &openapi.Config{}
// 请求协议
config.Protocol = tea.String("https")
config.RegionId = tea.String("central")
err := c.setProxy(config)
if err != nil {
return nil, err
}
err = c.setEndpoint(config, c.DingTalkAPI)
if err != nil {
return nil, err
}
dingTalkUserClient, err := dingtalkUserClient.NewClient(config)
return dingTalkUserClient, err
}
func (c *Config) setEndpoint(config *openapi.Config, endpoint string) error {
if endpoint == "" {
return nil
}
endpointURL, err := url.Parse(endpoint)
if err != nil {
return err
}
switch endpointURL.Scheme {
case "http":
config.SetProtocol("http")
config.Endpoint = tea.String(strings.Replace(endpoint, "http://", "", 1))
case "https":
config.SetProtocol("https")
config.Endpoint = tea.String(strings.Replace(endpoint, "https://", "", 1))
default:
config.SetProtocol("https")
config.Endpoint = tea.String(endpoint)
}
return nil
}
func (c *Config) setProxy(config *openapi.Config) error {
// 解析 代理URL协议:http\https
proxyURL, err := url.Parse(c.Proxy)
if err != nil {
return err
}
switch proxyURL.Scheme {
case "https":
config.SetHttpsProxy(c.Proxy)
default:
config.SetHttpProxy(c.Proxy)
}
return nil
}
func New(cf Config) *SsoClient {
var s = &SsoClient{}
if !cf.Enable {
return s
}
s.Reload(cf)
return s
}
func (s *SsoClient) AuthCodeURL(state string) (string, error) {
var buf bytes.Buffer
dingTalkOauthAuthURl := defaultAuthURL
if s.DingTalkConfig.AuthURL != "" {
dingTalkOauthAuthURl = s.DingTalkConfig.AuthURL
}
buf.WriteString(dingTalkOauthAuthURl)
v := url.Values{
"response_type": {"code"},
"client_id": {s.DingTalkConfig.ClientID},
}
v.Set("redirect_uri", s.DingTalkConfig.RedirectURL)
if s.DingTalkConfig.RedirectURL == "" {
return "", errors.New("DingTalk OAuth RedirectURL is empty")
}
if s.DingTalkConfig.CorpId != "" {
// Scope 授权范围, 当前只支持两种输入,
// openid:授权后可获得用户userid, openid
// corpid:授权后可获得用户id和登录过程中用户选择的组织id,空格分隔。注意url编码
v.Set("scope", "openid corpid")
// corpId: 必须设置scope值为openid corpid
v.Set("corpId", s.DingTalkConfig.CorpId)
} else {
v.Set("scope", "openid")
}
v.Set("prompt", "consent")
v.Set("state", state)
if strings.Contains(dingTalkOauthAuthURl, "?") {
buf.WriteByte('&')
} else {
buf.WriteByte('?')
}
buf.WriteString(v.Encode())
return buf.String(), nil
}
func (s *SsoClient) GetUserToken(code string) (string, error) {
authClient, err := s.DingTalkConfig.CreateClient()
getUserTokenRequest := &dingtalkoauth2.GetUserTokenRequest{
ClientId: tea.String(s.DingTalkConfig.ClientID),
ClientSecret: tea.String(s.DingTalkConfig.ClientSecret),
Code: tea.String(code),
RefreshToken: tea.String(code),
GrantType: tea.String("authorization_code"),
}
resp, err := authClient.GetUserToken(getUserTokenRequest)
if err != nil {
return "", errors.New("dingTalk sso get token error: " + err.Error())
}
tokenBody := resp.Body
accessToken := tea.StringValue(tokenBody.AccessToken)
return accessToken, nil
}
func (s *SsoClient) GetAccessToken() (string, error) {
authClient, err := s.DingTalkConfig.CreateClient()
getUserTokenRequest := &dingtalkoauth2.GetAccessTokenRequest{
AppKey: tea.String(s.DingTalkConfig.ClientID),
AppSecret: tea.String(s.DingTalkConfig.ClientSecret),
}
resp, err := authClient.GetAccessToken(getUserTokenRequest)
if err != nil {
return "", errors.New("dingTalk sso get token error: " + err.Error())
}
tokenBody := resp.Body
accessToken := tea.StringValue(tokenBody.AccessToken)
return accessToken, nil
}
func (s *SsoClient) Reload(dingTalkConfig Config) {
s.Lock()
defer s.Unlock()
s.Enable = dingTalkConfig.Enable
s.DingTalkConfig = &dingTalkConfig
}
func (s *SsoClient) GetDisplayName() string {
s.RLock()
defer s.RUnlock()
if !s.Enable {
return ""
}
return s.DingTalkConfig.DisplayName
}
func (s *SsoClient) Authorize(redis storage.Redis, redirect string) (string, error) {
state := uuid.New().String()
ctx := context.Background()
err := redis.Set(ctx, wrapStateKey(state), redirect, time.Duration(300*time.Second)).Err()
if err != nil {
return "", err
}
s.RLock()
defer s.RUnlock()
return s.AuthCodeURL(state)
}
func (s *SsoClient) GetUserInfo(accessToken string, unionid string) (*dingtalkUserClient.GetUserResult, error) {
userClient, err := s.DingTalkConfig.UserClient()
if err != nil {
return nil, fmt.Errorf("CreateClient error: %s", err)
}
query := &dingtalkUserClient.GetUserQuery{AccessToken: accessToken}
unionReq := &dingtalkUserClient.GetUnionIdRequest{
UnionID: unionid,
}
uid, err := userClient.GetByUnionId(unionReq, query)
if err != nil {
return nil, err
}
if uid.Body == nil {
return nil, errors.Errorf("dingTalk get userid fail status code : %d", tea.Int32Value(uid.StatusCode))
}
if uid.Body.Result == nil {
return nil, errors.Errorf("dingTalk get userid body: %s", uid.Body.String())
}
req := &dingtalkUserClient.GetUserRequest{
UserID: tea.StringValue(uid.Body.Result.UserId),
}
userInfo, err := userClient.GetUser(req, query)
if userInfo.Body == nil {
return nil, errors.Errorf("dingTalk get userinfo status code: %d", tea.Int32Value(userInfo.StatusCode))
}
logger.Debugf("dingTalk get userinfo RequestID %s UserID %s ", tea.StringValue(userInfo.Body.RequestID), req.UserID)
return userInfo.Body.Result, nil
}
func (s *SsoClient) Callback(redis storage.Redis, ctx context.Context, code, state string) (*CallbackOutput, error) {
userAccessToken, err := s.GetUserToken(code)
if err != nil {
return nil, fmt.Errorf("dingTalk GetUserToken error: %s", err)
}
// 获取用户信息
contactClient, err := s.DingTalkConfig.ContactClient()
if err != nil {
return nil, fmt.Errorf("dingTalk New ContactClient error: %s", err)
}
getUserHeaders := &contact_1_0.GetUserHeaders{}
getUserHeaders.XAcsDingtalkAccessToken = tea.String(userAccessToken)
me, err := contactClient.GetUserWithOptions(tea.String("me"), getUserHeaders, &util.RuntimeOptions{})
if err != nil {
return nil, fmt.Errorf("dingTalk GetUser me error: %s", err)
}
redirect := ""
if redis != nil {
redirect, err = fetchRedirect(redis, ctx, state)
if err != nil {
logger.Errorf("get redirect err:%v code:%s state:%s", err, code, state)
}
}
if redirect == "" {
redirect = "/"
}
err = deleteRedirect(redis, ctx, state)
if err != nil {
logger.Errorf("delete redirect err:%v code:%s state:%s", err, code, state)
}
var callbackOutput CallbackOutput
if me.Body == nil {
return nil, fmt.Errorf("dingTalk GetUser failed, status code:%d", me.StatusCode)
}
logger.Debugf("dingTalk get contact %+v", me)
username := tea.StringValue(me.Body.Nick)
nickname := tea.StringValue(me.Body.Nick)
phone := tea.StringValue(me.Body.Mobile)
email := tea.StringValue(me.Body.Email)
if s.DingTalkConfig.UseMemberInfo {
unionID := tea.StringValue(me.Body.UnionId)
accessToken, err := dingTalkAccessTokenCacheGet(redis, ctx)
if err != nil {
logger.Warningf("dingTalk get accessToken cache fail %s", err.Error())
}
if accessToken == "" {
accessToken, err = s.GetAccessToken()
if err != nil {
return nil, err
}
err = dingTalkAccessTokenCacheSet(redis, ctx, accessToken)
if err != nil {
logger.Warningf("dingTalk set accessToken cache fail %s", err.Error())
}
}
user, err := s.GetUserInfo(accessToken, unionID)
if err != nil {
return nil, err
}
if user == nil {
return nil, fmt.Errorf("dingTalk GetUserInfo unionid %s username %s is nil", unionID, username)
}
logger.Debugf("dingTalk get user info unionID %s accessToken %s result %+v", unionID, accessToken, user)
username = tea.StringValue(user.Name)
nickname = tea.StringValue(user.Name)
phone = tea.StringValue(user.Mobile)
email = tea.StringValue(user.Email)
}
callbackOutput.Redirect = redirect
switch s.DingTalkConfig.UsernameField {
case "name":
if username == "" {
return nil, errors.New("dingTalk user name is empty")
}
callbackOutput.Username = username
case "email":
if email == "" {
return nil, errors.New("dingTalk user email is empty")
}
callbackOutput.Username = email
default:
if phone == "" {
return nil, errors.New("dingTalk user mobile is empty")
}
callbackOutput.Username = phone
}
callbackOutput.Nickname = nickname
callbackOutput.Email = email
callbackOutput.Phone = phone
return &callbackOutput, nil
}
func dingTalkAccessTokenCacheSet(redis storage.Redis, ctx context.Context, accessToken string) error {
// accessToken的有效期为7200秒(2小时),有效期内重复获取会返回相同结果并自动续期,过期后获取会返回新的accessToken
// 不能频繁调用gettoken接口,否则会受到频率拦截。
// 设置accessToken缓存90分钟,比官方少半小时
return redis.Set(ctx, wrapStateKey("dingtalk_access_token"), accessToken, time.Duration(5400*time.Second)).Err()
}
func dingTalkAccessTokenCacheGet(redis storage.Redis, ctx context.Context) (string, error) {
return redis.Get(ctx, wrapStateKey("dingtalk_access_token")).Result()
}
func fetchRedirect(redis storage.Redis, ctx context.Context, state string) (string, error) {
return redis.Get(ctx, wrapStateKey(state)).Result()
}
func deleteRedirect(redis storage.Redis, ctx context.Context, state string) error {
return redis.Del(ctx, wrapStateKey(state)).Err()
}
================================================
FILE: pkg/dingtalk/user/client.go
================================================
package user
import (
openapi "github.com/alibabacloud-go/darabonba-openapi/v2/client"
gatewayclient "github.com/alibabacloud-go/gateway-dingtalk/client"
openapiutil "github.com/alibabacloud-go/openapi-util/service"
util "github.com/alibabacloud-go/tea-utils/v2/service"
"github.com/alibabacloud-go/tea/tea"
)
type GetUserQuery struct {
AccessToken string `json:"access_token" xml:"access_token"`
}
type Client struct {
openapi.Client
}
func NewClient(config *openapi.Config) (*Client, error) {
client := new(Client)
err := client.Init(config)
return client, err
}
func (client *Client) Init(config *openapi.Config) (err error) {
err = client.Client.Init(config)
if err != nil {
return err
}
gatewayClient, err := gatewayclient.NewClient()
if err != nil {
return err
}
client.Spi = gatewayClient
client.EndpointRule = tea.String("")
if tea.BoolValue(util.Empty(client.Endpoint)) {
client.Endpoint = tea.String("oapi.dingtalk.com")
}
return nil
}
// Summary:
//
// 获取用户详情信息
//
// @param request - GetUserRequest
//
// @param query - GetUserQuery
//
// @return GetUserResponse
func (client *Client) GetUser(request *GetUserRequest, query *GetUserQuery) (result *GetUserResponse, err error) {
runtime := &util.RuntimeOptions{}
realQuery := make(map[string]*string)
if !tea.BoolValue(util.IsUnset(query.AccessToken)) {
realQuery["access_token"] = tea.String(query.AccessToken)
}
reqBody := map[string]interface{}{}
if !tea.BoolValue(util.IsUnset(request.UserID)) {
reqBody["userid"] = request.UserID
}
if !tea.BoolValue(util.IsUnset(request.Language)) {
reqBody["language"] = request.Language
}
req := &openapi.OpenApiRequest{
Query: realQuery,
Body: openapiutil.ParseToMap(reqBody),
}
params := &openapi.Params{
Action: tea.String("GetUser"),
Version: tea.String("contact_1.0"),
Protocol: tea.String("HTTPS"),
Pathname: tea.String("/topapi/v2/user/get"),
Method: tea.String("POST"),
AuthType: tea.String("AK"),
Style: tea.String("ROA"),
ReqBodyType: tea.String("none"),
BodyType: tea.String("json"),
}
result = &GetUserResponse{}
body, err := client.Execute(params, req, runtime)
if err != nil {
return result, err
}
err = tea.Convert(body, &result)
return result, err
}
type GetUserRequest struct {
UserID string `json:"user_id" xml:"user_id"`
Language string `json:"language" xml:"language"`
}
type GetUserResult struct {
AvatarUrl *string `json:"avatarUrl,omitempty" xml:"avatarUrl,omitempty"`
Email *string `json:"email,omitempty" xml:"email,omitempty"`
Mobile *string `json:"mobile,omitempty" xml:"mobile,omitempty"`
Name *string `json:"name,omitempty" xml:"name,omitempty"`
JobNumber *string `json:"job_number,omitempty" xml:"job_number,omitempty"`
StateCode *string `json:"stateCode,omitempty" xml:"stateCode,omitempty"`
UnionId *string `json:"unionid,omitempty" xml:"unionid,omitempty"`
UserId *string `json:"userid,omitempty" xml:"userid,omitempty"`
Visitor *bool `json:"visitor,omitempty" xml:"visitor,omitempty"`
}
func (s GetUserResult) String() string {
return tea.Prettify(s)
}
func (s GetUserResult) GoString() string {
return s.String()
}
type GetUserResponseBody struct {
Result *GetUserResult `json:"result,omitempty" xml:"result,omitempty"`
RequestID *string `json:"request_id,omitempty" xml:"request_id,omitempty"`
ErrMsg *string `json:"errmsg,omitempty" xml:"errmsg,omitempty"`
ErrCode *int `json:"errcode,omitempty" xml:"errcode,omitempty"`
}
func (s GetUserResponseBody) String() string {
return tea.Prettify(s)
}
func (s GetUserResponseBody) GoString() string {
return s.String()
}
type GetUserResponse struct {
Headers map[string]*string `json:"headers,omitempty" xml:"headers,omitempty"`
StatusCode *int32 `json:"statusCode,omitempty" xml:"statusCode,omitempty"`
Body *GetUserResponseBody `json:"body,omitempty" xml:"body,omitempty"`
}
func (s GetUserResponse) String() string {
return tea.Prettify(s)
}
func (s GetUserResponse) GoString() string {
return s.String()
}
// Summary:
//
// 根据unionid获取用户ID
//
// @param request - GetUnionIdRequest
//
// @param query - GetUserQuery
//
// @return GetUserResponse
func (client *Client) GetByUnionId(request *GetUnionIdRequest, query *GetUserQuery) (result *GetUserIDResponse, err error) {
runtime := &util.RuntimeOptions{}
realQuery := make(map[string]*string)
if !tea.BoolValue(util.IsUnset(query.AccessToken)) {
realQuery["access_token"] = tea.String(query.AccessToken)
}
reqBody := map[string]interface{}{}
if !tea.BoolValue(util.IsUnset(request.UnionID)) {
reqBody["unionid"] = request.UnionID
}
req := &openapi.OpenApiRequest{
Query: realQuery,
Body: openapiutil.ParseToMap(reqBody),
}
params := &openapi.Params{
Action: tea.String("GetUserID"),
Version: tea.String("contact_1.0"),
Protocol: tea.String("HTTPS"),
Pathname: tea.String("/topapi/user/getbyunionid"),
Method: tea.String("POST"),
AuthType: tea.String("AK"),
Style: tea.String("ROA"),
ReqBodyType: tea.String("none"),
BodyType: tea.String("json"),
}
result = &GetUserIDResponse{}
body, err := client.Execute(params, req, runtime)
if err != nil {
return result, err
}
err = tea.Convert(body, &result)
return result, err
}
type GetUnionIdRequest struct {
UnionID string `json:"union_id" xml:"union_id"`
}
type GetUserIDResult struct {
UserId *string `json:"userid,omitempty" xml:"userid,omitempty"`
ContactType *bool `json:"contact_type,omitempty" xml:"contact_type,omitempty"`
}
func (s GetUserIDResult) String() string {
return tea.Prettify(s)
}
func (s GetUserIDResult) GoString() string {
return s.String()
}
type GetUserIDResponseBody struct {
Result *GetUserIDResult `json:"result,omitempty" xml:"result,omitempty"`
RequestID *string `json:"request_id,omitempty" xml:"request_id,omitempty"`
ErrMsg *string `json:"errmsg,omitempty" xml:"errmsg,omitempty"`
ErrCode *int `json:"errcode,omitempty" xml:"errcode,omitempty"`
}
func (s GetUserIDResponseBody) String() string {
return tea.Prettify(s)
}
func (s GetUserIDResponseBody) GoString() string {
return s.String()
}
type GetUserIDResponse struct {
Headers map[string]*string `json:"headers,omitempty" xml:"headers,omitempty"`
StatusCode *int32 `json:"statusCode,omitempty" xml:"statusCode,omitempty"`
Body *GetUserIDResponseBody `json:"body,omitempty" xml:"body,omitempty"`
}
func (s GetUserIDResponse) String() string {
return tea.Prettify(s)
}
func (s GetUserIDResponse) GoString() string {
return s.String()
}
================================================
FILE: pkg/fasttime/fasttime.go
================================================
package fasttime
import (
"sync/atomic"
"time"
)
func init() {
go func() {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for tm := range ticker.C {
t := uint64(tm.Unix())
atomic.StoreUint64(¤tTimestamp, t)
}
}()
}
var currentTimestamp = uint64(time.Now().Unix())
// UnixTimestamp returns the current unix timestamp in seconds.
//
// It is faster than time.Now().Unix()
func UnixTimestamp() uint64 {
return atomic.LoadUint64(¤tTimestamp)
}
// UnixDate returns date from the current unix timestamp.
//
// The date is calculated by dividing unix timestamp by (24*3600)
func UnixDate() uint64 {
return UnixTimestamp() / (24 * 3600)
}
// UnixHour returns hour from the current unix timestamp.
//
// The hour is calculated by dividing unix timestamp by 3600
func UnixHour() uint64 {
return UnixTimestamp() / 3600
}
================================================
FILE: pkg/feishu/feishu.go
================================================
package feishu
import (
"bytes"
"context"
"fmt"
"net/url"
"strings"
"sync"
"time"
"github.com/ccfos/nightingale/v6/storage"
"github.com/google/uuid"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
lark "github.com/larksuite/oapi-sdk-go/v3"
larkcore "github.com/larksuite/oapi-sdk-go/v3/core"
larkauthen "github.com/larksuite/oapi-sdk-go/v3/service/authen/v1"
larkcontact "github.com/larksuite/oapi-sdk-go/v3/service/contact/v3"
)
const defaultAuthURL = "https://accounts.feishu.cn/open-apis/authen/v1/authorize"
const SsoTypeName = "feishu"
type SsoClient struct {
Enable bool
FeiShuConfig *Config `json:"-"`
Ctx context.Context
client *lark.Client
sync.RWMutex
}
type Config struct {
Enable bool `json:"enable"`
AuthURL string `json:"auth_url"`
DisplayName string `json:"display_name"`
AppID string `json:"app_id"`
AppSecret string `json:"app_secret"`
RedirectURL string `json:"redirect_url"`
UsernameField string `json:"username_field"` // name, email, phone
FeiShuEndpoint string `json:"feishu_endpoint"` // 飞书API端点,默认为 open.feishu.cn
Proxy string `json:"proxy"`
CoverAttributes bool `json:"cover_attributes"`
DefaultRoles []string `json:"default_roles"`
DefaultUserGroups []int64 `json:"default_user_groups"`
}
type CallbackOutput struct {
Redirect string `json:"redirect"`
Msg string `json:"msg"`
AccessToken string `json:"accessToken"`
Username string `json:"Username"`
Nickname string `json:"Nickname"`
Phone string `yaml:"Phone"`
Email string `yaml:"Email"`
}
func wrapStateKey(key string) string {
return "n9e_feishu_oauth_" + key
}
// createClient 创建飞书SDK客户端(v3版本)
func (c *Config) createClient() (*lark.Client, error) {
opts := []lark.ClientOptionFunc{
lark.WithLogLevel(larkcore.LogLevelInfo),
lark.WithEnableTokenCache(true), // 启用token缓存
}
if c.FeiShuEndpoint != "" {
lark.FeishuBaseUrl = c.FeiShuEndpoint
}
// 创建客户端(v3版本)
client := lark.NewClient(
c.AppID,
c.AppSecret,
opts...,
)
return client, nil
}
func New(cf Config) *SsoClient {
var s = &SsoClient{}
if !cf.Enable {
return s
}
s.Reload(cf)
return s
}
func (s *SsoClient) AuthCodeURL(state string) (string, error) {
var buf bytes.Buffer
feishuAuthURL := defaultAuthURL
if s.FeiShuConfig.AuthURL != "" {
feishuAuthURL = s.FeiShuConfig.AuthURL
}
buf.WriteString(feishuAuthURL)
v := url.Values{
"app_id": {s.FeiShuConfig.AppID},
"state": {state},
}
v.Set("redirect_uri", s.FeiShuConfig.RedirectURL)
if s.FeiShuConfig.RedirectURL == "" {
return "", errors.New("FeiShu OAuth RedirectURL is empty")
}
if strings.Contains(feishuAuthURL, "?") {
buf.WriteByte('&')
} else {
buf.WriteByte('?')
}
buf.WriteString(v.Encode())
return buf.String(), nil
}
// GetUserToken 通过授权码获取用户access token和user_id(使用SDK v3)
func (s *SsoClient) GetUserToken(code string) (string, string, error) {
if s.client == nil {
return "", "", errors.New("feishu client is not initialized")
}
ctx := context.Background()
// 使用SDK v3的authen服务获取access token
req := larkauthen.NewCreateAccessTokenReqBuilder().
Body(larkauthen.NewCreateAccessTokenReqBodyBuilder().
GrantType("authorization_code").
Code(code).
Build()).
Build()
resp, err := s.client.Authen.AccessToken.Create(ctx, req)
if err != nil {
return "", "", fmt.Errorf("feishu get access token error: %w", err)
}
// 检查响应
if !resp.Success() {
return "", "", fmt.Errorf("feishu api error: code=%d, msg=%s", resp.Code, resp.Msg)
}
if resp.Data == nil {
return "", "", errors.New("feishu api returned empty data")
}
userID := ""
if resp.Data.UserId != nil {
userID = *resp.Data.UserId
}
if userID == "" {
return "", "", errors.New("feishu api returned empty user_id")
}
accessToken := ""
if resp.Data.AccessToken != nil {
accessToken = *resp.Data.AccessToken
}
if accessToken == "" {
return "", "", errors.New("feishu api returned empty access_token")
}
return accessToken, userID, nil
}
// GetUserInfo 通过user_id获取用户详细信息(使用SDK v3)
// 注意:SDK内部会自动管理token,所以不需要传入accessToken
func (s *SsoClient) GetUserInfo(userID string) (*larkcontact.GetUserRespData, error) {
if s.client == nil {
return nil, errors.New("feishu client is not initialized")
}
ctx := context.Background()
// 使用SDK v3的contact服务获取用户详情
req := larkcontact.NewGetUserReqBuilder().
UserId(userID).
UserIdType(larkcontact.UserIdTypeUserId).
Build()
resp, err := s.client.Contact.User.Get(ctx, req)
if err != nil {
return nil, fmt.Errorf("feishu get user detail error: %w", err)
}
// 检查响应
if !resp.Success() {
return nil, fmt.Errorf("feishu api error: code=%d, msg=%s", resp.Code, resp.Msg)
}
if resp.Data == nil || resp.Data.User == nil {
return nil, errors.New("feishu api returned empty user data")
}
return resp.Data, nil
}
func (s *SsoClient) Reload(feishuConfig Config) {
s.Lock()
defer s.Unlock()
s.Enable = feishuConfig.Enable
s.FeiShuConfig = &feishuConfig
// 重新创建客户端
if feishuConfig.Enable && feishuConfig.AppID != "" && feishuConfig.AppSecret != "" {
client, err := feishuConfig.createClient()
if err != nil {
logger.Errorf("create feishu client error: %v", err)
} else {
s.client = client
}
}
}
func (s *SsoClient) GetDisplayName() string {
s.RLock()
defer s.RUnlock()
if !s.Enable {
return ""
}
return s.FeiShuConfig.DisplayName
}
func (s *SsoClient) Authorize(redis storage.Redis, redirect string) (string, error) {
state := uuid.New().String()
ctx := context.Background()
err := redis.Set(ctx, wrapStateKey(state), redirect, time.Duration(300*time.Second)).Err()
if err != nil {
return "", err
}
s.RLock()
defer s.RUnlock()
return s.AuthCodeURL(state)
}
func (s *SsoClient) Callback(redis storage.Redis, ctx context.Context, code, state string) (*CallbackOutput, error) {
// 通过code获取access token和user_id
accessToken, userID, err := s.GetUserToken(code)
if err != nil {
return nil, fmt.Errorf("feishu GetUserToken error: %s", err)
}
// 获取用户详细信息
userData, err := s.GetUserInfo(userID)
if err != nil {
return nil, fmt.Errorf("feishu GetUserInfo error: %s", err)
}
// 获取redirect URL
redirect := ""
if redis != nil {
redirect, err = fetchRedirect(redis, ctx, state)
if err != nil {
logger.Errorf("get redirect err:%v code:%s state:%s", err, code, state)
}
}
if redirect == "" {
redirect = "/"
}
err = deleteRedirect(redis, ctx, state)
if err != nil {
logger.Errorf("delete redirect err:%v code:%s state:%s", err, code, state)
}
var callbackOutput CallbackOutput
if userData == nil || userData.User == nil {
return nil, fmt.Errorf("feishu GetUserInfo failed, user data is nil")
}
user := userData.User
logger.Debugf("feishu get user info userID %s result %+v", userID, user)
// 提取用户信息
username := ""
if user.UserId != nil {
username = *user.UserId
}
if username == "" {
return nil, errors.New("feishu user_id is empty")
}
nickname := ""
if user.Name != nil {
nickname = *user.Name
}
phone := ""
if user.Mobile != nil {
phone = *user.Mobile
}
email := ""
if user.Email != nil {
email = *user.Email
}
if email == "" {
if user.EnterpriseEmail != nil {
email = *user.EnterpriseEmail
}
}
callbackOutput.Redirect = redirect
callbackOutput.AccessToken = accessToken
// 根据UsernameField配置确定username
switch s.FeiShuConfig.UsernameField {
case "name":
if nickname == "" {
return nil, errors.New("feishu user name is empty")
}
callbackOutput.Username = nickname
case "phone":
if phone == "" {
return nil, errors.New("feishu user phone is empty")
}
callbackOutput.Username = phone
default:
if email == "" {
return nil, errors.New("feishu user email is empty")
}
callbackOutput.Username = email
}
callbackOutput.Nickname = nickname
callbackOutput.Email = email
callbackOutput.Phone = phone
return &callbackOutput, nil
}
func fetchRedirect(redis storage.Redis, ctx context.Context, state string) (string, error) {
return redis.Get(ctx, wrapStateKey(state)).Result()
}
func deleteRedirect(redis storage.Redis, ctx context.Context, state string) error {
return redis.Del(ctx, wrapStateKey(state)).Err()
}
================================================
FILE: pkg/flashduty/post.go
================================================
package flashduty
import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"time"
"github.com/ccfos/nightingale/v6/center/cconf"
"github.com/toolkits/pkg/logger"
)
var (
Api string
Headers map[string]string
Timeout time.Duration
)
func Init(fdConf cconf.FlashDuty) {
Api = fdConf.Api
Headers = make(map[string]string)
Headers = fdConf.Headers
if fdConf.Timeout == 0 {
Timeout = 5 * time.Second
} else {
Timeout = fdConf.Timeout * time.Millisecond
}
}
type dutyResp[T any] struct {
RequestId string `json:"request_id"`
Data T `json:"data"`
Error struct {
Code string `json:"code"`
Message string `json:"message"`
} `json:"error"`
}
type TeamInfo struct {
TeamID int64 `json:"team_id"`
TeamName string `json:"team_name"`
Description string `json:"description"`
CreatedAt int64 `json:"created_at"`
UpdatedAt int64 `json:"updated_at"`
UpdatedBy int64 `json:"updated_by"`
UpdatedByName string `json:"updated_by_name"`
CreatorID int64 `json:"creator_id"`
RefID string `json:"ref_id"`
PersonIDs []int64 `json:"person_ids"`
}
type Data struct {
P int `json:"p"`
Limit int `json:"limit"`
Total int `json:"total"`
Items []Item `json:"items"`
}
type Item struct {
MemberID int `json:"member_id"`
MemberName string `json:"member_name"`
Phone string `json:"phone"`
Email string `json:"email"`
EmailVerified string `json:"email_verified"`
RefID string `json:"ref_id"`
}
func PostFlashDuty(path string, appKey string, body interface{}) error {
_, err := PostFlashDutyWithResp[Data](path, appKey, body)
return err
}
func PostFlashDutyWithResp[T any](path string, appKey string, body interface{}) (T, error) {
urlParams := url.Values{}
urlParams.Add("app_key", appKey)
var url string
if Api != "" {
url = fmt.Sprintf("%s%s?%s", Api, path, urlParams.Encode())
} else {
url = fmt.Sprintf("%s%s?%s", "https://api.flashcat.cloud", path, urlParams.Encode())
}
response, code, err := PostJSON(url, Timeout, Headers, body)
req, _ := json.Marshal(body)
logger.Infof("flashduty post: url=%s, req=%s; response=%s, code=%d", url, string(req), string(response), code)
var resp dutyResp[T]
if err == nil {
e := json.Unmarshal(response, &resp)
if e == nil && resp.Error.Message != "" {
err = fmt.Errorf("flashduty post error: %s", resp.Error.Message)
}
}
return resp.Data, err
}
func PostJSON(url string, timeout time.Duration, headers map[string]string, v interface{}, retries ...int) (response []byte, code int, err error) {
var bs []byte
bs, err = json.Marshal(v)
if err != nil {
return
}
bf := bytes.NewBuffer(bs)
client := http.Client{
Timeout: timeout,
}
req, err := http.NewRequest("POST", url, bf)
if err != nil {
return
}
req.Header.Set("Content-Type", "application/json")
if len(headers) > 0 {
for k, v := range headers {
req.Header.Set(k, v)
}
}
var resp *http.Response
if len(retries) > 0 {
for i := 0; i < retries[0]; i++ {
resp, err = client.Do(req)
if err == nil {
break
}
tryagain := ""
if i+1 < retries[0] {
tryagain = " try again"
}
logger.Warningf("failed to curl %s error: %s"+tryagain, url, err)
if i+1 < retries[0] {
time.Sleep(time.Millisecond * 200)
}
}
} else {
resp, err = client.Do(req)
}
if err != nil {
return
}
code = resp.StatusCode
if resp.Body != nil {
defer resp.Body.Close()
response, err = io.ReadAll(resp.Body)
}
return
}
================================================
FILE: pkg/flashduty/sync_user.go
================================================
package flashduty
import (
"errors"
"strconv"
"strings"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/logger"
)
func SyncUsersChange(ctx *ctx.Context, dbUsers []*models.User) error {
if !ctx.IsCenter {
return nil
}
appKey, err := models.ConfigsGetFlashDutyAppKey(ctx)
if err != nil {
return err
}
req := make(map[string]interface{})
req["limit"] = 100
userList, err := PostFlashDutyWithResp[Data]("/member/list", appKey, req)
if err != nil {
return err
}
total := userList.Total
items := []Item{}
for i := 0; i < total/100+1; i++ {
req["p"] = i
req["limit"] = 100
resp, err := PostFlashDutyWithResp[Data]("/member/list", appKey, req)
if err != nil {
return err
}
items = append(items, resp.Items...)
}
dutyUsers := make(map[int64]*models.User, len(items))
for i := range items {
if items[i].RefID != "" {
id, _ := strconv.ParseInt(items[i].RefID, 10, 64)
user := &models.User{
Username: items[i].MemberName,
Email: items[i].Email,
Phone: items[i].Phone,
Id: id,
}
dutyUsers[id] = user
}
}
dbUsersHas := sliceToMap(dbUsers)
delUsers := diffMap(dutyUsers, dbUsersHas)
fdDelUsers(appKey, delUsers)
addUsers := diffMap(dbUsersHas, dutyUsers)
if err := fdAddUsers(appKey, addUsers); err != nil {
return err
}
updateUser(appKey, dbUsersHas, dutyUsers)
return nil
}
func sliceToMap(dbUsers []*models.User) map[int64]*models.User {
m := make(map[int64]*models.User, len(dbUsers))
for _, user := range dbUsers {
m[user.Id] = user
}
return m
}
// in m1 and not in m2
func diffMap(m1, m2 map[int64]*models.User) []models.User {
var diff []models.User
for i := range m1 {
if _, ok := m2[i]; !ok {
diff = append(diff, *m1[i])
}
}
return diff
}
func updateUser(appKey string, m1, m2 map[int64]*models.User) {
for i := range m1 {
if _, ok := m2[i]; ok {
if m1[i].Email != m2[i].Email || !PhoneIsSame(m1[i].Phone, m2[i].Phone) || m1[i].Username != m2[i].Username {
var flashdutyUser User
flashdutyUser = User{
RefID: strconv.FormatInt(m1[i].Id, 10),
}
flashdutyUser.Updates = Updates{
Phone: m1[i].Phone,
Email: m1[i].Email,
MemberName: m1[i].Username,
RefID: strconv.FormatInt(m1[i].Id, 10),
}
err := flashdutyUser.UpdateMember(appKey)
if err != nil {
logger.Errorf("failed to update user: %v", err)
}
}
}
}
}
func PhoneIsSame(phone1, phone2 string) bool {
// 兼容不同国家/地区前缀,例如 +86、+1、+44 等,以及包含空格或短横线的格式
normalize := func(p string) string {
p = strings.TrimSpace(p)
p = strings.ReplaceAll(p, " ", "")
p = strings.ReplaceAll(p, "-", "")
p = strings.TrimPrefix(p, "+")
return p
}
p1 := normalize(phone1)
p2 := normalize(phone2)
if p1 == p2 {
return true
}
// 如果长度相差不超过 3 且较长的以较短的结尾,则认为是相同号码(忽略最多 3 位国家区号差异)
if len(p1) > len(p2) {
return len(p1)-len(p2) <= 3 && strings.HasSuffix(p1, p2)
}
return len(p2)-len(p1) <= 3 && strings.HasSuffix(p2, p1)
}
type User struct {
Email string `json:"email,omitempty"`
Phone string `json:"phone,omitempty"`
MemberName string `json:"member_name,omitempty"`
RefID string `json:"ref_id,omitempty"`
Updates Updates `json:"updates,omitempty"`
}
type Updates struct {
RefID string `json:"ref_id,omitempty"`
Email string `json:"email,omitempty"`
Phone string `json:"phone,omitempty"`
MemberName string `json:"member_name,omitempty"`
CountryCode string `json:"country_code,omitempty"`
}
func (user *User) delMember(appKey string) error {
if user.RefID == "" {
return errors.New("refID must not be empty")
}
userDel := &User{RefID: user.RefID}
return PostFlashDuty("/member/delete", appKey, userDel)
}
func (user *User) UpdateMember(appKey string) error {
return PostFlashDuty("/member/info/reset", appKey, user)
}
type Members struct {
Users []User `json:"members"`
}
func (m *Members) addMembers(appKey string) error {
if len(m.Users) == 0 {
return nil
}
validUsers := make([]User, 0, len(m.Users))
for _, user := range m.Users {
if user.RefID == "" || (user.Phone == "" && user.Email == "") {
logger.Errorf("user(%+v) refID must not be none, Email or Phone can not be none", user)
} else {
validUsers = append(validUsers, user)
}
}
if len(validUsers) == 0 {
return nil
}
m.Users = validUsers
return PostFlashDuty("/member/invite", appKey, m)
}
func fdAddUsers(appKey string, users []models.User) error {
fdUsers := usersToFdUsers(users)
members := &Members{
Users: fdUsers,
}
return members.addMembers(appKey)
}
func fdDelUsers(appKey string, users []models.User) {
fdUsers := usersToFdUsers(users)
for _, fdUser := range fdUsers {
if err := fdUser.delMember(appKey); err != nil {
logger.Errorf("failed to delete user: %v", err)
}
}
}
func usersToFdUsers(users []models.User) []User {
fdUsers := make([]User, 0, len(users))
for i := range users {
fdUsers = append(fdUsers, User{
RefID: strconv.FormatInt(users[i].Id, 10),
Phone: users[i].Phone,
Email: users[i].Email,
MemberName: users[i].Username,
})
}
return fdUsers
}
func UpdateUser(ctx *ctx.Context, target models.User, email, phone string) {
//contact := target.FindSameContact(email, phone)
if target.Id == 0 {
logger.Errorf("user not found: %s", target.Username)
return
}
if email == "" && phone == "" {
logger.Errorf("email and phone are both empty: %s", target.Username)
return
}
var flashdutyUser User
refID := strconv.FormatInt(target.Id, 10)
flashdutyUser = User{
RefID: refID,
}
flashdutyUser.Updates = Updates{
Phone: phone,
Email: email,
MemberName: target.Username,
RefID: refID,
}
appKey, err := models.ConfigsGetFlashDutyAppKey(ctx)
if err != nil {
logger.Errorf("failed to get flashduty app key: %v", err)
return
}
err = flashdutyUser.UpdateMember(appKey)
if err != nil && strings.Contains(err.Error(), "no member found") {
// 如果没有找到成员,说明需要新建成员
NewUser := &User{
Phone: phone,
Email: email,
MemberName: target.Username,
RefID: refID,
}
err = PostFlashDuty("/member/invite", appKey, NewUser)
if err != nil {
logger.Errorf("failed to update user: %v", err)
}
return
}
if err != nil {
logger.Errorf("failed to update user: %v", err)
}
}
================================================
FILE: pkg/flashduty/sync_user_group.go
================================================
package flashduty
import (
"errors"
"strconv"
"strings"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/logger"
)
type UserGroupSyncer struct {
ctx *ctx.Context
ug *models.UserGroup
appKey string
teamID int64
}
func NewUserGroupSyncer(ctx *ctx.Context, ug *models.UserGroup) (*UserGroupSyncer, error) {
appKey, err := models.ConfigsGetFlashDutyAppKey(ctx)
if err != nil {
return nil, err
}
return &UserGroupSyncer{
ctx: ctx,
ug: ug,
appKey: appKey,
}, nil
}
func (ugs *UserGroupSyncer) SyncUGAdd() error {
// 新建团队(无用户仅有团队名称)
fdt := Team{
TeamName: ugs.ug.Name,
RefID: strconv.FormatInt(ugs.ug.Id, 10),
}
err := fdt.UpdateTeam(ugs.appKey)
if err != nil {
return err
}
return ugs.syncTeamMember()
}
func (ugs *UserGroupSyncer) SyncUGPut() error {
// 修改为查询 ref_ID
refID := strconv.FormatInt(ugs.ug.Id, 10)
teamID, err := ugs.CheckTeam(refID)
// 如果没有找到团队,说明是新建的团队
ugs.teamID = teamID
if err != nil && strings.Contains(err.Error(), "no team found by ref_id") {
emails := make([]string, 0)
phones := make([]string, 0)
for _, user := range ugs.ug.Users {
if user.Email != "" {
emails = append(emails, user.Email)
} else if user.Phone != "" {
phones = append(phones, user.Phone)
} else {
logger.Warningf("The user %s has no email and phone, and failed to sync to flashduty's team", user.Username)
}
}
//根据 team_id 去更新 duty 中这个团队的信息
fdt := Team{
RefID: refID,
TeamName: ugs.ug.Name,
Emails: emails,
Phones: phones,
}
if err := fdt.AddTeam(ugs.appKey); err != nil {
return err
}
if err := ugs.syncTeamMember(); err != nil {
return err
}
return nil
}
if err != nil {
return err
}
emails := make([]string, 0)
phones := make([]string, 0)
for _, user := range ugs.ug.Users {
if user.Email != "" {
emails = append(emails, user.Email)
} else if user.Phone != "" {
phones = append(phones, user.Phone)
} else {
logger.Warningf("The user %s has no email and phone, and failed to sync to flashduty's team", user.Username)
}
}
//根据 team_id 去更新 duty 中这个团队的信息
fdt := Team{
TeamID: teamID,
RefID: refID,
TeamName: ugs.ug.Name,
Emails: emails,
Phones: phones,
}
if err := fdt.UpdateTeam(ugs.appKey); err != nil {
return err
}
if err := ugs.syncTeamMember(); err != nil {
return err
}
return nil
}
func (ugs *UserGroupSyncer) SyncUGDel() error {
fdt := Team{
RefID: strconv.FormatInt(ugs.ug.Id, 10),
}
err := fdt.DelTeam(ugs.appKey)
return err
}
func (ugs *UserGroupSyncer) SyncMembersAdd() error {
return ugs.syncTeamMember()
}
func (ugs *UserGroupSyncer) SyncMembersDel() error {
return ugs.syncTeamMember()
}
func (ugs *UserGroupSyncer) syncTeamMember() error {
uids, err := models.MemberIds(ugs.ctx, ugs.ug.Id)
if err != nil {
return err
}
users, err := models.UserGetsByIds(ugs.ctx, uids)
if err != nil {
return err
}
toDutyErr := ugs.addMemberToFDTeam(users)
if toDutyErr != nil {
logger.Warningf("failed to sync user group %s %v to flashduty's team: %v", ugs.ug.Name, users, toDutyErr)
}
return err
}
func (ugs *UserGroupSyncer) addMemberToFDTeam(users []models.User) error {
if err := fdAddUsers(ugs.appKey, users); err != nil {
return err
}
emails := make([]string, 0)
phones := make([]string, 0)
for _, user := range users {
if user.Email != "" {
emails = append(emails, user.Email)
} else if user.Phone != "" {
phones = append(phones, user.Phone)
} else {
logger.Warningf("The user %s has no email and phone, and failed to sync to flashduty's team", user.Username)
}
}
teamID := ugs.teamID
refID := strconv.FormatInt(ugs.ug.Id, 10)
var err error
if teamID == 0 {
teamID, err = ugs.CheckTeam(refID)
if err != nil {
logger.Warningf("CheckTeam failed for refID=%v: %v", refID, err)
}
}
fdt := Team{
TeamID: teamID,
TeamName: ugs.ug.Name,
Emails: emails,
Phones: phones,
RefID: refID,
}
err = fdt.UpdateTeam(ugs.appKey)
return err
}
type Team struct {
TeamID int64 `json:"team_id"`
TeamName string `json:"team_name"`
ResetIfNameExist bool `json:"reset_if_name_exist"`
Description string `json:"description"`
Emails []string `json:"emails"`
Phones []string `json:"phones"`
RefID string `json:"ref_id"`
}
func (t *Team) AddTeam(appKey string) error {
if t.TeamName == "" {
return errors.New("team_name must be set")
}
return PostFlashDuty("/team/upsert", appKey, t)
}
func (t *Team) UpdateTeam(appKey string) error {
t.ResetIfNameExist = true
err := t.AddTeam(appKey)
return err
}
func (t *Team) DelTeam(appKey string) error {
err := PostFlashDuty("/team/delete", appKey, t)
return err
}
func NeedSyncTeam(ctx *ctx.Context) bool {
configs, err := models.ConfigsSelectByCkey(ctx, "flashduty_sync_team")
if err != nil {
logger.Warningf("failed to query flashduty_sync_team: %v", err)
return false
}
if len(configs) == 0 || configs[0].Cval == "" {
return false
}
return configs[0].Cval == "true"
}
func NeedSyncUser(ctx *ctx.Context) bool {
configs, err := models.ConfigsSelectByCkey(ctx, "flashduty_app_key")
if err != nil {
logger.Warningf("failed to query flashduty_app_key: %v", err)
return false
}
if len(configs) == 0 || configs[0].Cval == "" {
return false
}
return true
}
// CheckTeam 检查ref_id是否存在
func (ugs *UserGroupSyncer) CheckTeam(ref_id string) (int64, error) {
// Construct the request to query the team by name
info, err := PostFlashDutyWithResp[TeamInfo]("/team/info", ugs.appKey, map[string]interface{}{
"ref_id": ref_id,
})
if err != nil || info.TeamID == 0 {
return 0, err
}
return info.TeamID, nil
}
================================================
FILE: pkg/flashduty/sync_user_test.go
================================================
package flashduty
import "testing"
func TestPhoneIsSame(t *testing.T) {
tests := []struct {
name string
phone1 string
phone2 string
same bool
}{
{
name: "blank",
phone1: "",
phone2: "",
same: true,
},
{
name: "China +86 prefix",
phone1: "+8613812345678",
phone2: "13812345678",
same: true,
},
{
name: "China +86 with spaces and hyphens",
phone1: "+86 138-1234-5678",
phone2: "13812345678",
same: true,
},
{
name: "USA +1 prefix",
phone1: "+1 234-567-8900",
phone2: "2345678900",
same: true,
},
{
name: "UK +44 prefix",
phone1: "+442078765432",
phone2: "2078765432",
same: true,
},
{
name: "India +91 prefix",
phone1: "+919876543210",
phone2: "9876543210",
same: true,
},
{
name: "Germany +49 prefix",
phone1: "+4915123456789",
phone2: "15123456789",
same: true,
},
{
name: "Different numbers",
phone1: "+8613812345678",
phone2: "13812345679",
same: false,
},
}
for _, tt := range tests {
if got := PhoneIsSame(tt.phone1, tt.phone2); got != tt.same {
t.Errorf("%s: expected %v, got %v", tt.name, tt.same, got)
}
}
}
================================================
FILE: pkg/ginx/auth.go
================================================
// Copyright 2014 Manu Martinez-Almeida. All rights reserved.
// Use of this source code is governed by a MIT style
// license that can be found in the LICENSE file.
package ginx
import (
"crypto/subtle"
"encoding/base64"
"net/http"
"strconv"
"github.com/gin-gonic/gin"
)
// AuthUserKey is the cookie name for user credential in basic auth.
const AuthUserKey = "user"
// Accounts defines a key/value for user/pass list of authorized logins.
type Accounts []Account
type Account struct {
User string
Password string
}
type authPair struct {
value string
user string
}
type authPairs []authPair
func (a authPairs) searchCredential(authValue string) (string, bool) {
if authValue == "" {
return "", false
}
for _, pair := range a {
if subtle.ConstantTimeCompare(StringToBytes(pair.value), StringToBytes(authValue)) == 1 {
return pair.user, true
}
}
return "", false
}
// BasicAuthForRealm returns a Basic HTTP Authorization middleware. It takes as arguments a map[string]string where
// the key is the user name and the value is the password, as well as the name of the Realm.
// If the realm is empty, "Authorization Required" will be used by default.
// (see http://tools.ietf.org/html/rfc2617#section-1.2)
func BasicAuthForRealm(accounts Accounts, realm string) gin.HandlerFunc {
if realm == "" {
realm = "Authorization Required"
}
realm = "Basic realm=" + strconv.Quote(realm)
pairs := processAccounts(accounts)
return func(c *gin.Context) {
// Search user in the slice of allowed credentials
user, found := pairs.searchCredential(c.Request.Header.Get("Authorization"))
if !found {
// Credentials doesn't match, we return 401 and abort handlers chain.
c.Header("WWW-Authenticate", realm)
c.AbortWithStatus(http.StatusUnauthorized)
return
}
// The user credentials was found, set user's id to key AuthUserKey in this context, the user's id can be read later using
// c.MustGet(gin.AuthUserKey).
c.Set(AuthUserKey, user)
}
}
// BasicAuth returns a Basic HTTP Authorization middleware. It takes as argument a map[string]string where
// the key is the user name and the value is the password.
func BasicAuth(accounts Accounts) gin.HandlerFunc {
return BasicAuthForRealm(accounts, "")
}
func processAccounts(accounts Accounts) authPairs {
length := len(accounts)
assert1(length > 0, "Empty list of authorized credentials")
pairs := make(authPairs, 0, length)
for _, account := range accounts {
assert1(account.User != "", "User can not be empty")
value := authorizationHeader(account.User, account.Password)
pairs = append(pairs, authPair{
value: value,
user: account.User,
})
}
return pairs
}
func authorizationHeader(user, password string) string {
base := user + ":" + password
return "Basic " + base64.StdEncoding.EncodeToString(StringToBytes(base))
}
func assert1(guard bool, text string) {
if !guard {
panic(text)
}
}
================================================
FILE: pkg/ginx/bytesconv.go
================================================
// Copyright 2023 Gin Core Team. All rights reserved.
// Use of this source code is governed by a MIT style
// license that can be found in the LICENSE file.
//go:build go1.20
package ginx
import (
"unsafe"
)
// StringToBytes converts string to byte slice without a memory allocation.
// For more details, see https://github.com/golang/go/issues/53003#issuecomment-1140276077.
func StringToBytes(s string) []byte {
return unsafe.Slice(unsafe.StringData(s), len(s))
}
// BytesToString converts byte slice to string without a memory allocation.
// For more details, see https://github.com/golang/go/issues/53003#issuecomment-1140276077.
func BytesToString(b []byte) string {
return unsafe.String(unsafe.SliceData(b), len(b))
}
================================================
FILE: pkg/ginx/errorx.go
================================================
package ginx
import "github.com/toolkits/pkg/errorx"
func Bomb(code int, format string, a ...interface{}) {
errorx.Bomb(code, format, a...)
}
func Dangerous(v interface{}, code ...int) {
errorx.Dangerous(v, code...)
}
================================================
FILE: pkg/ginx/funcs.go
================================================
package ginx
import (
"github.com/gin-gonic/gin"
)
func Offset(c *gin.Context, limit int, pagenoVarName ...string) int {
if limit <= 0 {
limit = 10
}
pageno := "p"
if len(pagenoVarName) > 0 {
pageno = pagenoVarName[0]
}
page := QueryInt(c, pageno, 1)
return (page - 1) * limit
}
================================================
FILE: pkg/ginx/param.go
================================================
package ginx
import (
"net/http"
"strconv"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/errorx"
)
func BindJSON(c *gin.Context, ptr interface{}) {
err := c.ShouldBindJSON(ptr)
if err != nil {
errorx.Bomb(http.StatusBadRequest, "json body invalid: %v", err)
}
}
func UrlParamStr(c *gin.Context, field string) string {
val := c.Param(field)
if val == "" {
errorx.Bomb(http.StatusBadRequest, "url param[%s] is blank", field)
}
return val
}
func UrlParamInt64(c *gin.Context, field string) int64 {
strval := UrlParamStr(c, field)
intval, err := strconv.ParseInt(strval, 10, 64)
if err != nil {
errorx.Bomb(http.StatusBadRequest, "cannot convert %s to int64", strval)
}
return intval
}
func UrlParamInt(c *gin.Context, field string) int {
return int(UrlParamInt64(c, field))
}
func QueryStr(c *gin.Context, key string, defaultVal ...string) string {
val := c.Query(key)
if val != "" {
return val
}
if len(defaultVal) == 0 {
errorx.Bomb(http.StatusBadRequest, "query param[%s] is necessary", key)
}
return defaultVal[0]
}
func QueryInt(c *gin.Context, key string, defaultVal ...int) int {
strv := c.Query(key)
if strv != "" {
intv, err := strconv.Atoi(strv)
if err != nil {
errorx.Bomb(http.StatusBadRequest, "cannot convert [%s] to int", strv)
}
return intv
}
if len(defaultVal) == 0 {
errorx.Bomb(http.StatusBadRequest, "query param[%s] is necessary", key)
}
return defaultVal[0]
}
func QueryInt64(c *gin.Context, key string, defaultVal ...int64) int64 {
strv := c.Query(key)
if strv != "" {
intv, err := strconv.ParseInt(strv, 10, 64)
if err != nil {
errorx.Bomb(http.StatusBadRequest, "cannot convert [%s] to int64", strv)
}
return intv
}
if len(defaultVal) == 0 {
errorx.Bomb(http.StatusBadRequest, "query param[%s] is necessary", key)
}
return defaultVal[0]
}
func QueryBool(c *gin.Context, key string, defaultVal ...bool) bool {
strv := c.Query(key)
if strv != "" {
if strv == "true" || strv == "1" || strv == "on" || strv == "checked" || strv == "yes" || strv == "Y" {
return true
} else if strv == "false" || strv == "0" || strv == "off" || strv == "no" || strv == "N" {
return false
} else {
errorx.Bomb(http.StatusBadRequest, "unknown arg[%s] value: %s", key, strv)
}
}
if len(defaultVal) == 0 {
errorx.Bomb(http.StatusBadRequest, "arg[%s] is necessary", key)
}
return defaultVal[0]
}
================================================
FILE: pkg/ginx/render.go
================================================
package ginx
import (
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/i18n"
)
type Render struct {
code int
ctx *gin.Context
}
func NewRender(c *gin.Context, code ...int) Render {
r := Render{ctx: c}
if len(code) > 0 {
r.code = code[0]
} else {
r.code = 200
}
return r
}
func (r Render) Message(v interface{}, a ...interface{}) {
requestId := r.ctx.GetString("trace_id")
if v == nil {
if r.code == 200 {
r.ctx.JSON(r.code, gin.H{"err": "", "request_id": requestId})
} else {
r.ctx.String(r.code, "")
}
return
}
switch t := v.(type) {
case string:
msg := i18n.Sprintf(r.ctx.GetHeader("X-Language"), t, a...)
if r.code == 200 {
r.ctx.JSON(r.code, gin.H{"err": msg, "request_id": requestId})
} else {
r.ctx.String(r.code, msg)
}
case error:
msg := i18n.Sprintf(r.ctx.GetHeader("X-Language"), t.Error(), a...)
if r.code == 200 {
r.ctx.JSON(r.code, gin.H{"err": msg, "request_id": requestId})
} else {
r.ctx.String(r.code, msg)
}
}
}
func (r Render) Data(data interface{}, err interface{}, a ...interface{}) {
if err == nil {
r.ctx.JSON(r.code, gin.H{"dat": data, "err": "", "request_id": r.ctx.GetString("trace_id")})
return
}
r.Message(err, a...)
}
func (r Render) ZeroPage() {
r.Data(gin.H{
"list": []int{},
"total": 0,
}, nil)
}
================================================
FILE: pkg/hash/hash.go
================================================
package hash
import (
"sort"
"strings"
prommodel "github.com/prometheus/common/model"
"github.com/spaolacci/murmur3"
)
func GetHash(m prommodel.Metric, ref string) uint64 {
var str string
var strs []string
// get keys from m
for k, _ := range m {
strs = append(strs, string(k))
}
// sort keys use sort
sort.Strings(strs)
for _, k := range strs {
str += "/"
str += k
str += "/"
str += string(m[prommodel.LabelName(k)])
}
str += "/"
str += ref
return murmur3.Sum64([]byte(str))
}
func GetTagHash(m prommodel.Metric) uint64 {
var str string
var strs []string
// get keys from m
for k, _ := range m {
if k == "__name__" {
continue
}
strs = append(strs, string(k))
}
// sort keys use sort
sort.Strings(strs)
for _, k := range strs {
str += "/"
str += k
str += "/"
str += string(m[prommodel.LabelName(k)])
}
return murmur3.Sum64([]byte(str))
}
func GetTargetTagHash(m prommodel.Metric, target []string) uint64 {
builder := strings.Builder{}
for _, k := range target {
builder.WriteString("/")
builder.WriteString(k)
builder.WriteString("/")
builder.WriteString(string(m[prommodel.LabelName(k)]))
}
return murmur3.Sum64([]byte(builder.String()))
}
================================================
FILE: pkg/hash/hash_fnv.go
================================================
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package hash
import (
"hash"
"github.com/davecgh/go-spew/spew"
)
// DeepHashObject writes specified object to hash using the spew library
// which follows pointers and prints actual values of the nested objects
// ensuring the hash does not change when a pointer changes.
func DeepHashObject(hasher hash.Hash, objectToWrite interface{}) {
hasher.Reset()
printer := spew.ConfigState{
Indent: " ",
SortKeys: true,
DisableMethods: true,
SpewKeys: true,
}
printer.Fprintf(hasher, "%#v", objectToWrite)
}
================================================
FILE: pkg/hash/hash_md5.go
================================================
package hash
import (
prommodel "github.com/prometheus/common/model"
"github.com/toolkits/pkg/str"
)
func GetHash2(m prommodel.Metric, ref string) string {
var s string
for k, v := range m {
s += "/"
s += string(k)
s += "/"
s += string(v)
}
s += "/"
s += ref
return str.MD5(s)
}
func GetTagHash2(m prommodel.Metric) string {
var s string
for k, v := range m {
if k == "__name__" {
continue
}
s += "/"
s += string(k)
s += "/"
s += string(v)
}
return str.MD5(s)
}
================================================
FILE: pkg/httpx/httpx.go
================================================
package httpx
import (
"context"
"crypto/tls"
"fmt"
"net/http"
"os"
"strings"
"time"
"github.com/ccfos/nightingale/v6/pkg/aop"
"github.com/ccfos/nightingale/v6/pkg/logx"
"github.com/ccfos/nightingale/v6/pkg/version"
"github.com/gin-contrib/pprof"
"github.com/gin-gonic/gin"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
type Config struct {
Host string
Port int
CertFile string
KeyFile string
PProf bool
PrintAccessLog bool
PrintBody bool
ExposeMetrics bool
ShutdownTimeout int
MaxContentLength int64
ReadTimeout int
WriteTimeout int
IdleTimeout int
JWTAuth JWTAuth
ProxyAuth ProxyAuth
ShowCaptcha ShowCaptcha
APIForAgent BasicAuths
APIForService BasicAuths
RSA RSAConfig
TokenAuth TokenAuth
}
type RSAConfig struct {
OpenRSA bool
RSAPublicKey []byte
RSAPublicKeyPath string
RSAPrivateKey []byte
RSAPrivateKeyPath string
RSAPassWord string
}
type ShowCaptcha struct {
Enable bool
}
type BasicAuths struct {
BasicAuth gin.Accounts
Enable bool
}
type ProxyAuth struct {
Enable bool
HeaderUserNameKey string
DefaultRoles []string
}
type JWTAuth struct {
SigningKey string
AccessExpired int64
RefreshExpired int64
RedisKeyPrefix string
SingleLogin bool
}
type TokenAuth struct {
Enable bool
HeaderUserTokenKey string
}
func GinEngine(mode string, cfg Config, printBodyPaths func() map[string]struct{},
printAccessLog func() bool) *gin.Engine {
gin.SetMode(mode)
loggerMid := aop.Logger(aop.LoggerConfig{PrintAccessLog: printAccessLog,
PrintBodyPaths: printBodyPaths})
recoveryMid := aop.Recovery()
if strings.ToLower(mode) == "release" {
aop.DisableConsoleColor()
}
r := gin.New()
r.Use(traceIdMid())
r.Use(recoveryMid)
r.Use(loggerMid)
if cfg.PProf {
pprof.Register(r, "/api/debug/pprof")
}
r.GET("/ping", func(c *gin.Context) {
c.String(200, "pong")
})
r.GET("/pid", func(c *gin.Context) {
c.String(200, fmt.Sprintf("%d", os.Getpid()))
})
r.GET("/ppid", func(c *gin.Context) {
c.String(200, fmt.Sprintf("%d", os.Getppid()))
})
r.GET("/addr", func(c *gin.Context) {
c.String(200, c.Request.RemoteAddr)
})
r.GET("/api/n9e/version", func(c *gin.Context) {
c.String(200, version.Version)
})
if cfg.ExposeMetrics {
r.GET("/metrics", gin.WrapH(promhttp.Handler()))
}
return r
}
func traceIdMid() gin.HandlerFunc {
return func(c *gin.Context) {
id := c.GetHeader("X-Trace-Id")
if !isValidTraceId(id) {
id = uuid.New().String()
}
c.Set("trace_id", id)
ctx := logx.NewTraceContext(c.Request.Context(), id)
c.Request = c.Request.WithContext(ctx)
c.Header("X-Trace-Id", id)
c.Next()
}
}
func isValidTraceId(id string) bool {
if id == "" || len(id) > 64 {
return false
}
for _, r := range id {
if !((r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '-' || r == '_') {
return false
}
}
return true
}
func Init(cfg Config, handler http.Handler) func() {
addr := fmt.Sprintf("%s:%d", cfg.Host, cfg.Port)
srv := &http.Server{
Addr: addr,
Handler: handler,
ReadTimeout: time.Duration(cfg.ReadTimeout) * time.Second,
WriteTimeout: time.Duration(cfg.WriteTimeout) * time.Second,
IdleTimeout: time.Duration(cfg.IdleTimeout) * time.Second,
}
go func() {
fmt.Println("http server listening on:", addr)
var err error
if cfg.CertFile != "" && cfg.KeyFile != "" {
srv.TLSConfig = &tls.Config{MinVersion: tls.VersionTLS12}
err = srv.ListenAndServeTLS(cfg.CertFile, cfg.KeyFile)
} else {
err = srv.ListenAndServe()
}
if err != nil && err != http.ErrServerClosed {
panic(err)
}
}()
return func() {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(cfg.ShutdownTimeout))
defer cancel()
srv.SetKeepAlivesEnabled(false)
if err := srv.Shutdown(ctx); err != nil {
fmt.Println("cannot shutdown http server:", err)
}
select {
case <-ctx.Done():
fmt.Println("http exiting")
default:
fmt.Println("http server stopped")
}
}
}
================================================
FILE: pkg/i18nx/i18n.go
================================================
package i18nx
import (
"encoding/json"
"path"
"github.com/toolkits/pkg/file"
"github.com/toolkits/pkg/i18n"
"github.com/toolkits/pkg/logger"
)
func Init(configDir string) {
filePath := path.Join(configDir, "i18n.json")
m := make(map[string]map[string]string)
builtInConf := make(map[string]map[string]string)
var content = I18N
var err error
//use built-in config
err = json.Unmarshal([]byte(content), &builtInConf)
if err != nil {
logger.Errorf("parse i18n config file %s fail: %s\n", filePath, err)
return
}
if !file.IsExist(filePath) {
m = builtInConf
} else {
//expand config
//prioritize the settings within the expand config options in case of conflicts
content, err = file.ToTrimString(filePath)
if err != nil {
logger.Errorf("read i18n config file %s fail: %s\n", filePath, err)
return
}
err = json.Unmarshal([]byte(content), &m)
if err != nil {
logger.Errorf("parse i18n config file %s fail: %s\n", filePath, err)
return
}
// json Example:
//{
// "zh": {
// "username":"用户名"
// },
// "fr": {
// "username":"nom d'utilisateur"
// }
//}
for languageKey, languageDict := range builtInConf {
if _, hasL := m[languageKey]; hasL { //languages
for k, v := range languageDict {
if _, has := m[languageKey][k]; !has {
m[languageKey][k] = v
}
}
} else {
m[languageKey] = languageDict
}
}
}
i18n.DictRegister(m)
}
================================================
FILE: pkg/i18nx/var.go
================================================
package i18nx
var I18N = `{
"zh_CN": {
"Username or password invalid": "用户名或密码错误",
"incorrect verification code": "验证码错误",
"roles empty": "角色不能为空",
"Username already exists": "此用户名已存在 请使用其他用户名",
"failed to count user-groups": "校验数据失败 请重试",
"UserGroup already exists": "组名已存在 请使用其他名称",
"members empty": "成员不能为空",
"At least one team have rw permission": "至少需要有一个团队有读写权限",
"Failed to create BusiGroup(%s)": "[%s]创建失败 请重试",
"business group id invalid": "业务组 id 不正确",
"idents empty": "监控对象不能为空",
"invalid tag(%s)": "tag不合法[%s]",
"invalid tagkey(%s): cannot contains . ": "tagkey[%s]不能包含.",
"invalid tagkey(%s): cannot contains _ ": "tagkey[%s]不能包含_",
"invalid tagkey(%s)": "tagkey不合法[%s]",
"duplicate tagkey(%s)": "tagkey(%s)重复了",
"name is empty": "名称不能为空",
"Ident duplicate": "仪表盘唯一标识已存在",
"No such dashboard": "仪表盘不存在",
"Name has invalid characters": "名称包含非法字符",
"Name is blank": "名称不能为空",
"forbidden": "没有权限",
"builtin alerts is empty, file: %s": "内置告警模板为空 %s",
"input json is empty": "提交内容不能为空",
"fields empty": "选择字段不能为空",
"No such AlertRule": "无此告警规则",
"GroupId(%d) invalid": "业务组id无效",
"No such recording rule": "无此记录规则",
"tags is blank": "标签不能为空",
"oops... etime(%d) <= btime(%d)": "开始时间,不能大于结束时间",
"group_id invalid": "业务组无效",
"No such AlertMute": "无此屏蔽规则",
"rule_id and tags are both blank": "告警规则和标签不能同时为空",
"rule is blank": "规则不能为空",
"rule invalid": "规则无效 请检查是否正确",
"unsupported field: %s": "不支持字段 %s",
"arg(batch) should be nonnegative": "batch 不能为负数",
"arg(tolerance) should be nonnegative": "tolerance 不能为负数",
"arg(timeout) should be nonnegative": "timeout 不能为负数",
"arg(timeout) longer than five days": "timeout 时间不能超过5天",
"arg(title) is required": "title 为必填项",
"created task.id is zero": "任务id为零",
"invalid ibex address: %s": "ibex %s 地址无效",
"url path invalid": "url非法",
"no such server": "无此实例",
"admin role can not be modified": "管理员角色不允许修改",
"builtin payload already exists": "内置模板已存在",
"This functionality has not been enabled. Please contact the system administrator to activate it.": "此功能尚未启用。请联系系统管理员启用",
"targets not exist: %s": "有些机器不存在: %s",
"mute is disabled": "屏蔽规则已禁用",
"datasource id not match": "数据源ID不匹配",
"event trigger time not within mute time range": "事件触发时间不在屏蔽时间范围内",
"event trigger time not within periodic mute range": "事件触发时间不在周期性屏蔽时间范围内",
"mute time type invalid": "屏蔽时间类型无效",
"event severity not match mute severity": "事件严重程度与屏蔽严重程度不匹配",
"event tags not match mute tags": "事件标签与屏蔽标签不匹配",
"event datasource not match": "事件数据源不匹配",
"event rule id not match": "事件告警规则ID不匹配",
"event tags not match": "事件标签不匹配",
"event group name not match": "事件业务组名称不匹配",
"event severity not match": "事件严重程度不匹配",
"subscribe notify rule not found: %v": "订阅通知规则未找到: %v",
"notify rule send error: %v": "通知规则发送错误: %v",
"event match subscribe and notification test ok": "事件匹配订阅规则,通知测试成功",
"no notify rules selected": "未选择通知规则",
"no notify channels selected": "未选择通知渠道",
"no notify groups selected": "未选择通知组",
"all users missing notify channel configurations: %v": "所有用户缺少通知渠道配置: %v",
"event match subscribe and notify settings ok": "事件匹配订阅规则,通知设置正常",
"/loki suffix is miss, please add /loki to the url: %s": "缺少/loki后缀,请在URL中添加/loki:%s",
"event time not match time filter": "事件时间不匹配时间过滤器",
"event severity not match severity filter": "事件等级不匹配等级过滤器",
"event tag not match tag filter": "事件标签不匹配标签过滤器",
"event attributes not match attributes filter": "事件属性不匹配属性过滤器",
"failed to parse tag filter: %v": "解析标签过滤器失败: %v",
"event is dropped": "事件已被丢弃,不会进行通知",
"drop event success": "丢弃事件成功",
"drop event failed": "丢弃事件失败",
"callback success": "回调成功",
"Infrastructure": "基础设施",
"Host - View": "机器 - 查看",
"Host - Modify": "机器 - 修改",
"Host - Delete": "机器 - 删除",
"Host - Bind Uncategorized": "机器 - 绑定未归组机器到某个业务组",
"Explorer": "数据查询",
"Metrics Explorer": "指标查询",
"Quick View": "快捷视图",
"Built-in Metric - View": "内置指标 - 查看",
"Built-in Metric - Add": "内置指标 - 新增",
"Built-in Metric - Modify": "内置指标 - 修改",
"Built-in Metric - Delete": "内置指标 - 删除",
"Recording Rule - View": "记录规则 - 查看",
"Recording Rule - Add": "记录规则 - 新增",
"Recording Rule - Modify": "记录规则 - 修改",
"Recording Rule - Delete": "记录规则 - 删除",
"Logs Explorer": "日志查询",
"Index Pattern - View": "索引模式 - 查看",
"Index Pattern - Add": "索引模式 - 新增",
"Index Pattern - Modify": "索引模式 - 修改",
"Index Pattern - Delete": "索引模式 - 删除",
"Dashboard - View": "仪表盘 - 查看",
"Dashboard - Add": "仪表盘 - 新增",
"Dashboard - Modify": "仪表盘 - 修改",
"Dashboard - Delete": "仪表盘 - 删除",
"Dashboard - View Public": "仪表盘 - 查看公开仪表盘",
"Alerting": "告警",
"Alerting Rule - View": "告警规则 - 查看",
"Alerting Rule - Add": "告警规则 - 新增",
"Alerting Rule - Modify": "告警规则 - 修改",
"Alerting Rule - Delete": "告警规则 - 删除",
"Mutting Rule - View": "屏蔽规则 - 查看",
"Mutting Rule - Add": "屏蔽规则 - 新增",
"Mutting Rule - Modify": "屏蔽规则 - 修改",
"Mutting Rule - Delete": "屏蔽规则 - 删除",
"Subscribing Rule - View": "订阅规则 - 查看",
"Subscribing Rule - Add": "订阅规则 - 新增",
"Subscribing Rule - Modify": "订阅规则 - 修改",
"Subscribing Rule - Delete": "订阅规则 - 删除",
"Self-healing-Script - View": "自愈脚本 - 查看",
"Self-healing-Script - Add": "自愈脚本 - 新增",
"Self-healing-Script - Modify": "自愈脚本 - 修改",
"Self-healing-Script - Delete": "自愈脚本 - 删除",
"Self-healing-Job - View": "自愈任务 - 查看",
"Self-healing-Job - Add": "自愈任务 - 新增",
"Self-healing-Job - Modify": "自愈任务 - 修改",
"Active Event - View": "活跃事件 - 查看",
"Active Event - Delete": "活跃事件 - 删除",
"Historical Event - View": "历史事件 - 查看",
"Notification": "通知",
"Notification Rule - View": "通知规则 - 查看",
"Notification Rule - Add": "通知规则 - 新增",
"Notification Rule - Modify": "通知规则 - 修改",
"Notification Rule - Delete": "通知规则 - 删除",
"Media Type - View": "通知媒介 - 查看",
"Media Type - Add": "通知媒介 - 新增",
"Media Type - Modify": "通知媒介 - 修改",
"Media Type - Delete": "通知媒介 - 删除",
"Message Template - View": "消息模板 - 查看",
"Message Template - Add": "消息模板 - 新增",
"Message Template - Modify": "消息模板 - 修改",
"Message Template - Delete": "消息模板 - 删除",
"Event Pipeline - View": "事件管道 - 查看",
"Event Pipeline - Add": "事件管道 - 新增",
"Event Pipeline - Modify": "事件管道 - 修改",
"Event Pipeline - Delete": "事件管道 - 删除",
"Notification Settings - View": "老版本通知设置 - 查看",
"Notification Templates - View": "老版本消息模板 - 查看",
"Integrations": "集成中心",
"Data Source - View": "数据源 - 查看",
"Component - View": "组件 - 查看",
"Component - Add": "组件 - 新增",
"Component - Modify": "组件 - 修改",
"Component - Delete": "组件 - 删除",
"Embedded Product - View": "系统集成 - 查看",
"Embedded Product - Add": "系统集成 - 新增",
"Embedded Product - Modify": "系统集成 - 修改",
"Embedded Product - Delete": "系统集成 - 删除",
"Organization": "人员组织",
"User - View": "用户 - 查看",
"User - Add": "用户 - 新增",
"User - Modify": "用户 - 修改",
"User - Delete": "用户 - 删除",
"Team - View": "团队 - 查看",
"Team - Add": "团队 - 新增",
"Team - Modify": "团队 - 修改",
"Team - Delete": "团队 - 删除",
"Business Group - View": "业务组 - 查看",
"Business Group - Add": "业务组 - 新增",
"Business Group - Modify": "业务组 - 修改",
"Business Group - Delete": "业务组 - 删除",
"Role - View": "角色 - 查看",
"Role - Add": "角色 - 新增",
"Role - Modify": "角色 - 修改",
"Role - Delete": "角色 - 删除",
"System Settings": "系统配置",
"View Site Settings": "查看站点设置",
"View Variable Settings": "查看变量配置",
"View SSO Settings": "查看单点登录配置",
"View Alerting Engines": "查看告警引擎列表",
"View Product Version": "查看产品版本",
"Some alert rules still in the BusiGroup": "业务组中仍有告警规则",
"Some alert mutes still in the BusiGroup": "业务组中仍有屏蔽规则",
"Some alert subscribes still in the BusiGroup": "业务组中仍有订阅规则",
"Some Board still in the BusiGroup": "业务组中仍有仪表盘",
"Some targets still in the BusiGroup": "业务组中仍有监控对象",
"Some recording rules still in the BusiGroup": "业务组中仍有记录规则",
"Some recovery scripts still in the BusiGroup": "业务组中仍有自愈脚本",
"Some target busigroups still in the BusiGroup": "业务组中仍有监控对象",
"saved view not found": "保存的视图不存在",
"saved view name is blank": "视图名称不能为空",
"saved view page is blank": "视图页面不能为空",
"saved view name already exists in this page": "该页面下已存在同名的公开视图",
"---------zh_CN--------": "---------zh_CN--------"
},
"zh_HK": {
"Username or password invalid": "用戶名或密碼錯誤",
"incorrect verification code": "驗證碼錯誤",
"roles empty": "角色不能為空",
"Username already exists": "此用戶名已存在 請使用其他用戶名",
"failed to count user-groups": "校驗數據失敗 請重試",
"UserGroup already exists": "組名已存在 請使用其他名稱",
"members empty": "成員不能為空",
"At least one team have rw permission": "至少需要有一個團隊有讀寫權限",
"Failed to create BusiGroup(%s)": "[%s]創建失敗 請重試",
"business group id invalid": "業務組 id 不正確",
"idents empty": "監控對象不能為空",
"invalid tag(%s)": "tag不合法[%s]",
"invalid tagkey(%s): cannot contains . ": "tagkey[%s]不能包含.",
"invalid tagkey(%s): cannot contains _ ": "tagkey[%s]不能包含_",
"invalid tagkey(%s)": "tagkey不合法[%s]",
"duplicate tagkey(%s)": "tagkey(%s)重複了",
"name is empty": "名稱不能為空",
"Ident duplicate": "儀表板唯一標識已存在",
"Name duplicate": "儀表板名稱已存在",
"No such dashboard": "儀表板不存在",
"Name has invalid characters": "名稱包含非法字符",
"Name is blank": "名稱不能為空",
"forbidden": "沒有權限",
"builtin alerts is empty, file: %s": "內置告警模板為空 %s",
"input json is empty": "提交內容不能為空",
"fields empty": "選擇字段不能為空",
"No such AlertRule": "無此告警規則",
"GroupId(%d) invalid": "業務組id無效",
"No such recording rule": "無此記錄規則",
"tags is blank": "標籤不能為空",
"oops... etime(%d) <= btime(%d)": "開始時間,不能大於結束時間",
"group_id invalid": "業務組無效",
"No such AlertMute": "無此屏蔽規則",
"rule_id and tags are both blank": "告警規則和標籤不能同時為空",
"rule is blank": "規則不能為空",
"rule invalid": "規則無效 請檢查是否正確",
"unsupported field: %s": "不支持字段 %s",
"arg(batch) should be nonnegative": "batch 不能為負數",
"arg(tolerance) should be nonnegative": "tolerance 不能為負數",
"arg(timeout) should be nonnegative": "timeout 不能為負數",
"arg(timeout) longer than five days": "timeout 時間不能超過5天",
"arg(title) is required": "title 為必填項",
"created task.id is zero": "任務id為零",
"invalid ibex address: %s": "ibex %s 地址無效",
"url path invalid": "url非法",
"no such server": "無此實例",
"admin role can not be modified": "管理員角色不允許修改",
"builtin payload already exists": "內置模板已存在",
"builtin metric already exists": "內置指標已存在",
"AlertRule already exists": "告警規則已存在",
"This functionality has not been enabled. Please contact the system administrator to activate it.": "此功能尚未啟用。請聯繫系統管理員啟用",
"targets not exist: %s": "有些機器不存在: %s",
"mute is disabled": "屏蔽規則已禁用",
"datasource id not match": "數據源ID不匹配",
"event trigger time not within mute time range": "事件觸發時間不在屏蔽時間範圍內",
"event trigger time not within periodic mute range": "事件觸發時間不在週期性屏蔽時間範圍內",
"mute time type invalid": "屏蔽時間類型無效",
"event severity not match mute severity": "事件嚴重程度與屏蔽嚴重程度不匹配",
"event tags not match mute tags": "事件標籤與屏蔽標籤不匹配",
"event datasource not match": "事件數據源不匹配",
"event rule id not match": "事件告警規則ID不匹配",
"event tags not match": "事件標籤不匹配",
"event group name not match": "事件業務組名稱不匹配",
"event severity not match": "事件嚴重程度不匹配",
"subscribe notify rule not found: %v": "訂閱通知規則未找到: %v",
"notify rule send error: %v": "通知規則發送錯誤: %v",
"event match subscribe and notification test ok": "事件匹配訂閱規則,通知測試成功",
"no notify rules selected": "未選擇通知規則",
"no notify channels selected": "未選擇通知渠道",
"no notify groups selected": "未選擇通知組",
"all users missing notify channel configurations: %v": "所有用戶缺少通知渠道配置: %v",
"event match subscribe and notify settings ok": "事件匹配訂閱規則,通知設置正常",
"/loki suffix is miss, please add /loki to the url: %s": "缺少/loki後綴,請在URL中添加/loki:%s",
"event time not match time filter": "事件時間不匹配時間過濾器",
"event severity not match severity filter": "事件等級不匹配等級過濾器",
"event tag not match tag filter": "事件標籤不匹配標籤過濾器",
"event attributes not match attributes filter": "事件屬性不匹配屬性過濾器",
"failed to parse tag filter: %v": "解析標籤過濾器失敗: %v",
"event is dropped": "事件已被丟棄,不會進行通知",
"drop event success": "丟棄事件成功",
"drop event failed": "丟棄事件失敗",
"callback success": "回調成功",
"Infrastructure": "基礎設施",
"Host - View": "機器 - 查看",
"Host - Modify": "機器 - 修改",
"Host - Delete": "機器 - 删除",
"Host - Bind Uncategorized": "機器 - 綁定未歸組機器到某個業務組",
"Explorer": "數據查詢",
"Metrics Explorer": "指標查詢",
"Quick View": "快捷視圖",
"Built-in Metric - View": "內置指標 - 查看",
"Built-in Metric - Add": "內置指標 - 新增",
"Built-in Metric - Modify": "內置指標 - 修改",
"Built-in Metric - Delete": "內置指標 - 删除",
"Recording Rule - View": "記錄規則 - 查看",
"Recording Rule - Add": "記錄規則 - 新增",
"Recording Rule - Modify": "記錄規則 - 修改",
"Recording Rule - Delete": "記錄規則 - 删除",
"Logs Explorer": "日誌查詢",
"Index Pattern - View": "索引模式 - 查看",
"Index Pattern - Add": "索引模式 - 新增",
"Index Pattern - Modify": "索引模式 - 修改",
"Index Pattern - Delete": "索引模式 - 删除",
"Dashboard - View": "儀表板 - 查看",
"Dashboard - Add": "儀表板 - 新增",
"Dashboard - Modify": "儀表板 - 修改",
"Dashboard - Delete": "儀表板 - 删除",
"Dashboard - View Public": "儀表板 - 查看公開儀表板",
"Alerting": "告警",
"Alerting Rule - View": "告警規則 - 查看",
"Alerting Rule - Add": "告警規則 - 新增",
"Alerting Rule - Modify": "告警規則 - 修改",
"Alerting Rule - Delete": "告警規則 - 删除",
"Mutting Rule - View": "屏蔽規則 - 查看",
"Mutting Rule - Add": "屏蔽規則 - 新增",
"Mutting Rule - Modify": "屏蔽規則 - 修改",
"Mutting Rule - Delete": "屏蔽規則 - 删除",
"Subscribing Rule - View": "訂閱規則 - 查看",
"Subscribing Rule - Add": "訂閱規則 - 新增",
"Subscribing Rule - Modify": "訂閱規則 - 修改",
"Subscribing Rule - Delete": "訂閱規則 - 删除",
"Self-healing-Script - View": "自愈腳本 - 查看",
"Self-healing-Script - Add": "自愈腳本 - 新增",
"Self-healing-Script - Modify": "自愈腳本 - 修改",
"Self-healing-Script - Delete": "自愈腳本 - 删除",
"Self-healing-Job - View": "自愈任務 - 查看",
"Self-healing-Job - Add": "自愈任務 - 新增",
"Self-healing-Job - Modify": "自愈任務 - 修改",
"Active Event - View": "活躍事件 - 查看",
"Active Event - Delete": "活躍事件 - 删除",
"Historical Event - View": "歷史事件 - 查看",
"Notification": "通知",
"Notification Rule - View": "通知規則 - 查看",
"Notification Rule - Add": "通知規則 - 新增",
"Notification Rule - Modify": "通知規則 - 修改",
"Notification Rule - Delete": "通知規則 - 删除",
"Media Type - View": "通知媒介 - 查看",
"Media Type - Add": "通知媒介 - 新增",
"Media Type - Modify": "通知媒介 - 修改",
"Media Type - Delete": "通知媒介 - 删除",
"Message Template - View": "訊息範本 - 查看",
"Message Template - Add": "訊息範本 - 新增",
"Message Template - Modify": "訊息範本 - 修改",
"Message Template - Delete": "訊息範本 - 删除",
"Event Pipeline - View": "事件管線 - 查看",
"Event Pipeline - Add": "事件管線 - 新增",
"Event Pipeline - Modify": "事件管線 - 修改",
"Event Pipeline - Delete": "事件管線 - 删除",
"Notification Settings - View": "老版本通知设置 - 查看",
"Notification Templates - View": "老版本訊息範本 - 查看",
"Integrations": "集成中心",
"Data Source - View": "資料源 - 查看",
"Component - View": "組件 - 查看",
"Component - Add": "組件 - 新增",
"Component - Modify": "組件 - 修改",
"Component - Delete": "組件 - 刪除",
"Embedded Product - View": "系統集成 - 查看",
"Embedded Product - Add": "系統集成 - 新增",
"Embedded Product - Modify": "系統集成 - 修改",
"Embedded Product - Delete": "系統集成 - 刪除",
"Organization": "人員組織",
"User - View": "用戶 - 查看",
"User - Add": "用戶 - 新增",
"User - Modify": "用戶 - 修改",
"User - Delete": "用戶 - 刪除",
"Team - View": "團隊 - 查看",
"Team - Add": "團隊 - 新增",
"Team - Modify": "團隊 - 修改",
"Team - Delete": "團隊 - 刪除",
"Business Group - View": "業務組 - 查看",
"Business Group - Add": "業務組 - 新增",
"Business Group - Modify": "業務組 - 修改",
"Business Group - Delete": "業務組 - 删除",
"Role - View": "角色 - 查看",
"Role - Add": "角色 - 新增",
"Role - Modify": "角色 - 修改",
"Role - Delete": "角色 - 删除",
"System Settings": "系統配置",
"View Site Settings": "查看站點設置",
"View Variable Settings": "查看變量配置",
"View SSO Settings": "查看單點登錄配置",
"View Alerting Engines": "查看告警引擎列表",
"View Product Version": "查看產品版本",
"Some alert rules still in the BusiGroup": "業務組中仍有告警規則",
"Some alert mutes still in the BusiGroup": "業務組中仍有屏蔽規則",
"Some alert subscribes still in the BusiGroup": "業務組中仍有訂閱規則",
"Some Board still in the BusiGroup": "業務組中仍有儀表板",
"Some targets still in the BusiGroup": "業務組中仍有監控對象",
"Some recording rules still in the BusiGroup": "業務組中仍有記錄規則",
"Some recovery scripts still in the BusiGroup": "業務組中仍有自愈腳本",
"Some target busigroups still in the BusiGroup": "業務組中仍有監控對象",
"saved view not found": "保存的視圖不存在",
"saved view name is blank": "視圖名稱不能為空",
"saved view page is blank": "視圖頁面不能為空",
"saved view name already exists in this page": "該頁面下已存在同名的公開視圖",
"---------zh_HK--------": "---------zh_HK--------"
},
"ja_JP": {
"Username or password invalid": "ユーザー名またはパスワードが無効です",
"incorrect verification code": "認証コードが正しくありません",
"roles empty": "役割を空にすることはできません",
"Username already exists": "このユーザー名は既に存在します。別のユーザー名を使用してください",
"failed to count user-groups": "データの検証に失敗しました。もう一度お試しください",
"UserGroup already exists": "グループ名は既に存在します。別の名前を使用してください",
"members empty": "メンバーを空にすることはできません",
"At least one team have rw permission": "少なくとも1つのチームに読み書き権限が必要です",
"Failed to create BusiGroup(%s)": "[%s]の作成に失敗しました。もう一度お試しください",
"business group id invalid": "ビジネスグループIDが正しくありません",
"idents empty": "監視対象を空にすることはできません",
"invalid tag(%s)": "タグ[%s]が無効です",
"invalid tagkey(%s): cannot contains . ": "タグキー[%s]にドット(.)を含めることはできません",
"invalid tagkey(%s): cannot contains _ ": "タグキー[%s]にアンダースコア(_)を含めることはできません",
"invalid tagkey(%s)": "タグキー[%s]が無効です",
"duplicate tagkey(%s)": "タグキー(%s)が重複しています",
"name is empty": "名前を空にすることはできません",
"Ident duplicate": "ダッシュボードの一意の識別子が既に存在します",
"No such dashboard": "ダッシュボードが存在しません",
"Name has invalid characters": "名前に無効な文字が含まれています",
"Name is blank": "名前を空白にすることはできません",
"forbidden": "権限がありません",
"builtin alerts is empty, file: %s": "ビルトインアラートテンプレートが空です %s",
"input json is empty": "提出内容を空にすることはできません",
"fields empty": "選択フィールドを空にすることはできません",
"No such AlertRule": "そのようなアラートルールはありません",
"GroupId(%d) invalid": "ビジネスグループIDが無効です",
"No such recording rule": "そのような記録ルールはありません",
"tags is blank": "タグを空白にすることはできません",
"oops... etime(%d) <= btime(%d)": "開始時間は終了時間より大きくすることはできません",
"group_id invalid": "ビジネスグループが無効です",
"No such AlertMute": "そのようなアラートミュートルールはありません",
"rule_id and tags are both blank": "アラートルールとタグを同時に空にすることはできません",
"rule is blank": "ルールを空にすることはできません",
"rule invalid": "ルールが無効です。正しいかどうか確認してください",
"unsupported field: %s": "フィールド %s はサポートされていません",
"arg(batch) should be nonnegative": "batchは負の数にできません",
"arg(tolerance) should be nonnegative": "toleranceは負の数にできません",
"arg(timeout) should be nonnegative": "timeoutは負の数にできません",
"arg(timeout) longer than five days": "timeoutは5日を超えることはできません",
"arg(title) is required": "titleは必須項目です",
"created task.id is zero": "作成されたタスクIDがゼロです",
"invalid ibex address: %s": "ibex %s のアドレスが無効です",
"url path invalid": "URLパスが無効です",
"no such server": "そのようなインスタンスはありません",
"admin role can not be modified": "管理者ロールは変更できません",
"builtin payload already exists": "ビルトインテンプレートは既に存在します",
"This functionality has not been enabled. Please contact the system administrator to activate it.": "この機能はまだ有効になっていません。システム管理者に連絡して有効にしてください",
"targets not exist: %s": "いくつかのマシンが存在しません: %s",
"mute is disabled": "ミュートルールが無効になっています",
"datasource id not match": "データソースIDが一致しません",
"event trigger time not within mute time range": "イベントトリガー時間がミュート時間範囲内にありません",
"event trigger time not within periodic mute range": "イベントトリガー時間が周期的ミュート時間範囲内にありません",
"mute time type invalid": "ミュート時間タイプが無効です",
"event severity not match mute severity": "イベントの重要度がミュートの重要度と一致しません",
"event tags not match mute tags": "イベントタグがミュートタグと一致しません",
"event datasource not match": "イベントデータソースが一致しません",
"event rule id not match": "イベントアラートルールIDが一致しません",
"event tags not match": "イベントタグが一致しません",
"event group name not match": "イベントビジネスグループ名が一致しません",
"event severity not match": "イベントの重要度が一致しません",
"subscribe notify rule not found: %v": "サブスクライブ通知ルールが見つかりません: %v",
"notify rule send error: %v": "通知ルール送信エラー: %v",
"event match subscribe and notification test ok": "イベントがサブスクライブルールに一致し、通知テストが成功しました",
"no notify rules selected": "通知ルールが選択されていません",
"no notify channels selected": "通知チャンネルが選択されていません",
"no notify groups selected": "通知グループが選択されていません",
"all users missing notify channel configurations: %v": "すべてのユーザーに通知チャンネル設定がありません: %v",
"event match subscribe and notify settings ok": "イベントがサブスクライブルールに一致し、通知設定が正常です",
"/loki suffix is miss, please add /loki to the url: %s": "/lokiサフィックスがありません。URLに/lokiを追加してください: %s",
"event time not match time filter": "イベント時間が時間フィルタと一致しません",
"event severity not match severity filter": "イベント等級が等級フィルタと一致しません",
"event tag not match tag filter": "イベントタグがタグフィルタと一致しません",
"event attributes not match attributes filter": "イベント属性が属性フィルタと一致しません",
"failed to parse tag filter: %v": "タグフィルタの解析に失敗しました: %v",
"event is dropped": "イベントが破棄されました,通知は行われません",
"drop event success": "イベント破棄成功",
"drop event failed": "イベント破棄失敗",
"callback success": "コールバック成功",
"Infrastructure": "インフラストラクチャ",
"Host - View": "機器 - 閲覧",
"Host - Modify": "機器 - 修正",
"Host - Delete": "機器 - 削除",
"Host - Bind Uncategorized": "機器 - グループ未所属の機器をある業務グループにバインドする",
"Explorer": "データ検索",
"Metrics Explorer": "メトリクス エクスプローラー",
"Quick View": "クイック ビュー",
"Built-in Metric - View": "組み込みメトリクス - 閲覧",
"Built-in Metric - Add": "組み込みメトリクス - 追加",
"Built-in Metric - Modify": "組み込みメトリクス - 修正",
"Built-in Metric - Delete": "組み込みメトリクス - 削除",
"Recording Rule - View": "記録ルール - 閲覧",
"Recording Rule - Add": "記録ルール - 追加",
"Recording Rule - Modify": "記録ルール - 修正",
"Recording Rule - Delete": "記録ルール - 削除",
"Logs Explorer": "ログ エクスプローラー",
"Index Pattern - View": "インデックス パターン - 閲覧",
"Index Pattern - Add": "インデックス パターン - 追加",
"Index Pattern - Modify": "インデックス パターン - 修正",
"Index Pattern - Delete": "インデックス パターン - 削除",
"Dashboard - View": "ダッシュボード - 閲覧",
"Dashboard - Add": "ダッシュボード - 追加",
"Dashboard - Modify": "ダッシュボード - 修正",
"Dashboard - Delete": "ダッシュボード - 削除",
"Dashboard - View Public": "ダッシュボード - 公開されたダッシュボードを見る",
"Alerting": "アラート",
"Alerting Rule - View": "アラートルール - 閲覧",
"Alerting Rule - Add": "アラートルール - 追加",
"Alerting Rule - Modify": "アラートルール - 修正",
"Alerting Rule - Delete": "アラートルール - 削除",
"Mutting Rule - View": "抑制ルール - 閲覧",
"Mutting Rule - Add": "抑制ルール - 追加",
"Mutting Rule - Modify": "抑制ルール - 修正",
"Mutting Rule - Delete": "抑制ルール - 削除",
"Subscribing Rule - View": "購読ルール - 閲覧",
"Subscribing Rule - Add": "購読ルール - 追加",
"Subscribing Rule - Modify": "購読ルール - 修正",
"Subscribing Rule - Delete": "購読ルール - 削除",
"Self-healing-Script - View": "タスクテンプレート - 閲覧",
"Self-healing-Script - Add": "タスクテンプレート - 追加",
"Self-healing-Script - Modify": "タスクテンプレート - 修正",
"Self-healing-Script - Delete": "タスクテンプレート - 削除",
"Self-healing-Job - View": "一時的なタスク - 閲覧",
"Self-healing-Job - Add": "一時的なタスク - 追加",
"Self-healing-Job - Modify": "一時的なタスク - 修正",
"Active Event - View": "アクティブアラート - 閲覧",
"Active Event - Delete": "アクティブアラート - 削除",
"Historical Event - View": "過去のアラート - 閲覧",
"Notification": "通知",
"Notification Rule - View": "通知ルール - 閲覧",
"Notification Rule - Add": "通知ルール - 追加",
"Notification Rule - Modify": "通知ルール - 修正",
"Notification Rule - Delete": "通知ルール - 削除",
"Media Type - View": "通知メディア - 閲覧",
"Media Type - Add": "通知メディア - 追加",
"Media Type - Modify": "通知メディア - 修正",
"Media Type - Delete": "通知メディア - 削除",
"Message Template - View": "メッセージテンプレート - 閲覧",
"Message Template - Add": "メッセージテンプレート - 追加",
"Message Template - Modify": "メッセージテンプレート - 修正",
"Message Template - Delete": "メッセージテンプレート - 削除",
"Event Pipeline - View": "イベント パイプライン - 閲覧",
"Event Pipeline - Add": "イベント パイプライン - 追加",
"Event Pipeline - Modify": "イベント パイプライン - 修正",
"Event Pipeline - Delete": "イベント パイプライン - 削除",
"Notification Settings - View": "旧バージョンの通知設定 - 閲覧",
"Notification Templates - View": "旧バージョンのメッセージテンプレート - 閲覧",
"Integrations": "統合センター",
"Data Source - View": "データソース - 閲覧",
"Component - View": "コンポーネント - 閲覧",
"Component - Add": "コンポーネント - 追加",
"Component - Modify": "コンポーネント - 修正",
"Component - Delete": "コンポーネント - 削除",
"Embedded Product - View": "システム統合 - 閲覧",
"Embedded Product - Add": "システム統合 - 追加",
"Embedded Product - Modify": "システム統合 - 修正",
"Embedded Product - Delete": "システム統合 - 削除",
"Organization": "組織",
"User - View": "ユーザー - 閲覧",
"User - Add": "ユーザー - 追加",
"User - Modify": "ユーザー - 修正",
"User - Delete": "ユーザー - 削除",
"Team - View": "チーム - 閲覧",
"Team - Add": "チーム - 追加",
"Team - Modify": "チーム - 修正",
"Team - Delete": "チーム - 削除",
"Business Group - View": "業務グループ - 閲覧",
"Business Group - Add": "業務グループ - 追加",
"Business Group - Modify": "業務グループ - 修正",
"Business Group - Delete": "業務グループ - 削除",
"Role - View": "役割 - 閲覧",
"Role - Add": "役割 - 追加",
"Role - Modify": "役割 - 修正",
"Role - Delete": "役割 - 削除",
"System Settings": "システム設定",
"View Site Settings": "サイト設定の表示",
"View Variable Settings": "変数設定の表示",
"View SSO Settings": "シングルサインオン設定の表示",
"View Alerting Engines": "アラートエンジンの表示",
"View Product Version": "製品のバージョンを見る",
"Some alert rules still in the BusiGroup": "ビジネスグループにまだアラートルールがあります",
"Some alert mutes still in the BusiGroup": "ビジネスグループにまだミュートルールがあります",
"Some alert subscribes still in the BusiGroup": "ビジネスグループにまだサブスクライブルールがあります",
"Some Board still in the BusiGroup": "ビジネスグループにまだダッシュボードがあります",
"Some targets still in the BusiGroup": "ビジネスグループにまだ監視対象があります",
"Some recording rules still in the BusiGroup": "ビジネスグループにまだ記録ルールがあります",
"Some recovery scripts still in the BusiGroup": "ビジネスグループにまだ自己回復スクリプトがあります",
"Some target busigroups still in the BusiGroup": "ビジネスグループにまだ監視対象があります",
"saved view not found": "保存されたビューが見つかりません",
"saved view name is blank": "ビュー名を空にすることはできません",
"saved view page is blank": "ビューページを空にすることはできません",
"saved view name already exists in this page": "このページには同名の公開ビューが既に存在します",
"---------ja_JP--------": "---------ja_JP--------"
},
"ru_RU": {
"Username or password invalid": "Неверное имя пользователя или пароль",
"incorrect verification code": "Неверный код подтверждения",
"roles empty": "Роли не могут быть пустыми",
"Username already exists": "Это имя пользователя уже существует. Пожалуйста, используйте другое",
"failed to count user-groups": "Ошибка проверки данных. Пожалуйста, повторите попытку",
"UserGroup already exists": "Имя группы уже существует. Пожалуйста, используйте другое",
"members empty": "Участники не могут быть пустыми",
"At least one team have rw permission": "По крайней мере одна команда должна иметь права на чтение и запись",
"Failed to create BusiGroup(%s)": "Не удалось создать бизнес-группу [%s]. Пожалуйста, повторите попытку",
"business group id invalid": "Неверный идентификатор бизнес-группы",
"idents empty": "Объекты мониторинга не могут быть пустыми",
"invalid tag(%s)": "Тег [%s] недействителен",
"invalid tagkey(%s): cannot contains . ": "Ключ тега [%s] не может содержать точку (.)",
"invalid tagkey(%s): cannot contains _ ": "Ключ тега [%s] не может содержать подчеркивание (_)",
"invalid tagkey(%s)": "Ключ тега [%s] недействителен",
"duplicate tagkey(%s)": "Ключ тега (%s) дублируется",
"name is empty": "Имя не может быть пустым",
"Ident duplicate": "Уникальный идентификатор панели мониторинга уже существует",
"No such dashboard": "Панель мониторинга не найдена",
"Name has invalid characters": "Имя содержит недопустимые символы",
"Name is blank": "Имя не может быть пустым",
"forbidden": "Нет доступа",
"builtin alerts is empty, file: %s": "Встроенный шаблон оповещений пуст %s",
"input json is empty": "Предоставленные данные не могут быть пустыми",
"fields empty": "Выбранные поля не могут быть пустыми",
"No such AlertRule": "Правило оповещения не найдено",
"GroupId(%d) invalid": "Неверный идентификатор бизнес-группы",
"No such recording rule": "Правило записи не найдено",
"tags is blank": "Теги не могут быть пустыми",
"oops... etime(%d) <= btime(%d)": "Время начала не может быть позже времени окончания",
"group_id invalid": "Бизнес-группа недействительна",
"No such AlertMute": "Правило отключения оповещений не найдено",
"rule_id and tags are both blank": "Правило оповещения и теги не могут быть пустыми одновременно",
"rule is blank": "Правило не может быть пустым",
"rule invalid": "Правило недействительно. Проверьте правильность ввода",
"unsupported field: %s": "Поле %s не поддерживается",
"arg(batch) should be nonnegative": "Параметр 'batch' должен быть неотрицательным",
"arg(tolerance) should be nonnegative": "Параметр 'tolerance' должен быть неотрицательным",
"arg(timeout) should be nonnegative": "Параметр 'timeout' должен быть неотрицательным",
"arg(timeout) longer than five days": "Параметр 'timeout' не может превышать 5 дней",
"arg(title) is required": "Параметр 'title' является обязательным",
"created task.id is zero": "Идентификатор задачи равен нулю",
"invalid ibex address: %s": "Неверный адрес ibex %s",
"url path invalid": "Неверный URL-путь",
"no such server": "Экземпляр не найден",
"admin role can not be modified": "Роль администратора не может быть изменена",
"builtin payload already exists": "Встроенный шаблон уже существует",
"This functionality has not been enabled. Please contact the system administrator to activate it.": "Эта функция не активирована. Пожалуйста, обратитесь к системному администратору для активации",
"targets not exist: %s": "Некоторые машины не существуют: %s",
"mute is disabled": "Правило отключения оповещений деактивировано",
"datasource id not match": "Идентификатор источника данных не совпадает",
"event trigger time not within mute time range": "Время срабатывания события не входит в диапазон времени отключения оповещений",
"event trigger time not within periodic mute range": "Время срабатывания события не входит в периодический диапазон отключения оповещений",
"mute time type invalid": "Недопустимый тип времени отключения оповещений",
"event severity not match mute severity": "Уровень важности события не соответствует уровню важности отключения оповещений",
"event tags not match mute tags": "Теги события не соответствуют тегам отключения оповещений",
"event datasource not match": "Источник данных события не соответствует",
"event rule id not match": "Идентификатор правила оповещения события не соответствует",
"event tags not match": "Теги события не соответствуют",
"event group name not match": "Название бизнес-группы события не соответствует",
"event severity not match": "Уровень важности события не соответствует",
"subscribe notify rule not found: %v": "Правило уведомления подписки не найдено: %v",
"notify rule send error: %v": "Ошибка отправки правила уведомления: %v",
"event match subscribe and notification test ok": "Событие соответствует правилу подписки, тест уведомления успешен",
"no notify rules selected": "Правила уведомлений не выбраны",
"no notify channels selected": "Каналы уведомлений не выбраны",
"no notify groups selected": "Группы уведомлений не выбраны",
"all users missing notify channel configurations: %v": "У всех пользователей отсутствуют настройки каналов уведомлений: %v",
"event match subscribe and notify settings ok": "Событие соответствует правилу подписки, настройки уведомлений в порядке",
"/loki suffix is miss, please add /loki to the url: %s": "Отсутствует суффикс /loki, пожалуйста, добавьте /loki к URL: %s",
"event time not match time filter": "Время события не соответствует временному фильтру",
"event severity not match severity filter": "Уровень события не соответствует фильтру уровня",
"event tag not match tag filter": "Теги события не соответствуют фильтру тегов",
"event attributes not match attributes filter": "Атрибуты события не соответствуют фильтру атрибутов",
"failed to parse tag filter: %v": "Не удалось разобрать фильтр тегов: %v",
"event is dropped": "Событие отброшено, уведомление не будет отправлено",
"drop event success": "Событие успешно отброшено",
"drop event failed": "Не удалось отбросить событие",
"callback success": "Обратный вызов успешен",
"Infrastructure": "Инфраструктура",
"Host - View": "Хост - Просмотр",
"Host - Modify": "Хост - Изменить",
"Host - Delete": "Хост - Удалить",
"Host - Bind Uncategorized": "Хост - Привязать неразмеченные хосты к бизнес-группе",
"Explorer": "Поиск данных",
"Metrics Explorer": "Поиск метрик",
"Quick View": "Быстрый просмотр",
"Built-in Metric - View": "Встроенные метрики - Просмотр",
"Built-in Metric - Add": "Встроенные метрики - Добавить",
"Built-in Metric - Modify": "Встроенные метрики - Изменить",
"Built-in Metric - Delete": "Встроенные метрики - Удалить",
"Recording Rule - View": "Правила записи - Просмотр",
"Recording Rule - Add": "Правила записи - Добавить",
"Recording Rule - Modify": "Правила записи - Изменить",
"Recording Rule - Delete": "Правила записи - Удалить",
"Logs Explorer": "Поиск логов",
"Index Pattern - View": "Шаблоны индексов - Просмотр",
"Index Pattern - Add": "Шаблоны индексов - Добавить",
"Index Pattern - Modify": "Шаблоны индексов - Изменить",
"Index Pattern - Delete": "Шаблоны индексов - Удалить",
"Dashboard - View": "Панель мониторинга - Просмотр",
"Dashboard - Add": "Панель мониторинга - Добавить",
"Dashboard - Modify": "Панель мониторинга - Изменить",
"Dashboard - Delete": "Панель мониторинга - Удалить",
"Dashboard - View Public": "Панель мониторинга - Просмотр публичных панелей",
"Alerting": "Оповещения",
"Alerting Rule - View": "Правила оповещений - Просмотр",
"Alerting Rule - Add": "Правила оповещений - Добавить",
"Alerting Rule - Modify": "Правила оповещений - Изменить",
"Alerting Rule - Delete": "Правила оповещений - Удалить",
"Mutting Rule - View": "Правила отключения оповещений - Просмотр",
"Mutting Rule - Add": "Правила отключения оповещений - Добавить",
"Mutting Rule - Modify": "Правила отключения оповещений - Изменить",
"Mutting Rule - Delete": "Правила отключения оповещений - Удалить",
"Subscribing Rule - View": "Правила подписки - Просмотр",
"Subscribing Rule - Add": "Правила подписки - Добавить",
"Subscribing Rule - Modify": "Правила подписки - Изменить",
"Subscribing Rule - Delete": "Правила подписки - Удалить",
"Self-healing-Script - View": "Скрипты самоисцеления - Просмотр",
"Self-healing-Script - Add": "Скрипты самоисцеления - Добавить",
"Self-healing-Script - Modify": "Скрипты самоисцеления - Изменить",
"Self-healing-Script - Delete": "Скрипты самоисцеления - Удалить",
"Self-healing-Job - View": "Задачи самоисцеления - Просмотр",
"Self-healing-Job - Add": "Задачи самоисцеления - Добавить",
"Self-healing-Job - Modify": "Задачи самоисцеления - Изменить",
"Active Event - View": "Активные события - Просмотр",
"Active Event - Delete": "Активные события - Удалить",
"Historical Event - View": "Исторические события - Просмотр",
"Notification": "Уведомления",
"Notification Rule - View": "Правила уведомлений - Просмотр",
"Notification Rule - Add": "Правила уведомлений - Добавить",
"Notification Rule - Modify": "Правила уведомлений - Изменить",
"Notification Rule - Delete": "Правила уведомлений - Удалить",
"Media Type - View": "Типы уведомлений - Просмотр",
"Media Type - Add": "Типы уведомлений - Добавить",
"Media Type - Modify": "Типы уведомлений - Изменить",
"Media Type - Delete": "Типы уведомлений - Удалить",
"Message Template - View": "Шаблоны сообщений - Просмотр",
"Message Template - Add": "Шаблоны сообщений - Добавить",
"Message Template - Modify": "Шаблоны сообщений - Изменить",
"Message Template - Delete": "Шаблоны сообщений - Удалить",
"Event Pipeline - View": "Конвейер событий - Просмотр",
"Event Pipeline - Add": "Конвейер событий - Добавить",
"Event Pipeline - Modify": "Конвейер событий - Изменить",
"Event Pipeline - Delete": "Конвейер событий - Удалить",
"Notification Settings - View": "Настройки уведомлений (старый вариант) - Просмотр",
"Notification Templates - View": "Шаблоны уведомлений (старый вариант) - Просмотр",
"Integrations": "Центр интеграций",
"Data Source - View": "Источники данных - Просмотр",
"Component - View": "Компоненты - Просмотр",
"Component - Add": "Компоненты - Добавить",
"Component - Modify": "Компоненты - Изменить",
"Component - Delete": "Компоненты - Удалить",
"Embedded Product - View": "Встроенные продукты - Просмотр",
"Embedded Product - Add": "Встроенные продукты - Добавить",
"Embedded Product - Modify": "Встроенные продукты - Изменить",
"Embedded Product - Delete": "Встроенные продукты - Удалить",
"Organization": "Организация",
"User - View": "Пользователи - Просмотр",
"User - Add": "Пользователи - Добавить",
"User - Modify": "Пользователи - Изменить",
"User - Delete": "Пользователи - Удалить",
"Team - View": "Команды - Просмотр",
"Team - Add": "Команды - Добавить",
"Team - Modify": "Команды - Изменить",
"Team - Delete": "Команды - Удалить",
"Business Group - View": "Бизнес-группы - Просмотр",
"Business Group - Add": "Бизнес-группы - Добавить",
"Business Group - Modify": "Бизнес-группы - Изменить",
"Business Group - Delete": "Бизнес-группы - Удалить",
"Role - View": "Роли - Просмотр",
"Role - Add": "Роли - Добавить",
"Role - Modify": "Роли - Изменить",
"Role - Delete": "Роли - Удалить",
"System Settings": "Настройки системы",
"View Site Settings": "Просмотр настроек сайта",
"View Variable Settings": "Просмотр переменных",
"View SSO Settings": "Просмотр настроек единого входа",
"View Alerting Engines": "Просмотр списка алертинг-инженеров",
"View Product Version": "Просмотр версии продукта",
"Some alert rules still in the BusiGroup": "В бизнес-группе еще есть правила оповещений",
"Some alert mutes still in the BusiGroup": "В бизнес-группе еще есть правила отключения оповещений",
"Some alert subscribes still in the BusiGroup": "В бизнес-группе еще есть правила подписки",
"Some Board still in the BusiGroup": "В бизнес-группе еще есть панели мониторинга",
"Some targets still in the BusiGroup": "В бизнес-группе еще есть объекты мониторинга",
"Some recording rules still in the BusiGroup": "В бизнес-группе еще есть правила записи",
"Some recovery scripts still in the BusiGroup": "В бизнес-группе еще есть скрипты самоисцеления",
"Some target busigroups still in the BusiGroup": "В бизнес-группе еще есть объекты мониторинга",
"saved view not found": "Сохраненный вид не найден",
"saved view name is blank": "Название вида не может быть пустым",
"saved view page is blank": "Страница вида не может быть пустой",
"saved view name already exists in this page": "На этой странице уже существует публичный вид с таким названием",
"---------ru_RU--------": "---------ru_RU--------"
}
}`
================================================
FILE: pkg/ibex/ibex.go
================================================
package ibex
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"strings"
"time"
)
type Ibex struct {
address string
authUser string
authPass string
timeout time.Duration
method string
urlPath string
inValue interface{}
outPtr interface{}
headers map[string]string
queries map[string][]string
}
func New(addr, user, pass string, timeout int64) *Ibex {
if !strings.HasPrefix(addr, "http") {
addr = "http://" + addr
}
return &Ibex{
address: addr,
authUser: user,
authPass: pass,
timeout: time.Duration(timeout) * time.Millisecond,
headers: make(map[string]string),
queries: make(map[string][]string),
}
}
func (i *Ibex) In(v interface{}) *Ibex {
i.inValue = v
return i
}
func (i *Ibex) Out(ptr interface{}) *Ibex {
i.outPtr = ptr
return i
}
func (i *Ibex) Path(p string) *Ibex {
i.urlPath = p
return i
}
func (i *Ibex) Method(m string) *Ibex {
i.method = strings.ToUpper(m)
return i
}
func (i *Ibex) Header(key, value string) *Ibex {
i.headers[key] = value
return i
}
func (i *Ibex) QueryString(key, value string) *Ibex {
if param, ok := i.queries[key]; ok {
i.queries[key] = append(param, value)
} else {
i.queries[key] = []string{value}
}
return i
}
func (i *Ibex) buildUrl() {
var queries string
if len(i.queries) > 0 {
var buf bytes.Buffer
for k, v := range i.queries {
for _, vv := range v {
buf.WriteString(url.QueryEscape(k))
buf.WriteByte('=')
buf.WriteString(url.QueryEscape(vv))
buf.WriteByte('&')
}
}
queries = buf.String()
queries = queries[0 : len(queries)-1]
}
if len(queries) > 0 {
if strings.Contains(i.urlPath, "?") {
i.urlPath += "&" + queries
} else {
i.urlPath = i.urlPath + "?" + queries
}
}
}
func (i *Ibex) do() error {
i.buildUrl()
var req *http.Request
var err error
var bs []byte
if i.inValue != nil {
bs, err = json.Marshal(i.inValue)
if err != nil {
return err
}
req, err = http.NewRequest(i.method, i.address+i.urlPath, bytes.NewBuffer(bs))
} else {
req, err = http.NewRequest(i.method, i.address+i.urlPath, nil)
}
if err != nil {
return err
}
for key, value := range i.headers {
req.Header.Set(key, value)
}
if i.authUser != "" {
req.SetBasicAuth(i.authUser, i.authPass)
}
if i.method != http.MethodGet {
req.Header.Set("Content-Type", "application/json")
}
client := http.Client{
Timeout: i.timeout,
}
res, err := client.Do(req)
if err != nil {
return err
}
if res.StatusCode != 200 {
return fmt.Errorf("url(%s) response code: %v", i.urlPath, res.StatusCode)
}
if res.Body != nil {
defer res.Body.Close()
}
payload, err := ioutil.ReadAll(res.Body)
if err != nil {
return err
}
return json.Unmarshal(payload, i.outPtr)
}
func (i *Ibex) GET() error {
i.Method(http.MethodGet)
return i.do()
}
func (i *Ibex) POST() error {
i.Method(http.MethodPost)
return i.do()
}
func (i *Ibex) PUT() error {
i.Method(http.MethodPut)
return i.do()
}
func (i *Ibex) DELETE() error {
i.Method(http.MethodDelete)
return i.do()
}
func (i *Ibex) PATCH() error {
i.Method(http.MethodPatch)
return i.do()
}
================================================
FILE: pkg/ldapx/ldapx.go
================================================
package ldapx
import (
"crypto/tls"
"fmt"
"strings"
"sync"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/go-ldap/ldap/v3"
"github.com/pkg/errors"
"github.com/toolkits/pkg/container/set"
"github.com/toolkits/pkg/logger"
)
type Config struct {
Enable bool
Host string
Port int
BaseDn string
BindUser string
BindPass string
SyncAddUsers bool
SyncDelUsers bool
SyncInterval time.Duration
UserFilter string
AuthFilter string
Attributes LdapAttributes
CoverAttributes bool
CoverTeams bool
TLS bool
StartTLS bool
DefaultRoles []string
DefaultTeams []int64
RoleTeamMapping []RoleTeamMapping
}
type SsoClient struct {
Enable bool
Host string
Port int
BaseDn string
BaseDns []string
BindUser string
BindPass string
SyncAdd bool
SyncDel bool
SyncInterval time.Duration
UserFilter string
AuthFilter string
Attributes LdapAttributes
CoverAttributes bool
CoverTeams bool
TLS bool
StartTLS bool
DefaultRoles []string
DefaultTeams []int64
RoleTeamMapping map[string]RoleTeamMapping
Ticker *time.Ticker
sync.RWMutex
}
type LdapAttributes struct {
Username string
Nickname string
Phone string
Email string
Group string // User support is “memberOf” by default
}
type RoleTeamMapping struct {
DN string
Roles []string
Teams []int64
}
func New(cf Config) *SsoClient {
var s = &SsoClient{
Ticker: time.NewTicker(time.Hour * 24),
}
if !cf.Enable {
return s
}
s.Reload(cf)
return s
}
func (s *SsoClient) Reload(cf Config) {
s.Lock()
defer s.Unlock()
if !cf.Enable {
s.Enable = cf.Enable
return
}
s.Enable = cf.Enable
s.Host = cf.Host
s.Port = cf.Port
s.BaseDn = cf.BaseDn
s.BindUser = cf.BindUser
s.BindPass = cf.BindPass
s.AuthFilter = cf.AuthFilter
s.Attributes = cf.Attributes
s.CoverAttributes = cf.CoverAttributes
s.CoverTeams = cf.CoverTeams
s.TLS = cf.TLS
s.StartTLS = cf.StartTLS
s.DefaultRoles = cf.DefaultRoles
s.DefaultTeams = cf.DefaultTeams
s.SyncAdd = cf.SyncAddUsers
s.SyncDel = cf.SyncDelUsers
s.SyncInterval = cf.SyncInterval
s.SyncDel = cf.SyncDelUsers
s.UserFilter = cf.UserFilter
// Needs to be used to pull the group of LDAP to which the user belongs, that is,
// the memberOf property of the user needs to be pulled by default
s.Attributes.Group = "memberOf"
// Role Mapping and team mapping are configured
s.RoleTeamMapping = make(map[string]RoleTeamMapping, len(cf.RoleTeamMapping))
for _, mapping := range cf.RoleTeamMapping {
s.RoleTeamMapping[mapping.DN] = mapping
}
if s.SyncInterval > 0 {
s.Ticker.Reset(s.SyncInterval * time.Second)
}
s.BaseDns = strings.Split(s.BaseDn, "|")
}
func (s *SsoClient) Copy() *SsoClient {
s.RLock()
newRoles := make([]string, len(s.DefaultRoles))
copy(newRoles, s.DefaultRoles)
newTeams := make([]int64, len(s.DefaultTeams))
copy(newTeams, s.DefaultTeams)
lc := *s
lc.DefaultRoles = newRoles
lc.DefaultTeams = newTeams
s.RUnlock()
return &lc
}
func (s *SsoClient) LoginCheck(user, pass string) (*ldap.SearchResult, error) {
lc := s.Copy()
conn, err := lc.newLdapConn()
if err != nil {
return nil, err
}
defer conn.Close()
srs, err := lc.ldapReq(conn, lc.AuthFilter, user)
if err != nil {
return nil, fmt.Errorf("ldap.error: ldap search fail: %v", err)
}
var sr *ldap.SearchResult
for i := range srs {
if srs[i] == nil || len(srs[i].Entries) == 0 {
continue
}
// 多个 dn 中,账号的唯一性由 LDAP 保证
if len(srs[i].Entries) > 1 {
return nil, fmt.Errorf("ldap.error: search user(%s), multi entries found", user)
}
sr = srs[i]
if err := conn.Bind(srs[i].Entries[0].DN, pass); err != nil {
return nil, fmt.Errorf("username or password invalid")
}
for _, info := range srs[i].Entries[0].Attributes {
logger.Infof("ldap.info: user(%s) info: %+v", user, info)
}
break
}
if sr == nil {
return nil, fmt.Errorf("username or password invalid")
}
return sr, nil
}
func (s *SsoClient) newLdapConn() (*ldap.Conn, error) {
var conn *ldap.Conn
var err error
addr := fmt.Sprintf("%s:%d", s.Host, s.Port)
ldap.DefaultTimeout = time.Second * 10
if s.TLS {
conn, err = ldap.DialTLS("tcp", addr, &tls.Config{InsecureSkipVerify: true})
} else {
conn, err = ldap.Dial("tcp", addr)
}
if err != nil {
return nil, fmt.Errorf("ldap.error: cannot dial ldap(%s): %v", addr, err)
}
conn.SetTimeout(time.Second * 10)
if !s.TLS && s.StartTLS {
if err := conn.StartTLS(&tls.Config{InsecureSkipVerify: true}); err != nil {
return nil, fmt.Errorf("ldap.error: conn startTLS fail: %v", err)
}
}
// if bindUser is empty, anonymousSearch mode
if s.BindUser != "" {
// BindSearch mode
if err := conn.Bind(s.BindUser, s.BindPass); err != nil {
return nil, fmt.Errorf("ldap.error: bind ldap fail: %v, use user(%s) to bind", err, s.BindUser)
}
}
return conn, nil
}
func (s *SsoClient) ldapReq(conn *ldap.Conn, filter string, values ...interface{}) ([]*ldap.SearchResult, error) {
srs := make([]*ldap.SearchResult, 0, len(s.BaseDns))
for i := range s.BaseDns {
searchRequest := ldap.NewSearchRequest(
strings.TrimSpace(s.BaseDns[i]), // The base dn to search
ldap.ScopeWholeSubtree, ldap.NeverDerefAliases, 0, 0, false,
fmt.Sprintf(filter, values...), // The filter to apply
s.genLdapAttributeSearchList(), // A list attributes to retrieve
nil,
)
sr, err := conn.Search(searchRequest)
if err != nil {
logger.Errorf("ldap.error: ldap search fail: %v", err)
continue
}
srs = append(srs, sr)
}
return srs, nil
}
// GetUserRolesAndTeams Gets the roles and teams of the user
func (s *SsoClient) GetUserRolesAndTeams(entry *ldap.Entry) *RoleTeamMapping {
lc := s.Copy()
groups := entry.GetAttributeValues(lc.Attributes.Group)
rolesSet := set.NewStringSet()
teamsSet := set.NewInt64Set()
mapping := lc.RoleTeamMapping
// Collect DNs and Groups
dns := append(groups, entry.DN)
for _, dn := range dns {
// adds roles to the given set from the specified dn entry.
if rt, exists := mapping[dn]; exists {
for _, role := range rt.Roles {
rolesSet.Add(role)
}
}
// adds teams to the given set from the specified dn entry.
if rt, exists := mapping[dn]; exists {
for _, team := range rt.Teams {
teamsSet.Add(team)
}
}
}
// Convert sets to slices
return &RoleTeamMapping{
DN: entry.DN,
Roles: rolesSet.ToSlice(),
Teams: teamsSet.ToSlice(),
}
}
func (s *SsoClient) genLdapAttributeSearchList() []string {
var ldapAttributes []string
attrs := s.Attributes
if attrs.Username == "" {
ldapAttributes = append(ldapAttributes, "uid")
} else {
ldapAttributes = append(ldapAttributes, attrs.Username)
}
if attrs.Nickname != "" {
ldapAttributes = append(ldapAttributes, attrs.Nickname)
}
if attrs.Email != "" {
ldapAttributes = append(ldapAttributes, attrs.Email)
}
if attrs.Phone != "" {
ldapAttributes = append(ldapAttributes, attrs.Phone)
}
if attrs.Group != "" {
ldapAttributes = append(ldapAttributes, attrs.Group)
}
return ldapAttributes
}
func LdapLogin(ctx *ctx.Context, username, pass string, defaultRoles []string, defaultTeams []int64, ldap *SsoClient) (*models.User, error) {
sr, err := ldap.LoginCheck(username, pass)
if err != nil {
return nil, err
}
// copy attributes from ldap
ldap.RLock()
attrs := ldap.Attributes
coverAttributes := ldap.CoverAttributes
coverTeams := ldap.CoverTeams
ldap.RUnlock()
var nickname, email, phone string
if attrs.Nickname != "" {
nickname = sr.Entries[0].GetAttributeValue(attrs.Nickname)
}
if attrs.Email != "" {
email = sr.Entries[0].GetAttributeValue(attrs.Email)
}
if attrs.Phone != "" {
phone = strings.Replace(sr.Entries[0].GetAttributeValue(attrs.Phone), " ", "", -1)
}
// Gets the roles and teams for this entry
roleTeamMapping := ldap.GetUserRolesAndTeams(sr.Entries[0])
user, err := models.UserGetByUsername(ctx, username)
if err != nil {
return nil, err
}
if user != nil && user.Id > 0 {
if coverAttributes {
// need to override the user's basic properties
updatedFields := user.UpdateSsoFieldsWithRoles("ldap", nickname, phone, email, roleTeamMapping.Roles)
if err = user.Update(ctx, "update_at", updatedFields...); err != nil {
return nil, errors.WithMessage(err, "failed to update user")
}
}
if len(roleTeamMapping.Teams) == 0 {
roleTeamMapping.Teams = defaultTeams
}
// Synchronize group information
if err = models.UserGroupMemberSync(ctx, roleTeamMapping.Teams, user.Id, coverTeams); err != nil {
logger.Errorf("ldap.error: failed to update user(%s) group member err: %+v", user, err)
}
} else {
user = new(models.User)
if len(roleTeamMapping.Roles) == 0 {
// No role mapping is configured, the configured default role is used
roleTeamMapping.Roles = defaultRoles
}
user.FullSsoFields("ldap", username, nickname, phone, email, roleTeamMapping.Roles)
if err = models.DB(ctx).Create(user).Error; err != nil {
return nil, errors.WithMessage(err, "failed to add user")
}
if len(roleTeamMapping.Teams) == 0 {
for _, gid := range defaultTeams {
err = models.UserGroupMemberAdd(ctx, gid, user.Id)
if err != nil {
logger.Errorf("user:%v gid:%d UserGroupMemberAdd: %s", user, gid, err)
}
}
}
if err = models.UserGroupMemberSync(ctx, roleTeamMapping.Teams, user.Id, false); err != nil {
logger.Errorf("ldap.error: failed to update user(%s) group member err: %+v", user, err)
}
}
return user, nil
}
================================================
FILE: pkg/ldapx/user_sync.go
================================================
package ldapx
import (
"fmt"
"time"
"github.com/ccfos/nightingale/v6/dumper"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/logger"
)
func (s *SsoClient) SyncAddAndDelUsers(ctx *ctx.Context) error {
if !s.Enable || !s.SyncAdd {
return nil
}
start := time.Now()
usersFromSso, usersFromDb, err := s.getUsersFromSsoAndDb(ctx)
if err != nil {
dumper.PutSyncRecord("sso_user", start.Unix(), -1, -1, "failed to query users: "+err.Error())
return err
}
usersToBeAdd, usersExists := diffUsers(usersFromDb, usersFromSso)
// Incremental users synchronize both user information and group information
for _, user := range usersToBeAdd {
if err = user.AddUserAndGroups(ctx, s.CoverTeams); err != nil {
logger.Warningf("failed to sync add user[%v] to db, err: %v", *user, err)
}
}
// Existing users synchronize group information only
for _, user := range usersExists {
if err = models.UserGroupMemberSyncByUser(ctx, user, s.CoverTeams); err != nil {
logger.Warningf("failed to sync add user[%v] to db, err: %v", *user, err)
}
}
var usersToBeDel []*models.User
if s.SyncDel {
usersToBeDel, _ = diffUsers(usersFromSso, usersFromDb)
if len(usersToBeDel) > 0 {
delIds := make([]int64, 0, len(usersToBeDel))
for _, user := range usersToBeDel {
delIds = append(delIds, user.Id)
}
if err := models.UserDelByIds(ctx, delIds); err != nil {
logger.Warningf("failed to sync del users[%v] to db, err: %v", usersToBeDel, err)
}
}
}
ms := time.Since(start).Milliseconds()
logger.Infof("timer: sync sso users done, cost: %dms, number: %d", ms, len(usersToBeDel)+len(usersToBeAdd))
dumper.PutSyncRecord("sso_user", start.Unix(), ms, len(usersToBeDel)+len(usersToBeAdd), "success")
return nil
}
func (s *SsoClient) getUsersFromSsoAndDb(ctx *ctx.Context) (usersFromSso, usersFromDb map[string]*models.User, err error) {
usersFromSso, err = s.UserGetAll()
if err != nil {
return nil, nil, err
}
usersFromDb, err = models.UserGetsBySso(ctx, "ldap")
if err != nil {
return nil, nil, err
}
return
}
func (s *SsoClient) UserGetAll() (map[string]*models.User, error) {
lc := s.Copy()
conn, err := lc.newLdapConn()
if err != nil {
return nil, err
}
defer conn.Close()
srs, err := lc.ldapReq(conn, lc.UserFilter)
if err != nil {
return nil, fmt.Errorf("ldap.error: ldap search fail: %v", err)
}
res := make(map[string]*models.User)
for i := range srs {
if srs[i] == nil {
continue
}
for _, entry := range srs[i].Entries {
attrs := lc.Attributes
username := entry.GetAttributeValue(attrs.Username)
nickname := entry.GetAttributeValue(attrs.Nickname)
email := entry.GetAttributeValue(attrs.Email)
phone := entry.GetAttributeValue(attrs.Phone)
// Gets the roles and teams for this entry
roleTeamMapping := lc.GetUserRolesAndTeams(entry)
if len(roleTeamMapping.Roles) == 0 {
// No role mapping is configured, the configured default role is used
roleTeamMapping.Roles = lc.DefaultRoles
}
user := new(models.User)
user.FullSsoFieldsWithTeams("ldap", username, nickname, phone, email, roleTeamMapping.Roles, roleTeamMapping.Teams)
res[entry.GetAttributeValue(attrs.Username)] = user
}
}
return res, nil
}
// newExtraUsers: in newUsers not in base
// updatedUsers: in newUsers and in base, update the user.TeamsLst data
func diffUsers(base, newUsers map[string]*models.User) (newExtraUsers, updatedUsers []*models.User) {
for username, user := range newUsers {
if baseUser, exist := base[username]; !exist {
newExtraUsers = append(newExtraUsers, user)
} else {
if len(baseUser.TeamsLst) == 0 {
// Need to pass on the team message
baseUser.TeamsLst = user.TeamsLst
}
updatedUsers = append(updatedUsers, baseUser)
}
}
return
}
func (s *SsoClient) SyncDelUsers(ctx *ctx.Context) error {
if !s.Enable || s.SyncAdd || !s.SyncDel {
return nil
}
start := time.Now()
usersFromDb, err := models.UserGetsBySso(ctx, "ldap")
if err != nil {
dumper.PutSyncRecord("sso_user", start.Unix(), -1, -1, "failed to query users: "+err.Error())
return err
}
delIds := make([]int64, 0)
for _, user := range usersFromDb {
exist, err := s.UserExist(user.Username)
if err != nil {
dumper.PutSyncRecord("sso_user", start.Unix(), -1, -1, "failed to check whether the user exists: "+err.Error())
} else if !exist {
delIds = append(delIds, user.Id)
}
}
if len(delIds) > 0 {
if err := models.UserDelByIds(ctx, delIds); err != nil {
dumper.PutSyncRecord("sso_user", start.Unix(), -1, -1, "failed to sync del users: "+err.Error())
return err
}
}
ms := time.Since(start).Milliseconds()
logger.Infof("timer: sync del sso users done, cost: %dms, number: %d", ms, len(delIds))
dumper.PutSyncRecord("sso_user", start.Unix(), ms, len(delIds), "success")
return nil
}
func (s *SsoClient) UserExist(username string) (bool, error) {
lc := s.Copy()
conn, err := lc.newLdapConn()
if err != nil {
return false, err
}
defer conn.Close()
srs, err := lc.ldapReq(conn, "(&(%s=%s))", lc.Attributes.Username, username)
if err != nil {
return false, err
}
for i := range srs {
if srs[i] == nil {
continue
}
if len(srs[i].Entries) > 0 {
return true, nil
}
}
return false, nil
}
================================================
FILE: pkg/loggrep/loggrep.go
================================================
package loggrep
import (
"bufio"
"html/template"
"io"
"os"
"path/filepath"
"regexp"
"sort"
"strings"
)
const MaxLogLines = 5000
var hashPattern = regexp.MustCompile(`^[a-f0-9]{32,64}$`)
var idPattern = regexp.MustCompile(`^[1-9]\d*$`)
var traceIdPattern = regexp.MustCompile(`^[a-zA-Z0-9_-]{1,64}$`)
// IsValidHash checks whether s looks like a valid MD5/SHA hex hash.
func IsValidHash(s string) bool {
return hashPattern.MatchString(s)
}
// IsValidRuleID checks whether s looks like a valid positive integer rule ID.
func IsValidRuleID(s string) bool {
return idPattern.MatchString(s)
}
// IsValidTraceID checks whether s looks like a valid trace ID (alphanumeric, hyphens, underscores).
func IsValidTraceID(s string) bool {
return traceIdPattern.MatchString(s)
}
type EventDetailResp struct {
Logs []string `json:"logs"`
Instance string `json:"instance"`
}
type PageData struct {
Hash string
Instance string
Logs []string
Total int
}
type AlertEvalPageData struct {
RuleID string
Instance string
Logs []string
Total int
}
type TraceLogsPageData struct {
TraceID string
Instance string
Logs []string
Total int
}
// GrepLogDir searches all log files in logDir for lines containing keyword,
// sorts them by timestamp descending, and truncates to MaxLogLines.
func GrepLogDir(logDir string, keyword string) ([]string, error) {
logFiles, err := filepath.Glob(filepath.Join(logDir, "*.log*"))
if err != nil {
return nil, err
}
var logs []string
for _, logFile := range logFiles {
lines, err := GrepFile(logFile, keyword)
if err != nil {
continue
}
logs = append(logs, lines...)
}
sort.Slice(logs, func(i, j int) bool {
return logs[i] > logs[j]
})
if len(logs) > MaxLogLines {
logs = logs[:MaxLogLines]
}
return logs, nil
}
// GrepLatestLogFiles searches only the current (non-rotated) log files in logDir
// (i.e. files matching *.log without any additional suffix like .log.20240101).
func GrepLatestLogFiles(logDir string, keyword string) ([]string, error) {
logFiles, err := filepath.Glob(filepath.Join(logDir, "*.log"))
if err != nil {
return nil, err
}
var logs []string
for _, logFile := range logFiles {
lines, err := GrepFile(logFile, keyword)
if err != nil {
continue
}
logs = append(logs, lines...)
}
sort.Slice(logs, func(i, j int) bool {
return logs[i] > logs[j]
})
if len(logs) > MaxLogLines {
logs = logs[:MaxLogLines]
}
return logs, nil
}
// GrepFile scans a file line by line and returns lines containing the keyword.
func GrepFile(filePath string, keyword string) ([]string, error) {
f, err := os.Open(filePath)
if err != nil {
return nil, err
}
defer f.Close()
var lines []string
scanner := bufio.NewScanner(f)
scanner.Buffer(make([]byte, 0, 1024*1024), 1024*1024)
for scanner.Scan() {
line := scanner.Text()
if strings.Contains(line, keyword) {
lines = append(lines, line)
}
}
return lines, scanner.Err()
}
// RenderHTML writes the event detail HTML page to w.
func RenderHTML(w io.Writer, data PageData) error {
return htmlTpl.Execute(w, data)
}
// RenderAlertEvalHTML writes the alert eval detail HTML page to w.
func RenderAlertEvalHTML(w io.Writer, data AlertEvalPageData) error {
return alertEvalHtmlTpl.Execute(w, data)
}
// RenderTraceLogsHTML writes the trace logs HTML page to w.
func RenderTraceLogsHTML(w io.Writer, data TraceLogsPageData) error {
return traceLogsHtmlTpl.Execute(w, data)
}
var htmlTpl = template.Must(template.New("event-detail").Parse(`
Event Detail - {{.Hash}}
{{- if eq .Total 0}}
No log lines found for this event hash.
{{- else}}
{{- range $i, $line := .Logs}}
{{$i}} {{$line}}
{{- end}}
{{- end}}
`))
var traceLogsHtmlTpl = template.Must(template.New("trace-logs").Parse(`
Trace Logs - {{.TraceID}}
{{- if eq .Total 0}}
No log lines found for trace ID {{.TraceID}}.
{{- else}}
{{- range $i, $line := .Logs}}
{{$i}} {{$line}}
{{- end}}
{{- end}}
`))
var alertEvalHtmlTpl = template.Must(template.New("alert-eval-detail").Parse(`
Alert Eval Detail - Rule {{.RuleID}}
{{- if eq .Total 0}}
No log lines found for alert rule {{.RuleID}}.
{{- else}}
{{- range $i, $line := .Logs}}
{{$i}} {{$line}}
{{- end}}
{{- end}}
`))
================================================
FILE: pkg/logx/logx.go
================================================
package logx
import (
"context"
"fmt"
"github.com/pkg/errors"
"github.com/toolkits/pkg/logger"
)
type Config struct {
Dir string
Level string
Output string
KeepHours uint
RotateNum int
RotateSize uint64
OutputToOneFile bool
}
func Init(c Config) (func(), error) {
logger.SetSeverity(c.Level)
if c.Output == "stderr" {
logger.LogToStderr()
} else if c.Output == "file" {
lb, err := logger.NewFileBackend(c.Dir)
if err != nil {
return nil, errors.WithMessage(err, "NewFileBackend failed")
}
if c.KeepHours != 0 {
lb.SetRotateByHour(true)
lb.SetKeepHours(c.KeepHours)
} else if c.RotateNum != 0 {
lb.Rotate(c.RotateNum, c.RotateSize*1024*1024)
} else {
return nil, errors.New("KeepHours and Rotatenum both are 0")
}
lb.OutputToOneFile(c.OutputToOneFile)
logger.SetLogging(c.Level, lb)
}
return func() {
fmt.Println("logger exiting")
logger.Close()
}, nil
}
// traceKey is the context key for storing traceId.
type traceKey struct{}
// NewTraceContext returns a new context carrying the given traceId.
func NewTraceContext(ctx context.Context, traceId string) context.Context {
return context.WithValue(ctx, traceKey{}, traceId)
}
// GetTraceId extracts the traceId from ctx, or returns "" if absent.
func GetTraceId(ctx context.Context) string {
if ctx == nil {
return ""
}
id, _ := ctx.Value(traceKey{}).(string)
return id
}
func prefix(ctx context.Context) string {
id := GetTraceId(ctx)
if id == "" {
return ""
}
return "trace_id=" + id + " "
}
func Infof(ctx context.Context, format string, args ...interface{}) {
logger.Infof(prefix(ctx)+format, args...)
}
func Errorf(ctx context.Context, format string, args ...interface{}) {
logger.Errorf(prefix(ctx)+format, args...)
}
func Warningf(ctx context.Context, format string, args ...interface{}) {
logger.Warningf(prefix(ctx)+format, args...)
}
func Debugf(ctx context.Context, format string, args ...interface{}) {
logger.Debugf(prefix(ctx)+format, args...)
}
================================================
FILE: pkg/macros/macros.go
================================================
package macros
var Macro func(sql string, start, end int64) (string, error)
func RegisterMacro(f func(sql string, start, end int64) (string, error)) {
Macro = f
}
func MacroInVain(sql string, start, end int64) (string, error) {
return sql, nil
}
================================================
FILE: pkg/oauth2x/oauth2x.go
================================================
package oauth2x
import (
"bytes"
"context"
"crypto/tls"
"fmt"
"io/ioutil"
"net/http"
"sync"
"time"
"github.com/ccfos/nightingale/v6/storage"
"github.com/toolkits/pkg/logger"
"github.com/google/uuid"
jsoniter "github.com/json-iterator/go"
"golang.org/x/oauth2"
)
type SsoClient struct {
Enable bool
Config oauth2.Config
SsoAddr string
SsoLogoutAddr string
UserInfoAddr string
TranTokenMethod string
CallbackAddr string
DisplayName string
CoverAttributes bool
Attributes struct {
Username string
Nickname string
Phone string
Email string
}
UserinfoIsArray bool
UserinfoPrefix string
DefaultRoles []string
Ctx context.Context
sync.RWMutex
}
type Config struct {
Enable bool
DisplayName string
RedirectURL string
SsoAddr string
SsoLogoutAddr string
TokenAddr string
UserInfoAddr string
TranTokenMethod string
ClientId string
ClientSecret string
CoverAttributes bool
SkipTlsVerify bool
Attributes struct {
Username string
Nickname string
Phone string
Email string
}
DefaultRoles []string
UserinfoIsArray bool
UserinfoPrefix string
Scopes []string
}
func New(cf Config) *SsoClient {
var s = &SsoClient{}
if !cf.Enable {
return s
}
s.Reload(cf)
return s
}
func (s *SsoClient) Reload(cf Config) {
s.Lock()
defer s.Unlock()
if !cf.Enable {
s.Enable = cf.Enable
return
}
s.Enable = cf.Enable
s.SsoAddr = cf.SsoAddr
s.SsoLogoutAddr = cf.SsoLogoutAddr
s.UserInfoAddr = cf.UserInfoAddr
s.TranTokenMethod = cf.TranTokenMethod
s.CallbackAddr = cf.RedirectURL
s.DisplayName = cf.DisplayName
s.CoverAttributes = cf.CoverAttributes
s.Attributes.Username = cf.Attributes.Username
s.Attributes.Nickname = cf.Attributes.Nickname
s.Attributes.Phone = cf.Attributes.Phone
s.Attributes.Email = cf.Attributes.Email
s.UserinfoIsArray = cf.UserinfoIsArray
s.UserinfoPrefix = cf.UserinfoPrefix
s.DefaultRoles = cf.DefaultRoles
s.Ctx = context.Background()
if cf.SkipTlsVerify {
transport := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
}
// Create an HTTP client that uses our custom transport
client := &http.Client{Transport: transport}
s.Ctx = context.WithValue(s.Ctx, oauth2.HTTPClient, client)
}
s.Config = oauth2.Config{
ClientID: cf.ClientId,
ClientSecret: cf.ClientSecret,
Endpoint: oauth2.Endpoint{
AuthURL: cf.SsoAddr,
TokenURL: cf.TokenAddr,
},
RedirectURL: cf.RedirectURL,
Scopes: cf.Scopes,
}
}
func (s *SsoClient) GetDisplayName() string {
s.RLock()
defer s.RUnlock()
if !s.Enable {
return ""
}
return s.DisplayName
}
func (s *SsoClient) GetSsoLogoutAddr() string {
s.RLock()
defer s.RUnlock()
if !s.Enable {
return ""
}
return s.SsoLogoutAddr
}
func wrapStateKey(key string) string {
return "n9e_oauth_" + key
}
// Authorize return the sso authorize location with state
func (s *SsoClient) Authorize(redis storage.Redis, redirect string) (string, error) {
state := uuid.New().String()
ctx := context.Background()
err := redis.Set(ctx, wrapStateKey(state), redirect, time.Duration(300*time.Second)).Err()
if err != nil {
return "", err
}
s.RLock()
defer s.RUnlock()
return s.Config.AuthCodeURL(state), nil
}
func fetchRedirect(redis storage.Redis, ctx context.Context, state string) (string, error) {
return redis.Get(ctx, wrapStateKey(state)).Result()
}
func deleteRedirect(redis storage.Redis, ctx context.Context, state string) error {
return redis.Del(ctx, wrapStateKey(state)).Err()
}
// Callback 用 code 兑换 accessToken 以及 用户信息
func (s *SsoClient) Callback(redis storage.Redis, ctx context.Context, code, state string) (*CallbackOutput, error) {
ret, err := s.exchangeUser(code)
if err != nil {
return nil, fmt.Errorf("illegal user:%v", err)
}
ret.Redirect, err = fetchRedirect(redis, ctx, state)
if err != nil {
logger.Errorf("get redirect err:%v code:%s state:%s", err, code, state)
}
err = deleteRedirect(redis, ctx, state)
if err != nil {
logger.Errorf("delete redirect err:%v code:%s state:%s", err, code, state)
}
return ret, nil
}
type CallbackOutput struct {
Redirect string `json:"redirect"`
Msg string `json:"msg"`
AccessToken string `json:"accessToken"`
Username string `json:"Username"`
Nickname string `json:"Nickname"`
Phone string `yaml:"Phone"`
Email string `yaml:"Email"`
}
func (s *SsoClient) exchangeUser(code string) (*CallbackOutput, error) {
s.RLock()
defer s.RUnlock()
oauth2Token, err := s.Config.Exchange(s.Ctx, code)
if err != nil {
return nil, fmt.Errorf("failed to exchange token: %s", err)
}
userInfo, err := s.getUserInfo(s.Config.ClientID, s.UserInfoAddr, oauth2Token.AccessToken, s.TranTokenMethod)
if err != nil {
logger.Errorf("failed to get user info: %s", err)
return nil, fmt.Errorf("failed to get user info: %s", err)
}
logger.Debugf("get userInfo: %s", string(userInfo))
return &CallbackOutput{
AccessToken: oauth2Token.AccessToken,
Username: getUserinfoField(userInfo, s.UserinfoIsArray, s.UserinfoPrefix, s.Attributes.Username),
Nickname: getUserinfoField(userInfo, s.UserinfoIsArray, s.UserinfoPrefix, s.Attributes.Nickname),
Phone: getUserinfoField(userInfo, s.UserinfoIsArray, s.UserinfoPrefix, s.Attributes.Phone),
Email: getUserinfoField(userInfo, s.UserinfoIsArray, s.UserinfoPrefix, s.Attributes.Email),
}, nil
}
func (s *SsoClient) getUserInfo(ClientId, UserInfoAddr, accessToken string, TranTokenMethod string) ([]byte, error) {
var req *http.Request
if TranTokenMethod == "formdata" {
body := bytes.NewBuffer([]byte("access_token=" + accessToken + "&client_id=" + ClientId))
r, err := http.NewRequest("POST", UserInfoAddr, body)
if err != nil {
return nil, err
}
r.Header.Add("Content-Type", "application/x-www-form-urlencoded")
req = r
} else if TranTokenMethod == "querystring" {
r, err := http.NewRequest("GET", UserInfoAddr+"?access_token="+accessToken+"&client_id="+ClientId, nil)
if err != nil {
return nil, err
}
r.Header.Add("Authorization", "Bearer "+accessToken)
req = r
} else {
r, err := http.NewRequest("GET", UserInfoAddr, nil)
if err != nil {
return nil, err
}
r.Header.Add("Authorization", "Bearer "+accessToken)
r.Header.Add("client_id", ClientId)
req = r
}
client := http.DefaultClient
c := s.Ctx.Value(oauth2.HTTPClient)
if c != nil {
client = c.(*http.Client)
}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
body, err := ioutil.ReadAll(resp.Body)
resp.Body.Close()
return body, err
}
func getUserinfoField(input []byte, isArray bool, prefix, field string) string {
if prefix == "" {
if isArray {
return jsoniter.Get(input, 0).Get(field).ToString()
} else {
return jsoniter.Get(input, field).ToString()
}
} else {
if isArray {
return jsoniter.Get(input, prefix, 0).Get(field).ToString()
} else {
return jsoniter.Get(input, prefix).Get(field).ToString()
}
}
}
================================================
FILE: pkg/oidcx/oidc.go
================================================
package oidcx
import (
"context"
"crypto/tls"
"fmt"
"net/http"
"strings"
"sync"
"time"
"github.com/ccfos/nightingale/v6/storage"
oidc "github.com/coreos/go-oidc"
"github.com/google/uuid"
"github.com/toolkits/pkg/logger"
"golang.org/x/oauth2"
)
type SsoClient struct {
Enable bool
Verifier *oidc.IDTokenVerifier
Config oauth2.Config
SsoAddr string
SsoLogoutAddr string
CallbackAddr string
CoverAttributes bool
DisplayName string
Attributes struct {
Username string
Nickname string
Phone string
Email string
}
DefaultRoles []string
DefaultTeams []int64
Ctx context.Context
Provider *oidc.Provider
sync.RWMutex
}
type Config struct {
Enable bool
DisplayName string
RedirectURL string
SsoAddr string
SsoLogoutAddr string
ClientId string
ClientSecret string
CoverAttributes bool
SkipTlsVerify bool
Attributes struct {
Username string
Nickname string
Phone string
Email string
}
DefaultRoles []string
DefaultTeams []int64
Scopes []string
}
func New(cf Config) (*SsoClient, error) {
var s = &SsoClient{}
if !cf.Enable {
return s, nil
}
err := s.Reload(cf)
return s, err
}
func (s *SsoClient) Reload(cf Config) error {
s.Lock()
defer s.Unlock()
if !cf.Enable {
s.Enable = cf.Enable
return nil
}
if cf.Attributes.Username == "" {
cf.Attributes.Username = "sub"
}
s.Enable = cf.Enable
s.SsoAddr = cf.SsoAddr
s.SsoLogoutAddr = cf.SsoLogoutAddr
s.CallbackAddr = cf.RedirectURL
s.CoverAttributes = cf.CoverAttributes
s.Attributes.Username = cf.Attributes.Username
s.Attributes.Nickname = cf.Attributes.Nickname
s.Attributes.Phone = cf.Attributes.Phone
s.Attributes.Email = cf.Attributes.Email
s.DisplayName = cf.DisplayName
s.DefaultRoles = cf.DefaultRoles
s.DefaultTeams = cf.DefaultTeams
s.Ctx = context.Background()
if cf.SkipTlsVerify {
transport := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
}
// Create an HTTP client that uses our custom transport
client := &http.Client{Transport: transport}
s.Ctx = context.WithValue(s.Ctx, oauth2.HTTPClient, client)
}
provider, err := oidc.NewProvider(s.Ctx, cf.SsoAddr)
if err != nil {
return err
}
oidcConfig := &oidc.Config{
ClientID: cf.ClientId,
}
s.Verifier = provider.Verifier(oidcConfig)
s.Provider = provider
s.Config = oauth2.Config{
ClientID: cf.ClientId,
ClientSecret: cf.ClientSecret,
Endpoint: provider.Endpoint(),
RedirectURL: cf.RedirectURL,
Scopes: cf.Scopes,
}
if len(s.Config.Scopes) == 0 {
s.Config.Scopes = []string{oidc.ScopeOpenID, "profile", "email", "phone"}
}
return nil
}
func (s *SsoClient) GetDisplayName() string {
s.RLock()
defer s.RUnlock()
if !s.Enable {
return ""
}
return s.DisplayName
}
func (s *SsoClient) GetSsoLogoutAddr(idToken string) string {
s.RLock()
defer s.RUnlock()
if !s.Enable {
return ""
}
return s.replaceIdTokenTemplate(s.SsoLogoutAddr, idToken)
}
// replaceIdTokenTemplate 替换登出 URL 中的 {{$__id_token__}} 模板变量
func (s *SsoClient) replaceIdTokenTemplate(logoutAddr, idToken string) string {
if idToken == "" {
return logoutAddr
}
return strings.ReplaceAll(logoutAddr, "{{$__id_token__}}", idToken)
}
func wrapStateKey(key string) string {
return "n9e_oidc_" + key
}
// Authorize return the sso authorize location with state
func (s *SsoClient) Authorize(redis storage.Redis, redirect string) (string, error) {
s.RLock()
defer s.RUnlock()
state := uuid.New().String()
ctx := context.Background()
err := redis.Set(ctx, wrapStateKey(state), redirect, time.Duration(300*time.Second)).Err()
if err != nil {
return "", err
}
return s.Config.AuthCodeURL(state), nil
}
func fetchRedirect(redis storage.Redis, ctx context.Context, state string) (string, error) {
return redis.Get(ctx, wrapStateKey(state)).Result()
}
func deleteRedirect(redis storage.Redis, ctx context.Context, state string) error {
return redis.Del(ctx, wrapStateKey(state)).Err()
}
// Callback 用 code 兑换 accessToken 以及 用户信息,
func (s *SsoClient) Callback(redis storage.Redis, ctx context.Context, code, state string) (*CallbackOutput, error) {
ret, err := s.exchangeUser(code)
if err != nil {
return nil, fmt.Errorf("sso_exchange_user fail. code:%s, error:%v", code, err)
}
ret.Redirect, err = fetchRedirect(redis, ctx, state)
if err != nil {
logger.Errorf("get redirect err:%v code:%s state:%s", code, state, err)
}
err = deleteRedirect(redis, ctx, state)
if err != nil {
logger.Errorf("delete redirect err:%v code:%s state:%s", code, state, err)
}
return ret, nil
}
type CallbackOutput struct {
Redirect string `json:"redirect"`
Msg string `json:"msg"`
AccessToken string `json:"accessToken"`
IdToken string `json:"idToken"`
Username string `json:"username"`
Nickname string `json:"nickname"`
Phone string `yaml:"phone"`
Email string `yaml:"email"`
}
func (s *SsoClient) exchangeUser(code string) (*CallbackOutput, error) {
s.RLock()
defer s.RUnlock()
oauth2Token, err := s.Config.Exchange(s.Ctx, code)
if err != nil {
return nil, fmt.Errorf("failed to exchange token: %v", err)
}
rawIDToken, ok := oauth2Token.Extra("id_token").(string)
if !ok {
rerr := fmt.Errorf("sso_exchange_user: no id_token field in oauth2 token %v", oauth2Token)
logger.Error(rerr)
return nil, rerr
}
idToken, err := s.Verifier.Verify(s.Ctx, rawIDToken)
if err != nil {
rerr := fmt.Errorf("sso_exchange_user: failed to verify id_token: %s, error:%v", rawIDToken, err)
logger.Error(rerr)
return nil, rerr
}
logger.Infof("sso_exchange_user: verify id_token success. token:%s", rawIDToken)
data := map[string]interface{}{}
if err := idToken.Claims(&data); err != nil {
rerr := fmt.Errorf("sso_exchange_user: failed to parse id_token: %s, error:%+v", rawIDToken, err)
logger.Error(rerr)
return nil, rerr
}
for k, v := range data {
logger.Debugf("sso_exchange_user: oidc info key:%s value:%v", k, v)
}
output := &CallbackOutput{
AccessToken: oauth2Token.AccessToken,
IdToken: rawIDToken,
Username: extractClaim(data, s.Attributes.Username),
Nickname: extractClaim(data, s.Attributes.Nickname),
Phone: extractClaim(data, s.Attributes.Phone),
Email: extractClaim(data, s.Attributes.Email),
}
userInfo, err := s.Provider.UserInfo(s.Ctx, oauth2.StaticTokenSource(oauth2Token))
if err != nil {
logger.Errorf("sso_exchange_user: failed to get userinfo: %v", err)
return output, nil
}
if userInfo == nil {
logger.Errorf("sso_exchange_user: userinfo is nil")
return output, nil
}
logger.Debugf("sso_exchange_user: userinfo subject:%s email:%s profile:%s", userInfo.Subject, userInfo.Email, userInfo.Profile)
if output.Email == "" {
output.Email = userInfo.Email
}
data = map[string]interface{}{}
userInfo.Claims(&data)
logger.Debugf("sso_exchange_user: userinfo claims:%+v", data)
if output.Nickname == "" {
output.Nickname = extractClaim(data, s.Attributes.Nickname)
}
if output.Phone == "" {
output.Phone = extractClaim(data, s.Attributes.Phone)
}
return output, nil
}
func extractClaim(data map[string]interface{}, key string) string {
if value, ok := data[key]; ok {
if strValue, ok := value.(string); ok {
return strValue
}
}
return ""
}
================================================
FILE: pkg/ormx/database_init.go
================================================
package ormx
import (
"database/sql"
"fmt"
"strconv"
"strings"
"time"
"github.com/toolkits/pkg/logger"
"gorm.io/gorm"
)
type InitUser struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Username string `gorm:"size:64;not null;unique;comment:login name, cannot rename;uniqueIndex"`
Nickname string `gorm:"size:64;not null;comment:display name, chinese name"`
Password string `gorm:"size:128;not null;default:''"`
Phone string `gorm:"size:16;not null;default:''"`
Email string `gorm:"size:64;not null;default:''"`
Portrait string `gorm:"size:255;not null;default:'';comment:portrait image url"`
Roles string `gorm:"size:255;not null;comment:Admin | Standard | Guest, split by space"`
Contacts sql.NullString `gorm:"size:1024;default null;comment:json e.g. {wecom:xx, dingtalk_robot_token:yy}"`
Maintainer bool `gorm:"type:tinyint(1);not null;default:0"`
Belong string `gorm:"size:16;not null;default:'';comment:belong"`
LastActiveTime int64 `gorm:"not null;default:0"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitUser) TableName() string {
return "users"
}
func (InitUser) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitPostgresUser struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Username string `gorm:"size:64;not null;unique;comment:login name, cannot rename;uniqueIndex"`
Nickname string `gorm:"size:64;not null;comment:display name, chinese name"`
Password string `gorm:"size:128;not null;default:''"`
Phone string `gorm:"size:16;not null;default:''"`
Email string `gorm:"size:64;not null;default:''"`
Portrait string `gorm:"size:255;not null;default:'';comment:portrait image url"`
Roles string `gorm:"size:255;not null;comment:Admin | Standard | Guest, split by space"`
Contacts sql.NullString `gorm:"size:1024;default null;comment:json e.g. {wecom:xx, dingtalk_robot_token:yy}"`
Maintainer int16 `gorm:"type:smallint;not null;default:0"`
Belong string `gorm:"size:16;not null;default:'';comment:belong"`
LastActiveTime int64 `gorm:"not null;default:0"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitPostgresUser) TableName() string {
return "users"
}
type InitUserGroup struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Name string `gorm:"size:128;not null;default:''"`
Note string `gorm:"size:255;not null;default:''"`
CreateAt int64 `gorm:"not null;default:0;index"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0;index"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitUserGroup) TableName() string {
return "user_group"
}
func (InitUserGroup) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitUserGroupMember struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
GroupID int64 `gorm:"not null;index"`
UserID uint64 `gorm:"not null;index"`
}
func (InitUserGroupMember) TableName() string {
return "user_group_member"
}
func (InitUserGroupMember) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitConfig struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
CKey string `gorm:"column:ckey;size:191;not null"`
CVal string `gorm:"column:cval;type:text;not null"`
Note string `gorm:"size:1024;not null;default:''"`
External bool `gorm:"type:tinyint(1);not null;default:0"`
Encrypted bool `gorm:"type:tinyint(1);not null;default:0"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitConfig) TableName() string {
return "configs"
}
func (InitConfig) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitPostgresConfig struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
CKey string `gorm:"column:ckey;size:191;not null"`
CVal string `gorm:"column:cval;type:text;not null"`
Note string `gorm:"size:1024;not null;default:''"`
External int16 `gorm:"type:smallint;not null;default:0"`
Encrypted int16 `gorm:"type:smallint;not null;default:0"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitPostgresConfig) TableName() string {
return "configs"
}
type InitRole struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Name string `gorm:"size:191;not null;default:'';uniqueIdx"`
Note string `gorm:"size:255;not null;default:''"`
}
func (InitRole) TableName() string {
return "role"
}
func (InitRole) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitRoleOperation struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
RoleName string `gorm:"size:128;not null;index"`
Operation string `gorm:"size:191;not null;index"`
}
func (InitRoleOperation) TableName() string {
return "role_operation"
}
func (InitRoleOperation) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitBusiGroup struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Name string `gorm:"size:191;not null;uniqueIndex"`
LabelEnable bool `gorm:"type:tinyint(1);not null;default:0"`
LabelValue string `gorm:"size:191;not null;default:'';comment:if label_enable: label_value can not be blank"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitBusiGroup) TableName() string {
return "busi_group"
}
func (InitBusiGroup) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitPostgresBusiGroup struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Name string `gorm:"size:191;not null;uniqueIndex"`
LabelEnable int16 `gorm:"type:smallint;not null;default:0"`
LabelValue string `gorm:"size:191;not null;default:'';comment:if label_enable: label_value can not be blank"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitPostgresBusiGroup) TableName() string {
return "busi_group"
}
type InitBusiGroupMember struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
BusiGroupID int64 `gorm:"not null;comment:busi group id;index"`
UserGroupID int64 `gorm:"not null;comment:user group id;index"`
PermFlag string `gorm:"size:2;not null;comment:ro | rw"`
}
func (InitBusiGroupMember) TableName() string {
return "busi_group_member"
}
func (InitBusiGroupMember) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitBoard struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
GroupID uint64 `gorm:"not null;default:0;comment:busi group id;uniqueIndex:idx_groupid_name"`
Name string `gorm:"size:191;not null;uniqueIndex:idx_groupid_name"`
Ident string `gorm:"size:200;not null;default:'';index"`
Tags string `gorm:"size:255;not null;comment:split by space"`
Public bool `gorm:"type:tinyint(1);not null;default:0;comment:0:false 1:true"`
BuiltIn bool `gorm:"type:tinyint(1);not null;default:0;comment:0:false 1:true"`
Hide bool `gorm:"type:tinyint(1);not null;default:0;comment:0:false 1:true"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitBoard) TableName() string {
return "board"
}
func (InitBoard) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitPostgresBoard struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
GroupID uint64 `gorm:"not null;default:0;comment:busi group id;uniqueIndex:idx_groupid_name"`
Name string `gorm:"size:191;not null;uniqueIndex:idx_groupid_name"`
Ident string `gorm:"size:200;not null;default:'';index"`
Tags string `gorm:"size:255;not null;comment:split by space"`
Public int16 `gorm:"type:smallint;not null;default:0;comment:0:false 1:true"`
BuiltIn int16 `gorm:"type:smallint;not null;default:0;comment:0:false 1:true"`
Hide int16 `gorm:"type:smallint;not null;default:0;comment:0:false 1:true"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitPostgresBoard) TableName() string {
return "board"
}
type InitBoardPayload struct {
ID uint64 `gorm:"not null;comment:dashboard id"`
Payload string `gorm:"type:mediumtext;not null"`
}
func (InitBoardPayload) TableName() string {
return "board_payload"
}
func (InitBoardPayload) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitPostgresBoardPayload struct {
ID uint64 `gorm:"primaryKey;comment:dashboard id"`
Payload string `gorm:"type:TEXT;not null"`
}
func (InitPostgresBoardPayload) TableName() string {
return "board_payload"
}
type InitDashboard struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
GroupID uint64 `gorm:"not null;default:0;comment:busi group id;uniqueIndex:idx_group_name"`
Name string `gorm:"size:191;not null;uniqueIndex:idx_group_name"`
Tags string `gorm:"size:255;not null;comment:split by space"`
Configs string `gorm:"size:8192;comment:dashboard variables"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitDashboard) TableName() string {
return "dashboard"
}
func (InitDashboard) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitChartGroup struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
DashboardID uint64 `gorm:"not null;index"`
Name string `gorm:"size:255;not null"`
Weight int32 `gorm:"not null;default:0"`
}
func (InitChartGroup) TableName() string {
return "chart_group"
}
func (InitChartGroup) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitChart struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
GroupID int64 `gorm:"not null;comment:chart group id;index"`
Configs string `gorm:"type:text"`
Weight int32 `gorm:"not null;default:0"`
}
func (InitChart) TableName() string {
return "chart"
}
func (InitChart) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitChartShare struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Cluster string `gorm:"size:128;not null"`
DatasourceID int64 `gorm:"not null;default:0"`
Configs string `gorm:"type:text"`
CreateAt int64 `gorm:"not null;default:0;index"`
CreateBy string `gorm:"size:64;not null;default:''"`
}
func (InitChartShare) TableName() string {
return "chart_share"
}
func (InitChartShare) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitAlertRule struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
GroupID uint64 `gorm:"not null;default:0;comment:busi group id;index"`
Cate string `gorm:"size:128;not null"`
DatasourceIDs string `gorm:"size:255;not null;default:'';comment:datasource ids"`
Cluster string `gorm:"size:128;not null"`
Name string `gorm:"size:255;not null"`
Note string `gorm:"size:1024;not null;default:''"`
Prod string `gorm:"size:255;not null;default:''"`
Algorithm string `gorm:"size:255;not null;default:''"`
AlgoParams string `gorm:"size:255"`
Delay int32 `gorm:"not null;default:0"`
Severity int16 `gorm:"type:tinyint(1);not null;comment:1:Emergency 2:Warning 3:Notice"`
Disabled bool `gorm:"type:tinyint(1);not null;comment:0:enabled 1:disabled"`
PromForDuration int32 `gorm:"not null;comment:prometheus for, unit:s"`
RuleConfig string `gorm:"type:text;not null;comment:rule_config"`
PromQL string `gorm:"type:text;not null;comment:promql"`
PromEvalInterval int32 `gorm:"not null;comment:evaluate interval"`
EnableStime string `gorm:"size:255;not null;default:'00:00'"`
EnableEtime string `gorm:"size:255;not null;default:'23:59'"`
EnableDaysOfWeek string `gorm:"size:255;not null;default:'';comment:split by space: 0 1 2 3 4 5 6"`
EnableInBg bool `gorm:"type:tinyint(1);not null;default:0;comment:1: only this bg 0: global"`
NotifyRecovered bool `gorm:"type:tinyint(1);not null;comment:whether notify when recovery"`
NotifyChannels string `gorm:"size:255;not null;default:'';comment:split by space: sms voice email dingtalk wecom"`
NotifyGroups string `gorm:"size:255;not null;default:'';comment:split by space: 233 43"`
NotifyRepeatStep int32 `gorm:"not null;default:0;comment:unit: min"`
NotifyMaxNumber int32 `gorm:"not null;default:0"`
RecoverDuration int32 `gorm:"not null;default:0;comment:unit: s"`
Callbacks string `gorm:"size:4096;not null;default:'';comment:split by space: http://a.com/api/x http://a.com/api/y"`
RunbookURL string `gorm:"size:4096"`
AppendTags string `gorm:"size:255;not null;default:'';comment:split by space: service=n9e mod=api"`
Annotations string `gorm:"type:text;not null;comment:annotations"`
ExtraConfig string `gorm:"type:text;not null;comment:extra_config"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0;index"`
UpdateBy string `gorm:"size:64;not null;default:''"`
TimeZone string `gorm:"size:64;not null;default:''"`
DatasourceQueries string `gorm:"type:text"`
}
func (InitAlertRule) TableName() string {
return "alert_rule"
}
func (InitAlertRule) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitPostgresAlertRule struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
GroupID uint64 `gorm:"not null;default:0;comment:busi group id;index"`
Cate string `gorm:"size:128;not null"`
DatasourceIDs string `gorm:"size:255;not null;default:'';comment:datasource ids"`
Cluster string `gorm:"size:128;not null"`
Name string `gorm:"size:255;not null"`
Note string `gorm:"size:1024;not null;default:''"`
Prod string `gorm:"size:255;not null;default:''"`
Algorithm string `gorm:"size:255;not null;default:''"`
AlgoParams string `gorm:"size:255"`
Delay int32 `gorm:"not null;default:0"`
Severity int16 `gorm:"type:smallint;not null;comment:1:Emergency 2:Warning 3:Notice"`
Disabled int16 `gorm:"type:smallint;not null;comment:0:enabled 1:disabled"`
PromForDuration int32 `gorm:"not null;comment:prometheus for, unit:s"`
RuleConfig string `gorm:"type:text;not null;comment:rule_config"`
PromQL string `gorm:"type:text;not null;comment:promql"`
PromEvalInterval int32 `gorm:"not null;comment:evaluate interval"`
EnableStime string `gorm:"size:255;not null;default:'00:00'"`
EnableEtime string `gorm:"size:255;not null;default:'23:59'"`
EnableDaysOfWeek string `gorm:"size:255;not null;default:'';comment:split by space: 0 1 2 3 4 5 6"`
EnableInBg int16 `gorm:"type:smallint;not null;default:0;comment:1: only this bg 0: global"`
NotifyRecovered int16 `gorm:"type:smallint;not null;comment:whether notify when recovery"`
NotifyChannels string `gorm:"size:255;not null;default:'';comment:split by space: sms voice email dingtalk wecom"`
NotifyGroups string `gorm:"size:255;not null;default:'';comment:split by space: 233 43"`
NotifyRepeatStep int32 `gorm:"not null;default:0;comment:unit: min"`
NotifyMaxNumber int32 `gorm:"not null;default:0"`
RecoverDuration int32 `gorm:"not null;default:0;comment:unit: s"`
Callbacks string `gorm:"size:4096;not null;default:'';comment:split by space: http://a.com/api/x http://a.com/api/y"`
RunbookURL string `gorm:"size:4096"`
AppendTags string `gorm:"size:255;not null;default:'';comment:split by space: service=n9e mod=api"`
Annotations string `gorm:"type:text;not null;comment:annotations"`
ExtraConfig string `gorm:"type:text;not null;comment:extra_config"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0;index"`
UpdateBy string `gorm:"size:64;not null;default:''"`
TimeZone string `gorm:"size:64;not null;default:''"`
DatasourceQueries string `gorm:"type:text"`
}
func (InitPostgresAlertRule) TableName() string {
return "alert_rule"
}
type InitAlertMute struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
GroupID uint64 `gorm:"not null;default:0;comment:busi group id;index"`
Prod string `gorm:"size:255;not null;default:''"`
Note string `gorm:"size:1024;not null;default:''"`
Cate string `gorm:"size:128;not null"`
Cluster string `gorm:"size:128;not null"`
DatasourceIDs string `gorm:"size:255;not null;default:'';comment:datasource ids"`
Tags string `gorm:"size:4096;default:'[]';comment:json,map,tagkey->regexp|value"`
Cause string `gorm:"size:255;not null;default:''"`
BTime int64 `gorm:"column:btime;not null;default:0;comment:begin time"`
ETime int64 `gorm:"column:etime;not null;default:0;comment:end time"`
Disabled bool `gorm:"type:tinyint(1);not null;default:0;comment:0:enabled 1:disabled"`
MuteTimeType bool `gorm:"type:tinyint(1);not null;default:0"`
PeriodicMutes string `gorm:"size:4096;not null;default:''"`
Severities string `gorm:"size:32;not null;default:''"`
CreateAt int64 `gorm:"not null;default:0;index"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitAlertMute) TableName() string {
return "alert_mute"
}
func (InitAlertMute) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitPostgresAlertMute struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
GroupID uint64 `gorm:"not null;default:0;comment:busi group id;index"`
Prod string `gorm:"size:255;not null;default:''"`
Note string `gorm:"size:1024;not null;default:''"`
Cate string `gorm:"size:128;not null"`
Cluster string `gorm:"size:128;not null"`
DatasourceIDs string `gorm:"size:255;not null;default:'';comment:datasource ids"`
Tags string `gorm:"size:4096;default:'[]';comment:json,map,tagkey->regexp|value"`
Cause string `gorm:"size:255;not null;default:''"`
BTime int64 `gorm:"column:btime;not null;default:0;comment:begin time"`
ETime int64 `gorm:"column:etime;not null;default:0;comment:end time"`
Disabled int16 `gorm:"type:smallint;not null;default:0;comment:0:enabled 1:disabled"`
MuteTimeType int16 `gorm:"type:smallint;not null;default:0"`
PeriodicMutes string `gorm:"size:4096;not null;default:''"`
Severities string `gorm:"size:32;not null;default:''"`
CreateAt int64 `gorm:"not null;default:0;index"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitPostgresAlertMute) TableName() string {
return "alert_mute"
}
type InitAlertSubscribe struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Name string `gorm:"size:255;not null;default:''"`
Disabled bool `gorm:"type:tinyint(1);not null;default:0;comment:0:enabled 1:disabled"`
GroupID uint64 `gorm:"not null;default:0;comment:busi group id;index"`
Prod string `gorm:"size:255;not null;default:''"`
Cate string `gorm:"size:128;not null"`
DatasourceIDs string `gorm:"size:255;not null;default:'';comment:datasource ids"`
Cluster string `gorm:"size:128;not null"`
RuleID int64 `gorm:"not null;default:0"`
Severities string `gorm:"size:32;not null;default:''"`
Tags string `gorm:"size:4096;not null;default:'';comment:json,map,tagkey->regexp|value"`
RedefineSeverity int16 `gorm:"type:tinyint(1);default:0;comment:is redefine severity?"`
NewSeverity int16 `gorm:"type:tinyint(1);not null;comment:0:Emergency 1:Warning 2:Notice"`
RedefineChannels int16 `gorm:"type:tinyint(1);default:0;comment:is redefine channels?"`
NewChannels string `gorm:"size:255;not null;default:'';comment:split by space: sms voice email dingtalk wecom"`
UserGroupIDs string `gorm:"size:250;not null;comment:split by space 1 34 5, notify cc to user_group_ids"`
BusiGroups string `gorm:"size:4096;not null;default:'[]'"`
Note string `gorm:"size:1024;default:'';comment:note"`
RuleIDs string `gorm:"size:1024;default:'';comment:rule_ids"`
Webhooks string `gorm:"type:text;not null"`
ExtraConfig string `gorm:"type:text;not null;comment:extra_config"`
RedefineWebhooks bool `gorm:"type:tinyint(1);default:0"`
ForDuration int64 `gorm:"not null;default:0"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0;index"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitAlertSubscribe) TableName() string {
return "alert_subscribe"
}
func (InitAlertSubscribe) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitPostgresAlertSubscribe struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Name string `gorm:"size:255;not null;default:''"`
Disabled int16 `gorm:"type:smallint;not null;default:0;comment:0:enabled 1:disabled"`
GroupID uint64 `gorm:"not null;default:0;comment:busi group id;index"`
Prod string `gorm:"size:255;not null;default:''"`
Cate string `gorm:"size:128;not null"`
DatasourceIDs string `gorm:"size:255;not null;default:'';comment:datasource ids"`
Cluster string `gorm:"size:128;not null"`
RuleID int64 `gorm:"not null;default:0"`
Severities string `gorm:"size:32;not null;default:''"`
Tags string `gorm:"size:4096;not null;default:'';comment:json,map,tagkey->regexp|value"`
RedefineSeverity int16 `gorm:"type:smallint;default:0;comment:is redefine severity?"`
NewSeverity int16 `gorm:"type:smallint;not null;comment:0:Emergency 1:Warning 2:Notice"`
RedefineChannels int16 `gorm:"type:smallint;default:0;comment:is redefine channels?"`
NewChannels string `gorm:"size:255;not null;default:'';comment:split by space: sms voice email dingtalk wecom"`
UserGroupIDs string `gorm:"size:250;not null;comment:split by space 1 34 5, notify cc to user_group_ids"`
BusiGroups string `gorm:"size:4096;not null;default:'[]'"`
Note string `gorm:"size:1024;default:'';comment:note"`
RuleIDs string `gorm:"size:1024;default:'';comment:rule_ids"`
Webhooks string `gorm:"type:text;not null"`
ExtraConfig string `gorm:"type:text;not null;comment:extra_config"`
RedefineWebhooks int16 `gorm:"type:smallint;default:0"`
ForDuration int64 `gorm:"not null;default:0"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0;index"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitPostgresAlertSubscribe) TableName() string {
return "alert_subscribe"
}
type InitTarget struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
GroupID uint64 `gorm:"not null;default:0;comment:busi group id;index"`
Ident string `gorm:"size:191;not null;comment:target id;uniqueIndex"`
Note string `gorm:"size:255;not null;default:'';comment:append to alert event as field"`
Tags string `gorm:"size:512;not null;default:'';comment:append to series data as tags, split by space, append external space at suffix"`
HostTags string `gorm:"size:512;not null;default:'';comment:append to series data as tags, split by space, append external space at suffix"`
HostIP string `gorm:"size:15;default:'';comment:IPv4 string"`
AgentVersion string `gorm:"size:255;default:'';comment:agent version"`
EngineName string `gorm:"size:255;default:'';comment:engine_name"`
OS string `gorm:"size:31;default:'';comment:os type"`
UpdateAt int64 `gorm:"not null;default:0"`
}
func (InitTarget) TableName() string {
return "target"
}
func (InitTarget) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitMetricView struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Name string `gorm:"size:191;not null;default:''"`
Cate bool `gorm:"type:tinyint(1);not null;comment:0: preset 1: custom"`
Configs string `gorm:"size:8192;not null;default:''"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy uint64 `gorm:"not null;default:0;comment:user id;index"`
UpdateAt int64 `gorm:"not null;default:0"`
}
func (InitMetricView) TableName() string {
return "metric_view"
}
func (InitMetricView) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitPostgresMetricView struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Name string `gorm:"size:191;not null;default:''"`
Cate int16 `gorm:"type:smallint;not null;comment:0: preset 1: custom"`
Configs string `gorm:"size:8192;not null;default:''"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy uint64 `gorm:"not null;default:0;comment:user id;index"`
UpdateAt int64 `gorm:"not null;default:0"`
}
func (InitPostgresMetricView) TableName() string {
return "metric_view"
}
type InitRecordingRule struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
GroupID uint64 `gorm:"not null;default:0;comment:group_id;index"`
DatasourceIDs string `gorm:"size:255;not null;default:'';comment:datasource ids"`
Cluster string `gorm:"size:128;not null"`
Name string `gorm:"size:255;not null;comment:new metric name"`
Note string `gorm:"size:255;not null;comment:rule note"`
Disabled bool `gorm:"type:tinyint(1);not null;default:0;comment:0:enabled 1:disabled"`
PromQL string `gorm:"size:8192;not null;comment:promql"`
PromEvalInterval int32 `gorm:"not null;comment:evaluate interval"`
CronPattern string `gorm:"size:255;default:'';comment:cron pattern"`
AppendTags string `gorm:"size:255;default:'';comment:split by space: service=n9e mod=api"`
QueryConfigs string `gorm:"type:text;not null;comment:query configs"`
CreateAt int64 `gorm:"default:0"`
CreateBy string `gorm:"size:64;default:''"`
UpdateAt int64 `gorm:"default:0;index"`
UpdateBy string `gorm:"size:64;default:''"`
DatasourceQueries string `gorm:"type:text"`
}
func (InitRecordingRule) TableName() string {
return "recording_rule"
}
func (InitRecordingRule) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitPostgresRecordingRule struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
GroupID uint64 `gorm:"not null;default:0;comment:group_id;index"`
DatasourceIDs string `gorm:"size:255;not null;default:'';comment:datasource ids"`
Cluster string `gorm:"size:128;not null"`
Name string `gorm:"size:255;not null;comment:new metric name"`
Note string `gorm:"size:255;not null;comment:rule note"`
Disabled int16 `gorm:"type:smallint;not null;default:0;comment:0:enabled 1:disabled"`
PromQL string `gorm:"size:8192;not null;comment:promql"`
PromEvalInterval int32 `gorm:"not null;comment:evaluate interval"`
CronPattern string `gorm:"size:255;default:'';comment:cron pattern"`
AppendTags string `gorm:"size:255;default:'';comment:split by space: service=n9e mod=api"`
QueryConfigs string `gorm:"type:text;not null;comment:query configs"`
CreateAt int64 `gorm:"default:0"`
CreateBy string `gorm:"size:64;default:''"`
UpdateAt int64 `gorm:"default:0;index"`
UpdateBy string `gorm:"size:64;default:''"`
DatasourceQueries string `gorm:"type:text"`
}
func (InitPostgresRecordingRule) TableName() string {
return "recording_rule"
}
type InitAlertAggrView struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Name string `gorm:"size:191;not null;default:''"`
Rule string `gorm:"size:2048;not null;default:''"`
Cate bool `gorm:"type:tinyint(1);not null;comment:0: preset 1: custom"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy int64 `gorm:"not null;default:0;comment:user id;index:create_by"`
UpdateAt int64 `gorm:"not null;default:0"`
}
func (InitAlertAggrView) TableName() string {
return "alert_aggr_view"
}
func (InitAlertAggrView) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitPostgresAlertAggrView struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Name string `gorm:"size:191;not null;default:''"`
Rule string `gorm:"size:2048;not null;default:''"`
Cate int16 `gorm:"type:smallint;not null;comment:0: preset 1: custom"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy int64 `gorm:"not null;default:0;comment:user id;index:create_by"`
UpdateAt int64 `gorm:"not null;default:0"`
}
func (InitPostgresAlertAggrView) TableName() string {
return "alert_aggr_view"
}
type InitAlertCurEvent struct {
ID uint64 `gorm:"primaryKey;NOT NULL;COMMENT:use alert_his_event.id"`
Cate string `gorm:"size:128;not null"`
DatasourceID int64 `gorm:"not null;default:0;comment:datasource id"`
Cluster string `gorm:"size:128;not null"`
GroupID uint64 `gorm:"not null;comment:busi group id of rule;index"`
GroupName string `gorm:"size:255;not null;default:'';comment:busi group name"`
Hash string `gorm:"size:64;not null;comment:rule_id + vector_pk;index"`
RuleID uint64 `gorm:"not null;index"`
RuleName string `gorm:"size:255;not null"`
RuleNote string `gorm:"size:2048;not null;default:'alert rule note'"`
RuleProd string `gorm:"size:255;not null;default:''"`
RuleAlgo string `gorm:"size:255;not null;default:''"`
Severity int16 `gorm:"type:tinyint(1);not null;comment:0:Emergency 1:Warning 2:Notice"`
PromForDuration int32 `gorm:"not null;comment:prometheus for, unit:s"`
PromQL string `gorm:"size:8192;not null;comment:promql"`
PromEvalInterval int32 `gorm:"not null;comment:evaluate interval"`
Callbacks string `gorm:"size:2048;not null;default:'';comment:split by space: http://a.com/api/x http://a.com/api/y"`
RunbookURL string `gorm:"size:255"`
NotifyRecovered bool `gorm:"type:tinyint(1);not null;comment:whether notify when recovery"`
NotifyChannels string `gorm:"size:255;not null;default:'';comment:split by space: sms voice email dingtalk wecom"`
NotifyGroups string `gorm:"size:255;not null;default:'';comment:split by space: 233 43"`
NotifyRepeatNext int64 `gorm:"not null;default:0;comment:next timestamp to notify, get repeat settings from rule;index"`
NotifyCurNumber int32 `gorm:"not null;default:0"`
TargetIdent string `gorm:"size:191;not null;default:'';comment:target ident, also in tags"`
TargetNote string `gorm:"size:191;not null;default:'';comment:target note"`
FirstTriggerTime int64
TriggerTime int64 `gorm:"not null;index"`
TriggerValue string `gorm:"type:text;not null"`
Annotations string `gorm:"type:text;not null;comment:annotations"`
RuleConfig string `gorm:"type:text;not null;comment:annotations"`
Tags string `gorm:"size:1024;not null;default:'';comment:merge data_tags rule_tags, split by ,,"`
OriginalTags string `gorm:"type:text;comment:labels key=val,,k2=v2"`
}
func (InitAlertCurEvent) TableName() string {
return "alert_cur_event"
}
func (InitAlertCurEvent) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitPostgresAlertCurEvent struct {
ID uint64 `gorm:"primaryKey;NOT NULL;COMMENT:use alert_his_event.id"`
Cate string `gorm:"size:128;not null"`
DatasourceID int64 `gorm:"not null;default:0;comment:datasource id"`
Cluster string `gorm:"size:128;not null"`
GroupID uint64 `gorm:"not null;comment:busi group id of rule;index"`
GroupName string `gorm:"size:255;not null;default:'';comment:busi group name"`
Hash string `gorm:"size:64;not null;comment:rule_id + vector_pk;index"`
RuleID uint64 `gorm:"not null;index"`
RuleName string `gorm:"size:255;not null"`
RuleNote string `gorm:"size:2048;not null;default:'alert rule note'"`
RuleProd string `gorm:"size:255;not null;default:''"`
RuleAlgo string `gorm:"size:255;not null;default:''"`
Severity int16 `gorm:"type:smallint;not null;comment:0:Emergency 1:Warning 2:Notice"`
PromForDuration int32 `gorm:"not null;comment:prometheus for, unit:s"`
PromQL string `gorm:"size:8192;not null;comment:promql"`
PromEvalInterval int32 `gorm:"not null;comment:evaluate interval"`
Callbacks string `gorm:"size:2048;not null;default:'';comment:split by space: http://a.com/api/x http://a.com/api/y"`
RunbookURL string `gorm:"size:255"`
NotifyRecovered int16 `gorm:"type:smallint;not null;comment:whether notify when recovery"`
NotifyChannels string `gorm:"size:255;not null;default:'';comment:split by space: sms voice email dingtalk wecom"`
NotifyGroups string `gorm:"size:255;not null;default:'';comment:split by space: 233 43"`
NotifyRepeatNext int64 `gorm:"not null;default:0;comment:next timestamp to notify, get repeat settings from rule;index"`
NotifyCurNumber int32 `gorm:"not null;default:0"`
TargetIdent string `gorm:"size:191;not null;default:'';comment:target ident, also in tags"`
TargetNote string `gorm:"size:191;not null;default:'';comment:target note"`
FirstTriggerTime int64
TriggerTime int64 `gorm:"not null;index"`
TriggerValue string `gorm:"type:text;not null"`
Annotations string `gorm:"type:text;not null;comment:annotations"`
RuleConfig string `gorm:"type:text;not null;comment:annotations"`
Tags string `gorm:"size:1024;not null;default:'';comment:merge data_tags rule_tags, split by ,,"`
OriginalTags string `gorm:"type:text;comment:labels key=val,,k2=v2"`
}
func (InitPostgresAlertCurEvent) TableName() string {
return "alert_cur_event"
}
type InitAlertHisEvent struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
IsRecovered bool `gorm:"type:tinyint(1);not null"`
Cate string `gorm:"size:128;not null"`
DatasourceID int64 `gorm:"not null;default:0;comment:datasource id"`
Cluster string `gorm:"size:128;not null"`
GroupID int64 `gorm:"not null;comment:busi group id of rule;index"`
GroupName string `gorm:"size:255;not null;default:'';comment:busi group name"`
Hash string `gorm:"size:64;not null;comment:rule_id + vector_pk;index"`
RuleID int64 `gorm:"not null;index"`
RuleName string `gorm:"size:255;not null"`
RuleNote string `gorm:"size:2048;not null;default:'alert rule note'"`
RuleProd string `gorm:"size:255;not null;default:''"`
RuleAlgo string `gorm:"size:255;not null;default:''"`
Severity int16 `gorm:"type:tinyint(1);not null;comment:0:Emergency 1:Warning 2:Notice"`
PromForDuration int32 `gorm:"not null;comment:prometheus for, unit:s"`
PromQL string `gorm:"size:8192;not null;comment:promql"`
PromEvalInterval int32 `gorm:"not null;comment:evaluate interval"`
Callbacks string `gorm:"size:2048;not null;default:'';comment:split by space: http://a.com/api/x http://a.com/api/y"`
RunbookURL string `gorm:"size:255"`
NotifyRecovered bool `gorm:"type:tinyint(1);not null;comment:whether notify when recovery"`
NotifyChannels string `gorm:"size:255;not null;default:'';comment:split by space: sms voice email dingtalk wecom"`
NotifyGroups string `gorm:"size:255;not null;default:'';comment:split by space: 233 43"`
NotifyCurNumber int32 `gorm:"not null;default:0"`
TargetIdent string `gorm:"size:191;not null;default:'';comment:target ident, also in tags"`
TargetNote string `gorm:"size:191;not null;default:'';comment:target note"`
FirstTriggerTime int64
TriggerTime int64 `gorm:"not null;index"`
TriggerValue string `gorm:"type:text;not null"`
RecoverTime int64 `gorm:"not null;default:0"`
LastEvalTime int64 `gorm:"not null;default:0;comment:for time filter;index"`
Tags string `gorm:"size:1024;not null;default:'';comment:merge data_tags rule_tags, split by ,,"`
OriginalTags string `gorm:"type:text;comment:labels key=val,,k2=v2"`
Annotations string `gorm:"type:text;not null;comment:annotations"`
RuleConfig string `gorm:"type:text;not null;comment:annotations"`
}
func (InitAlertHisEvent) TableName() string {
return "alert_his_event"
}
func (InitAlertHisEvent) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitPostgresAlertHisEvent struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
IsRecovered int16 `gorm:"type:smallint;not null"`
Cate string `gorm:"size:128;not null"`
DatasourceID int64 `gorm:"not null;default:0;comment:datasource id"`
Cluster string `gorm:"size:128;not null"`
GroupID int64 `gorm:"not null;comment:busi group id of rule;index"`
GroupName string `gorm:"size:255;not null;default:'';comment:busi group name"`
Hash string `gorm:"size:64;not null;comment:rule_id + vector_pk;index"`
RuleID int64 `gorm:"not null;index"`
RuleName string `gorm:"size:255;not null"`
RuleNote string `gorm:"size:2048;not null;default:'alert rule note'"`
RuleProd string `gorm:"size:255;not null;default:''"`
RuleAlgo string `gorm:"size:255;not null;default:''"`
Severity int16 `gorm:"type:smallint;not null;comment:0:Emergency 1:Warning 2:Notice"`
PromForDuration int32 `gorm:"not null;comment:prometheus for, unit:s"`
PromQL string `gorm:"size:8192;not null;comment:promql"`
PromEvalInterval int32 `gorm:"not null;comment:evaluate interval"`
Callbacks string `gorm:"size:2048;not null;default:'';comment:split by space: http://a.com/api/x http://a.com/api/y"`
RunbookURL string `gorm:"size:255"`
NotifyRecovered int16 `gorm:"type:smallint;not null;comment:whether notify when recovery"`
NotifyChannels string `gorm:"size:255;not null;default:'';comment:split by space: sms voice email dingtalk wecom"`
NotifyGroups string `gorm:"size:255;not null;default:'';comment:split by space: 233 43"`
NotifyCurNumber int32 `gorm:"not null;default:0"`
TargetIdent string `gorm:"size:191;not null;default:'';comment:target ident, also in tags"`
TargetNote string `gorm:"size:191;not null;default:'';comment:target note"`
FirstTriggerTime int64
TriggerTime int64 `gorm:"not null;index"`
TriggerValue string `gorm:"type:text;not null"`
RecoverTime int64 `gorm:"not null;default:0"`
LastEvalTime int64 `gorm:"not null;default:0;comment:for time filter;index"`
Tags string `gorm:"size:1024;not null;default:'';comment:merge data_tags rule_tags, split by ,,"`
OriginalTags string `gorm:"type:text;comment:labels key=val,,k2=v2"`
Annotations string `gorm:"type:text;not null;comment:annotations"`
RuleConfig string `gorm:"type:text;not null;comment:annotations"`
}
func (InitPostgresAlertHisEvent) TableName() string {
return "alert_his_event"
}
type InitBoardBusiGroup struct {
BusiGroupID int64 `primaryKey;gorm:"not null;default:0;comment:busi group id"`
BoardID int64 `primaryKey;gorm:"not null;default:0;comment:board id"`
}
func (InitBoardBusiGroup) TableName() string {
return "board_busigroup"
}
func (InitBoardBusiGroup) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitBuiltinComponent struct {
ID int64 `gorm:"primaryKey;not null;autoIncrement;comment:unique identifier"`
Ident string `gorm:"size:191;not null;comment:identifier of component;index"`
Logo string `gorm:"size:191;not null;comment:logo of component"`
Readme string `gorm:"type:text;not null;comment:readme of component"`
CreatedAt int64 `gorm:"not null;default:0;comment:create time"`
CreatedBy string `gorm:"size:191;not null;default:'';comment:creator"`
UpdatedAt int64 `gorm:"not null;default:0;comment:update time"`
UpdatedBy string `gorm:"size:191;not null;default:'';comment:updater"`
}
func (InitBuiltinComponent) TableName() string {
return "builtin_components"
}
func (InitBuiltinComponent) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitpostgresBuiltinPayload struct {
ID uint64 `gorm:"primaryKey;autoIncrement;comment:unique identifier"`
ComponentID uint64 `gorm:"not null;default:0;comment:component_id"`
UUID uint64 `gorm:"not null;comment:uuid of payload;index"`
Type string `gorm:"size:191;not null;comment:type of payload;index"`
Component string `gorm:"size:191;not null;comment:component of payload;index"`
Cate string `gorm:"size:191;not null;comment:category of payload;index"`
Name string `gorm:"size:191;not null;comment:name of payload;index"`
Tags string `gorm:"size:191;not null;default:'';comment:tags of payload"`
Content string `gorm:"type:TEXT;not null;comment:content of payload"`
CreatedAt int64 `gorm:"not null;default:0;comment:create time"`
CreatedBy string `gorm:"size:191;not null;default:'';comment:creator"`
UpdatedAt int64 `gorm:"not null;default:0;comment:update time"`
UpdatedBy string `gorm:"size:191;not null;default:'';comment:updater"`
}
func (InitpostgresBuiltinPayload) TableName() string {
return "builtin_payloads"
}
type InitBuiltinPayload struct {
ID uint64 `gorm:"primaryKey;autoIncrement;comment:unique identifier"`
ComponentID uint64 `gorm:"not null;default:0;comment:component_id"`
UUID uint64 `gorm:"not null;comment:uuid of payload;index"`
Type string `gorm:"size:191;not null;comment:type of payload;index"`
Component string `gorm:"size:191;not null;comment:component of payload;index"`
Cate string `gorm:"size:191;not null;comment:category of payload;index"`
Name string `gorm:"size:191;not null;comment:name of payload;index"`
Tags string `gorm:"size:191;not null;default:'';comment:tags of payload"`
Content string `gorm:"type:longtext;not null;comment:content of payload"`
CreatedAt int64 `gorm:"not null;default:0;comment:create time"`
CreatedBy string `gorm:"size:191;not null;default:'';comment:creator"`
UpdatedAt int64 `gorm:"not null;default:0;comment:update time"`
UpdatedBy string `gorm:"size:191;not null;default:'';comment:updater"`
}
func (InitBuiltinPayload) TableName() string {
return "builtin_payloads"
}
func (InitBuiltinPayload) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitNotificationRecord struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
EventID uint64 `gorm:"not null;index:idx_evt"`
SubID uint64 `gorm:"not null"`
Channel string `gorm:"size:255;not null"`
Status int32 `gorm:"not null;default:0"`
Target string `gorm:"size:1024;not null"`
Details string `gorm:"size:2048"`
CreatedAt int64 `gorm:"not null"`
}
func (InitNotificationRecord) TableName() string {
return "notification_record"
}
func (InitNotificationRecord) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitTaskTpl struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
GroupID int64 `gorm:"not null;comment:busi group id;index"`
Title string `gorm:"size:255;not null;default:''"`
Account string `gorm:"size:64;not null"`
Batch uint `gorm:"not null;default:0"`
Tolerance uint `gorm:"not null;default:0"`
Timeout uint `gorm:"not null;default:0"`
Pause string `gorm:"size:255;not null;default:''"`
Script string `gorm:"type:text;not null"`
Args string `gorm:"size:512;not null;default:''"`
Tags string `gorm:"size:255;not null;default:'';comment:split by space"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitTaskTpl) TableName() string {
return "task_tpl"
}
func (InitTaskTpl) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitTaskTplHost struct {
II uint64 `gorm:"primaryKey;autoIncrement"`
ID uint64 `gorm:"not null;comment:task tpl id;index:idx_id_host"`
Host string `gorm:"size:128;not null;comment:ip or hostname;index:idx_id_host"`
}
func (InitTaskTplHost) TableName() string {
return "task_tpl_host"
}
func (InitTaskTplHost) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitTaskRecord struct {
ID uint64 `gorm:"primaryKey"`
EventID uint64 `gorm:"not null;default:0;comment:event id;index"`
GroupID uint64 `gorm:"not null;comment:busi group id;index:idx_group_id_create_at"`
IbexAddress string `gorm:"size:128;not null"`
IbexAuthUser string `gorm:"size:128;not null;default:''"`
IbexAuthPass string `gorm:"size:128;not null;default:''"`
Title string `gorm:"size:255;not null;default:''"`
Account string `gorm:"size:64;not null"`
Batch uint `gorm:"not null;default:0"`
Tolerance uint `gorm:"not null;default:0"`
Timeout uint `gorm:"not null;default:0"`
Pause string `gorm:"size:255;not null;default:''"`
Script string `gorm:"type:text;not null"`
Args string `gorm:"size:512;not null;default:''"`
CreateAt int64 `gorm:"not null;default:0;index:idx_group_id_create_at"`
CreateBy string `gorm:"size:64;not null;default:'';index"`
}
func (InitTaskRecord) TableName() string {
return "task_record"
}
func (InitTaskRecord) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitAlertingEngine struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Instance string `gorm:"size:128;not null;default:'';comment:instance identification, e.g. 10.9.0.9:9090"`
DatasourceID int64 `gorm:"not null;default:0;comment:datasource id"`
EngineCluster string `gorm:"size:128;not null;default:'';comment:n9e-alert cluster"`
Clock int64 `gorm:"not null"`
}
func (InitAlertingEngine) TableName() string {
return "alerting_engines"
}
func (InitAlertingEngine) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitDatasource struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Name string `gorm:"size:191;not null;default:'';uniqueIndex"`
Description string `gorm:"size:255;not null;default:''"`
Category string `gorm:"size:255;not null;default:''"`
PluginID uint `gorm:"not null;default:0"`
PluginType string `gorm:"size:255;not null;default:''"`
PluginTypeName string `gorm:"size:255;not null;default:''"`
ClusterName string `gorm:"size:255;not null;default:''"`
Settings string `gorm:"type:text;not null"`
Status string `gorm:"size:255;not null;default:''"`
HTTP string `gorm:"size:4096;not null;default:''"`
Auth string `gorm:"size:8192;not null;default:''"`
IsDefault bool `gorm:"type:tinyint(1);not null;default:0"`
CreatedAt int64 `gorm:"not null;default:0"`
CreatedBy string `gorm:"size:64;not null;default:''"`
UpdatedAt int64 `gorm:"not null;default:0"`
UpdatedBy string `gorm:"size:64;not null;default:''"`
}
func (InitDatasource) TableName() string {
return "datasource"
}
func (InitDatasource) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitPostgresDatasource struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Name string `gorm:"size:191;not null;default:'';uniqueIndex"`
Description string `gorm:"size:255;not null;default:''"`
Category string `gorm:"size:255;not null;default:''"`
PluginID uint `gorm:"not null;default:0"`
PluginType string `gorm:"size:255;not null;default:''"`
PluginTypeName string `gorm:"size:255;not null;default:''"`
ClusterName string `gorm:"size:255;not null;default:''"`
Settings string `gorm:"type:text;not null"`
Status string `gorm:"size:255;not null;default:''"`
HTTP string `gorm:"size:4096;not null;default:''"`
Auth string `gorm:"size:8192;not null;default:''"`
IsDefault bool `gorm:"type:boolean;not null;default:0"`
CreatedAt int64 `gorm:"not null;default:0"`
CreatedBy string `gorm:"size:64;not null;default:''"`
UpdatedAt int64 `gorm:"not null;default:0"`
UpdatedBy string `gorm:"size:64;not null;default:''"`
}
func (InitPostgresDatasource) TableName() string {
return "datasource"
}
type InitBuiltinCate struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Name string `gorm:"size:191;not null"`
UserID int64 `gorm:"not null;default:0"`
}
func (InitBuiltinCate) TableName() string {
return "builtin_cate"
}
func (InitBuiltinCate) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitNotifyTpl struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Channel string `gorm:"size:32;not null;uniqueIndex"`
Name string `gorm:"size:255;not null"`
Content string `gorm:"type:text;not null"`
CreateAt int64 `gorm:"not null;default:0"`
CreateBy string `gorm:"size:64;not null;default:''"`
UpdateAt int64 `gorm:"not null;default:0"`
UpdateBy string `gorm:"size:64;not null;default:''"`
}
func (InitNotifyTpl) TableName() string {
return "notify_tpl"
}
func (InitNotifyTpl) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitSSOConfig struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Name string `gorm:"size:191;not null;uniqueIndex"`
Content string `gorm:"type:text;not null"`
UpdateAt int64 `gorm:"not null;default:0"`
}
func (InitSSOConfig) TableName() string {
return "sso_config"
}
func (InitSSOConfig) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitESIndexPattern struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
DatasourceID int64 `gorm:"not null;default:0;comment:datasource id;uniqueIndex:idx_datasource_name"`
Name string `gorm:"size:191;not null;uniqueIndex:idx_datasource_name"`
TimeField string `gorm:"size:128;not null;default:'@timestamp'"`
AllowHideSystemIndices bool `gorm:"type:tinyint(1);not null;default:0"`
FieldsFormat string `gorm:"size:4096;not null;default:''"`
CreateAt int64 `gorm:"default:0"`
CreateBy string `gorm:"size:64;default:''"`
UpdateAt int64 `gorm:"default:0"`
UpdateBy string `gorm:"size:64;default:''"`
}
func (InitESIndexPattern) TableName() string {
return "es_index_pattern"
}
func (InitESIndexPattern) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitSqliteESIndexPattern struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
DatasourceID int64 `gorm:"not null;default:0;comment:datasource id;uniqueIndex:idx_datasource"`
Name string `gorm:"size:191;not null;uniqueIndex:idx_name"`
TimeField string `gorm:"size:128;not null;default:'@timestamp'"`
AllowHideSystemIndices bool `gorm:"type:tinyint(1);not null;default:0"`
FieldsFormat string `gorm:"size:4096;not null;default:''"`
CreateAt int64 `gorm:"default:0"`
CreateBy string `gorm:"size:64;default:''"`
UpdateAt int64 `gorm:"default:0"`
UpdateBy string `gorm:"size:64;default:''"`
}
func (InitSqliteESIndexPattern) TableName() string {
return "es_index_pattern"
}
type InitPostgresESIndexPattern struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
DatasourceID int64 `gorm:"not null;default:0;comment:datasource id;uniqueIndex:idx_datasource_name"`
Name string `gorm:"size:191;not null;uniqueIndex:idx_datasource_name"`
TimeField string `gorm:"size:128;not null;default:'@timestamp'"`
AllowHideSystemIndices int16 `gorm:"type:smallint;not null;default:0"`
FieldsFormat string `gorm:"size:4096;not null;default:''"`
CreateAt int64 `gorm:"default:0"`
CreateBy string `gorm:"size:64;default:''"`
UpdateAt int64 `gorm:"default:0"`
UpdateBy string `gorm:"size:64;default:''"`
}
func (InitPostgresESIndexPattern) TableName() string {
return "es_index_pattern"
}
type InitBuiltinMetric struct {
ID uint64 `gorm:"primaryKey;autoIncrement;comment:unique identifier"`
Collector string `gorm:"size:191;not null;comment:type of collector;index:idx_collector`
Typ string `gorm:"size:191;not null;comment:type of metric;index:idx_typ`
Name string `gorm:"size:191;not null;comment:name of metric;index:idx_name`
Unit string `gorm:"size:191;not null;comment:unit of metric"`
Lang string `gorm:"size:191;not null;default:'';comment:language of metric;index:idx_lang`
Note string `gorm:"size:4096;not null;comment:description of metric in Chinese"`
Expression string `gorm:"size:4096;not null;comment:expression of metric"`
CreatedAt int64 `gorm:"not null;default:0;comment:create time"`
CreatedBy string `gorm:"size:191;not null;default:'';comment:creator"`
UpdatedAt int64 `gorm:"not null;default:0;comment:update time"`
UpdatedBy string `gorm:"size:191;not null;default:'';comment:updater"`
UUID int64 `gorm:"not null;default:0;comment:'uuid'"`
}
func (InitBuiltinMetric) TableName() string {
return "builtin_metrics"
}
func (InitBuiltinMetric) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitSqliteBuiltinMetric struct {
ID uint64 `gorm:"primaryKey;autoIncrement;comment:unique identifier"`
Collector string `gorm:"size:191;not null;comment:type of collector;index:idx_collector`
Typ string `gorm:"size:191;not null;comment:type of metric;index:idx_typ`
Name string `gorm:"size:191;not null;comment:name of metric;index:idx_name_sqlite`
Unit string `gorm:"size:191;not null;comment:unit of metric"`
Lang string `gorm:"size:191;not null;default:'';comment:language of metric;index:idx_lang`
Note string `gorm:"size:4096;not null;comment:description of metric in Chinese"`
Expression string `gorm:"size:4096;not null;comment:expression of metric"`
CreatedAt int64 `gorm:"not null;default:0;comment:create time"`
CreatedBy string `gorm:"size:191;not null;default:'';comment:creator"`
UpdatedAt int64 `gorm:"not null;default:0;comment:update time"`
UpdatedBy string `gorm:"size:191;not null;default:'';comment:updater"`
UUID int64 `gorm:"not null;default:0;comment:'uuid'"`
}
func (InitSqliteBuiltinMetric) TableName() string {
return "builtin_metrics"
}
type InitMetricFilter struct {
ID uint64 `gorm:"primaryKey;autoIncrement;comment:unique identifier"`
Name string `gorm:"size:191;not null;comment:name of metric filter;index:idx_name"`
Configs string `gorm:"size:4096;not null;comment:configuration of metric filter"`
GroupsPerm string `gorm:"type:text"`
CreateAt int64 `gorm:"not null;default:0;comment:create time"`
CreateBy string `gorm:"size:191;not null;default:'';comment:creator"`
UpdateAt int64 `gorm:"not null;default:0;comment:update time"`
UpdateBy string `gorm:"size:191;not null;default:'';comment:updater"`
}
func (InitMetricFilter) TableName() string {
return "metric_filter"
}
func (InitMetricFilter) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitSqliteMetricFilter struct {
ID uint64 `gorm:"primaryKey;autoIncrement;comment:unique identifier"`
Name string `gorm:"size:191;not null;comment:name of metric filter;index:idx_name_metric_filter_sqlite"`
Configs string `gorm:"size:4096;not null;comment:configuration of metric filter"`
GroupsPerm string `gorm:"type:text"`
CreateAt int64 `gorm:"not null;default:0;comment:create time"`
CreateBy string `gorm:"size:191;not null;default:'';comment:creator"`
UpdateAt int64 `gorm:"not null;default:0;comment:update time"`
UpdateBy string `gorm:"size:191;not null;default:'';comment:updater"`
}
func (InitSqliteMetricFilter) TableName() string {
return "metric_filter"
}
type InitTargetBusiGroup struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
TargetIdent string `gorm:"size:191;not null;uniqueIndex:idx_target_group"`
GroupID uint64 `gorm:"not null;uniqueIndex:idx_target_group"`
UpdateAt int64 `gorm:"not null"`
}
func (InitTargetBusiGroup) TableName() string {
return "target_busi_group"
}
func (InitTargetBusiGroup) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitTaskMeta struct {
ID uint64 `gorm:"primaryKey;autoIncrement"`
Title string `gorm:"size:255;not null;default:''"`
Account string `gorm:"size:64;not null"`
Batch uint `gorm:"not null;default:0"`
Tolerance uint `gorm:"not null;default:0"`
Timeout uint `gorm:"not null;default:0"`
Pause string `gorm:"size:255;not null;default:''"`
Script string `gorm:"type:text;not null"`
Args string `gorm:"size:512;not null;default:''"`
Stdin string `gorm:"size:1024;not null;default:''"`
Creator string `gorm:"size:64;not null;default:'';index"`
Created time.Time `gorm:"column:created;not null;default:CURRENT_TIMESTAMP;type:timestamp;index" json:"created"`
}
func (InitTaskMeta) TableName() string {
return "task_meta"
}
func (InitTaskMeta) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitTaskAction struct {
ID uint64 `gorm:"primaryKey"`
Action string `gorm:"size:32;not null"`
Clock int64 `gorm:"not null;default:0"`
}
func (InitTaskAction) TableName() string {
return "task_action"
}
func (InitTaskAction) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitTaskScheduler struct {
ID uint64 `gorm:"primaryKey;index"`
Scheduler string `gorm:"size:128;not null;default:'';index"`
}
func (InitTaskScheduler) TableName() string {
return "task_scheduler"
}
func (InitTaskScheduler) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitTaskSchedulerHealth struct {
Scheduler string `gorm:"size:128;not null;uniqueIndex"`
Clock int64 `gorm:"not null;index"`
}
func (InitTaskSchedulerHealth) TableName() string {
return "task_scheduler_health"
}
func (InitTaskSchedulerHealth) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitTaskHostDoing struct {
ID uint64 `gorm:"primaryKey;index"`
Host string `gorm:"size:128;not null;index"`
Clock int64 `gorm:"not null;default:0"`
Action string `gorm:"size:16;not null"`
}
func (InitTaskHostDoing) TableName() string {
return "task_host_doing"
}
func (InitTaskHostDoing) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitTaskHost struct {
II uint64 `gorm:"primaryKey;autoIncrement"`
ID uint64 `gorm:"not null;uniqueIndex:id_host"`
Host string `gorm:"size:128;not null;uniqueIndex:id_host"`
Status string `gorm:"size:32;not null"`
Stdout string `gorm:"type:text"`
Stderr string `gorm:"type:text"`
}
func (InitTaskHost) TableName() string {
return "task_host_0"
}
func (InitTaskHost) TableOptions() string {
return "ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"
}
type InitSqliteTaskHost struct {
II uint64 `gorm:"primaryKey;autoIncrement"`
ID uint64 `gorm:"not null;"`
Host string `gorm:"size:128;not null;"`
Status string `gorm:"size:32;not null"`
Stdout string `gorm:"type:text"`
Stderr string `gorm:"type:text"`
}
func (InitSqliteTaskHost) TableName() string {
return "task_host_0"
}
func DataBaseInit(c DBConfig, db *gorm.DB) error {
switch strings.ToLower(c.DBType) {
case "mysql":
return mysqlDataBaseInit(db)
case "postgres":
return postgresDataBaseInit(db)
case "sqlite":
return sqliteDataBaseInit(db)
default:
return fmt.Errorf("unsupported database type: %s", c.DBType)
}
}
func sqliteDataBaseInit(db *gorm.DB) error {
dts := []interface{}{
&InitTaskMeta{},
&InitTaskAction{},
&InitTaskScheduler{},
&InitTaskSchedulerHealth{},
&InitTaskHostDoing{},
&InitSqliteTaskHost{},
&InitBoardBusiGroup{},
&InitBuiltinComponent{},
&InitBuiltinPayload{},
&InitNotificationRecord{},
&InitTaskTpl{},
&InitTaskTplHost{},
&InitTaskRecord{},
&InitAlertingEngine{},
&InitDatasource{},
&InitBuiltinCate{},
&InitNotifyTpl{},
&InitSSOConfig{},
&InitSqliteESIndexPattern{},
&InitSqliteBuiltinMetric{},
&InitSqliteMetricFilter{},
&InitTargetBusiGroup{},
&InitAlertAggrView{},
&InitAlertCurEvent{},
&InitAlertHisEvent{},
&InitAlertMute{},
&InitAlertSubscribe{},
&InitTarget{},
&InitMetricView{},
&InitRecordingRule{},
&InitUser{},
&InitUserGroup{},
&InitUserGroupMember{},
&InitConfig{},
&InitRole{},
&InitRoleOperation{},
&InitBusiGroup{},
&InitBusiGroupMember{},
&InitBoard{},
&InitBoardPayload{},
&InitDashboard{},
&InitChartGroup{},
&InitChart{},
&InitChartShare{},
&InitAlertRule{}}
for _, dt := range dts {
err := db.AutoMigrate(dt)
if err != nil {
fmt.Printf("sqliteDataBaseInit AutoMigrate error: %v\n", err)
return err
}
}
for i := 1; i <= 99; i++ {
tableName := "task_host_" + strconv.Itoa(i)
err := db.Table(tableName).AutoMigrate(&InitSqliteTaskHost{})
if err != nil {
return err
}
}
roleOperations := []InitRoleOperation{
{RoleName: "Guest", Operation: "/metric/explorer"},
{RoleName: "Guest", Operation: "/object/explorer"},
{RoleName: "Guest", Operation: "/log/explorer"},
{RoleName: "Guest", Operation: "/trace/explorer"},
{RoleName: "Guest", Operation: "/help/version"},
{RoleName: "Guest", Operation: "/help/contact"},
{RoleName: "Standard", Operation: "/metric/explorer"},
{RoleName: "Standard", Operation: "/object/explorer"},
{RoleName: "Standard", Operation: "/log/explorer"},
{RoleName: "Standard", Operation: "/trace/explorer"},
{RoleName: "Standard", Operation: "/help/version"},
{RoleName: "Standard", Operation: "/help/contact"},
{RoleName: "Standard", Operation: "/help/servers"},
{RoleName: "Standard", Operation: "/help/migrate"},
{RoleName: "Standard", Operation: "/alert-rules-built-in"},
{RoleName: "Standard", Operation: "/dashboards-built-in"},
{RoleName: "Standard", Operation: "/trace/dependencies"},
{RoleName: "Standard", Operation: "/users"},
{RoleName: "Standard", Operation: "/user-groups"},
{RoleName: "Standard", Operation: "/user-groups/add"},
{RoleName: "Standard", Operation: "/user-groups/put"},
{RoleName: "Standard", Operation: "/user-groups/del"},
{RoleName: "Standard", Operation: "/busi-groups"},
{RoleName: "Standard", Operation: "/busi-groups/add"},
{RoleName: "Standard", Operation: "/busi-groups/put"},
{RoleName: "Standard", Operation: "/busi-groups/del"},
{RoleName: "Standard", Operation: "/targets"},
{RoleName: "Standard", Operation: "/targets/add"},
{RoleName: "Standard", Operation: "/targets/put"},
{RoleName: "Standard", Operation: "/targets/del"},
{RoleName: "Standard", Operation: "/dashboards"},
{RoleName: "Standard", Operation: "/dashboards/add"},
{RoleName: "Standard", Operation: "/dashboards/put"},
{RoleName: "Standard", Operation: "/dashboards/del"},
{RoleName: "Standard", Operation: "/alert-rules"},
{RoleName: "Standard", Operation: "/alert-rules/add"},
{RoleName: "Standard", Operation: "/alert-rules/put"},
{RoleName: "Standard", Operation: "/alert-rules/del"},
{RoleName: "Standard", Operation: "/alert-mutes"},
{RoleName: "Standard", Operation: "/alert-mutes/add"},
{RoleName: "Standard", Operation: "/alert-mutes/del"},
{RoleName: "Standard", Operation: "/alert-subscribes"},
{RoleName: "Standard", Operation: "/alert-subscribes/add"},
{RoleName: "Standard", Operation: "/alert-subscribes/put"},
{RoleName: "Standard", Operation: "/alert-subscribes/del"},
{RoleName: "Standard", Operation: "/alert-cur-events"},
{RoleName: "Standard", Operation: "/alert-cur-events/del"},
{RoleName: "Standard", Operation: "/alert-his-events"},
{RoleName: "Standard", Operation: "/job-tpls"},
{RoleName: "Standard", Operation: "/job-tpls/add"},
{RoleName: "Standard", Operation: "/job-tpls/put"},
{RoleName: "Standard", Operation: "/job-tpls/del"},
{RoleName: "Standard", Operation: "/job-tasks"},
{RoleName: "Standard", Operation: "/job-tasks/add"},
{RoleName: "Standard", Operation: "/job-tasks/put"},
{RoleName: "Standard", Operation: "/recording-rules"},
{RoleName: "Standard", Operation: "/recording-rules/add"},
{RoleName: "Standard", Operation: "/recording-rules/put"},
{RoleName: "Standard", Operation: "/recording-rules/del"},
}
entries := []struct {
name string
entry interface{}
}{
{
name: "InitUser",
entry: &InitUser{ID: 1, Username: "root", Nickname: "超管", Password: "root.2020", Roles: "Admin", CreateAt: time.Now().Unix(), CreateBy: "system", UpdateAt: time.Now().Unix(), UpdateBy: "system"},
},
{
name: "InitUserGroup",
entry: &InitUserGroup{ID: 1, Name: "demo-root-group", CreateAt: time.Now().Unix(), CreateBy: "root", UpdateAt: time.Now().Unix(), UpdateBy: "root"},
},
{
name: "InitUserGroupMember",
entry: &InitUserGroupMember{GroupID: 1, UserID: 1},
},
{
name: "InitRole",
entry: &InitRole{Name: "Admin", Note: "Administrator role"},
},
{
name: "InitRole",
entry: &InitRole{Name: "Standard", Note: "Ordinary user role"},
},
{
name: "InitRole",
entry: &InitRole{Name: "Guest", Note: "Readonly user role"},
},
{
name: "InitBusiGroup",
entry: &InitBusiGroup{ID: 1, Name: "Default Busi Group", CreateAt: time.Now().Unix(), CreateBy: "root", UpdateAt: time.Now().Unix(), UpdateBy: "root"},
},
{
name: "InitBusiGroupMember",
entry: &InitBusiGroupMember{BusiGroupID: 1, UserGroupID: 1, PermFlag: "rw"},
},
{
name: "InitMetricView",
entry: &InitMetricView{Name: "Host View", Cate: false, Configs: `{"filters":[{"oper":"=","label":"__name__","value":"cpu_usage_idle"}],"dynamicLabels":[],"dimensionLabels":[{"label":"ident","value":""}]}`},
},
{
name: "InitAlertAggrView",
entry: &InitAlertAggrView{Name: "By BusiGroup, Severity", Rule: "field:group_name::field:severity", Cate: false},
},
{
name: "InitAlertAggrView",
entry: &InitAlertAggrView{Name: "By RuleName", Rule: "field:rule_name", Cate: false},
},
}
for _, roleOperation := range roleOperations {
err := db.Create(&roleOperation).Error
if err != nil {
logger.Errorf("[sqlite database init]create role operation error: %v", err)
}
}
for _, entry := range entries {
if err := db.Create(entry.entry).Error; err != nil {
logger.Errorf("[sqlite database init]create %s error: %v", entry.name, err)
}
}
return nil
}
func mysqlDataBaseInit(db *gorm.DB) error {
dts := []interface{}{
&InitTaskMeta{},
&InitTaskAction{},
&InitTaskScheduler{},
&InitTaskSchedulerHealth{},
&InitTaskHostDoing{},
&InitTaskHost{},
&InitBoardBusiGroup{},
&InitBuiltinComponent{},
&InitBuiltinPayload{},
&InitNotificationRecord{},
&InitTaskTpl{},
&InitTaskTplHost{},
&InitTaskRecord{},
&InitAlertingEngine{},
&InitDatasource{},
&InitBuiltinCate{},
&InitNotifyTpl{},
&InitSSOConfig{},
&InitESIndexPattern{},
&InitBuiltinMetric{},
&InitMetricFilter{},
&InitTargetBusiGroup{},
&InitAlertAggrView{},
&InitAlertCurEvent{},
&InitAlertHisEvent{},
&InitAlertMute{},
&InitAlertSubscribe{},
&InitTarget{},
&InitMetricView{},
&InitRecordingRule{},
&InitUser{},
&InitUserGroup{},
&InitUserGroupMember{},
&InitConfig{},
&InitRole{},
&InitRoleOperation{},
&InitBusiGroup{},
&InitBusiGroupMember{},
&InitBoard{},
&InitBoardPayload{},
&InitDashboard{},
&InitChartGroup{},
&InitChart{},
&InitChartShare{},
&InitAlertRule{}}
for _, dt := range dts {
err := db.AutoMigrate(dt)
if err != nil {
logger.Errorf("mysqlDataBaseInit AutoMigrate error: %v\n", err)
}
}
for i := 1; i <= 99; i++ {
tableName := "task_host_" + strconv.Itoa(i)
err := db.Table(tableName).AutoMigrate(&InitTaskHost{})
if err != nil {
logger.Errorf("mysqlDataBaseInit AutoMigrate task_host_%d error: %v\n", i, err)
}
}
roleOperations := []InitRoleOperation{
{RoleName: "Guest", Operation: "/metric/explorer"},
{RoleName: "Guest", Operation: "/object/explorer"},
{RoleName: "Guest", Operation: "/log/explorer"},
{RoleName: "Guest", Operation: "/trace/explorer"},
{RoleName: "Guest", Operation: "/help/version"},
{RoleName: "Guest", Operation: "/help/contact"},
{RoleName: "Standard", Operation: "/metric/explorer"},
{RoleName: "Standard", Operation: "/object/explorer"},
{RoleName: "Standard", Operation: "/log/explorer"},
{RoleName: "Standard", Operation: "/trace/explorer"},
{RoleName: "Standard", Operation: "/help/version"},
{RoleName: "Standard", Operation: "/help/contact"},
{RoleName: "Standard", Operation: "/help/servers"},
{RoleName: "Standard", Operation: "/help/migrate"},
{RoleName: "Standard", Operation: "/alert-rules-built-in"},
{RoleName: "Standard", Operation: "/dashboards-built-in"},
{RoleName: "Standard", Operation: "/trace/dependencies"},
{RoleName: "Standard", Operation: "/users"},
{RoleName: "Standard", Operation: "/user-groups"},
{RoleName: "Standard", Operation: "/user-groups/add"},
{RoleName: "Standard", Operation: "/user-groups/put"},
{RoleName: "Standard", Operation: "/user-groups/del"},
{RoleName: "Standard", Operation: "/busi-groups"},
{RoleName: "Standard", Operation: "/busi-groups/add"},
{RoleName: "Standard", Operation: "/busi-groups/put"},
{RoleName: "Standard", Operation: "/busi-groups/del"},
{RoleName: "Standard", Operation: "/targets"},
{RoleName: "Standard", Operation: "/targets/add"},
{RoleName: "Standard", Operation: "/targets/put"},
{RoleName: "Standard", Operation: "/targets/del"},
{RoleName: "Standard", Operation: "/dashboards"},
{RoleName: "Standard", Operation: "/dashboards/add"},
{RoleName: "Standard", Operation: "/dashboards/put"},
{RoleName: "Standard", Operation: "/dashboards/del"},
{RoleName: "Standard", Operation: "/alert-rules"},
{RoleName: "Standard", Operation: "/alert-rules/add"},
{RoleName: "Standard", Operation: "/alert-rules/put"},
{RoleName: "Standard", Operation: "/alert-rules/del"},
{RoleName: "Standard", Operation: "/alert-mutes"},
{RoleName: "Standard", Operation: "/alert-mutes/add"},
{RoleName: "Standard", Operation: "/alert-mutes/del"},
{RoleName: "Standard", Operation: "/alert-subscribes"},
{RoleName: "Standard", Operation: "/alert-subscribes/add"},
{RoleName: "Standard", Operation: "/alert-subscribes/put"},
{RoleName: "Standard", Operation: "/alert-subscribes/del"},
{RoleName: "Standard", Operation: "/alert-cur-events"},
{RoleName: "Standard", Operation: "/alert-cur-events/del"},
{RoleName: "Standard", Operation: "/alert-his-events"},
{RoleName: "Standard", Operation: "/job-tpls"},
{RoleName: "Standard", Operation: "/job-tpls/add"},
{RoleName: "Standard", Operation: "/job-tpls/put"},
{RoleName: "Standard", Operation: "/job-tpls/del"},
{RoleName: "Standard", Operation: "/job-tasks"},
{RoleName: "Standard", Operation: "/job-tasks/add"},
{RoleName: "Standard", Operation: "/job-tasks/put"},
{RoleName: "Standard", Operation: "/recording-rules"},
{RoleName: "Standard", Operation: "/recording-rules/add"},
{RoleName: "Standard", Operation: "/recording-rules/put"},
{RoleName: "Standard", Operation: "/recording-rules/del"},
}
entries := []struct {
name string
entry interface{}
}{
{
name: "InitUser",
entry: &InitUser{ID: 1, Username: "root", Nickname: "超管", Password: "root.2020", Roles: "Admin", CreateAt: time.Now().Unix(), CreateBy: "system", UpdateAt: time.Now().Unix(), UpdateBy: "system"},
},
{
name: "InitUserGroup",
entry: &InitUserGroup{ID: 1, Name: "demo-root-group", CreateAt: time.Now().Unix(), CreateBy: "root", UpdateAt: time.Now().Unix(), UpdateBy: "root"},
},
{
name: "InitUserGroupMember",
entry: &InitUserGroupMember{GroupID: 1, UserID: 1},
},
{
name: "InitRole",
entry: &InitRole{Name: "Admin", Note: "Administrator role"},
},
{
name: "InitRole",
entry: &InitRole{Name: "Standard", Note: "Ordinary user role"},
},
{
name: "InitRole",
entry: &InitRole{Name: "Guest", Note: "Readonly user role"},
},
{
name: "InitBusiGroup",
entry: &InitBusiGroup{ID: 1, Name: "Default Busi Group", CreateAt: time.Now().Unix(), CreateBy: "root", UpdateAt: time.Now().Unix(), UpdateBy: "root"},
},
{
name: "InitBusiGroupMember",
entry: &InitBusiGroupMember{BusiGroupID: 1, UserGroupID: 1, PermFlag: "rw"},
},
{
name: "InitMetricView",
entry: &InitMetricView{Name: "Host View", Cate: false, Configs: `{"filters":[{"oper":"=","label":"__name__","value":"cpu_usage_idle"}],"dynamicLabels":[],"dimensionLabels":[{"label":"ident","value":""}]}`},
},
{
name: "InitAlertAggrView",
entry: &InitAlertAggrView{Name: "By BusiGroup, Severity", Rule: "field:group_name::field:severity", Cate: false},
},
{
name: "InitAlertAggrView",
entry: &InitAlertAggrView{Name: "By RuleName", Rule: "field:rule_name", Cate: false},
},
}
for _, roleOperation := range roleOperations {
err := db.Create(&roleOperation).Error
if err != nil {
logger.Errorf("[mysql database init]create role operation error: %v", err)
}
}
for _, entry := range entries {
if err := db.Create(entry.entry).Error; err != nil {
logger.Errorf("[mysql database init]create %s error: %v", entry.name, err)
}
}
return nil
}
func postgresDataBaseInit(db *gorm.DB) error {
dts := []interface{}{
&InitTaskMeta{},
&InitTaskAction{},
&InitTaskScheduler{},
&InitTaskSchedulerHealth{},
&InitTaskHostDoing{},
&InitTaskHost{},
&InitBoardBusiGroup{},
&InitBuiltinComponent{},
&InitpostgresBuiltinPayload{},
&InitNotificationRecord{},
&InitTaskTpl{},
&InitTaskTplHost{},
&InitTaskRecord{},
&InitAlertingEngine{},
&InitPostgresDatasource{},
&InitBuiltinCate{},
&InitNotifyTpl{},
&InitSSOConfig{},
&InitPostgresESIndexPattern{},
&InitBuiltinMetric{},
&InitMetricFilter{},
&InitTargetBusiGroup{},
&InitPostgresAlertAggrView{},
&InitPostgresAlertCurEvent{},
&InitPostgresAlertHisEvent{},
&InitPostgresAlertMute{},
&InitPostgresAlertSubscribe{},
&InitTarget{},
&InitPostgresMetricView{},
&InitPostgresRecordingRule{},
&InitPostgresUser{},
&InitUserGroup{},
&InitUserGroupMember{},
&InitPostgresConfig{},
&InitRole{},
&InitRoleOperation{},
&InitPostgresBusiGroup{},
&InitBusiGroupMember{},
&InitPostgresBoard{},
&InitPostgresBoardPayload{},
&InitDashboard{},
&InitChartGroup{},
&InitChart{},
&InitChartShare{},
&InitPostgresAlertRule{}}
for _, dt := range dts {
err := db.AutoMigrate(dt)
if err != nil {
fmt.Printf("postgresDataBaseInit AutoMigrate error: %v\n", err)
return err
}
}
for i := 1; i <= 99; i++ {
tableName := "task_host_" + strconv.Itoa(i)
err := db.Table(tableName).AutoMigrate(&InitTaskHost{})
if err != nil {
return err
}
}
roleOperations := []InitRoleOperation{
{RoleName: "Guest", Operation: "/metric/explorer"},
{RoleName: "Guest", Operation: "/object/explorer"},
{RoleName: "Guest", Operation: "/log/explorer"},
{RoleName: "Guest", Operation: "/trace/explorer"},
{RoleName: "Guest", Operation: "/help/version"},
{RoleName: "Guest", Operation: "/help/contact"},
{RoleName: "Standard", Operation: "/metric/explorer"},
{RoleName: "Standard", Operation: "/object/explorer"},
{RoleName: "Standard", Operation: "/log/explorer"},
{RoleName: "Standard", Operation: "/trace/explorer"},
{RoleName: "Standard", Operation: "/help/version"},
{RoleName: "Standard", Operation: "/help/contact"},
{RoleName: "Standard", Operation: "/help/servers"},
{RoleName: "Standard", Operation: "/help/migrate"},
{RoleName: "Standard", Operation: "/alert-rules-built-in"},
{RoleName: "Standard", Operation: "/dashboards-built-in"},
{RoleName: "Standard", Operation: "/trace/dependencies"},
{RoleName: "Standard", Operation: "/users"},
{RoleName: "Standard", Operation: "/user-groups"},
{RoleName: "Standard", Operation: "/user-groups/add"},
{RoleName: "Standard", Operation: "/user-groups/put"},
{RoleName: "Standard", Operation: "/user-groups/del"},
{RoleName: "Standard", Operation: "/busi-groups"},
{RoleName: "Standard", Operation: "/busi-groups/add"},
{RoleName: "Standard", Operation: "/busi-groups/put"},
{RoleName: "Standard", Operation: "/busi-groups/del"},
{RoleName: "Standard", Operation: "/targets"},
{RoleName: "Standard", Operation: "/targets/add"},
{RoleName: "Standard", Operation: "/targets/put"},
{RoleName: "Standard", Operation: "/targets/del"},
{RoleName: "Standard", Operation: "/dashboards"},
{RoleName: "Standard", Operation: "/dashboards/add"},
{RoleName: "Standard", Operation: "/dashboards/put"},
{RoleName: "Standard", Operation: "/dashboards/del"},
{RoleName: "Standard", Operation: "/alert-rules"},
{RoleName: "Standard", Operation: "/alert-rules/add"},
{RoleName: "Standard", Operation: "/alert-rules/put"},
{RoleName: "Standard", Operation: "/alert-rules/del"},
{RoleName: "Standard", Operation: "/alert-mutes"},
{RoleName: "Standard", Operation: "/alert-mutes/add"},
{RoleName: "Standard", Operation: "/alert-mutes/del"},
{RoleName: "Standard", Operation: "/alert-subscribes"},
{RoleName: "Standard", Operation: "/alert-subscribes/add"},
{RoleName: "Standard", Operation: "/alert-subscribes/put"},
{RoleName: "Standard", Operation: "/alert-subscribes/del"},
{RoleName: "Standard", Operation: "/alert-cur-events"},
{RoleName: "Standard", Operation: "/alert-cur-events/del"},
{RoleName: "Standard", Operation: "/alert-his-events"},
{RoleName: "Standard", Operation: "/job-tpls"},
{RoleName: "Standard", Operation: "/job-tpls/add"},
{RoleName: "Standard", Operation: "/job-tpls/put"},
{RoleName: "Standard", Operation: "/job-tpls/del"},
{RoleName: "Standard", Operation: "/job-tasks"},
{RoleName: "Standard", Operation: "/job-tasks/add"},
{RoleName: "Standard", Operation: "/job-tasks/put"},
{RoleName: "Standard", Operation: "/recording-rules"},
{RoleName: "Standard", Operation: "/recording-rules/add"},
{RoleName: "Standard", Operation: "/recording-rules/put"},
{RoleName: "Standard", Operation: "/recording-rules/del"},
}
entries := []struct {
name string
entry interface{}
}{
{
name: "InitUser",
entry: &InitPostgresUser{ID: 1, Username: "root", Nickname: "超管", Password: "root.2020", Roles: "Admin", CreateAt: time.Now().Unix(), CreateBy: "system", UpdateAt: time.Now().Unix(), UpdateBy: "system"},
},
{
name: "InitUserGroup",
entry: &InitUserGroup{ID: 1, Name: "demo-root-group", CreateAt: time.Now().Unix(), CreateBy: "root", UpdateAt: time.Now().Unix(), UpdateBy: "root"},
},
{
name: "InitUserGroupMember",
entry: &InitUserGroupMember{GroupID: 1, UserID: 1},
},
{
name: "InitRole",
entry: &InitRole{Name: "Admin", Note: "Administrator role"},
},
{
name: "InitRole",
entry: &InitRole{Name: "Standard", Note: "Ordinary user role"},
},
{
name: "InitRole",
entry: &InitRole{Name: "Guest", Note: "Readonly user role"},
},
{
name: "InitBusiGroup",
entry: &InitPostgresBusiGroup{ID: 1, Name: "Default Busi Group", CreateAt: time.Now().Unix(), CreateBy: "root", UpdateAt: time.Now().Unix(), UpdateBy: "root"},
},
{
name: "InitBusiGroupMember",
entry: &InitBusiGroupMember{BusiGroupID: 1, UserGroupID: 1, PermFlag: "rw"},
},
{
name: "InitMetricView",
entry: &InitPostgresMetricView{Name: "Host View", Cate: 0, Configs: `{"filters":[{"oper":"=","label":"__name__","value":"cpu_usage_idle"}],"dynamicLabels":[],"dimensionLabels":[{"label":"ident","value":""}]}`},
},
{
name: "InitAlertAggrView",
entry: &InitPostgresAlertAggrView{Name: "By BusiGroup, Severity", Rule: "field:group_name::field:severity", Cate: 0},
},
{
name: "InitAlertAggrView",
entry: &InitPostgresAlertAggrView{Name: "By RuleName", Rule: "field:rule_name", Cate: 0},
},
}
for _, roleOperation := range roleOperations {
err := db.Create(&roleOperation).Error
if err != nil {
logger.Errorf("[postgres database init]create role operation error: %v", err)
}
}
for _, entry := range entries {
if err := db.Create(entry.entry).Error; err != nil {
logger.Errorf("[postgres database init]create %s error: %v", entry.name, err)
}
}
return nil
}
================================================
FILE: pkg/ormx/database_init_test.go
================================================
package ormx
import (
"fmt"
"testing"
"github.com/stretchr/testify/assert"
"gorm.io/driver/mysql"
"gorm.io/driver/postgres"
"gorm.io/driver/sqlite"
"gorm.io/gorm"
)
func TestCheckPostgresDatabaseExist(t *testing.T) {
tests := []struct {
name string
config DBConfig
}{
{
name: "MySQL",
config: DBConfig{
DBType: "mysql",
DSN: "root:1234@tcp(127.0.0.1:3306)/test?charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true",
},
},
{
name: "Postgres",
config: DBConfig{
DBType: "postgres",
DSN: "host=127.0.0.1 port=5432 user=root dbname=n9e_v6 password=1234 sslmode=disable",
},
},
{
name: "SQLite",
config: DBConfig{
DBType: "sqlite",
DSN: "./test.db",
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
exist, err := checkPostgresDatabaseExist(tt.config)
fmt.Printf("exitst: %v", exist)
assert.NoError(t, err)
})
}
}
func TestDataBaseInit(t *testing.T) {
tests := []struct {
name string
config DBConfig
}{
{
name: "MySQL",
config: DBConfig{
DBType: "mysql",
DSN: "root:1234@tcp(127.0.0.1:3306)/test?charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true",
},
},
{
name: "Postgres",
config: DBConfig{
DBType: "postgres",
DSN: "host=127.0.0.1 port=5432 user=postgres dbname=test password=1234 sslmode=disable",
},
},
{
name: "SQLite",
config: DBConfig{
DBType: "sqlite",
DSN: "./test.db",
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := createDatabase(tt.config, &gorm.Config{})
assert.NoError(t, err)
var dialector gorm.Dialector
switch tt.config.DBType {
case "mysql":
dialector = mysql.Open(tt.config.DSN)
case "postgres":
dialector = postgres.Open(tt.config.DSN)
case "sqlite":
dialector = sqlite.Open(tt.config.DSN)
}
db, err := gorm.Open(dialector, &gorm.Config{})
assert.NoError(t, err)
err = DataBaseInit(tt.config, db)
assert.NoError(t, err)
})
}
}
================================================
FILE: pkg/ormx/ormx.go
================================================
package ormx
import (
"fmt"
"os"
"reflect"
"strings"
"time"
"github.com/glebarez/sqlite"
tklog "github.com/toolkits/pkg/logger"
"gorm.io/driver/mysql"
"gorm.io/driver/postgres"
"gorm.io/gorm"
"gorm.io/gorm/logger"
"gorm.io/gorm/schema"
)
// DBConfig GORM DBConfig
type DBConfig struct {
Debug bool
DBType string
DSN string
MaxLifetime int
MaxOpenConns int
MaxIdleConns int
TablePrefix string
}
var gormLogger = logger.New(
&TKitLogger{tklog.GetLogger()},
logger.Config{
SlowThreshold: 2 * time.Second,
LogLevel: logger.Warn,
IgnoreRecordNotFoundError: false,
Colorful: true,
},
)
var logLevelMap map[string]logger.LogLevel
func init() {
logLevelMap = make(map[string]logger.LogLevel, 8)
v := reflect.ValueOf(gormLogger).Elem()
logLevelMap[v.FieldByName("infoStr").String()] = logger.Info
logLevelMap[v.FieldByName("warnStr").String()] = logger.Warn
logLevelMap[v.FieldByName("errStr").String()] = logger.Error
logLevelMap[v.FieldByName("traceStr").String()] = logger.Info
logLevelMap[v.FieldByName("traceWarnStr").String()] = logger.Warn
logLevelMap[v.FieldByName("traceErrStr").String()] = logger.Error
}
type TKitLogger struct {
writer *tklog.Logger
}
func (l *TKitLogger) Printf(s string, i ...interface{}) {
level, ok := logLevelMap[s]
if !ok {
l.writer.Debugf(s, i...)
}
switch level {
case logger.Info:
l.writer.Infof(s, i...)
case logger.Warn:
l.writer.Warningf(s, i...)
case logger.Error:
l.writer.Errorf(s, i...)
default:
l.writer.Debugf(s, i...)
}
}
func createDatabase(c DBConfig, gconfig *gorm.Config) error {
switch strings.ToLower(c.DBType) {
case "mysql":
return createMysqlDatabase(c.DSN, gconfig)
case "postgres":
return createPostgresDatabase(c.DSN, gconfig)
case "sqlite":
return createSqliteDatabase(c.DSN, gconfig)
default:
return fmt.Errorf("dialector(%s) not supported", c.DBType)
}
}
func createSqliteDatabase(dsn string, gconfig *gorm.Config) error {
tempDialector := sqlite.Open(dsn)
_, err := gorm.Open(tempDialector, gconfig)
if err != nil {
return fmt.Errorf("failed to open temporary connection: %v", err)
}
fmt.Println("sqlite file created")
return nil
}
func createPostgresDatabase(dsn string, gconfig *gorm.Config) error {
dsnParts := strings.Split(dsn, " ")
dbName := ""
connectionWithoutDB := ""
for _, part := range dsnParts {
if strings.HasPrefix(part, "dbname=") {
dbName = part[strings.Index(part, "=")+1:]
} else {
connectionWithoutDB += part
connectionWithoutDB += " "
}
}
createDBQuery := fmt.Sprintf("CREATE DATABASE %s ENCODING='UTF8' LC_COLLATE='en_US.utf8' LC_CTYPE='en_US.utf8';", dbName)
tempDialector := postgres.Open(connectionWithoutDB)
tempDB, err := gorm.Open(tempDialector, gconfig)
if err != nil {
return fmt.Errorf("failed to open temporary connection: %v", err)
}
result := tempDB.Exec(createDBQuery)
if result.Error != nil {
return fmt.Errorf("failed to execute create database query: %v", result.Error)
}
return nil
}
func createMysqlDatabase(dsn string, gconfig *gorm.Config) error {
dsnParts := strings.SplitN(dsn, "/", 2)
if len(dsnParts) != 2 {
return fmt.Errorf("failed to parse DSN: %s", dsn)
}
connectionInfo := dsnParts[0]
dbInfo := dsnParts[1]
dbName := dbInfo
queryIndex := strings.Index(dbInfo, "?")
if queryIndex != -1 {
dbName = dbInfo[:queryIndex]
} else {
return fmt.Errorf("failed to parse database name from DSN: %s", dsn)
}
connectionWithoutDB := connectionInfo + "/?" + dbInfo[queryIndex+1:]
createDBQuery := fmt.Sprintf("CREATE DATABASE IF NOT EXISTS %s CHARACTER SET utf8mb4", dbName)
tempDialector := mysql.Open(connectionWithoutDB)
tempDB, err := gorm.Open(tempDialector, gconfig)
if err != nil {
return fmt.Errorf("failed to open temporary connection: %v", err)
}
result := tempDB.Exec(createDBQuery)
if result.Error != nil {
return fmt.Errorf("failed to execute create database query: %v", result.Error)
}
return nil
}
func checkDatabaseExist(c DBConfig) (bool, error) {
switch strings.ToLower(c.DBType) {
case "mysql":
return checkMysqlDatabaseExist(c)
case "postgres":
return checkPostgresDatabaseExist(c)
case "sqlite":
return checkSqliteDatabaseExist(c)
default:
return false, fmt.Errorf("dialector(%s) not supported", c.DBType)
}
}
func checkSqliteDatabaseExist(c DBConfig) (bool, error) {
if _, err := os.Stat(c.DSN); os.IsNotExist(err) {
fmt.Printf("sqlite file not exists: %s\n", c.DSN)
return false, nil
} else {
return true, nil
}
}
func checkPostgresDatabaseExist(c DBConfig) (bool, error) {
dsnParts := strings.Split(c.DSN, " ")
dbName := ""
dbpair := ""
for _, part := range dsnParts {
if strings.HasPrefix(part, "dbname=") {
dbName = part[strings.Index(part, "=")+1:]
dbpair = part
}
}
connectionStr := strings.Replace(c.DSN, dbpair, "dbname=postgres", 1)
dialector := postgres.Open(connectionStr)
gconfig := &gorm.Config{
NamingStrategy: schema.NamingStrategy{
TablePrefix: c.TablePrefix,
SingularTable: true,
},
Logger: gormLogger,
}
db, err := gorm.Open(dialector, gconfig)
if err != nil {
return false, fmt.Errorf("failed to open database: %v", err)
}
var databases []string
query := genQuery(c)
if err := db.Raw(query).Scan(&databases).Error; err != nil {
return false, fmt.Errorf("failed to query: %v", err)
}
for _, database := range databases {
if database == dbName {
fmt.Println("Database exist")
return true, nil
}
}
return false, nil
}
func checkMysqlDatabaseExist(c DBConfig) (bool, error) {
dsnParts := strings.SplitN(c.DSN, "/", 2)
if len(dsnParts) != 2 {
return false, fmt.Errorf("failed to parse DSN: %s", c.DSN)
}
connectionInfo := dsnParts[0]
dbInfo := dsnParts[1]
dbName := dbInfo
queryIndex := strings.Index(dbInfo, "?")
if queryIndex != -1 {
dbName = dbInfo[:queryIndex]
} else {
return false, fmt.Errorf("failed to parse database name from DSN: %s", c.DSN)
}
connectionWithoutDB := connectionInfo + "/?" + dbInfo[queryIndex+1:]
var dialector gorm.Dialector
switch strings.ToLower(c.DBType) {
case "mysql":
dialector = mysql.Open(connectionWithoutDB)
case "postgres":
dialector = postgres.Open(connectionWithoutDB)
default:
return false, fmt.Errorf("unsupported database type: %s", c.DBType)
}
gconfig := &gorm.Config{
NamingStrategy: schema.NamingStrategy{
TablePrefix: c.TablePrefix,
SingularTable: true,
},
Logger: gormLogger,
}
db, err := gorm.Open(dialector, gconfig)
if err != nil {
return false, fmt.Errorf("failed to open database: %v", err)
}
var databases []string
query := genQuery(c)
if err := db.Raw(query).Scan(&databases).Error; err != nil {
return false, fmt.Errorf("failed to query: %v", err)
}
for _, database := range databases {
if database == dbName {
return true, nil
}
}
return false, nil
}
func genQuery(c DBConfig) string {
switch strings.ToLower(c.DBType) {
case "mysql":
return "SHOW DATABASES"
case "postgres":
return "SELECT datname FROM pg_database"
case "sqlite":
return ""
default:
return ""
}
}
// New Create gorm.DB instance
func New(c DBConfig) (*gorm.DB, error) {
var dialector gorm.Dialector
sqliteUsed := false
switch strings.ToLower(c.DBType) {
case "mysql":
dialector = mysql.Open(c.DSN)
case "postgres":
dialector = postgres.Open(c.DSN)
case "sqlite":
dialector = sqlite.Open(c.DSN)
sqliteUsed = true
default:
return nil, fmt.Errorf("dialector(%s) not supported", c.DBType)
}
gconfig := &gorm.Config{
NamingStrategy: schema.NamingStrategy{
TablePrefix: c.TablePrefix,
SingularTable: true,
},
Logger: gormLogger,
}
dbExist, checkErr := checkDatabaseExist(c)
if checkErr != nil {
return nil, checkErr
}
if !dbExist {
fmt.Println("Database not exist, trying to create it")
createErr := createDatabase(c, gconfig)
if createErr != nil {
return nil, fmt.Errorf("failed to create database: %v", createErr)
}
db, err := gorm.Open(dialector, gconfig)
if err != nil {
return nil, fmt.Errorf("failed to reopen database after creation: %v", err)
}
err = DataBaseInit(c, db)
if err != nil {
return nil, fmt.Errorf("failed to init database: %v", err)
}
}
db, err := gorm.Open(dialector, gconfig)
if err != nil {
return nil, fmt.Errorf("failed to open database: %v", err)
}
// 检查 user 表是否存在,可能用户自己创建了空的数据库,如果不存在也执行 DataBaseInit
if dbExist && !db.Migrator().HasTable("users") {
fmt.Printf("Database exists but user table not found, initializing tables for %s\n", c.DBType)
err = DataBaseInit(c, db)
if err != nil {
return nil, fmt.Errorf("failed to init database: %v", err)
}
}
if c.Debug {
db = db.Debug()
}
sqlDB, err := db.DB()
if err != nil {
return nil, err
}
if !sqliteUsed {
sqlDB.SetMaxIdleConns(c.MaxIdleConns)
sqlDB.SetMaxOpenConns(c.MaxOpenConns)
sqlDB.SetConnMaxLifetime(time.Duration(c.MaxLifetime) * time.Second)
}
return db, nil
}
================================================
FILE: pkg/ormx/types.go
================================================
package ormx
import (
"database/sql/driver"
"encoding/json"
"errors"
"fmt"
)
type JSONObj json.RawMessage
type JSONArr json.RawMessage
// 实现 sql.Scanner 接口,Scan 将 value 扫描至 Jsonb
func (j *JSONObj) Scan(value interface{}) error {
// 判断是不是byte类型
bytes, ok := value.([]byte)
if !ok {
// 判断是不是string类型
strings, ok := value.(string)
if !ok {
return errors.New(fmt.Sprint("Failed to unmarshal JSONB value:", value))
}
// string类型转byte[]
bytes = []byte(strings)
}
result := json.RawMessage{}
err := json.Unmarshal(bytes, &result)
*j = JSONObj(result)
return err
}
// 实现 driver.Valuer 接口,Value 返回 json value
func (j JSONObj) Value() (driver.Value, error) {
if len(j) == 0 {
return nil, nil
}
return json.RawMessage(j).MarshalJSON()
}
func (j *JSONObj) MarshalJSON() ([]byte, error) {
ret := []byte(*j)
if len(ret) == 0 {
return []byte(`{}`), nil
}
// not valid json
if ret[0] == '"' {
return []byte(`{}`), nil
}
return ret, nil
}
func (j *JSONObj) UnmarshalJSON(data []byte) error {
*j = JSONObj(data)
return nil
}
// 实现 sql.Scanner 接口,Scan 将 value 扫描至 Jsonb
func (j *JSONArr) Scan(value interface{}) error {
bytes, ok := value.([]byte)
if !ok {
// 判断是不是string类型 Postgres的varchar scan出来是string类型
strings, ok := value.(string)
if !ok {
return errors.New(fmt.Sprint("Failed to unmarshal JSONB value:", value))
}
// string类型转byte[]
bytes = []byte(strings)
}
result := json.RawMessage{}
err := json.Unmarshal(bytes, &result)
*j = JSONArr(result)
return err
}
// 实现 driver.Valuer 接口,Value 返回 json value
func (j JSONArr) Value() (driver.Value, error) {
if len(j) == 0 {
return nil, nil
}
return json.RawMessage(j).MarshalJSON()
}
func (j *JSONArr) MarshalJSON() ([]byte, error) {
ret := []byte(*j)
if len(ret) == 0 {
return []byte(`[]`), nil
}
// not valid json
if ret[0] == '"' {
return []byte(`[]`), nil
}
return ret, nil
}
func (j *JSONArr) UnmarshalJSON(data []byte) error {
*j = JSONArr(data)
return nil
}
================================================
FILE: pkg/osx/osx.go
================================================
package osx
import "os"
// GetEnv returns the value of an environment variable, or returns the provided fallback value
func GetEnv(key, fallback string) string {
if value, ok := os.LookupEnv(key); ok {
return value
}
return fallback
}
================================================
FILE: pkg/parser/calc.go
================================================
package parser
import (
"regexp"
"strings"
"github.com/expr-lang/expr"
"github.com/toolkits/pkg/logger"
)
var defaultFuncMap = map[string]interface{}{
"between": between,
}
func MathCalc(s string, data map[string]interface{}) (float64, error) {
m := make(map[string]interface{})
for k, v := range data {
m[cleanStr(k)] = v
}
for k, v := range defaultFuncMap {
m[k] = v
}
// 表达式要求类型一致,否则此处编译会报错
program, err := expr.Compile(cleanStr(s), expr.Env(m))
if err != nil {
return 0, err
}
output, err := expr.Run(program, m)
if err != nil {
return 0, err
}
if result, ok := output.(float64); ok {
return result, nil
} else if result, ok := output.(bool); ok {
if result {
return 1, nil
} else {
return 0, nil
}
} else if result, ok := output.(int); ok {
return float64(result), nil
} else {
return 0, nil
}
}
func Calc(s string, data map[string]interface{}) bool {
v, err := MathCalc(s, data)
if err != nil {
logger.Errorf("Calc exp:%s data:%v error: %v", s, data, err)
return false
}
return v > 0
}
func cleanStr(s string) string {
s = replaceDollarSigns(s)
s = strings.ReplaceAll(s, "$.", "")
return s
}
func replaceDollarSigns(s string) string {
re := regexp.MustCompile(`\$([A-Z])\.`)
return re.ReplaceAllString(s, "${1}_")
}
// 自定义 expr 函数
// between 函数,判断 target 是否在 arr[0] 和 arr[1] 之间
func between(target float64, arr []interface{}) bool {
if len(arr) != 2 {
return false
}
var min, max float64
switch arr[0].(type) {
case float64:
min = arr[0].(float64)
case int:
min = float64(arr[0].(int))
default:
return false
}
switch arr[1].(type) {
case float64:
max = arr[1].(float64)
case int:
max = float64(arr[1].(int))
default:
return false
}
return target >= min && target <= max
}
func CalcWithRid(s string, data map[string]interface{}, rid int64) bool {
v, err := MathCalc(s, data)
if err != nil {
logger.Errorf("rid:%d exp:%s data:%v error: %v", rid, s, data, err)
return false
}
return v > 0
}
================================================
FILE: pkg/parser/calc_test.go
================================================
package parser
import (
"testing"
)
func TestMathCalc(t *testing.T) {
tests := []struct {
name string
expr string
data map[string]interface{}
expected float64
wantErr bool
}{
{
name: "Add and Subtract",
expr: "一个 + $.B - $.C",
data: map[string]interface{}{"一个": 1, "$.B": 2, "$.C": 3},
expected: 0,
wantErr: false,
},
{
name: "Multiply and Divide",
expr: "($A.err_count >0&& $A.err_count <=3)||($B.err_count>0 && $B.err_count <=5)",
data: map[string]interface{}{"$A.err_count": 4, "$B.err_count": 2},
expected: 1,
wantErr: false,
},
{
name: "Subtract and Add",
expr: "$.C - $.D + $.A",
data: map[string]interface{}{"$.A": 5, "$.C": 3, "$.D": 2},
expected: 6,
wantErr: false,
},
{
name: "Divide and Multiply",
expr: "$.B / $.C * $.D",
data: map[string]interface{}{"$.B": 6, "$.C": 2, "$.D": 3},
expected: 9,
wantErr: false,
},
{
name: "Divide and Multiply",
expr: "$.B / $.C * $.D",
data: map[string]interface{}{"$.B": 6, "$.C": 2, "$.D": 3},
expected: 9,
wantErr: false,
},
{
name: "Multiply and Add",
expr: "$.A * $.B + $.C",
data: map[string]interface{}{"$.A": 2, "$.B": 3, "$.C": 4},
expected: 10,
wantErr: false,
},
{
name: "Subtract and Divide",
expr: "$.D - $.A / $.B",
data: map[string]interface{}{"$.D": 10, "$.A": 4, "$.B": 2},
expected: 8,
wantErr: false,
},
{
name: "Add, Subtract and Subtract",
expr: "$.C + $.D - $.A",
data: map[string]interface{}{"$.C": 3, "$.D": 4, "$.A": 5},
expected: 2,
wantErr: false,
},
{
name: "Multiply and Subtract",
expr: "$.B * $.A - $.D",
data: map[string]interface{}{"$.B": 2, "$.A": 3, "$.D": 4},
expected: 2,
wantErr: false,
},
{
name: "Divide and Add",
expr: "$.A / $.B + $.C",
data: map[string]interface{}{"$.A": 4, "$.B": 2, "$.C": 3},
expected: 5,
wantErr: false,
},
{
name: "Add and Multiply",
expr: "$.D + $.A * $.B",
data: map[string]interface{}{"$.D": 1, "$.A": 2, "$.B": 3},
expected: 7,
wantErr: false,
},
{
name: "Divide and Add with Parentheses",
expr: "($A / $B) + ($C * $D)",
data: map[string]interface{}{"$A": 4, "$B": 2, "$C": 1, "$D": 3},
expected: 5.0,
wantErr: false,
},
{
name: "Divide with Parentheses",
expr: "($.A - $.B) / ($.C + $.D)",
data: map[string]interface{}{"$.A": 6, "$.B": 2, "$.C": 3, "$.D": 1},
expected: 1.0,
wantErr: false,
},
{
name: "Add and Multiply with Parentheses",
expr: "($.A + $.B) * ($.C - $.D)",
data: map[string]interface{}{"$.A": 8, "$.B": 2, "$.C": 4, "$.D": 2},
expected: 20,
wantErr: false,
},
{
name: "Divide and Multiply with Parentheses",
expr: "($.A * $.B) / ($.C - $.D)",
data: map[string]interface{}{"$.A": 8, "$.B": 2, "$.C": 4, "$.D": 2},
expected: 8,
wantErr: false,
},
{
name: "Add and Divide with Parentheses",
expr: "$.A + ($.B * $.C) / $.D",
data: map[string]interface{}{"$.A": 1, "$.B": 2, "$.C": 3, "$.D": 4},
expected: 2.5,
wantErr: false,
},
{
name: "Subtract and Multiply with Parentheses",
expr: "($.A + $.B) - ($.C * $.D)",
data: map[string]interface{}{"$.A": 5, "$.B": 2, "$.C": 3, "$.D": 1},
expected: 4,
wantErr: false,
},
{
name: "Multiply and Divide with Parentheses",
expr: "$.A / ($.B - $.C) * $.D",
data: map[string]interface{}{"$.A": 4, "$.B": 3, "$.C": 2, "$.D": 5},
expected: 20.0,
wantErr: false,
},
{
name: "Multiply and Divide with Parentheses 2",
expr: "($.A - $.B) * ($.C / $.D)",
data: map[string]interface{}{"$.A": 3, "$.B": 1, "$.C": 2, "$.D": 4},
expected: 1.0,
wantErr: false,
},
{
name: "Complex expression",
expr: "$.A/$.B*$.D",
data: map[string]interface{}{"$.A": 1, "$.B": 2, "$.C": 3, "$.D": 4},
expected: 2,
wantErr: false,
},
{
name: "Complex expression",
expr: "$.A/$.B*$.C",
data: map[string]interface{}{"$.A": 2, "$.B": 2, "$.C": 2},
expected: 2,
wantErr: false,
},
{
name: "Complex expression",
expr: "$.A/($.B*$.C)",
data: map[string]interface{}{"$.A": 2, "$.B": 2, "$.C": 2},
expected: 0.5,
wantErr: false,
},
{
name: "Addition",
expr: "$.A + $.B",
data: map[string]interface{}{"$.A": 2, "$.B": 3},
expected: 5,
wantErr: false,
},
{
name: "Subtraction",
expr: "$.A - $.B",
data: map[string]interface{}{"$.A": 5, "$.B": 3},
expected: 2,
wantErr: false,
},
{
name: "Multiplication",
expr: "$.A * $.B",
data: map[string]interface{}{"$.A": 4, "$.B": 3},
expected: 12,
wantErr: false,
},
{
name: "Division",
expr: "$.A / $.B",
data: map[string]interface{}{"$.A": 10, "$.B": 2},
expected: 5,
wantErr: false,
},
{
name: "Mixed operations",
expr: "($.A + $.B) * ($.C - $.D)",
data: map[string]interface{}{"$.A": 1, "$.B": 2, "$.C": 5, "$.D": 3},
expected: 6, // Corrected from 9 to 6
wantErr: false,
},
{
name: "Parentheses",
expr: "($.A + $.B) / ($.C - $.D)",
data: map[string]interface{}{"$.A": 6, "$.B": 4, "$.C": 10, "$.D": 2},
expected: 1.25, // Corrected from 2.5 to 1.25
wantErr: false,
},
{
name: "Add and Multiply with Parentheses for float64 and int",
expr: "($.A + $.B) * ($.C - $.D)",
data: map[string]interface{}{"$.A": 8.0, "$.B": 2.0, "$.C": 4.0, "$.D": 2},
expected: 20,
wantErr: false,
},
{
name: "Divide and Multiply with Parentheses for float64 and int",
expr: "($.A * $.B) / ($.C - $.D)",
data: map[string]interface{}{"$.A": 8, "$.B": 2, "$.C": 4.0, "$.D": 2},
expected: 8,
wantErr: false,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
// Run the MathCalc function
result, err := MathCalc(tc.expr, tc.data)
// Check for expected errors
if tc.wantErr {
if err == nil {
t.Errorf("Expected an error for expr '%s', but got none:%v", tc.expr, result)
}
return
}
// If an error is not expected, but occurs, fail the test
if err != nil {
t.Fatalf("Unexpected error for expr '%s' data:%v err:%v", tc.expr, tc.data, err)
}
// Compare the expected result with the actual result
if result != tc.expected {
t.Errorf("Expected result for expr '%s' to be %v, got %v", tc.expr, tc.expected, result)
}
})
}
}
func TestCalc(t *testing.T) {
tests := []struct {
name string
expr string
data map[string]interface{}
expected bool
}{
{
name: "Greater than - true",
expr: "$.A > $.B",
data: map[string]interface{}{"$.A": 5, "$.B": 3},
expected: true,
},
{
name: "Multiply and Subtract with Parentheses",
expr: "$A.yesterday_rate > 0.1 && $A.last_week_rate>0.1 or ($A.今天 >300 || $A.昨天>300 || $A.上周今天 > 300)",
data: map[string]interface{}{"$A.yesterday_rate": 0.1, "$A.last_week_rate": 2, "$A.今天": 200.4, "$A.昨天": 200.4, "$A.上周今天": 200.4},
expected: false,
},
{
name: "Count Greater Than Zero with Code",
expr: "$A.count > 0",
data: map[string]interface{}{"$A.count": 197, "$A.code": 30000},
expected: true,
},
{
name: "Today, Yesterday, and Lastweek Rate Comparison",
expr: "$A.todayRate<0.3 && $A.yesterdayRate<0.3 && $A.lastweekRate<0.3",
data: map[string]interface{}{"$A.todayRate": 1.1, "$A.yesterdayRate": 0.8, "$A.lastweekRate": 1.2},
expected: false,
},
{
name: "Today, Yesterday, and Lastweek Rate Low Threshold",
expr: "$A.todayRate<0.1 && $A.yesterdayRate<0.1 && $A.lastweekRate<0.1",
data: map[string]interface{}{"$A.todayRate": 0.9, "$A.yesterdayRate": 0.8, "$A.lastweekRate": 0.9},
expected: false,
},
{
name: "Agent Specific Today, Yesterday, and Lastweek Rate Comparison",
expr: "$A.agent == 11 && $A.todayRate<0.3 && $A.yesterdayRate<0.3 && $A.lastweekRate<0.3",
data: map[string]interface{}{"$A.agent": 11, "$A.todayRate": 0.9, "$A.yesterdayRate": 0.9, "$A.lastweekRate": 1},
expected: false,
},
{
name: "Today, Yesterday, and Lastweek Rate Below 0.1 - Case 1",
expr: "$A<0.1 && $A.yesterdayRate<0.1 && $A.lastweekRate<0.1",
data: map[string]interface{}{"$A": 0.8, "$A.yesterdayRate": 0.9, "$A.lastweekRate": 0.9},
expected: false,
},
{
name: "Today, Yesterday, and Lastweek Rate Below 0.1 - Case 2",
expr: "$A.today_rate<0.1 && $A.yesterday_rate<0.1 && $A.lastweek_rate<0.1",
data: map[string]interface{}{"$A.today_rate": 0.9, "$A.yesterday_rate": 0.9, "$A.lastweek_rate": 0.9},
expected: false,
},
{
name: "Today, Yesterday, and Lastweek Rate Below 0.1 - Case 3",
expr: "$B.today_rate<0.1 && $A.yesterday_rate<0.1 && $A.lastweek_rate<0.1",
data: map[string]interface{}{"$B.today_rate": 0.5, "$A.yesterday_rate": 0.9, "$A.lastweek_rate": 0.8},
expected: false,
},
{
name: "Yesterday and Byesterday Rates Logical Conditions - Case 1",
expr: "($A.yesterday_rate > 2 && $A.byesterday_rate > 2) or ($A.yesterday_rate <= 0.7 && $A.byesterday_rate <= 0.7)",
data: map[string]interface{}{"$A.yesterday_rate": 3, "$A.byesterday_rate": 3},
expected: true,
},
{
name: "Yesterday and Byesterday Rates Higher Thresholds - Case 1",
expr: "($A.yesterday_rate > 1.5 && $A.byesterday_rate > 1.5) or ($A.yesterday_rate <= 0.8 && $A.byesterday_rate <= 0.8)",
data: map[string]interface{}{"$A.yesterday_rate": 1.08, "$A.byesterday_rate": 1.02},
expected: false,
},
{
name: "Greater than - false",
expr: "($A.yesterday_rate > 1.0 && $A.byesterday_rate > 1.0 ) or ($A.yesterday_rate <= 0.9 && $A.byesterday_rate <= 0.9)",
data: map[string]interface{}{"$A.byesterday_rate": 0.33, "$A.yesterday_rate": 2},
expected: false,
},
{
name: "Less than - true",
expr: "$A.count > 100 or $A.count2 > -3",
data: map[string]interface{}{"$A.count": 5, "$A.count2": -1, "$.D": 2},
expected: true,
},
{
name: "Less than - false",
expr: "$.A < $.B/$.B*4",
data: map[string]interface{}{"$.A": 5, "$.B": 3},
expected: false,
},
{
name: "Greater than or equal - true",
expr: "$.A >= $.B",
data: map[string]interface{}{"$.A": 3, "$.B": 3},
expected: true,
},
{
name: "Less than or equal - true",
expr: "$.A <= $.B",
data: map[string]interface{}{"$.A": 2, "$.B": 2},
expected: true,
},
{
name: "Not equal - true",
expr: "$.A != $.B",
data: map[string]interface{}{"$.A": 3, "$.B": 2},
expected: true,
},
{
name: "Not equal - false",
expr: "$.A != $.B",
data: map[string]interface{}{"$.A": 2, "$.B": 2},
expected: false,
},
{
name: "Addition resulting in true",
expr: "$.A + $.B > $.C",
data: map[string]interface{}{"$.A": 3, "$.B": 2, "$.C": 4},
expected: true,
},
{
name: "Subtraction resulting in false",
expr: "$.A - $.B < $.C",
data: map[string]interface{}{"$.A": 1, "$.B": 3, "$.C": 1},
expected: true,
},
{
name: "Multiplication resulting in true",
expr: "$.A * $.B > $.C",
data: map[string]interface{}{"$.A": 2, "$.B": 3, "$.C": 5},
expected: true,
},
{
name: "Division resulting in false",
expr: "$.A / $.B*$.C < $.C",
data: map[string]interface{}{"$.A": 4, "$.B": 2, "$.C": 2},
expected: false,
},
{
name: "Addition with parentheses resulting in true",
expr: "($.A + $.B) > $.C && $.A >0",
data: map[string]interface{}{"$.A": 1, "$.B": 4, "$.C": 4},
expected: true,
},
{
name: "Addition with parentheses resulting in true",
expr: "($.A + $.B) > $.C || $.A < 0",
data: map[string]interface{}{"$.A": 1, "$.B": 4, "$.C": 4},
expected: true,
},
{
name: "Complex expression with parentheses resulting in false",
expr: "($.A + $.B) * $.C < $.D",
data: map[string]interface{}{"$.A": 1, "$.B": 2, "$.C": 3, "$.D": 10},
expected: true,
},
{
name: "Nested parentheses resulting in true",
expr: "($.A + ($.B - $.C)) * $.D > $.E",
data: map[string]interface{}{"$.A": 2, "$.B": 5, "$.C": 2, "$.D": 2, "$.E": 8},
expected: true,
},
{
name: "Division with parentheses resulting in false",
expr: " ( true || false ) && true",
data: map[string]interface{}{"$A": 673601, "$A.": 673601, "$B": 250218, "$C": 456513, "$C.": 456513, "$D": 456513, "$D.": 456513},
expected: true,
},
// $A:673601.5 $A.:673601.5 $B:361520 $B.:361520 $C:456513 $C.:456513 $D:422634 $D.:422634]
{
name: "Greater than or equal for string - true",
expr: "$.A >= $.B",
data: map[string]interface{}{"$.A": "123", "$.B": "123"},
expected: true,
},
{
name: "Less than or equal - true",
expr: "$.A <= $.B",
data: map[string]interface{}{"$.A": "abc", "$.B": "abc"},
expected: true,
},
{
name: "Not equal - true",
expr: "$.A != $.B",
data: map[string]interface{}{"$.A": "abcde", "$.B": "abcdf"},
expected: true,
},
{
name: "Not equal - false",
expr: "$.A != $.B",
data: map[string]interface{}{"$.A": "!@#$qwer1234", "$.B": "!@#$qwer1234"},
expected: false,
},
{
name: "In operation for string resulting in false",
expr: `$.A in ["admin", "moderator"]`,
data: map[string]interface{}{"$.A": "admin1"},
expected: false,
},
{
name: "In operation for string resulting in true",
expr: `$.A in ["admin", "moderator"]`,
data: map[string]interface{}{"$.A": "admin"},
expected: true,
},
{
name: "In operation for int resulting in false",
expr: `$.A not in [1, 2, 3]`,
data: map[string]interface{}{"$.A": 2},
expected: false,
},
{
name: "In operation for int resulting in true",
expr: `$.A not in [1, 2, 3]`,
data: map[string]interface{}{"$.A": 5},
expected: true,
},
{
name: "Contains operation resulting in true",
expr: `$.A contains $.B`,
data: map[string]interface{}{"$.A": "hello world", "$.B": "world"},
expected: true,
},
{
name: "Contains operation resulting in false",
expr: `$.A contains $.B`,
data: map[string]interface{}{"$.A": "hello world", "$.B": "go"},
expected: false,
},
{
name: "Contains operation resulting in false",
expr: `$.A not contains $.B`,
data: map[string]interface{}{"$.A": "hello world", "$.B": "world"},
expected: false,
},
{
name: "Contains operation resulting in true",
expr: `$.A not contains $.B`,
data: map[string]interface{}{"$.A": "hello world", "$.B": "go"},
expected: true,
},
{
name: "regex operation resulting in true",
expr: `$.A matches $.B`,
data: map[string]interface{}{"$.A": "123", "$.B": "^[0-9]+$"},
expected: true,
},
{
name: "regex operation resulting in false",
expr: `$.A matches $.B`,
data: map[string]interface{}{"$.A": "abc", "$.B": "^[0-9]+$"},
expected: false,
},
{
name: "between function resulting in true",
expr: `between($.A, [100,200])`,
data: map[string]interface{}{"$.A": 155.0},
expected: true,
},
{
name: "between function resulting in false",
expr: `not between($.A, [100.3,200.3])`,
data: map[string]interface{}{"$.A": 155.1},
expected: false,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
result := Calc(tc.expr, tc.data)
if result != tc.expected {
t.Errorf("Expected result for expr '%s' to be %v, got %v", tc.expr, tc.expected, result)
}
})
}
}
================================================
FILE: pkg/poster/post.go
================================================
package poster
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"math/rand"
"net/http"
"os"
"strings"
"time"
"github.com/ccfos/nightingale/v6/conf"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/toolkits/pkg/logger"
)
type DataResponse[T any] struct {
Dat T `json:"dat"`
Err string `json:"err"`
}
func GetByUrls[T any](ctx *ctx.Context, path string) (T, error) {
addrs := ctx.CenterApi.Addrs
if len(addrs) == 0 {
var dat T
return dat, fmt.Errorf("no center api addresses configured")
}
// 随机选择起始位置
startIdx := rand.Intn(len(addrs))
// 从随机位置开始遍历所有地址
var dat T
var err error
for i := 0; i < len(addrs); i++ {
idx := (startIdx + i) % len(addrs)
url := fmt.Sprintf("%s%s", addrs[idx], path)
dat, err = GetByUrl[T](url, ctx.CenterApi)
if err != nil {
logger.Warningf("failed to get data from center, url: %s, err: %v", url, err)
continue
}
return dat, nil
}
return dat, fmt.Errorf("failed to get data from center, path= %s, addrs= %v err: %v", path, addrs, err)
}
func GetByUrl[T any](url string, cfg conf.CenterApi) (T, error) {
var dat T
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return dat, fmt.Errorf("failed to create request: %w", err)
}
if len(cfg.BasicAuthUser) > 0 {
req.SetBasicAuth(cfg.BasicAuthUser, cfg.BasicAuthPass)
}
if cfg.Timeout < 1 {
cfg.Timeout = 5000
}
client := &http.Client{
Timeout: time.Duration(cfg.Timeout) * time.Millisecond,
}
if UseProxy(url) {
client.Transport = ProxyTransporter
}
resp, err := client.Do(req)
if err != nil {
return dat, fmt.Errorf("failed to fetch from url: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return dat, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return dat, fmt.Errorf("failed to read response body: %w", err)
}
var dataResp DataResponse[T]
err = json.Unmarshal(body, &dataResp)
if err != nil {
return dat, fmt.Errorf("failed to decode:%s response: %w", string(body), err)
}
if dataResp.Err != "" {
return dat, fmt.Errorf("error from server: %s", dataResp.Err)
}
logger.Debugf("get data from %s, data: %+v", url, dataResp.Dat)
return dataResp.Dat, nil
}
func PostByUrls(ctx *ctx.Context, path string, v interface{}) error {
addrs := ctx.CenterApi.Addrs
if len(addrs) == 0 {
return fmt.Errorf("submission of the POST request from the center has failed, "+
"path= %s, v= %v, ctx.CenterApi.Addrs= %v", path, v, addrs)
}
// 随机选择起始位置
startIdx := rand.Intn(len(addrs))
// 从随机位置开始遍历所有地址
for i := 0; i < len(addrs); i++ {
idx := (startIdx + i) % len(addrs)
url := fmt.Sprintf("%s%s", addrs[idx], path)
_, err := PostByUrl[interface{}](url, ctx.CenterApi, v)
if err != nil {
logger.Warningf("failed to post data to center, url: %s, err: %v", url, err)
continue
}
return nil
}
return fmt.Errorf("failed to post data to center, path= %s, addrs= %v", path, addrs)
}
func PostByUrlsWithResp[T any](ctx *ctx.Context, path string, v interface{}) (t T, err error) {
addrs := ctx.CenterApi.Addrs
if len(addrs) < 1 {
err = fmt.Errorf("submission of the POST request from the center has failed, "+
"path= %s, v= %v, ctx.CenterApi.Addrs= %v", path, v, addrs)
return
}
// 随机选择起始位置
startIdx := rand.Intn(len(addrs))
// 从随机位置开始遍历所有地址
for i := 0; i < len(addrs); i++ {
idx := (startIdx + i) % len(addrs)
url := fmt.Sprintf("%s%s", addrs[idx], path)
t, err = PostByUrl[T](url, ctx.CenterApi, v)
if err != nil {
logger.Warningf("failed to post data to center, url: %s, err: %v", url, err)
continue
}
return t, nil
}
return t, fmt.Errorf("failed to post data to center, path= %s, addrs= %v err: %v", path, addrs, err)
}
func PostByUrl[T any](url string, cfg conf.CenterApi, v interface{}) (t T, err error) {
var bs []byte
bs, err = json.Marshal(v)
if err != nil {
return
}
bf := bytes.NewBuffer(bs)
if cfg.Timeout < 1 {
cfg.Timeout = 5000
}
client := http.Client{
Timeout: time.Duration(cfg.Timeout) * time.Millisecond,
}
if UseProxy(url) {
client.Transport = ProxyTransporter
}
req, err := http.NewRequest("POST", url, bf)
if err != nil {
return t, fmt.Errorf("failed to create request %q: %w", url, err)
}
req.Header.Set("Content-Type", "application/json")
if len(cfg.BasicAuthUser) > 0 {
req.SetBasicAuth(cfg.BasicAuthUser, cfg.BasicAuthPass)
}
resp, err := client.Do(req)
if err != nil {
return t, fmt.Errorf("failed to fetch from url: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return t, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return t, fmt.Errorf("failed to read response body: %w", err)
}
var dataResp DataResponse[T]
err = json.Unmarshal(body, &dataResp)
if err != nil {
return t, fmt.Errorf("failed to decode response: %w", err)
}
if dataResp.Err != "" {
return t, fmt.Errorf("error from server: %s", dataResp.Err)
}
logger.Debugf("get data from %s, data: %+v", url, dataResp.Dat)
return dataResp.Dat, nil
}
var ProxyTransporter = &http.Transport{
Proxy: http.ProxyFromEnvironment,
}
func UseProxy(url string) bool {
// N9E_PROXY_URL=oapi.dingtalk.com,feishu.com
patterns := os.Getenv("N9E_PROXY_URL")
if patterns != "" {
// 说明要让某些 URL 走代理
for _, u := range strings.Split(patterns, ",") {
u = strings.TrimSpace(u)
if u == "" {
continue
}
if strings.Contains(url, u) {
return true
}
}
}
return false
}
func PostJSON(url string, timeout time.Duration, v interface{}, retries ...int) (response []byte, code int, err error) {
var bs []byte
bs, err = json.Marshal(v)
if err != nil {
return
}
bf := bytes.NewBuffer(bs)
client := http.Client{
Timeout: timeout,
}
if UseProxy(url) {
client.Transport = ProxyTransporter
}
req, err := http.NewRequest("POST", url, bf)
if err != nil {
return
}
req.Header.Set("Content-Type", "application/json")
var resp *http.Response
if len(retries) > 0 {
for i := 0; i < retries[0]; i++ {
resp, err = client.Do(req)
if err == nil {
break
}
tryagain := ""
if i+1 < retries[0] {
tryagain = " try again"
}
logger.Warningf("failed to curl %s error: %s"+tryagain, url, err)
if i+1 < retries[0] {
time.Sleep(time.Millisecond * 200)
}
}
} else {
resp, err = client.Do(req)
}
if err != nil {
return
}
code = resp.StatusCode
if resp.Body != nil {
defer resp.Body.Close()
response, err = ioutil.ReadAll(resp.Body)
}
return
}
================================================
FILE: pkg/poster/post_test.go
================================================
package poster
import (
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
"github.com/ccfos/nightingale/v6/conf"
"github.com/ccfos/nightingale/v6/pkg/ctx"
)
func TestPostByUrls(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
response := DataResponse[interface{}]{Dat: "", Err: ""}
json.NewEncoder(w).Encode(response)
}))
defer server.Close()
ctx := &ctx.Context{
CenterApi: conf.CenterApi{
Addrs: []string{server.URL},
}}
if err := PostByUrls(ctx, "/v1/n9e/server-heartbeat", map[string]string{"a": "aa"}); err != nil {
t.Errorf("PostByUrls() error = %v ", err)
}
}
func TestPostByUrlsWithResp(t *testing.T) {
expected := int64(123)
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
response := DataResponse[int64]{Dat: expected, Err: ""}
json.NewEncoder(w).Encode(response)
}))
defer server.Close()
ctx := &ctx.Context{
CenterApi: conf.CenterApi{
Addrs: []string{server.URL},
}}
gotT, err := PostByUrlsWithResp[int64](ctx, "/v1/n9e/event-persist", map[string]string{"b": "bb"})
if err != nil {
t.Errorf("PostByUrlsWithResp() error = %v", err)
return
}
if gotT != expected {
t.Errorf("PostByUrlsWithResp() gotT = %v,expected = %v", gotT, expected)
}
}
================================================
FILE: pkg/prom/client_option.go
================================================
package prom
type ClientOptions struct {
Url string
BasicAuthUser string
BasicAuthPass string
Headers []string
}
================================================
FILE: pkg/prom/conv.go
================================================
package prom
import (
"fmt"
"sort"
"strings"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/promql/parser"
)
type Metric struct {
Key string `json:"key"`
Labels model.Metric `json:"labels"`
Values []SamplePair `json:"values"`
}
type SamplePair struct {
Timestamp model.Time `json:"timestamp"`
Value model.SampleValue `json:"value"`
}
func ConvertPromQL(ql string, metric Metric) (string, error) {
metrics, err := GetMetric(ql)
if err != nil {
return "", err
}
labels := LabelsWithoutMetric(metric.Labels)
for metric, metricLabel := range metrics {
newMetric := metric + labels
ql = strings.ReplaceAll(ql, metricLabel, newMetric)
}
return ql, nil
}
func AddLabelToPromQL(label, promql string) string {
if label == "" {
return promql
}
// 移除label字符串中的空格
label = strings.ReplaceAll(label, " ", "")
// 使用正则表达式匹配promql中的指标名称
metricNames, err := GetMetric(promql)
if err != nil {
return promql
}
// 遍历匹配到的指标名称
for metricName := range metricNames {
// 检查指标名称后面是否已经有label
if strings.Contains(promql, metricName+"{}") {
// exp = "metricName{}"
promql = strings.ReplaceAll(promql, metricName+"{}", metricName+label)
} else if strings.Contains(promql, metricName+"{") {
// exp = "metricName{label1=\"value1\",label2=\"value2\"}"
// 如果已经有label,则在最后一个label前面添加新的label
lb := strings.ReplaceAll(label, "}", "")
promql = strings.ReplaceAll(promql, metricName+"{", metricName+lb+",")
} else {
// exp = "metricName"
// 如果没有label,则在指标名称后面添加label
promql = strings.ReplaceAll(promql, metricName, metricName+label)
}
}
return promql
}
func GetMetric(ql string) (map[string]string, error) {
metrics := make(map[string]string)
expr, err := parser.ParseExpr(ql)
if err != nil {
return metrics, err
}
selectors := parser.ExtractSelectors(expr)
for i := 0; i < len(selectors); i++ {
var metric string
var labels []string
for j := 0; j < len(selectors[i]); j++ {
if selectors[i][j].Name == "__name__" {
metric = selectors[i][j].Value
} else {
labels = append(labels, selectors[i][j].Name+selectors[i][j].Type.String()+"\""+selectors[i][j].Value+"\"")
}
}
if len(labels) != 0 {
metrics[metric] = metric + "{" + strings.Join(labels, ",") + "}"
} else {
metrics[metric] = metric
}
}
return metrics, nil
}
func LabelsWithoutMetric(labels model.Metric) string {
_, hasName := labels[model.MetricNameLabel]
numLabels := len(labels) - 1
if !hasName {
numLabels = len(labels)
}
labelStrings := make([]string, 0, numLabels)
for label, value := range labels {
if label != model.MetricNameLabel {
labelStrings = append(labelStrings, fmt.Sprintf(`%s=%q`, label, value))
}
}
switch numLabels {
case 0:
return "{}"
default:
sort.Strings(labelStrings)
return fmt.Sprintf("{%s}", strings.Join(labelStrings, ", "))
}
}
================================================
FILE: pkg/prom/conv_test.go
================================================
package prom
import (
"testing"
)
func TestAddLabelToPromQL(t *testing.T) {
testCases := []struct {
name string
label string
promql string
expected string
}{
{
name: "Add label to PromQL without existing labels",
label: "{ident=\"dev-backup-01\"}",
promql: "sum(\n irate(container_cpu_usage_seconds_total{image!=\"\", image!~\".*pause.*\"}[3m])\n) by (pod,namespace,container,image)\n/\nsum(\n container_spec_cpu_quota/container_spec_cpu_period\n) by (pod,namespace,container,image)",
expected: "sum(\n irate(container_cpu_usage_seconds_total{ident=\"dev-backup-01\",image!=\"\", image!~\".*pause.*\"}[3m])\n) by (pod,namespace,container,image)\n/\nsum(\n container_spec_cpu_quota{ident=\"dev-backup-01\"}/container_spec_cpu_period{ident=\"dev-backup-01\"}\n) by (pod,namespace,container,image)",
},
{
name: "Add label to PromQL without existing labels",
label: "{new_label=\"value\"}",
promql: "metric_name{}",
expected: "metric_name{new_label=\"value\"}",
},
{
name: "Add label to PromQL without existing labels",
label: "",
promql: "avg without (mode,cpu) ( irate(node_cpu_seconds_total{mode=\"idle\"}[2m]) ) * 100",
expected: "avg without (mode,cpu) ( irate(node_cpu_seconds_total{mode=\"idle\"}[2m]) ) * 100",
},
{
name: "Add label to PromQL without existing labels",
label: "{new_label=\"value\"}",
promql: "metric_name",
expected: "metric_name{new_label=\"value\"}",
},
{
name: "Add label to PromQL with existing labels",
label: "{new_label=\"value\"}",
promql: "metric_name{existing_label=\"value\"}",
expected: "metric_name{new_label=\"value\",existing_label=\"value\"}",
},
{
name: "Add label with spaces to PromQL",
label: "{ new_label = \"value\" }",
promql: "metric_name",
expected: "metric_name{new_label=\"value\"}",
},
{
name: "Add label to PromQL with multiple metrics",
label: "{new_label=\"value\"}",
promql: "metric1 + metric2{existing_label=\"value\"}",
expected: "metric1{new_label=\"value\"} + metric2{new_label=\"value\",existing_label=\"value\"}",
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
result := AddLabelToPromQL(tc.label, tc.promql)
if result != tc.expected {
t.Errorf("Expected: %s, Got: %s", tc.expected, result)
}
})
}
}
================================================
FILE: pkg/prom/reader.go
================================================
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package v1 provides bindings to the Prometheus HTTP API v1:
// http://prometheus.io/docs/querying/api/
package prom
import (
"context"
"errors"
"fmt"
"math"
"net/http"
"net/url"
"strconv"
"strings"
"time"
"unsafe"
json "github.com/json-iterator/go"
"github.com/prometheus/common/model"
"github.com/prometheus/client_golang/api"
)
func init() {
json.RegisterTypeEncoderFunc("model.SamplePair", marshalPointJSON, marshalPointJSONIsEmpty)
json.RegisterTypeDecoderFunc("model.SamplePair", unMarshalPointJSON)
}
func unMarshalPointJSON(ptr unsafe.Pointer, iter *json.Iterator) {
p := (*model.SamplePair)(ptr)
if !iter.ReadArray() {
iter.ReportError("unmarshal model.SamplePair", "SamplePair must be [timestamp, value]")
return
}
t := iter.ReadNumber()
if err := p.Timestamp.UnmarshalJSON([]byte(t)); err != nil {
iter.ReportError("unmarshal model.SamplePair", err.Error())
return
}
if !iter.ReadArray() {
iter.ReportError("unmarshal model.SamplePair", "SamplePair missing value")
return
}
f, err := strconv.ParseFloat(iter.ReadString(), 64)
if err != nil {
iter.ReportError("unmarshal model.SamplePair", err.Error())
return
}
p.Value = model.SampleValue(f)
if iter.ReadArray() {
iter.ReportError("unmarshal model.SamplePair", "SamplePair has too many values, must be [timestamp, value]")
return
}
}
func marshalPointJSON(ptr unsafe.Pointer, stream *json.Stream) {
p := *((*model.SamplePair)(ptr))
stream.WriteArrayStart()
// Write out the timestamp as a float divided by 1000.
// This is ~3x faster than converting to a float.
t := int64(p.Timestamp)
if t < 0 {
stream.WriteRaw(`-`)
t = -t
}
stream.WriteInt64(t / 1000)
fraction := t % 1000
if fraction != 0 {
stream.WriteRaw(`.`)
if fraction < 100 {
stream.WriteRaw(`0`)
}
if fraction < 10 {
stream.WriteRaw(`0`)
}
stream.WriteInt64(fraction)
}
stream.WriteMore()
stream.WriteRaw(`"`)
// Taken from https://github.com/json-iterator/go/blob/master/stream_float.go#L71 as a workaround
// to https://github.com/json-iterator/go/issues/365 (jsoniter, to follow json standard, doesn't allow inf/nan)
buf := stream.Buffer()
abs := math.Abs(float64(p.Value))
fmt := byte('f')
// Note: Must use float32 comparisons for underlying float32 value to get precise cutoffs right.
if abs != 0 {
if abs < 1e-6 || abs >= 1e21 {
fmt = 'e'
}
}
buf = strconv.AppendFloat(buf, float64(p.Value), fmt, -1, 64)
stream.SetBuffer(buf)
stream.WriteRaw(`"`)
stream.WriteArrayEnd()
}
func marshalPointJSONIsEmpty(ptr unsafe.Pointer) bool {
return false
}
const (
statusAPIError = 422
apiPrefix = "/api/v1"
epAlerts = apiPrefix + "/alerts"
epAlertManagers = apiPrefix + "/alertmanagers"
epQuery = apiPrefix + "/query"
epQueryRange = apiPrefix + "/query_range"
epLabels = apiPrefix + "/labels"
epLabelValues = apiPrefix + "/label/:name/values"
epSeries = apiPrefix + "/series"
epTargets = apiPrefix + "/targets"
epTargetsMetadata = apiPrefix + "/targets/metadata"
epMetadata = apiPrefix + "/metadata"
epRules = apiPrefix + "/rules"
epSnapshot = apiPrefix + "/admin/tsdb/snapshot"
epDeleteSeries = apiPrefix + "/admin/tsdb/delete_series"
epCleanTombstones = apiPrefix + "/admin/tsdb/clean_tombstones"
epConfig = apiPrefix + "/status/config"
epFlags = apiPrefix + "/status/flags"
)
// AlertState models the state of an alert.
type AlertState string
// ErrorType models the different API error types.
type ErrorType string
// HealthStatus models the health status of a scrape target.
type HealthStatus string
// RuleType models the type of a rule.
type RuleType string
// RuleHealth models the health status of a rule.
type RuleHealth string
// MetricType models the type of a metric.
type MetricType string
const (
// Possible values for AlertState.
AlertStateFiring AlertState = "firing"
AlertStateInactive AlertState = "inactive"
AlertStatePending AlertState = "pending"
// Possible values for ErrorType.
ErrBadData ErrorType = "bad_data"
ErrTimeout ErrorType = "timeout"
ErrCanceled ErrorType = "canceled"
ErrExec ErrorType = "execution"
ErrBadResponse ErrorType = "bad_response"
ErrServer ErrorType = "server_error"
ErrClient ErrorType = "client_error"
// Possible values for HealthStatus.
HealthGood HealthStatus = "up"
HealthUnknown HealthStatus = "unknown"
HealthBad HealthStatus = "down"
// Possible values for RuleType.
RuleTypeRecording RuleType = "recording"
RuleTypeAlerting RuleType = "alerting"
// Possible values for RuleHealth.
RuleHealthGood = "ok"
RuleHealthUnknown = "unknown"
RuleHealthBad = "err"
// Possible values for MetricType
MetricTypeCounter MetricType = "counter"
MetricTypeGauge MetricType = "gauge"
MetricTypeHistogram MetricType = "histogram"
MetricTypeGaugeHistogram MetricType = "gaugehistogram"
MetricTypeSummary MetricType = "summary"
MetricTypeInfo MetricType = "info"
MetricTypeStateset MetricType = "stateset"
MetricTypeUnknown MetricType = "unknown"
)
// Error is an error returned by the API.
type Error struct {
Type ErrorType
Msg string
Detail string
}
func (e *Error) Error() string {
return fmt.Sprintf("%s: %s", e.Type, e.Msg)
}
// Range represents a sliced time range.
type Range struct {
// The boundaries of the time range.
Start, End time.Time
// The maximum time between two slices within the boundaries.
Step time.Duration
}
const (
DefaultStep = 30 * time.Second
MaxPoints = 30000
)
func (r *Range) Validate() bool {
if r.Step <= 0 {
r.Step = DefaultStep
}
if !r.End.After(r.Start) {
return false
}
dur := r.End.Sub(r.Start)
for dur/r.Step > MaxPoints {
if r.Step < time.Second {
r.Step *= 10
continue
}
if r.Step < time.Hour {
r.Step *= 60
continue
}
r.Step *= 2
}
return true
}
// API provides bindings for Prometheus's v1 API.
type API interface {
// Alerts returns a list of all active alerts.
Alerts(ctx context.Context) (AlertsResult, error)
// AlertManagers returns an overview of the current state of the Prometheus alert manager discovery.
AlertManagers(ctx context.Context) (AlertManagersResult, error)
// CleanTombstones removes the deleted data from disk and cleans up the existing tombstones.
CleanTombstones(ctx context.Context) error
// Config returns the current Prometheus configuration.
Config(ctx context.Context) (ConfigResult, error)
// DeleteSeries deletes data for a selection of series in a time range.
DeleteSeries(ctx context.Context, matches []string, startTime time.Time, endTime time.Time) error
// Flags returns the flag values that Prometheus was launched with.
Flags(ctx context.Context) (FlagsResult, error)
// LabelNames returns all the unique label names present in the block in sorted order.
LabelNames(ctx context.Context) ([]string, Warnings, error)
// LabelValues performs a query for the values of the given label.
LabelValues(ctx context.Context, label string, matches []string) (model.LabelValues, Warnings, error)
// Query performs a query for the given time.
Query(ctx context.Context, query string, ts time.Time) (model.Value, Warnings, error)
// QueryRange performs a query for the given range.
QueryRange(ctx context.Context, query string, r Range) (model.Value, Warnings, error)
// Series finds series by label matchers.
Series(ctx context.Context, matches []string, startTime time.Time, endTime time.Time) ([]model.LabelSet, Warnings, error)
// Snapshot creates a snapshot of all current data into snapshots/-
// under the TSDB's data directory and returns the directory as response.
Snapshot(ctx context.Context, skipHead bool) (SnapshotResult, error)
// Rules returns a list of alerting and recording rules that are currently loaded.
Rules(ctx context.Context) (RulesResult, error)
// Targets returns an overview of the current state of the Prometheus target discovery.
Targets(ctx context.Context) (TargetsResult, error)
// TargetsMetadata returns metadata about metrics currently scraped by the target.
TargetsMetadata(ctx context.Context, matchTarget string, metric string, limit string) ([]MetricMetadata, error)
// Metadata returns metadata about metrics currently scraped by the metric name.
Metadata(ctx context.Context, metric string, limit string) (map[string][]Metadata, error)
}
// AlertsResult contains the result from querying the alerts endpoint.
type AlertsResult struct {
Alerts []Alert `json:"alerts"`
}
// AlertManagersResult contains the result from querying the alertmanagers endpoint.
type AlertManagersResult struct {
Active []AlertManager `json:"activeAlertManagers"`
Dropped []AlertManager `json:"droppedAlertManagers"`
}
// AlertManager models a configured Alert Manager.
type AlertManager struct {
URL string `json:"url"`
}
// ConfigResult contains the result from querying the config endpoint.
type ConfigResult struct {
YAML string `json:"yaml"`
}
// FlagsResult contains the result from querying the flag endpoint.
type FlagsResult map[string]string
// SnapshotResult contains the result from querying the snapshot endpoint.
type SnapshotResult struct {
Name string `json:"name"`
}
// RulesResult contains the result from querying the rules endpoint.
type RulesResult struct {
Groups []RuleGroup `json:"groups"`
}
// RuleGroup models a rule group that contains a set of recording and alerting rules.
type RuleGroup struct {
Name string `json:"name"`
File string `json:"file"`
Interval float64 `json:"interval"`
Rules Rules `json:"rules"`
}
type Rules []interface{}
// AlertingRule models a alerting rule.
type AlertingRule struct {
Name string `json:"name"`
Query string `json:"query"`
Duration float64 `json:"duration"`
Labels model.LabelSet `json:"labels"`
Annotations model.LabelSet `json:"annotations"`
Alerts []*Alert `json:"alerts"`
Health RuleHealth `json:"health"`
LastError string `json:"lastError,omitempty"`
}
// RecordingRule models a recording rule.
type RecordingRule struct {
Name string `json:"name"`
Query string `json:"query"`
Labels model.LabelSet `json:"labels,omitempty"`
Health RuleHealth `json:"health"`
LastError string `json:"lastError,omitempty"`
}
// Alert models an active alert.
type Alert struct {
ActiveAt time.Time `json:"activeAt"`
Annotations model.LabelSet
Labels model.LabelSet
State AlertState
Value string
}
// TargetsResult contains the result from querying the targets endpoint.
type TargetsResult struct {
Active []ActiveTarget `json:"activeTargets"`
Dropped []DroppedTarget `json:"droppedTargets"`
}
// ActiveTarget models an active Prometheus scrape target.
type ActiveTarget struct {
DiscoveredLabels map[string]string `json:"discoveredLabels"`
Labels model.LabelSet `json:"labels"`
ScrapeURL string `json:"scrapeUrl"`
LastError string `json:"lastError"`
LastScrape time.Time `json:"lastScrape"`
Health HealthStatus `json:"health"`
}
// DroppedTarget models a dropped Prometheus scrape target.
type DroppedTarget struct {
DiscoveredLabels map[string]string `json:"discoveredLabels"`
}
// MetricMetadata models the metadata of a metric with its scrape target and name.
type MetricMetadata struct {
Target map[string]string `json:"target"`
Metric string `json:"metric,omitempty"`
Type MetricType `json:"type"`
Help string `json:"help"`
Unit string `json:"unit"`
}
// Metadata models the metadata of a metric.
type Metadata struct {
Type MetricType `json:"type"`
Help string `json:"help"`
Unit string `json:"unit"`
}
// queryResult contains result data for a query.
type QueryResult struct {
Type model.ValueType `json:"resultType"`
Result interface{} `json:"result"`
// The decoded value.
v model.Value
}
func (rg *RuleGroup) UnmarshalJSON(b []byte) error {
v := struct {
Name string `json:"name"`
File string `json:"file"`
Interval float64 `json:"interval"`
Rules []json.RawMessage `json:"rules"`
}{}
if err := json.Unmarshal(b, &v); err != nil {
return err
}
rg.Name = v.Name
rg.File = v.File
rg.Interval = v.Interval
for _, rule := range v.Rules {
alertingRule := AlertingRule{}
if err := json.Unmarshal(rule, &alertingRule); err == nil {
rg.Rules = append(rg.Rules, alertingRule)
continue
}
recordingRule := RecordingRule{}
if err := json.Unmarshal(rule, &recordingRule); err == nil {
rg.Rules = append(rg.Rules, recordingRule)
continue
}
return errors.New("failed to decode JSON into an alerting or recording rule")
}
return nil
}
func (r *AlertingRule) UnmarshalJSON(b []byte) error {
v := struct {
Type string `json:"type"`
}{}
if err := json.Unmarshal(b, &v); err != nil {
return err
}
if v.Type == "" {
return errors.New("type field not present in rule")
}
if v.Type != string(RuleTypeAlerting) {
return fmt.Errorf("expected rule of type %s but got %s", string(RuleTypeAlerting), v.Type)
}
rule := struct {
Name string `json:"name"`
Query string `json:"query"`
Duration float64 `json:"duration"`
Labels model.LabelSet `json:"labels"`
Annotations model.LabelSet `json:"annotations"`
Alerts []*Alert `json:"alerts"`
Health RuleHealth `json:"health"`
LastError string `json:"lastError,omitempty"`
}{}
if err := json.Unmarshal(b, &rule); err != nil {
return err
}
r.Health = rule.Health
r.Annotations = rule.Annotations
r.Name = rule.Name
r.Query = rule.Query
r.Alerts = rule.Alerts
r.Duration = rule.Duration
r.Labels = rule.Labels
r.LastError = rule.LastError
return nil
}
func (r *RecordingRule) UnmarshalJSON(b []byte) error {
v := struct {
Type string `json:"type"`
}{}
if err := json.Unmarshal(b, &v); err != nil {
return err
}
if v.Type == "" {
return errors.New("type field not present in rule")
}
if v.Type != string(RuleTypeRecording) {
return fmt.Errorf("expected rule of type %s but got %s", string(RuleTypeRecording), v.Type)
}
rule := struct {
Name string `json:"name"`
Query string `json:"query"`
Labels model.LabelSet `json:"labels,omitempty"`
Health RuleHealth `json:"health"`
LastError string `json:"lastError,omitempty"`
}{}
if err := json.Unmarshal(b, &rule); err != nil {
return err
}
r.Health = rule.Health
r.Labels = rule.Labels
r.Name = rule.Name
r.LastError = rule.LastError
r.Query = rule.Query
return nil
}
func (qr *QueryResult) UnmarshalJSON(b []byte) error {
v := struct {
Type model.ValueType `json:"resultType"`
Result json.RawMessage `json:"result"`
}{}
err := json.Unmarshal(b, &v)
if err != nil {
return err
}
switch v.Type {
case model.ValScalar:
var sv model.Scalar
err = json.Unmarshal(v.Result, &sv)
qr.v = &sv
case model.ValVector:
var vv model.Vector
err = json.Unmarshal(v.Result, &vv)
qr.v = vv
case model.ValMatrix:
var mv model.Matrix
err = json.Unmarshal(v.Result, &mv)
qr.v = mv
default:
err = fmt.Errorf("unexpected value type %q", v.Type)
}
return err
}
// NewAPI returns a new API for the client.
//
// It is safe to use the returned API from multiple goroutines.
func NewAPI(c api.Client, opt ClientOptions) API {
return &httpAPI{
client: &apiClientImpl{
client: c,
opt: opt,
},
}
}
type httpAPI struct {
client apiClient
}
func (h *httpAPI) Alerts(ctx context.Context) (AlertsResult, error) {
u := h.client.URL(epAlerts, nil)
req, err := http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return AlertsResult{}, err
}
_, body, _, err := h.client.Do(ctx, req)
if err != nil {
return AlertsResult{}, err
}
var res AlertsResult
return res, json.Unmarshal(body, &res)
}
func (h *httpAPI) AlertManagers(ctx context.Context) (AlertManagersResult, error) {
u := h.client.URL(epAlertManagers, nil)
req, err := http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return AlertManagersResult{}, err
}
_, body, _, err := h.client.Do(ctx, req)
if err != nil {
return AlertManagersResult{}, err
}
var res AlertManagersResult
return res, json.Unmarshal(body, &res)
}
func (h *httpAPI) CleanTombstones(ctx context.Context) error {
u := h.client.URL(epCleanTombstones, nil)
req, err := http.NewRequest(http.MethodPost, u.String(), nil)
if err != nil {
return err
}
_, _, _, err = h.client.Do(ctx, req)
return err
}
func (h *httpAPI) Config(ctx context.Context) (ConfigResult, error) {
u := h.client.URL(epConfig, nil)
req, err := http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return ConfigResult{}, err
}
_, body, _, err := h.client.Do(ctx, req)
if err != nil {
return ConfigResult{}, err
}
var res ConfigResult
return res, json.Unmarshal(body, &res)
}
func (h *httpAPI) DeleteSeries(ctx context.Context, matches []string, startTime time.Time, endTime time.Time) error {
u := h.client.URL(epDeleteSeries, nil)
q := u.Query()
for _, m := range matches {
q.Add("match[]", m)
}
q.Set("start", formatTime(startTime))
q.Set("end", formatTime(endTime))
u.RawQuery = q.Encode()
req, err := http.NewRequest(http.MethodPost, u.String(), nil)
if err != nil {
return err
}
_, _, _, err = h.client.Do(ctx, req)
return err
}
func (h *httpAPI) Flags(ctx context.Context) (FlagsResult, error) {
u := h.client.URL(epFlags, nil)
req, err := http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return FlagsResult{}, err
}
_, body, _, err := h.client.Do(ctx, req)
if err != nil {
return FlagsResult{}, err
}
var res FlagsResult
return res, json.Unmarshal(body, &res)
}
func (h *httpAPI) LabelNames(ctx context.Context) ([]string, Warnings, error) {
u := h.client.URL(epLabels, nil)
req, err := http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return nil, nil, err
}
_, body, w, err := h.client.Do(ctx, req)
if err != nil {
return nil, w, err
}
var labelNames []string
return labelNames, w, json.Unmarshal(body, &labelNames)
}
func (h *httpAPI) LabelValues(ctx context.Context, label string, matches []string) (model.LabelValues, Warnings, error) {
u := h.client.URL(epLabelValues, map[string]string{"name": label})
q := u.Query()
for _, m := range matches {
q.Add("match[]", m)
}
u.RawQuery = q.Encode()
req, err := http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return nil, nil, err
}
_, body, w, err := h.client.Do(ctx, req)
if err != nil {
return nil, w, err
}
var labelValues model.LabelValues
return labelValues, w, json.Unmarshal(body, &labelValues)
}
func (h *httpAPI) Query(ctx context.Context, query string, ts time.Time) (model.Value, Warnings, error) {
var err error
var warnings Warnings
var value model.Value
var statusCode int
for i := 0; i < 1; i++ {
value, warnings, statusCode, err = h.query(ctx, query, ts)
if err == nil {
return value, warnings, nil
}
// statusCode 4xx do not retry
if statusCode >= 400 && statusCode < 500 {
return nil, warnings, err
}
time.Sleep(100 * time.Millisecond)
}
return nil, warnings, err
}
func (h *httpAPI) query(ctx context.Context, query string, ts time.Time) (model.Value, Warnings, int, error) {
u := h.client.URL(epQuery, nil)
q := u.Query()
q.Set("query", query)
if !ts.IsZero() {
q.Set("time", formatTime(ts))
}
resp, body, warnings, err := h.client.DoGetFallback(ctx, u, q)
if err != nil {
return nil, warnings, 0, err
}
var qres QueryResult
return model.Value(qres.v), warnings, resp.StatusCode, json.Unmarshal(body, &qres)
}
func (h *httpAPI) QueryRange(ctx context.Context, query string, r Range) (model.Value, Warnings, error) {
u := h.client.URL(epQueryRange, nil)
q := u.Query()
q.Set("query", query)
q.Set("start", formatTime(r.Start))
q.Set("end", formatTime(r.End))
q.Set("step", strconv.FormatFloat(r.Step.Seconds(), 'f', -1, 64))
_, body, warnings, err := h.client.DoGetFallback(ctx, u, q)
if err != nil {
return nil, warnings, err
}
var qres QueryResult
return qres.v, warnings, json.Unmarshal(body, &qres)
}
func (h *httpAPI) Series(ctx context.Context, matches []string, startTime time.Time, endTime time.Time) ([]model.LabelSet, Warnings, error) {
u := h.client.URL(epSeries, nil)
q := u.Query()
for _, m := range matches {
q.Add("match[]", m)
}
q.Set("start", formatTime(startTime))
q.Set("end", formatTime(endTime))
u.RawQuery = q.Encode()
req, err := http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return nil, nil, err
}
_, body, warnings, err := h.client.Do(ctx, req)
if err != nil {
return nil, warnings, err
}
var mset []model.LabelSet
return mset, warnings, json.Unmarshal(body, &mset)
}
func (h *httpAPI) Snapshot(ctx context.Context, skipHead bool) (SnapshotResult, error) {
u := h.client.URL(epSnapshot, nil)
q := u.Query()
q.Set("skip_head", strconv.FormatBool(skipHead))
u.RawQuery = q.Encode()
req, err := http.NewRequest(http.MethodPost, u.String(), nil)
if err != nil {
return SnapshotResult{}, err
}
_, body, _, err := h.client.Do(ctx, req)
if err != nil {
return SnapshotResult{}, err
}
var res SnapshotResult
return res, json.Unmarshal(body, &res)
}
func (h *httpAPI) Rules(ctx context.Context) (RulesResult, error) {
u := h.client.URL(epRules, nil)
req, err := http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return RulesResult{}, err
}
_, body, _, err := h.client.Do(ctx, req)
if err != nil {
return RulesResult{}, err
}
var res RulesResult
return res, json.Unmarshal(body, &res)
}
func (h *httpAPI) Targets(ctx context.Context) (TargetsResult, error) {
u := h.client.URL(epTargets, nil)
req, err := http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return TargetsResult{}, err
}
_, body, _, err := h.client.Do(ctx, req)
if err != nil {
return TargetsResult{}, err
}
var res TargetsResult
return res, json.Unmarshal(body, &res)
}
func (h *httpAPI) TargetsMetadata(ctx context.Context, matchTarget string, metric string, limit string) ([]MetricMetadata, error) {
u := h.client.URL(epTargetsMetadata, nil)
q := u.Query()
q.Set("match_target", matchTarget)
q.Set("metric", metric)
q.Set("limit", limit)
u.RawQuery = q.Encode()
req, err := http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return nil, err
}
_, body, _, err := h.client.Do(ctx, req)
if err != nil {
return nil, err
}
var res []MetricMetadata
return res, json.Unmarshal(body, &res)
}
func (h *httpAPI) Metadata(ctx context.Context, metric string, limit string) (map[string][]Metadata, error) {
u := h.client.URL(epMetadata, nil)
q := u.Query()
q.Set("metric", metric)
q.Set("limit", limit)
u.RawQuery = q.Encode()
req, err := http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return nil, err
}
_, body, _, err := h.client.Do(ctx, req)
if err != nil {
return nil, err
}
var res map[string][]Metadata
return res, json.Unmarshal(body, &res)
}
// Warnings is an array of non critical errors
type Warnings []string
// apiClient wraps a regular client and processes successful API responses.
// Successful also includes responses that errored at the API level.
type apiClient interface {
URL(ep string, args map[string]string) *url.URL
Do(context.Context, *http.Request) (*http.Response, []byte, Warnings, error)
DoGetFallback(ctx context.Context, u *url.URL, args url.Values) (*http.Response, []byte, Warnings, error)
}
type apiClientImpl struct {
client api.Client
opt ClientOptions
}
type apiResponse struct {
Status string `json:"status"`
Data json.RawMessage `json:"data"`
ErrorType ErrorType `json:"errorType"`
Error string `json:"error"`
Warnings []string `json:"warnings,omitempty"`
}
func apiError(code int) bool {
// These are the codes that Prometheus sends when it returns an error.
return code == statusAPIError || code == http.StatusBadRequest
}
func errorTypeAndMsgFor(resp *http.Response) (ErrorType, string) {
switch resp.StatusCode / 100 {
case 4:
return ErrClient, fmt.Sprintf("client error: %d", resp.StatusCode)
case 5:
return ErrServer, fmt.Sprintf("server error: %d", resp.StatusCode)
}
return ErrBadResponse, fmt.Sprintf("bad response code %d", resp.StatusCode)
}
func (h *apiClientImpl) URL(ep string, args map[string]string) *url.URL {
return h.client.URL(ep, args)
}
func (h *apiClientImpl) Do(ctx context.Context, req *http.Request) (*http.Response, []byte, Warnings, error) {
if h.opt.BasicAuthUser != "" && h.opt.BasicAuthPass != "" {
req.SetBasicAuth(h.opt.BasicAuthUser, h.opt.BasicAuthPass)
}
headerCount := len(h.opt.Headers)
if headerCount > 0 && headerCount%2 == 0 {
for i := 0; i < len(h.opt.Headers); i += 2 {
req.Header.Add(h.opt.Headers[i], h.opt.Headers[i+1])
if h.opt.Headers[i] == "Host" {
req.Host = h.opt.Headers[i+1]
}
}
}
resp, body, err := h.client.Do(ctx, req)
if err != nil {
return resp, body, nil, err
}
code := resp.StatusCode
if code/100 != 2 && !apiError(code) {
errorType, errorMsg := errorTypeAndMsgFor(resp)
return resp, body, nil, &Error{
Type: errorType,
Msg: errorMsg,
Detail: string(body),
}
}
var result apiResponse
if http.StatusNoContent != code {
if jsonErr := json.Unmarshal(body, &result); jsonErr != nil {
return resp, body, nil, &Error{
Type: ErrBadResponse,
Msg: jsonErr.Error(),
}
}
}
if apiError(code) != (result.Status == "error") {
err = &Error{
Type: ErrBadResponse,
Msg: "inconsistent body for response code",
}
}
if apiError(code) && result.Status == "error" {
err = &Error{
Type: result.ErrorType,
Msg: result.Error,
}
}
return resp, []byte(result.Data), result.Warnings, err
}
// DoGetFallback will attempt to do the request as-is, and on a 405 it will fallback to a GET request.
func (h *apiClientImpl) DoGetFallback(ctx context.Context, u *url.URL, args url.Values) (*http.Response, []byte, Warnings, error) {
req, err := http.NewRequest(http.MethodPost, u.String(), strings.NewReader(args.Encode()))
if err != nil {
return nil, nil, nil, err
}
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
resp, body, warnings, err := h.Do(ctx, req)
if resp != nil && resp.StatusCode == http.StatusMethodNotAllowed {
u.RawQuery = args.Encode()
req, err = http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return nil, nil, warnings, err
}
} else {
if err != nil {
return resp, body, warnings, err
}
return resp, body, warnings, nil
}
return h.Do(ctx, req)
}
func formatTime(t time.Time) string {
return strconv.FormatFloat(float64(t.Unix())+float64(t.Nanosecond())/1e9, 'f', -1, 64)
}
================================================
FILE: pkg/prom/writer.go
================================================
package prom
import (
"bytes"
"context"
"fmt"
"net/http"
"strings"
"github.com/golang/protobuf/proto"
"github.com/golang/snappy"
"github.com/prometheus/client_golang/api"
"github.com/prometheus/prometheus/prompb"
"github.com/toolkits/pkg/logger"
)
type WriterType struct {
Opts ClientOptions
Client api.Client
}
func NewWriter(cli api.Client, opt ClientOptions) WriterType {
writer := WriterType{
Opts: opt,
Client: cli,
}
return writer
}
func (w WriterType) Write(items []prompb.TimeSeries, headers ...map[string]string) error {
if len(items) == 0 {
return nil
}
req := &prompb.WriteRequest{
Timeseries: items,
}
data, err := proto.Marshal(req)
if err != nil {
logger.Warningf("marshal prom data to proto got error: %v, data: %+v", err, items)
return nil
}
if err := w.Post(snappy.Encode(nil, data), headers...); err != nil {
logger.Warningf("%v post to %s got error: %v", w.Opts, w.Opts.Url, err)
logger.Debug("example timeseries:", items[0].String())
}
return err
}
func (w WriterType) Post(req []byte, headers ...map[string]string) error {
urls := strings.Split(w.Opts.Url, ",")
var err error
var httpReq *http.Request
for _, url := range urls {
httpReq, err = http.NewRequest("POST", url, bytes.NewReader(req))
if err != nil {
logger.Warningf("create remote write:%s request got error: %s", url, err.Error())
continue
}
httpReq.Header.Add("Content-Encoding", "snappy")
httpReq.Header.Set("Content-Type", "application/x-protobuf")
httpReq.Header.Set("User-Agent", "n9e")
httpReq.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if len(headers) > 0 {
for k, v := range headers[0] {
httpReq.Header.Set(k, v)
}
}
if w.Opts.BasicAuthUser != "" {
httpReq.SetBasicAuth(w.Opts.BasicAuthUser, w.Opts.BasicAuthPass)
}
headerCount := len(w.Opts.Headers)
if headerCount > 0 && headerCount%2 == 0 {
for i := 0; i < len(w.Opts.Headers); i += 2 {
httpReq.Header.Add(w.Opts.Headers[i], w.Opts.Headers[i+1])
if w.Opts.Headers[i] == "Host" {
httpReq.Host = w.Opts.Headers[i+1]
}
}
}
resp, body, e := w.Client.Do(context.Background(), httpReq)
if e != nil {
logger.Warningf("push data with remote write:%s request got error: %v, response body: %s", url, e, string(body))
err = e
continue
}
if resp.StatusCode >= 400 {
err = fmt.Errorf("push data with remote write:%s request got status code: %v, response body: %s", url, resp.StatusCode, string(body))
logger.Warning(err)
continue
}
break
}
return err
}
================================================
FILE: pkg/promql/parser.go
================================================
package promql
import (
"regexp"
"strings"
"github.com/VictoriaMetrics/metricsql"
"github.com/prometheus/prometheus/promql/parser"
)
func SplitBinaryOp(code string) ([]string, error) {
var lst []string
expr, err := metricsql.Parse(code)
if err != nil {
return lst, err
}
m := make(map[string]struct{})
ParseExpr(expr, false, m)
for k := range m {
lst = append(lst, k)
}
return lst, nil
}
func GetMetric(ql string) (map[string]string, error) {
metrics := make(map[string]string)
expr, err := parser.ParseExpr(ql)
if err != nil {
return metrics, err
}
selectors := parser.ExtractSelectors(expr)
for i := 0; i < len(selectors); i++ {
var metric string
var labels []string
for j := 0; j < len(selectors[i]); j++ {
if selectors[i][j].Name == "__name__" {
metric = selectors[i][j].Value
} else {
labels = append(labels, selectors[i][j].Name+selectors[i][j].Type.String()+"\""+selectors[i][j].Value+"\"")
}
}
if len(labels) != 0 {
metrics[metric] = metric + "{" + strings.Join(labels, ",") + "}"
} else {
metrics[metric] = metric
}
}
return metrics, nil
}
// GetLabels 解析PromQL查询并返回其中的所有标签和它们的值。
func GetLabels(ql string) (map[string]string, error) {
labels := make(map[string]string)
// 解析PromQL表达式
expr, err := parser.ParseExpr(ql)
if err != nil {
return labels, err
}
// 提取所有的选择器
selectors := parser.ExtractSelectors(expr)
for _, selector := range selectors {
for _, labelMatcher := range selector {
if labelMatcher.Name != "__name__" {
labels[labelMatcher.Name] = labelMatcher.Value
}
}
}
return labels, nil
}
func GetLabelsAndMetricName(ql string) (map[string]string, string, error) {
labels := make(map[string]string)
metricName := ""
// 解析PromQL表达式
expr, err := parser.ParseExpr(ql)
if err != nil {
return labels, metricName, err
}
// 提取所有的选择器
selectors := parser.ExtractSelectors(expr)
for _, selector := range selectors {
for _, labelMatcher := range selector {
if labelMatcher.Name != "__name__" {
labels[labelMatcher.Name] = labelMatcher.Value
} else {
metricName = labelMatcher.Value
}
}
}
return labels, metricName, nil
}
type Label struct {
Name string
Value string
Op string
}
func GetLabelsAndMetricNameWithReplace(ql string, rep string) (map[string]Label, string, error) {
labels := make(map[string]Label)
metricName := ""
ql = strings.ReplaceAll(ql, rep, "____")
ql = removeBrackets(ql)
// 解析PromQL表达式
expr, err := parser.ParseExpr(ql)
if err != nil {
return labels, metricName, err
}
// 提取所有的选择器
selectors := parser.ExtractSelectors(expr)
for _, selector := range selectors {
for _, labelMatcher := range selector {
labelMatcher.Value = strings.ReplaceAll(labelMatcher.Value, "____", rep)
if labelMatcher.Name != "__name__" {
label := Label{
Name: labelMatcher.Name,
Value: labelMatcher.Value,
Op: labelMatcher.Type.String(),
}
labels[labelMatcher.Name] = label
} else {
if strings.Contains(labelMatcher.Value, "$") {
continue
}
metricName = labelMatcher.Value
}
}
}
return labels, metricName, nil
}
func GetFirstMetric(ql string) (string, error) {
var metric string
expr, err := parser.ParseExpr(ql)
if err != nil {
return metric, err
}
selectors := parser.ExtractSelectors(expr)
for i := 0; i < len(selectors); i++ {
for j := 0; j < len(selectors[i]); j++ {
if selectors[i][j].Name == "__name__" {
metric = selectors[i][j].Value
return metric, nil
}
}
}
return metric, nil
}
func removeBrackets(promql string) string {
if strings.Contains(promql, "_over_time") || strings.Contains(promql, "rate") || strings.Contains(promql, "increase") ||
strings.Contains(promql, "predict_linear") || strings.Contains(promql, "resets") ||
strings.Contains(promql, "changes") || strings.Contains(promql, "holt_winters") ||
strings.Contains(promql, "delta") || strings.Contains(promql, "deriv") {
return promql
}
if !strings.Contains(promql, "[") {
return promql
}
// 使用正则表达式匹配 [xx] 形式的内容,xx 可以是任何字符序列
re := regexp.MustCompile(`\[[^\]]*\]`)
// 删除匹配到的内容
return re.ReplaceAllString(promql, "")
}
================================================
FILE: pkg/promql/perser_test.go
================================================
package promql
import (
"reflect"
"testing"
)
func TestGetMetric(t *testing.T) {
tests := []struct {
name string
ql string
want map[string]string
wantErr error
}{
{
name: "Valid query with labels",
ql: "metric_name{label1=\"value1\",label2=\"value2\"}",
want: map[string]string{"metric_name": "metric_name{label1=\"value1\",label2=\"value2\"}"},
wantErr: nil,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := GetMetric(tt.ql)
if err != tt.wantErr && err != nil {
t.Errorf("GetMetric() error = %v, wantErr %v ql:%s", err, tt.wantErr, tt.ql)
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("GetMetric() = %v, want %v", got, tt.want)
}
})
}
}
func TestGetLabels(t *testing.T) {
tests := []struct {
name string
ql string
want map[string]string
wantErr bool
}{
{
name: "Valid query with multiple labels",
ql: "metric_name{label1=\"value1\", label2=\"value2\"} > 3",
want: map[string]string{"label1": "value1", "label2": "value2"},
},
{
name: "Valid query with multiple labels",
ql: "metric_name{label1=\"$value1\", label2=\"$value2\"} > 3",
want: map[string]string{"label1": "$value1", "label2": "$value2"},
},
{
name: "Query without labels",
ql: "metric_name",
want: map[string]string{},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := GetLabels(tt.ql)
if (err != nil) != tt.wantErr {
t.Errorf("GetLabels() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("GetLabels() = %v, want %v ql:%s", got, tt.want, tt.ql)
}
})
}
}
func TestGetLabelsAndMetricNameWithReplace(t *testing.T) {
// 定义测试案例
tests := []struct {
name string
ql string
rep string
expectedLabels map[string]Label
expectedMetricName string
expectError bool
}{
{
name: "正常情况",
ql: `(snmp_arista_system_cpuuse{ent_descr="$ent_descr"} / 100 > $cpu_high_threshold[1m])`,
rep: "$",
expectedLabels: map[string]Label{
"ent_descr": {Name: "ent_descr", Value: "$ent_descr", Op: "="},
},
expectedMetricName: "snmp_arista_system_cpuuse",
expectError: false,
},
{
name: "正常情况",
ql: `rate(snmp_interface_incoming{agent_host='$agent_host',ifname='$ifname'}[2m]) * 8 / 10^9 > snmp_interface_speed{agent_host='$agent_host',ifname='$ifname'}/ 10^3 * $traffic_in and snmp_interface_speed{agent_host='$agent_host',ifname='$ifname'} > 0`,
rep: "$",
expectedLabels: map[string]Label{
"agent_host": {Name: "agent_host", Value: "$agent_host", Op: "="},
"ifname": {Name: "ifname", Value: "$ifname", Op: "="},
},
expectedMetricName: "snmp_interface_speed",
expectError: false,
},
{
name: "正常情况",
ql: `rate(snmp_interface_incoming{agent_host='$agent_host',ifname='$ifname'}[2m]) * 8 / 10^9 > snmp_interface_speed{agent_host='$agent_host',ifname='$ifname'}/ 10^3 * $traffic_in`,
rep: "$",
expectedLabels: map[string]Label{
"agent_host": {Name: "agent_host", Value: "$agent_host", Op: "="},
"ifname": {Name: "ifname", Value: "$ifname", Op: "="},
},
expectedMetricName: "snmp_interface_speed",
expectError: false,
},
{
name: "正常情况",
ql: `rate(snmp_interface_incoming{agent_host='$agent_host',ifname='$ifname'}[2m]) * 8 / 10^9 > 10`,
rep: "$",
expectedLabels: map[string]Label{
"agent_host": {Name: "agent_host", Value: "$agent_host", Op: "="},
"ifname": {Name: "ifname", Value: "$ifname", Op: "="},
},
expectedMetricName: "snmp_interface_incoming",
expectError: false,
},
{
name: "带有替换字符",
ql: `rate(snmp_interface_outgoing{Role=~'ZRT.*',agent_host='$agent_host',ifname='$ifname'}[2m]) * 8 / 10^9 > snmp_interface_speed{Role=~'ZRT.*',agent_host='$agent_host',ifname='$ifname'}/ 10^3 * $outgoing_warning and snmp_interface_speed{Role=~'ZRT.*',agent_host='$agent_host',ifname='$ifname'} > 0`,
rep: "$",
expectedLabels: map[string]Label{
"agent_host": {Name: "agent_host", Value: "$agent_host", Op: "="},
"ifname": {Name: "ifname", Value: "$ifname", Op: "="},
"Role": {Name: "Role", Value: "ZRT.*", Op: "=~"},
},
expectedMetricName: "snmp_interface_speed",
expectError: false,
},
// 更多测试案例...
{
name: "告警规则支持变量",
ql: `mem{test1="$test1", test2="$test2", test3="test3"} > $val`,
rep: "$",
expectedLabels: map[string]Label{},
expectedMetricName: "snmp_interface_speed",
expectError: false,
},
}
// 运行测试案例
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
labels, metricName, err := GetLabelsAndMetricNameWithReplace(tc.ql, tc.rep)
if (err != nil) != tc.expectError {
t.Errorf("ql:%s 测试 '%v' 发生错误: %v, 期望的错误状态: %v", tc.ql, tc.name, err, tc.expectError)
}
if !reflect.DeepEqual(labels, tc.expectedLabels) {
t.Errorf("ql:%s 测试 '%v' 返回的标签不匹配: got %v, want %v", tc.ql, tc.name, labels, tc.expectedLabels)
}
if metricName != tc.expectedMetricName {
t.Errorf("ql:%s 测试 '%v' 返回的度量名称不匹配: got %s, want %s", tc.ql, tc.name, metricName, tc.expectedMetricName)
}
})
}
}
func TestSplitBinaryOp(t *testing.T) {
tests := []struct {
name string
code string
want []string
wantErr bool
}{
{
name: "valid binary operation with spaces",
code: "cpu_usage + memory_usage",
want: []string{"cpu_usage + memory_usage"},
},
{
name: "12",
code: "cpu_usage > 0 and memory_usage>0",
want: []string{"cpu_usage", "memory_usage"},
},
{
name: "12",
code: "cpu_usage +1> 0",
want: []string{"cpu_usage + 1"},
},
{
name: "valid complex binary operation",
code: "(cpu_usage + memory_usage) / 2",
want: []string{"(cpu_usage + memory_usage) / 2"},
},
{
name: "invalid binary operation",
code: "cpu_usage + ",
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := SplitBinaryOp(tt.code)
if (err != nil) != tt.wantErr {
t.Errorf("SplitBinaryOp() code:%s error = %v, wantErr %v", tt.code, err, tt.wantErr)
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("SplitBinaryOp() got = %v, want %v", got, tt.want)
}
})
}
}
================================================
FILE: pkg/promql/promql.go
================================================
package promql
import (
"github.com/VictoriaMetrics/metricsql"
)
// copy from https://github.com/laixintao/promqlpy/blob/main/go/promql/promql.go
// ModifierExpr represents MetricsQL modifier such as ` (...)`
type ModifierExpr struct {
// Op is modifier operation.
Op string `json:"op"`
// Args contains modifier args from parens.
Args []string `json:"args"`
}
type Expression struct {
// if true, all fields are set
// if false, then it's a normal expression, only `code` is set
IsBinaryOp bool `json:"is_binary_op"`
Left *Expression `json:"left"`
Right *Expression `json:"right"`
Op string `json:"op"`
// GroupModifier contains modifier such as "on" or "ignoring".
GroupModifier ModifierExpr `json:"group_modifier"`
// JoinModifier contains modifier such as "group_left" or "group_right".
JoinModifier ModifierExpr `json:"join_modifier"`
Code string `json:"code"`
}
var compareOps = map[string]bool{
"==": true,
"!=": true,
">": true,
"<": true,
">=": true,
"<=": true,
}
var logicalOps = map[string]bool{
"and": true,
"or": true,
"unless": true,
}
// if `mustBeExpression` is true, means that the last level is compareOps
// or ready.
// example:
// (a > 10) > b
// result: a > 10 is expression, compare to b
func ParseExpr(expr metricsql.Expr, mustBeExpression bool, m map[string]struct{}) *Expression {
// I am sure it is a normal expression!
if mustBeExpression {
return &Expression{
Code: string(expr.AppendString(nil)),
IsBinaryOp: false,
}
}
if bop, ok := expr.(*metricsql.BinaryOpExpr); ok {
if logicalOps[bop.Op] {
return &Expression{
Left: ParseExpr(bop.Left, false, m),
Right: ParseExpr(bop.Right, false, m),
GroupModifier: ModifierExpr(bop.GroupModifier),
JoinModifier: ModifierExpr(bop.JoinModifier),
Op: bop.Op,
Code: string(bop.AppendString(nil)),
IsBinaryOp: true,
}
}
if compareOps[bop.Op] {
m[string(bop.Left.AppendString(nil))] = struct{}{}
return &Expression{
Left: ParseExpr(bop.Left, true, m),
Right: ParseExpr(bop.Right, true, m),
GroupModifier: ModifierExpr(bop.GroupModifier),
JoinModifier: ModifierExpr(bop.JoinModifier),
Op: bop.Op,
Code: string(bop.AppendString(nil)),
IsBinaryOp: true,
}
}
}
if len(m) == 0 {
m[string(expr.AppendString(nil))] = struct{}{}
}
// treat +,-,* etc still as normal expression
// default: just return the literal code as it is
return &Expression{
Code: string(expr.AppendString(nil)),
IsBinaryOp: false,
}
}
================================================
FILE: pkg/secu/aes.go
================================================
package secu
import (
"bytes"
"crypto/aes"
"crypto/cipher"
"encoding/base64"
"strings"
)
// BASE64StdEncode base64编码
func BASE64StdEncode(src []byte) string {
return base64.StdEncoding.EncodeToString(src)
}
// BASE64StdDecode base64解码
func BASE64StdDecode(src string) ([]byte, error) {
dst, err := base64.StdEncoding.DecodeString(src)
if err != nil {
return nil, err
}
return dst, nil
}
func PKCS7Padding(ciphertext []byte, blockSize int) []byte {
padding := blockSize - len(ciphertext)%blockSize
padtext := bytes.Repeat([]byte{byte(padding)}, padding)
return append(ciphertext, padtext...)
}
func PKCS7UnPadding(originData []byte) []byte {
length := len(originData)
unpadding := int(originData[length-1])
return originData[:(length - unpadding)]
}
// AES加密
func AesEncrypt(origData, key []byte) ([]byte, error) {
block, err := aes.NewCipher(key)
if err != nil {
return nil, err
}
//加密块填充
blockSize := block.BlockSize()
padOrigData := PKCS7Padding(origData, blockSize)
//初始化CBC加密
blockMode := cipher.NewCBCEncrypter(block, key[:blockSize])
encrypted := make([]byte, len(padOrigData))
//加密
blockMode.CryptBlocks(encrypted, padOrigData)
return encrypted, nil
}
// AES解密
func AesDecrypt(encrypted, key []byte) ([]byte, error) {
block, err := aes.NewCipher(key)
if err != nil {
return nil, err
}
blockSize := block.BlockSize()
blockMode := cipher.NewCBCDecrypter(block, key[:blockSize])
origData := make([]byte, len(encrypted))
//解密
blockMode.CryptBlocks(origData, encrypted)
//去除填充
origData = PKCS7UnPadding(origData)
return origData, nil
}
// 针对配置文件属性进行解密处理
func DealWithDecrypt(src string, key string) (string, error) {
//如果是{{cipher}}前缀,则代表是加密过的属性,先解密
if strings.HasPrefix(src, "{{cipher}}") {
data := src[10:]
decodeData, err := BASE64StdDecode(data)
if err != nil {
return src, err
}
//解密
origin, err := AesDecrypt(decodeData, []byte(key))
if err != nil {
return src, err
}
//返回明文
return string(origin), nil
} else {
return src, nil
}
}
// 针对配置文件属性进行加密处理
func DealWithEncrypt(src string, key string) (string, error) {
encrypted, err := AesEncrypt([]byte(src), []byte(key))
if err != nil {
return src, err
}
data := BASE64StdEncode(encrypted)
return "{{cipher}}" + data, nil
}
================================================
FILE: pkg/secu/rsa.go
================================================
package secu
import (
"crypto/rand"
"crypto/rsa"
"crypto/x509"
"encoding/base64"
"encoding/pem"
"fmt"
"github.com/toolkits/pkg/logger"
)
func Decrypt(cipherText string, privateKeyByte []byte, password string) (decrypted string, err error) {
// 移除 "enc:" 前缀(如果存在)
if len(cipherText) > 4 && cipherText[:4] == "enc:" {
cipherText = cipherText[4:]
}
decodeCipher, _ := base64.StdEncoding.DecodeString(cipherText)
//pem解码
block, _ := pem.Decode(privateKeyByte)
var privateKey *rsa.PrivateKey
var decryptedPrivateKeyBytes []byte
if block == nil {
return "", fmt.Errorf("private key block is nil")
}
decryptedPrivateKeyBytes, err = x509.DecryptPEMBlock(block, []byte(password))
if err == nil {
privateKey, err = x509.ParsePKCS1PrivateKey(decryptedPrivateKeyBytes)
} else if password == "" { // has error. retry unencrypted
privateKey, err = x509.ParsePKCS1PrivateKey(block.Bytes)
}
if err != nil {
logger.Error("Failed to parse private key:", err)
return "", err
}
decryptedByte, err := rsa.DecryptPKCS1v15(rand.Reader, privateKey, decodeCipher)
if err != nil {
logger.Error("Failed to decrypt data:", err)
return "", err
}
return string(decryptedByte), err
}
func EncryptValue(value string, publicKeyData []byte) (string, error) {
publicKeyBlock, _ := pem.Decode(publicKeyData)
parsedPublicKey, err := x509.ParsePKIXPublicKey(publicKeyBlock.Bytes)
if err != nil {
return "", fmt.Errorf("failed to parse public key: %v", err)
}
publicKey, ok := parsedPublicKey.(*rsa.PublicKey)
if !ok {
return "", fmt.Errorf("failed to assert parsed key as RSA public key")
}
ciphertext, err := rsa.EncryptPKCS1v15(rand.Reader, publicKey, []byte(value))
if err != nil {
return "", fmt.Errorf("failed to encrypt value: %v", err)
}
// 添加 "enc:" 前缀标记这是加密数据
return "enc:" + BASE64StdEncode(ciphertext), nil
}
func GenerateRsaKeyPair(password string) (privateByte, publicByte []byte, err error) {
privateKey, err := rsa.GenerateKey(rand.Reader, 2048)
if err != nil {
err = fmt.Errorf("failed to GenerateKey: %v", err)
return
}
block := &pem.Block{
Type: "RSA PRIVATE KEY",
Bytes: x509.MarshalPKCS1PrivateKey(privateKey),
}
var encryptedBlock *pem.Block
if password != "" {
encryptedBlock, err = x509.EncryptPEMBlock(rand.Reader, block.Type, block.Bytes, []byte(password), x509.PEMCipherAES256)
if err != nil {
err = fmt.Errorf("failed to EncryptPEMBlock: %v", err)
return
}
} else {
encryptedBlock = block
}
privateByte = pem.EncodeToMemory(encryptedBlock)
publicKey := &privateKey.PublicKey
publicKeyBytes, err := x509.MarshalPKIXPublicKey(publicKey)
if err != nil {
err = fmt.Errorf("failed to MarshalPKIXPublicKey: %v", err)
return
}
block = &pem.Block{
Type: "PUBLIC KEY",
Bytes: publicKeyBytes,
}
publicByte = pem.EncodeToMemory(block)
return
}
================================================
FILE: pkg/slice/contains.go
================================================
package slice
func HaveIntersection[T comparable](slice1, slice2 []T) bool {
elemMap := make(map[T]bool)
for _, val := range slice1 {
elemMap[val] = true
}
for _, val := range slice2 {
if elemMap[val] {
return true
}
}
return false
}
================================================
FILE: pkg/strx/verify.go
================================================
package strx
import (
"net/http"
"regexp"
"strconv"
"strings"
"github.com/toolkits/pkg/errorx"
)
func IsValidURL(url string) bool {
re := regexp.MustCompile(`^https?://[^\s/$.?#].[^\s]*$`)
return re.MatchString(url)
}
func IdsInt64ForAPI(ids string, sep ...string) []int64 {
if ids == "" {
return []int64{}
}
s := ","
if len(sep) > 0 {
s = sep[0]
}
var arr []string
if s == " " {
arr = strings.Fields(ids)
} else {
arr = strings.Split(ids, s)
}
count := len(arr)
ret := make([]int64, 0, count)
for i := 0; i < count; i++ {
if arr[i] != "" {
id, err := strconv.ParseInt(arr[i], 10, 64)
if err != nil {
errorx.Bomb(http.StatusBadRequest, "cannot convert %s to int64", arr[i])
}
ret = append(ret, id)
}
}
return ret
}
================================================
FILE: pkg/tlsx/common.go
================================================
package tlsx
import (
"crypto/tls"
"fmt"
)
var tlsVersionMap = map[string]uint16{
"TLS10": tls.VersionTLS10,
"TLS11": tls.VersionTLS11,
"TLS12": tls.VersionTLS12,
"TLS13": tls.VersionTLS13,
}
var tlsCipherMap = map[string]uint16{
"TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305": tls.TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,
"TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305": tls.TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,
"TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256": tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
"TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256": tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
"TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384": tls.TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
"TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384": tls.TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
"TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256": tls.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256,
"TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA": tls.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,
"TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256": tls.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,
"TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA": tls.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,
"TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA": tls.TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,
"TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA": tls.TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,
"TLS_RSA_WITH_AES_128_GCM_SHA256": tls.TLS_RSA_WITH_AES_128_GCM_SHA256,
"TLS_RSA_WITH_AES_256_GCM_SHA384": tls.TLS_RSA_WITH_AES_256_GCM_SHA384,
"TLS_RSA_WITH_AES_128_CBC_SHA256": tls.TLS_RSA_WITH_AES_128_CBC_SHA256,
"TLS_RSA_WITH_AES_128_CBC_SHA": tls.TLS_RSA_WITH_AES_128_CBC_SHA,
"TLS_RSA_WITH_AES_256_CBC_SHA": tls.TLS_RSA_WITH_AES_256_CBC_SHA,
"TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA": tls.TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA,
"TLS_RSA_WITH_3DES_EDE_CBC_SHA": tls.TLS_RSA_WITH_3DES_EDE_CBC_SHA,
"TLS_RSA_WITH_RC4_128_SHA": tls.TLS_RSA_WITH_RC4_128_SHA,
"TLS_ECDHE_RSA_WITH_RC4_128_SHA": tls.TLS_ECDHE_RSA_WITH_RC4_128_SHA,
"TLS_ECDHE_ECDSA_WITH_RC4_128_SHA": tls.TLS_ECDHE_ECDSA_WITH_RC4_128_SHA,
"TLS_AES_128_GCM_SHA256": tls.TLS_AES_128_GCM_SHA256,
"TLS_AES_256_GCM_SHA384": tls.TLS_AES_256_GCM_SHA384,
"TLS_CHACHA20_POLY1305_SHA256": tls.TLS_CHACHA20_POLY1305_SHA256,
}
// ParseCiphers returns a `[]uint16` by received `[]string` key that represents ciphers from crypto/tls.
// If some of ciphers in received list doesn't exists ParseCiphers returns nil with error
func ParseCiphers(ciphers []string) ([]uint16, error) {
suites := []uint16{}
for _, cipher := range ciphers {
v, ok := tlsCipherMap[cipher]
if !ok {
return nil, fmt.Errorf("unsupported cipher %q", cipher)
}
suites = append(suites, v)
}
return suites, nil
}
// ParseTLSVersion returns a `uint16` by received version string key that represents tls version from crypto/tls.
// If version isn't supported ParseTLSVersion returns 0 with error
func ParseTLSVersion(version string) (uint16, error) {
if v, ok := tlsVersionMap[version]; ok {
return v, nil
}
return 0, fmt.Errorf("unsupported version %q", version)
}
================================================
FILE: pkg/tlsx/config.go
================================================
package tlsx
import (
"crypto/tls"
"crypto/x509"
"fmt"
"os"
"strings"
"github.com/ccfos/nightingale/v6/pkg/choice"
)
// ClientConfig represents the standard client TLS config.
type ClientConfig struct {
UseTLS bool
TLSCA string
TLSCert string
TLSKey string
TLSKeyPwd string
InsecureSkipVerify bool
ServerName string
TLSMinVersion string
TLSMaxVersion string
}
// ServerConfig represents the standard server TLS config.
type ServerConfig struct {
TLSCert string
TLSKey string
TLSKeyPwd string
TLSAllowedCACerts []string
TLSCipherSuites []string
TLSMinVersion string
TLSMaxVersion string
TLSAllowedDNSNames []string
}
// TLSConfig returns a tls.Config, may be nil without error if TLS is not
// configured.
func (c *ClientConfig) TLSConfig() (*tls.Config, error) {
if !c.UseTLS {
return nil, nil
}
tlsConfig := &tls.Config{
InsecureSkipVerify: c.InsecureSkipVerify,
Renegotiation: tls.RenegotiateNever,
}
if c.TLSCA != "" {
pool, err := makeCertPool([]string{c.TLSCA})
if err != nil {
return nil, err
}
tlsConfig.RootCAs = pool
}
if c.TLSCert != "" && c.TLSKey != "" {
err := loadCertificate(tlsConfig, c.TLSCert, c.TLSKey)
if err != nil {
return nil, err
}
}
if c.ServerName != "" {
tlsConfig.ServerName = c.ServerName
}
if c.TLSMinVersion == "1.0" {
tlsConfig.MinVersion = tls.VersionTLS10
} else if c.TLSMinVersion == "1.1" {
tlsConfig.MinVersion = tls.VersionTLS11
} else if c.TLSMinVersion == "1.2" {
tlsConfig.MinVersion = tls.VersionTLS12
} else if c.TLSMinVersion == "1.3" {
tlsConfig.MinVersion = tls.VersionTLS13
}
if c.TLSMaxVersion == "1.0" {
tlsConfig.MaxVersion = tls.VersionTLS10
} else if c.TLSMaxVersion == "1.1" {
tlsConfig.MaxVersion = tls.VersionTLS11
} else if c.TLSMaxVersion == "1.2" {
tlsConfig.MaxVersion = tls.VersionTLS12
} else if c.TLSMaxVersion == "1.3" {
tlsConfig.MaxVersion = tls.VersionTLS13
}
return tlsConfig, nil
}
// TLSConfig returns a tls.Config, may be nil without error if TLS is not
// configured.
func (c *ServerConfig) TLSConfig() (*tls.Config, error) {
if c.TLSCert == "" && c.TLSKey == "" && len(c.TLSAllowedCACerts) == 0 {
return nil, nil
}
tlsConfig := &tls.Config{}
if len(c.TLSAllowedCACerts) != 0 {
pool, err := makeCertPool(c.TLSAllowedCACerts)
if err != nil {
return nil, err
}
tlsConfig.ClientCAs = pool
tlsConfig.ClientAuth = tls.RequireAndVerifyClientCert
}
if c.TLSCert != "" && c.TLSKey != "" {
err := loadCertificate(tlsConfig, c.TLSCert, c.TLSKey)
if err != nil {
return nil, err
}
}
if len(c.TLSCipherSuites) != 0 {
cipherSuites, err := ParseCiphers(c.TLSCipherSuites)
if err != nil {
return nil, fmt.Errorf(
"could not parse server cipher suites %s: %v", strings.Join(c.TLSCipherSuites, ","), err)
}
tlsConfig.CipherSuites = cipherSuites
}
if c.TLSMaxVersion != "" {
version, err := ParseTLSVersion(c.TLSMaxVersion)
if err != nil {
return nil, fmt.Errorf(
"could not parse tls max version %q: %v", c.TLSMaxVersion, err)
}
tlsConfig.MaxVersion = version
}
if c.TLSMinVersion != "" {
version, err := ParseTLSVersion(c.TLSMinVersion)
if err != nil {
return nil, fmt.Errorf(
"could not parse tls min version %q: %v", c.TLSMinVersion, err)
}
tlsConfig.MinVersion = version
}
if tlsConfig.MinVersion != 0 && tlsConfig.MaxVersion != 0 && tlsConfig.MinVersion > tlsConfig.MaxVersion {
return nil, fmt.Errorf(
"tls min version %q can't be greater than tls max version %q", tlsConfig.MinVersion, tlsConfig.MaxVersion)
}
// Since clientAuth is tlsConfig.ClientAuth = tls.RequireAndVerifyClientCert
// there must be certs to validate.
if len(c.TLSAllowedCACerts) > 0 && len(c.TLSAllowedDNSNames) > 0 {
tlsConfig.VerifyPeerCertificate = c.verifyPeerCertificate
}
return tlsConfig, nil
}
func makeCertPool(certFiles []string) (*x509.CertPool, error) {
pool := x509.NewCertPool()
for _, certFile := range certFiles {
pem, err := os.ReadFile(certFile)
if err != nil {
return nil, fmt.Errorf(
"could not read certificate %q: %v", certFile, err)
}
if !pool.AppendCertsFromPEM(pem) {
return nil, fmt.Errorf(
"could not parse any PEM certificates %q: %v", certFile, err)
}
}
return pool, nil
}
func loadCertificate(config *tls.Config, certFile, keyFile string) error {
cert, err := tls.LoadX509KeyPair(certFile, keyFile)
if err != nil {
return fmt.Errorf(
"could not load keypair %s:%s: %v", certFile, keyFile, err)
}
config.Certificates = []tls.Certificate{cert}
config.BuildNameToCertificate()
return nil
}
func (c *ServerConfig) verifyPeerCertificate(rawCerts [][]byte, verifiedChains [][]*x509.Certificate) error {
// The certificate chain is client + intermediate + root.
// Let's review the client certificate.
cert, err := x509.ParseCertificate(rawCerts[0])
if err != nil {
return fmt.Errorf("could not validate peer certificate: %v", err)
}
for _, name := range cert.DNSNames {
if choice.Contains(name, c.TLSAllowedDNSNames) {
return nil
}
}
return fmt.Errorf("peer certificate not in allowed DNS Name list: %v", cert.DNSNames)
}
================================================
FILE: pkg/tplx/conv.go
================================================
package tplx
import (
"fmt"
"strconv"
)
// ToFloat64 convert interface to float64
func ToFloat64(val interface{}) (float64, error) {
switch v := val.(type) {
case string:
if f, err := strconv.ParseFloat(v, 64); err == nil {
return f, nil
}
// try int
if i, err := strconv.ParseInt(v, 0, 64); err == nil {
return float64(i), nil
}
// try bool
b, err := strconv.ParseBool(v)
if err == nil {
if b {
return 1, nil
} else {
return 0, nil
}
}
if v == "Yes" || v == "yes" || v == "YES" || v == "Y" || v == "ON" || v == "on" || v == "On" || v == "ok" || v == "up" {
return 1, nil
}
if v == "No" || v == "no" || v == "NO" || v == "N" || v == "OFF" || v == "off" || v == "Off" || v == "fail" || v == "err" || v == "down" {
return 0, nil
}
return 0, fmt.Errorf("unparsable value %v", v)
case float64:
return v, nil
case uint64:
return float64(v), nil
case uint32:
return float64(v), nil
case uint16:
return float64(v), nil
case uint8:
return float64(v), nil
case uint:
return float64(v), nil
case int64:
return float64(v), nil
case int32:
return float64(v), nil
case int16:
return float64(v), nil
case int8:
return float64(v), nil
case bool:
if v {
return 1, nil
} else {
return 0, nil
}
case int:
return float64(v), nil
case float32:
return float64(v), nil
default:
return strconv.ParseFloat(fmt.Sprint(v), 64)
}
}
================================================
FILE: pkg/tplx/fns.go
================================================
package tplx
import (
"encoding/json"
"errors"
"fmt"
"html/template"
"math"
"net"
"net/url"
"reflect"
"regexp"
"sort"
"strconv"
"strings"
"time"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/util/strutil"
)
var (
errNaNOrInf = errors.New("value is NaN or Inf")
)
type sample struct {
Labels map[string]string
Value float64
}
type QueryFunc func(int64, string) model.Value
var queryFunc QueryFunc
// RegisterQueryFunc 为了避免循环引用,通过外部注入的方式注册 queryFunc
func RegisterQueryFunc(f QueryFunc) {
queryFunc = f
}
type QueryResult []*sample
type queryResultByLabelSorter struct {
results QueryResult
by string
}
func (q queryResultByLabelSorter) Len() int {
return len(q.results)
}
func (q queryResultByLabelSorter) Less(i, j int) bool {
return q.results[i].Labels[q.by] < q.results[j].Labels[q.by]
}
func (q queryResultByLabelSorter) Swap(i, j int) {
q.results[i], q.results[j] = q.results[j], q.results[i]
}
func Unescaped(str string) interface{} {
return template.HTML(str)
}
func Urlconvert(str string) interface{} {
return template.URL(str)
}
func Timeformat(ts int64, pattern ...string) string {
defp := "2006-01-02 15:04:05"
if len(pattern) > 0 {
defp = pattern[0]
}
return time.Unix(ts, 0).Format(defp)
}
func Timestamp(pattern ...string) string {
defp := "2006-01-02 15:04:05"
if len(pattern) > 0 {
defp = pattern[0]
}
return time.Now().Format(defp)
}
func Now() time.Time {
return time.Now()
}
func Args(args ...interface{}) map[string]interface{} {
result := make(map[string]interface{})
for i, a := range args {
result[fmt.Sprintf("arg%d", i)] = a
}
return result
}
func ReReplaceAll(pattern, repl, text string) string {
re := regexp.MustCompile(pattern)
return re.ReplaceAllString(text, repl)
}
func Humanize(s string) string {
v, err := strconv.ParseFloat(s, 64)
if err != nil {
return s
}
if v == 0 || math.IsNaN(v) || math.IsInf(v, 0) {
return fmt.Sprintf("%.2f", v)
}
if math.Abs(v) >= 1 {
prefix := ""
for _, p := range []string{"k", "M", "G", "T", "P", "E", "Z", "Y"} {
if math.Abs(v) < 1000 {
break
}
prefix = p
v /= 1000
}
return fmt.Sprintf("%.2f%s", v, prefix)
}
prefix := ""
for _, p := range []string{"m", "u", "n", "p", "f", "a", "z", "y"} {
if math.Abs(v) >= 1 {
break
}
prefix = p
v *= 1000
}
return fmt.Sprintf("%.2f%s", v, prefix)
}
func Humanize1024(s string) string {
v, err := strconv.ParseFloat(s, 64)
if err != nil {
return s
}
if math.Abs(v) <= 1 || math.IsNaN(v) || math.IsInf(v, 0) {
return fmt.Sprintf("%.4g", v)
}
prefix := ""
for _, p := range []string{"ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi"} {
if math.Abs(v) < 1024 {
break
}
prefix = p
v /= 1024
}
return fmt.Sprintf("%.4g%s", v, prefix)
}
func ToString(v interface{}) string {
return fmt.Sprint(v)
}
func HumanizeDuration(s string) string {
v, err := strconv.ParseFloat(s, 64)
if err != nil {
return s
}
return HumanizeDurationFloat64(v)
}
func HumanizeDurationInterface(i interface{}) string {
f, err := ToFloat64(i)
if err != nil {
return ToString(i)
}
return HumanizeDurationFloat64(f)
}
func HumanizeDurationFloat64(v float64) string {
if math.IsNaN(v) || math.IsInf(v, 0) {
return fmt.Sprintf("%.4g", v)
}
if v == 0 {
return fmt.Sprintf("%.4gs", v)
}
if math.Abs(v) >= 1 {
sign := ""
if v < 0 {
sign = "-"
v = -v
}
seconds := int64(v) % 60
minutes := (int64(v) / 60) % 60
hours := (int64(v) / 60 / 60) % 24
days := int64(v) / 60 / 60 / 24
// For days to minutes, we display seconds as an integer.
if days != 0 {
return fmt.Sprintf("%s%dd %dh %dm %ds", sign, days, hours, minutes, seconds)
}
if hours != 0 {
return fmt.Sprintf("%s%dh %dm %ds", sign, hours, minutes, seconds)
}
if minutes != 0 {
return fmt.Sprintf("%s%dm %ds", sign, minutes, seconds)
}
// For seconds, we display 4 significant digits.
return fmt.Sprintf("%s%.4gs", sign, v)
}
prefix := ""
for _, p := range []string{"m", "u", "n", "p", "f", "a", "z", "y"} {
if math.Abs(v) >= 1 {
break
}
prefix = p
v *= 1000
}
return fmt.Sprintf("%.4g%ss", v, prefix)
}
func HumanizePercentage(s string) string {
v, err := strconv.ParseFloat(s, 64)
if err != nil {
return s
}
return fmt.Sprintf("%.2f%%", v*100)
}
func HumanizePercentageH(s string) string {
v, err := strconv.ParseFloat(s, 64)
if err != nil {
return s
}
return fmt.Sprintf("%.2f%%", v)
}
func HumanizeTimestamp(i interface{}) (string, error) {
v, err := convertToFloat(i)
if err != nil {
return "", err
}
tm, err := floatToTime(v)
switch {
case errors.Is(err, errNaNOrInf):
return fmt.Sprintf("%.4g", v), nil
case err != nil:
return "", err
}
return fmt.Sprint(tm), nil
}
// Add returns the sum of a and b.
func Add(a, b interface{}) (interface{}, error) {
av := reflect.ValueOf(a)
bv := reflect.ValueOf(b)
switch av.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
switch bv.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return av.Int() + bv.Int(), nil
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return av.Int() + int64(bv.Uint()), nil
case reflect.Float32, reflect.Float64:
return float64(av.Int()) + bv.Float(), nil
default:
return nil, fmt.Errorf("add: unknown type for %q (%T)", bv, b)
}
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
switch bv.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return int64(av.Uint()) + bv.Int(), nil
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return av.Uint() + bv.Uint(), nil
case reflect.Float32, reflect.Float64:
return float64(av.Uint()) + bv.Float(), nil
default:
return nil, fmt.Errorf("add: unknown type for %q (%T)", bv, b)
}
case reflect.Float32, reflect.Float64:
switch bv.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return av.Float() + float64(bv.Int()), nil
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return av.Float() + float64(bv.Uint()), nil
case reflect.Float32, reflect.Float64:
return av.Float() + bv.Float(), nil
default:
return nil, fmt.Errorf("add: unknown type for %q (%T)", bv, b)
}
default:
return nil, fmt.Errorf("add: unknown type for %q (%T)", av, a)
}
}
// Subtract returns the difference of b from a.
func Subtract(a, b interface{}) (interface{}, error) {
av := reflect.ValueOf(a)
bv := reflect.ValueOf(b)
switch av.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
switch bv.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return av.Int() - bv.Int(), nil
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return av.Int() - int64(bv.Uint()), nil
case reflect.Float32, reflect.Float64:
return float64(av.Int()) - bv.Float(), nil
default:
return nil, fmt.Errorf("subtract: unknown type for %q (%T)", bv, b)
}
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
switch bv.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return int64(av.Uint()) - bv.Int(), nil
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return av.Uint() - bv.Uint(), nil
case reflect.Float32, reflect.Float64:
return float64(av.Uint()) - bv.Float(), nil
default:
return nil, fmt.Errorf("subtract: unknown type for %q (%T)", bv, b)
}
case reflect.Float32, reflect.Float64:
switch bv.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return av.Float() - float64(bv.Int()), nil
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return av.Float() - float64(bv.Uint()), nil
case reflect.Float32, reflect.Float64:
return av.Float() - bv.Float(), nil
default:
return nil, fmt.Errorf("subtract: unknown type for %q (%T)", bv, b)
}
default:
return nil, fmt.Errorf("subtract: unknown type for %q (%T)", av, a)
}
}
// Multiply returns the product of a and b.
func Multiply(a, b interface{}) (interface{}, error) {
av := reflect.ValueOf(a)
bv := reflect.ValueOf(b)
switch av.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
switch bv.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return av.Int() * bv.Int(), nil
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return av.Int() * int64(bv.Uint()), nil
case reflect.Float32, reflect.Float64:
return float64(av.Int()) * bv.Float(), nil
default:
return nil, fmt.Errorf("multiply: unknown type for %q (%T)", bv, b)
}
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
switch bv.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return int64(av.Uint()) * bv.Int(), nil
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return av.Uint() * bv.Uint(), nil
case reflect.Float32, reflect.Float64:
return float64(av.Uint()) * bv.Float(), nil
default:
return nil, fmt.Errorf("multiply: unknown type for %q (%T)", bv, b)
}
case reflect.Float32, reflect.Float64:
switch bv.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return av.Float() * float64(bv.Int()), nil
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return av.Float() * float64(bv.Uint()), nil
case reflect.Float32, reflect.Float64:
return av.Float() * bv.Float(), nil
default:
return nil, fmt.Errorf("multiply: unknown type for %q (%T)", bv, b)
}
default:
return nil, fmt.Errorf("multiply: unknown type for %q (%T)", av, a)
}
}
// Divide returns the division of b from a.
func Divide(a, b interface{}) (interface{}, error) {
av := reflect.ValueOf(a)
bv := reflect.ValueOf(b)
switch av.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
switch bv.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return av.Int() / bv.Int(), nil
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return av.Int() / int64(bv.Uint()), nil
case reflect.Float32, reflect.Float64:
return float64(av.Int()) / bv.Float(), nil
default:
return nil, fmt.Errorf("divide: unknown type for %q (%T)", bv, b)
}
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
switch bv.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return int64(av.Uint()) / bv.Int(), nil
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return av.Uint() / bv.Uint(), nil
case reflect.Float32, reflect.Float64:
return float64(av.Uint()) / bv.Float(), nil
default:
return nil, fmt.Errorf("divide: unknown type for %q (%T)", bv, b)
}
case reflect.Float32, reflect.Float64:
switch bv.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return av.Float() / float64(bv.Int()), nil
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return av.Float() / float64(bv.Uint()), nil
case reflect.Float32, reflect.Float64:
return av.Float() / bv.Float(), nil
default:
return nil, fmt.Errorf("divide: unknown type for %q (%T)", bv, b)
}
default:
return nil, fmt.Errorf("divide: unknown type for %q (%T)", av, a)
}
}
func FormatDecimal(s string, n int) string {
num, err := strconv.ParseFloat(s, 64)
if err != nil {
return s
}
format := fmt.Sprintf("%%.%df", n)
return fmt.Sprintf(format, num)
}
func First(v QueryResult) (*sample, error) {
if len(v) > 0 {
return v[0], nil
}
return nil, errors.New("first() called on vector with no elements")
}
func Label(label string, s *sample) string {
return s.Labels[label]
}
func Value(s *sample) float64 {
return s.Value
}
func StrValue(s *sample) string {
return s.Labels["__value__"]
}
func SafeHtml(text string) template.HTML {
return template.HTML(text)
}
func Match(pattern, s string) (bool, error) {
return regexp.MatchString(pattern, s)
}
func Title(s string) string {
return strings.Title(s)
}
func ToUpper(s string) string {
return strings.ToUpper(s)
}
func ToLower(s string) string {
return strings.ToLower(s)
}
func GraphLink(expr string) string {
return strutil.GraphLinkForExpression(expr)
}
func TableLink(expr string) string {
return strutil.TableLinkForExpression(expr)
}
func SortByLabel(label string, v QueryResult) QueryResult {
sorter := queryResultByLabelSorter{v[:], label}
sort.Stable(sorter)
return v
}
func StripPort(hostPort string) string {
host, _, err := net.SplitHostPort(hostPort)
if err != nil {
return hostPort
}
return host
}
func StripDomain(hostPort string) string {
host, port, err := net.SplitHostPort(hostPort)
if err != nil {
host = hostPort
}
ip := net.ParseIP(host)
if ip != nil {
return hostPort
}
host = strings.Split(host, ".")[0]
if port != "" {
return net.JoinHostPort(host, port)
}
return host
}
func ToTime(i interface{}) (*time.Time, error) {
v, err := convertToFloat(i)
if err != nil {
return nil, err
}
return floatToTime(v)
}
func PathPrefix(externalURL *url.URL) string {
return externalURL.Path
}
func ExternalURL(externalURL *url.URL) string {
return externalURL.String()
}
func ParseDuration(d string) (float64, error) {
v, err := model.ParseDuration(d)
if err != nil {
return 0, err
}
return float64(time.Duration(v)) / float64(time.Second), nil
}
func Printf(format string, value interface{}) string {
valType := reflect.TypeOf(value).Kind()
switch valType {
case reflect.String:
strValue := value.(string)
// Check if it's a value with unit (contains both digits and non-numeric chars like letters or %)
if isValueWithUnit(strValue) {
return strValue
}
// Try converting string to float
if floatValue, err := strconv.ParseFloat(strValue, 64); err == nil {
return fmt.Sprintf(format, floatValue)
}
return fmt.Sprintf(format, value)
case reflect.Float64, reflect.Float32:
return fmt.Sprintf(format, value)
default:
// Handle other types as per requirement
return fmt.Sprintf(format, value)
}
}
// isValueWithUnit checks if a string is a numeric value with unit
// e.g., "11.5%", "100MB", "10a" returns true
// e.g., "11", "11.11", "-3.14" returns false
func isValueWithUnit(s string) bool {
if s == "" {
return false
}
hasDigit := false
hasUnit := false
for _, r := range s {
if r >= '0' && r <= '9' {
hasDigit = true
} else if r == '.' || r == '-' || r == '+' {
// These are valid numeric characters, not units
continue
} else {
// Any other character (letters, %, etc.) is considered a unit
hasUnit = true
}
}
return hasDigit && hasUnit
}
func floatToTime(v float64) (*time.Time, error) {
if math.IsNaN(v) || math.IsInf(v, 0) {
return nil, errNaNOrInf
}
timestamp := v * 1e9
if timestamp > math.MaxInt64 || timestamp < math.MinInt64 {
return nil, fmt.Errorf("%v cannot be represented as a nanoseconds timestamp since it overflows int64", v)
}
t := model.TimeFromUnixNano(int64(timestamp)).Time().UTC()
return &t, nil
}
func convertToFloat(i interface{}) (float64, error) {
switch v := i.(type) {
case float64:
return v, nil
case string:
return strconv.ParseFloat(v, 64)
case int:
return float64(v), nil
case uint:
return float64(v), nil
case int64:
return float64(v), nil
case uint64:
return float64(v), nil
default:
return 0, fmt.Errorf("can't convert %T to float", v)
}
}
func Query(datasourceID int64, promql string) model.Value {
value := queryFunc(datasourceID, promql)
if value != nil {
return value
}
return nil
}
// ConvertToQueryResult 将model.Value转换为queryResult
func ConvertToQueryResult(value model.Value) QueryResult {
if value == nil {
return nil
}
var result QueryResult
switch value.Type() {
case model.ValVector:
items, ok := value.(model.Vector)
if !ok {
return nil
}
for _, item := range items {
if math.IsNaN(float64(item.Value)) {
continue
}
labels := make(map[string]string)
for k, v := range item.Metric {
labels[string(k)] = string(v)
}
result = append(result, &sample{
Labels: labels,
Value: float64(item.Value),
})
}
case model.ValMatrix:
items, ok := value.(model.Matrix)
if !ok {
return nil
}
for _, item := range items {
if len(item.Values) == 0 {
continue
}
last := item.Values[len(item.Values)-1]
if math.IsNaN(float64(last.Value)) {
continue
}
labels := make(map[string]string)
for k, v := range item.Metric {
labels[string(k)] = string(v)
}
result = append(result, &sample{
Labels: labels,
Value: float64(last.Value),
})
}
case model.ValScalar:
item, ok := value.(*model.Scalar)
if !ok {
return nil
}
if math.IsNaN(float64(item.Value)) {
return nil
}
result = append(result, &sample{
Labels: map[string]string{},
Value: float64(item.Value),
})
default:
return nil
}
return result
}
func MappingAndJoin(arr interface{}, prefix, suffix, join string) string {
var result []string
switch v := arr.(type) {
case []int:
for _, item := range v {
result = append(result, fmt.Sprintf("%v", item))
}
case []string:
result = v
case []interface{}:
for _, item := range v {
result = append(result, fmt.Sprintf("%v", item))
}
}
var res []string
for _, s := range result {
if s == "" {
continue
}
res = append(res, prefix+s+suffix)
}
if len(res) == 0 {
return ""
}
return strings.Join(res, join)
}
func StrMappingAndJoin(str, split, prefix, suffix, join string) string {
arr := strings.Split(str, split)
return MappingAndJoin(arr, prefix, suffix, join)
}
func Ats(str string) string {
if strings.Contains(str, ",") {
arr := strings.Split(str, ",")
return MappingAndJoin(arr, "@", "", " ")
}
if strings.Contains(str, " ") {
arr := strings.Split(str, " ")
return MappingAndJoin(arr, "@", "", " ")
}
return str
}
// BatchContactsAts
func BatchContactsAts(arr interface{}) string {
return MappingAndJoin(arr, "@", "", " ")
}
func BatchContactsJsonMarshal(arr interface{}) template.HTML {
return template.HTML("[" + MappingAndJoin(arr, "\"", "\"", ",") + "]")
}
func BatchContactsJoinComma(arr interface{}) string {
return MappingAndJoin(arr, "", "", ",")
}
func BatchContactsAtsInFeishuEmail(arr interface{}) template.HTML {
return template.HTML(MappingAndJoin(arr, " ", " "))
}
func BatchContactsAtsInFeishuId(arr interface{}) template.HTML {
return template.HTML(MappingAndJoin(arr, " ", " "))
}
func JsonMarshal(v interface{}) template.HTML {
json, err := json.Marshal(v)
if err != nil {
return template.HTML("")
}
return template.HTML(string(json))
}
func MapDifference(firstMap, secondMap map[string]string) (map[string]string, error) {
// 创建结果 map
result := make(map[string]string)
// 遍历第一个 map,将不在第二个 map 中的键值对添加到结果中
for key, value := range firstMap {
if _, exists := secondMap[key]; !exists {
result[key] = value
}
}
return result, nil
}
func TagsMapToStr(m map[string]string) string {
strs := []string{}
for key, value := range m {
strs = append(strs, key+"="+value)
}
sort.Strings(strs)
return strings.Join(strs, ",")
}
================================================
FILE: pkg/tplx/tpl_test.go
================================================
package tplx
import (
"html/template"
"testing"
)
func TestBatchContactJsonMarshal(t *testing.T) {
tests := []struct {
name string
input interface{}
expected string
}{
{
name: "整数切片",
input: []int{13800138001, 13800138002, 13800138003},
expected: `["13800138001","13800138002","13800138003"]`,
},
{
name: "字符串切片",
input: []string{"a", "b", "c"},
expected: `["a","b","c"]`,
},
{
name: "接口切片",
input: []interface{}{1, "b", 3.14},
expected: `["1","b","3.14"]`,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := BatchContactsJsonMarshal(tt.input)
if result != template.HTML(tt.expected) {
t.Errorf("期望得到 %v,实际得到 %v", tt.expected, result)
}
})
}
}
func TestBatchContactJoinComma(t *testing.T) {
tests := []struct {
name string
input interface{}
expected string
}{
{
name: "整数切片",
input: []int{13800138001, 13800138002, 13800138003},
expected: `13800138001,13800138002,13800138003`,
},
{
name: "字符串切片",
input: []string{"a", "b", "c"},
expected: `a,b,c`,
},
{
name: "接口切片",
input: []interface{}{1, "b", 3.14},
expected: `1,b,3.14`,
},
{
name: "不支持的类型",
input: 123,
expected: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := BatchContactsJoinComma(tt.input)
if result != tt.expected {
t.Errorf("期望得到 %v,实际得到 %v", tt.expected, result)
}
})
}
}
func TestMappingAndJoin(t *testing.T) {
tests := []struct {
name string
input interface{}
prefix string
suffix string
join string
expected string
}{
{
name: "整数切片带前后缀",
input: []int{1, 2, 3},
prefix: "num_",
suffix: "_end",
join: ",",
expected: "num_1_end,num_2_end,num_3_end",
},
{
name: "字符串切片带引号",
input: []string{"a", "b", "c"},
prefix: "@",
suffix: "",
join: " ",
expected: `@a @b @c`,
},
{
name: "接口切片带括号",
input: []interface{}{1, "b", 3.14},
prefix: "(",
suffix: ")",
join: "|",
expected: "(1)|(b)|(3.14)",
},
{
name: "空前后缀",
input: []int{1, 2, 3},
prefix: "",
suffix: "",
join: "-",
expected: "1-2-3",
},
{
name: "不支持的类型",
input: 123,
prefix: "test_",
suffix: "_test",
join: ",",
expected: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := MappingAndJoin(tt.input, tt.prefix, tt.suffix, tt.join)
if result != tt.expected {
t.Errorf("期望得到 %v,实际得到 %v", tt.expected, result)
}
})
}
}
================================================
FILE: pkg/tplx/tplx.go
================================================
package tplx
import (
"bytes"
"html/template"
"net/url"
"regexp"
"strings"
templateT "text/template"
"encoding/base64"
"github.com/toolkits/pkg/logger"
)
var TemplateFuncMap = template.FuncMap{
"escape": url.PathEscape,
"unescaped": Unescaped,
"urlconvert": Urlconvert,
"timeformat": Timeformat,
"timestamp": Timestamp,
"args": Args,
"reReplaceAll": ReReplaceAll,
"match": regexp.MatchString,
"toUpper": strings.ToUpper,
"toLower": strings.ToLower,
"contains": strings.Contains,
"humanize": Humanize,
"humanize1024": Humanize1024,
"humanizeDuration": HumanizeDuration,
"humanizeDurationInterface": HumanizeDurationInterface,
"humanizePercentage": HumanizePercentage,
"humanizePercentageH": HumanizePercentageH,
"add": Add,
"sub": Subtract,
"mul": Multiply,
"div": Divide,
"now": Now,
"toString": ToString,
"formatDecimal": FormatDecimal,
"first": First,
"label": Label,
"value": Value,
"strvalue": StrValue,
"safeHtml": SafeHtml,
"title": Title,
"graphLink": GraphLink,
"tableLink": TableLink,
"sortByLabel": SortByLabel,
"stripPort": StripPort,
"stripDomain": StripDomain,
"toTime": ToTime,
"pathPrefix": PathPrefix,
"externalURL": ExternalURL,
"parseDuration": ParseDuration,
"printf": Printf,
"split": strings.Split,
"join": strings.Join,
"ats": Ats,
"batchContactsJsonMarshal": BatchContactsJsonMarshal,
"batchContactsJoinComma": BatchContactsJoinComma,
"batchContactsAts": BatchContactsAts,
"mappingAndJoin": MappingAndJoin,
"batchContactsAtsInFeishuEmail": BatchContactsAtsInFeishuEmail,
"batchContactsAtsInFeishuId": BatchContactsAtsInFeishuId,
"jsonMarshal": JsonMarshal,
"mapDifference": MapDifference,
"tagsMapToStr": TagsMapToStr,
"b64enc": func(s string) string {
return base64.StdEncoding.EncodeToString([]byte(s))
},
"b64dec": func(s string) string {
data, err := base64.StdEncoding.DecodeString(s)
if err != nil {
return s
}
return string(data)
},
}
// NewTemplateFuncMap copy on write for TemplateFuncMap
func NewTemplateFuncMap() template.FuncMap {
m := template.FuncMap{}
for k, v := range TemplateFuncMap {
m[k] = v
}
return m
}
// ReplaceTemplateUseHtml replaces variables in a template string with values.
//
// It accepts the following parameters:
//
// - name: The name to use when parsing the template
//
// - templateText: The template string containing variables to replace
//
// - templateData: A struct containing fields to replace the variables
//
// It parses the templateText into a template using template.New and template.Parse.
//
// It executes the parsed template with templateData as the data, writing the result
// to a bytes.Buffer.
//
// Any {{.Field}} variables in templateText are replaced with values from templateData.
//
// If there are any errors parsing or executing the template, they are logged and
// the original templateText is returned.
//
// The rendered template string is returned on success.
//
// Example usage:
//
// type Data struct {
// Name string
// }
//
// data := Data{"John"}
//
// output := ReplaceTemplateUseHtml("mytpl", "Hello {{.Name}}!", data)
func ReplaceTemplateUseHtml(name string, templateText string, templateData any) string {
tpl, err := template.New(name).Parse(templateText)
if err != nil {
logger.Warningf("parse config error: %v", err)
return templateText
}
var body bytes.Buffer
if err := tpl.Execute(&body, templateData); err != nil {
logger.Warningf("execute config error: %v", err)
return templateText
}
return body.String()
}
func ReplaceTemplateUseText(name string, templateText string, templateData any) string {
tpl, err := templateT.New(name).Parse(templateText)
if err != nil {
logger.Warningf("text parse config error: %v", err)
return templateText
}
var body bytes.Buffer
if err := tpl.Execute(&body, templateData); err != nil {
logger.Warningf("text execute config error: %v", err)
return templateText
}
return body.String()
}
================================================
FILE: pkg/unit/unit_convert.go
================================================
package unit
import (
"fmt"
"math"
"strings"
"time"
)
// FormattedValue 格式化后的值的结构
type FormattedValue struct {
Value float64 `json:"value"`
Unit string `json:"unit"`
Text string `json:"text"`
Stat float64 `json:"stat"`
}
// FormatOptions 格式化选项
type FormatOptions struct {
Type string // "si" 或 "iec"
Base string // "bits" 或 "bytes"
Decimals int // 小数位数
Postfix string // 后缀
}
// 时间相关常量
const (
NanosecondVal = 0.000000001
MicrosecondVal = 0.000001
MillisecondVal = 0.001
SecondVal = 1
MinuteVal = 60
HourVal = 3600
DayVal = 86400
WeekVal = 86400 * 7
YearVal = 86400 * 365
)
var (
valueMap = []struct {
Exp int
Si string
Iec string
IecExp int
}{
{0, "", "", 1},
{3, "k", "Ki", 10},
{6, "M", "Mi", 20},
{9, "G", "Gi", 30},
{12, "T", "Ti", 40},
{15, "P", "Pi", 50},
{18, "E", "Ei", 60},
{21, "Z", "Zi", 70},
{24, "Y", "Yi", 80},
}
baseUtilMap = map[string]string{
"bits": "b",
"bytes": "B",
}
)
// ValueFormatter 格式化入口函数
func ValueFormatter(unit string, decimals int, value float64) FormattedValue {
if math.IsNaN(value) {
return FormattedValue{
Value: 0,
Unit: "",
Text: "NaN",
Stat: 0,
}
}
// Handle positive and negative infinity
if math.IsInf(value, 1) {
return FormattedValue{
Value: 9999999999,
Unit: "",
Text: "+Inf",
Stat: 9999999999,
}
}
if math.IsInf(value, -1) {
return FormattedValue{
Value: -9999999999,
Unit: "",
Text: "-Inf",
Stat: -9999999999,
}
}
// 处理时间单位
switch unit {
case "none":
return formatNone(value, decimals)
case "ns", "nanoseconds":
return formatDuration(value, "ns", decimals)
case "µs", "microseconds":
return formatDuration(value, "µs", decimals)
case "ms", "milliseconds":
return formatDuration(value, "ms", decimals)
case "s", "seconds":
return formatDuration(value, "s", decimals)
case "min", "h", "d", "w":
return formatDuration(value, unit, decimals)
case "percent":
return formatPercent(value, decimals, false)
case "percentUnit":
return formatPercent(value, decimals, true)
case "bytesIEC", "bytes(IEC)", "bitsIEC", "bits(IEC)":
base := unit
base = strings.TrimSuffix(base, "(IEC)")
base = strings.TrimSuffix(base, "IEC")
base = strings.TrimSuffix(base, "s")
opts := FormatOptions{
Type: "iec",
Base: base,
Decimals: decimals,
}
return formatBytes(value, opts)
case "bytesSI", "bytes(SI)", "bitsSI", "bits(SI)", "default", "sishort":
base := unit
base = strings.TrimSuffix(base, "(SI)")
base = strings.TrimSuffix(base, "SI")
base = strings.TrimSuffix(base, "s")
opts := FormatOptions{
Type: "si",
Base: base,
Decimals: decimals,
}
return formatBytes(value, opts)
case "bytesSecIEC":
opts := FormatOptions{
Type: "iec",
Base: "bytes",
Decimals: decimals,
Postfix: "/s",
}
return formatBytes(value, opts)
case "bitsSecIEC":
opts := FormatOptions{
Type: "iec",
Base: "bits",
Decimals: decimals,
Postfix: "/s",
}
return formatBytes(value, opts)
case "bytesSecSI":
opts := FormatOptions{
Type: "si",
Base: "bytes",
Decimals: decimals,
Postfix: "/s",
}
return formatBytes(value, opts)
case "bitsSecSI":
opts := FormatOptions{
Type: "si",
Base: "bits",
Decimals: decimals,
Postfix: "/s",
}
return formatBytes(value, opts)
case "datetimeSeconds", "datetimeMilliseconds":
return formatDateTime(unit, value)
default:
return formatNone(value, decimals)
}
}
// formatDuration 处理时间单位的转换
func formatDuration(originValue float64, unit string, decimals int) FormattedValue {
var converted float64
var targetUnit string
value := originValue
// 标准化到秒
switch unit {
case "ns":
value *= NanosecondVal
case "µs":
value *= MicrosecondVal
case "ms":
value *= MillisecondVal
case "min":
value *= MinuteVal
case "h":
value *= HourVal
case "d":
value *= DayVal
case "w":
value *= WeekVal
}
// 选择合适的单位
switch {
case value >= YearVal:
converted = value / YearVal
targetUnit = "y"
case value >= WeekVal:
converted = value / WeekVal
targetUnit = "w"
case value >= DayVal:
converted = value / DayVal
targetUnit = "d"
case value >= HourVal:
converted = value / HourVal
targetUnit = "h"
case value >= MinuteVal:
converted = value / MinuteVal
targetUnit = "min"
case value >= SecondVal:
converted = value
targetUnit = "s"
case value >= MillisecondVal:
converted = value / MillisecondVal
targetUnit = "ms"
case value >= MicrosecondVal:
converted = value / MicrosecondVal
targetUnit = "µs"
default:
converted = value / NanosecondVal
targetUnit = "ns"
}
return FormattedValue{
Value: roundFloat(converted, decimals),
Unit: targetUnit,
Text: fmt.Sprintf("%.*f %s", decimals, converted, targetUnit),
Stat: originValue,
}
}
// formatBytes 处理字节相关的转换
func formatBytes(value float64, opts FormatOptions) FormattedValue {
if value == 0 {
baseUtil := baseUtilMap[opts.Base]
return FormattedValue{
Value: 0,
Unit: baseUtil + opts.Postfix,
Text: fmt.Sprintf("0%s%s", baseUtil, opts.Postfix),
Stat: 0,
}
}
baseUtil := baseUtilMap[opts.Base]
threshold := 1000.0
if opts.Type == "iec" {
threshold = 1024.0
}
if math.Abs(value) < threshold {
return FormattedValue{
Value: roundFloat(value, opts.Decimals),
Unit: baseUtil + opts.Postfix,
Text: fmt.Sprintf("%.*f%s%s", opts.Decimals, value, baseUtil, opts.Postfix),
Stat: value,
}
}
// 计算指数
exp := int(math.Floor(math.Log10(math.Abs(value))/3.0)) * 3
if exp > 24 {
exp = 24
}
var unit string
var divider float64
// 查找对应的单位
for _, v := range valueMap {
if v.Exp == exp {
if opts.Type == "iec" {
unit = v.Iec
divider = math.Pow(2, float64(v.IecExp))
} else {
unit = v.Si
divider = math.Pow(10, float64(v.Exp))
}
break
}
}
newValue := value / divider
return FormattedValue{
Value: roundFloat(newValue, opts.Decimals),
Unit: unit + baseUtil + opts.Postfix,
Text: fmt.Sprintf("%.*f%s%s%s", opts.Decimals, newValue, unit, baseUtil, opts.Postfix),
Stat: value,
}
}
// formatPercent 处理百分比格式化
func formatPercent(value float64, decimals int, isUnit bool) FormattedValue {
if isUnit {
value = value * 100
}
return FormattedValue{
Value: roundFloat(value, decimals),
Unit: "%",
Text: fmt.Sprintf("%.*f%%", decimals, value),
Stat: value,
}
}
// formatNone 处理无单位格式化
func formatNone(value float64, decimals int) FormattedValue {
return FormattedValue{
Value: value,
Unit: "",
Text: fmt.Sprintf("%.*f", decimals, value),
Stat: value,
}
}
// formatDateTime 处理时间戳格式化
func formatDateTime(uint string, value float64) FormattedValue {
var t time.Time
switch uint {
case "datetimeSeconds":
t = time.Unix(int64(value), 0)
case "datetimeMilliseconds":
t = time.Unix(0, int64(value)*int64(time.Millisecond))
}
text := t.Format("2006-01-02 15:04:05")
return FormattedValue{
Value: value,
Unit: "",
Text: text,
Stat: value,
}
}
// roundFloat 四舍五入到指定小数位
func roundFloat(val float64, precision int) float64 {
ratio := math.Pow(10, float64(precision))
return math.Round(val*ratio) / ratio
}
================================================
FILE: pkg/unit/unit_convert_test.go
================================================
package unit
import (
"math"
"testing"
)
func TestValueFormatter(t *testing.T) {
tests := []struct {
name string
unit string
decimals int
value float64
want FormattedValue
}{
// 字节测试
{
name: "IEC字节测试",
unit: "bytes(IEC)",
decimals: 2,
value: 1024 * 1024,
want: FormattedValue{Value: 1, Unit: "Mi", Text: "1.00Mi", Stat: 1024 * 1024},
},
{
name: "SI字节测试",
unit: "bytes(SI)",
decimals: 2,
value: 1000 * 1000,
want: FormattedValue{Value: 1, Unit: "M", Text: "1.00M", Stat: 1000 * 1000},
},
// 时间单位测试
{
name: "毫秒转秒",
unit: "ms",
decimals: 2,
value: 1500,
want: FormattedValue{
Value: 1.50,
Unit: "s",
Text: "1.50 s",
Stat: 1500,
},
},
{
name: "秒转分钟",
unit: "s",
decimals: 1,
value: 150,
want: FormattedValue{
Value: 2.5,
Unit: "min",
Text: "2.5 min",
Stat: 150,
},
},
// 百分比测试
{
name: "百分比",
unit: "percent",
decimals: 2,
value: 0.9555,
want: FormattedValue{
Value: 0.96,
Unit: "%",
Text: "0.96%",
Stat: 0.9555,
},
},
{
name: "百分比单位",
unit: "percentUnit",
decimals: 1,
value: 0.95,
want: FormattedValue{
Value: 95.0,
Unit: "%",
Text: "95.0%",
Stat: 95.0,
},
},
// SI格式测试
{
name: "SI格式",
unit: "sishort",
decimals: 2,
value: 1500,
want: FormattedValue{
Value: 1.50,
Unit: "k",
Text: "1.50k",
Stat: 1500,
},
},
// 时间戳测试
{
name: "时间戳 s",
unit: "datetimeSeconds",
decimals: 0,
value: 1683518400,
want: FormattedValue{
Value: 1683518400,
Unit: "",
Text: "2023-05-08 12:00:00",
Stat: 1683518400,
},
},
{
name: "时间戳 ms",
unit: "datetimeMilliseconds",
decimals: 0,
value: 1683518400000,
want: FormattedValue{
Value: 1683518400000,
Unit: "",
Text: "2023-05-08 12:00:00",
Stat: 1683518400000,
},
},
// 补充时间单位测试
{
name: "纳秒测试",
unit: "ns",
decimals: 2,
value: 1500,
want: FormattedValue{
Value: 1.50,
Unit: "µs",
Text: "1.50 µs",
Stat: 1500,
},
},
{
name: "微秒测试",
unit: "µs",
decimals: 2,
value: 1500,
want: FormattedValue{
Value: 1.50,
Unit: "ms",
Text: "1.50 ms",
Stat: 1500,
},
},
{
name: "小时测试",
unit: "h",
decimals: 1,
value: 2.5,
want: FormattedValue{
Value: 2.5,
Unit: "h",
Text: "2.5 h",
Stat: 2.5,
},
},
{
name: "天数测试",
unit: "d",
decimals: 1,
value: 1.5,
want: FormattedValue{
Value: 1.5,
Unit: "d",
Text: "1.5 d",
Stat: 1.5,
},
},
{
name: "周数测试",
unit: "w",
decimals: 1,
value: 1.5,
want: FormattedValue{
Value: 1.5,
Unit: "w",
Text: "1.5 w",
Stat: 1.5,
},
},
// 补充字节速率测试
{
name: "IEC字节每秒",
unit: "bytesSecIEC",
decimals: 2,
value: 1024 * 1024,
want: FormattedValue{
Value: 1,
Unit: "MiB/s",
Text: "1.00MiB/s",
Stat: 1024 * 1024,
},
},
{
name: "IEC比特每秒",
unit: "bitsSecIEC",
decimals: 2,
value: 1024 * 1024,
want: FormattedValue{
Value: 1,
Unit: "Mib/s",
Text: "1.00Mib/s",
Stat: 1024 * 1024,
},
},
{
name: "SI字节每秒",
unit: "bytesSecSI",
decimals: 2,
value: 1000 * 1000,
want: FormattedValue{
Value: 1,
Unit: "MB/s",
Text: "1.00MB/s",
Stat: 1000 * 1000,
},
},
{
name: "SI比特每秒",
unit: "bitsSecSI",
decimals: 2,
value: 1000 * 1000,
want: FormattedValue{
Value: 1,
Unit: "Mb/s",
Text: "1.00Mb/s",
Stat: 1000 * 1000,
},
},
// none 类型测试
{
name: "无单位测试",
unit: "none",
decimals: 2,
value: 1234.5678,
want: FormattedValue{
Value: 1234.5678,
Unit: "",
Text: "1234.57",
Stat: 1234.5678,
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := ValueFormatter(tt.unit, tt.decimals, tt.value)
if !compareFormattedValues(got, tt.want) {
t.Errorf("ValueFormatter() = %v, want %v", got, tt.want)
}
})
}
}
func TestEdgeCases(t *testing.T) {
tests := []struct {
name string
unit string
decimals int
value float64
wantNil bool
}{
{
name: "NaN值",
unit: "bytes",
decimals: 2,
value: math.NaN(),
wantNil: false,
},
{
name: "零值",
unit: "bytes",
decimals: 2,
value: 0,
wantNil: false,
},
{
name: "极小值",
unit: "bytes",
decimals: 2,
value: 0.0000001,
wantNil: false,
},
{
name: "极大值",
unit: "bytes",
decimals: 2,
value: 1e30,
wantNil: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := ValueFormatter(tt.unit, tt.decimals, tt.value)
if (got == FormattedValue{}) == !tt.wantNil {
t.Errorf("ValueFormatter() got = %v, wantNil = %v", got, tt.wantNil)
}
})
}
}
// compareFormattedValues 比较两个FormattedValue是否相等
func compareFormattedValues(a, b FormattedValue) bool {
const epsilon = 0.0001
if math.Abs(a.Value-b.Value) > epsilon {
return false
}
if math.Abs(a.Stat-b.Stat) > epsilon {
return false
}
if a.Unit != b.Unit {
return false
}
if a.Text != b.Text {
return false
}
return true
}
================================================
FILE: pkg/version/version.go
================================================
package version
import (
"sync/atomic"
"time"
"github.com/hashicorp/go-version"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/net/httplib"
)
var Version = "unknown"
var GithubVersion atomic.Value
func CompareVersion(v1, v2 string) (int, error) {
version1, err := version.NewVersion(v1)
if err != nil {
return 0, err
}
version2, err := version.NewVersion(v2)
if err != nil {
return 0, err
}
if version1.LessThan(version2) {
return -1, nil
}
if version1.GreaterThan(version2) {
return 1, nil
}
return 0, nil
}
func GetGithubVersion() {
for {
req := httplib.Get("https://api.github.com/repos/ccfos/nightingale/releases/latest")
var release GithubRelease
err := req.ToJSON(&release)
if err != nil {
logger.Errorf("get github version fail: %v", err)
}
GithubVersion.Store(release.TagName)
time.Sleep(24 * time.Hour)
}
}
type GithubRelease struct {
TagName string `json:"tag_name"`
}
================================================
FILE: prom/client.go
================================================
package prom
import (
"sync"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/prom"
)
type PromClientMap struct {
sync.RWMutex
ctx *ctx.Context
ReaderClients map[int64]prom.API
WriterClients map[int64]prom.WriterType
}
func (pc *PromClientMap) Set(datasourceId int64, r prom.API, w prom.WriterType) {
if r == nil {
return
}
pc.Lock()
defer pc.Unlock()
pc.ReaderClients[datasourceId] = r
pc.WriterClients[datasourceId] = w
}
func (pc *PromClientMap) GetDatasourceIds() []int64 {
pc.RLock()
defer pc.RUnlock()
var datasourceIds []int64
for k := range pc.ReaderClients {
datasourceIds = append(datasourceIds, k)
}
return datasourceIds
}
func (pc *PromClientMap) GetCli(datasourceId int64) prom.API {
pc.RLock()
defer pc.RUnlock()
c := pc.ReaderClients[datasourceId]
return c
}
func (pc *PromClientMap) GetWriterCli(datasourceId int64) prom.WriterType {
pc.RLock()
defer pc.RUnlock()
c := pc.WriterClients[datasourceId]
return c
}
func (pc *PromClientMap) IsNil(datasourceId int64) bool {
pc.RLock()
defer pc.RUnlock()
c, exists := pc.ReaderClients[datasourceId]
if !exists {
return true
}
return c == nil
}
func (pc *PromClientMap) Reset() {
pc.Lock()
defer pc.Unlock()
pc.ReaderClients = make(map[int64]prom.API)
pc.WriterClients = make(map[int64]prom.WriterType)
}
func (pc *PromClientMap) Del(datasourceId int64) {
pc.Lock()
defer pc.Unlock()
delete(pc.ReaderClients, datasourceId)
delete(pc.WriterClients, datasourceId)
}
================================================
FILE: prom/option.go
================================================
package prom
import (
"sync"
"github.com/ccfos/nightingale/v6/models"
)
type PromOption struct {
ClusterName string
Url string
WriteAddr string
BasicAuthUser string
BasicAuthPass string
Timeout int64
DialTimeout int64
MaxIdleConnsPerHost int
Headers []string
// TLS 配置(支持 mTLS)
TLS models.TLS
}
func (po *PromOption) Equal(target PromOption) bool {
if po.Url != target.Url {
return false
}
if po.BasicAuthUser != target.BasicAuthUser {
return false
}
if po.BasicAuthPass != target.BasicAuthPass {
return false
}
if po.WriteAddr != target.WriteAddr {
return false
}
if po.Timeout != target.Timeout {
return false
}
if po.DialTimeout != target.DialTimeout {
return false
}
if po.MaxIdleConnsPerHost != target.MaxIdleConnsPerHost {
return false
}
if len(po.Headers) != len(target.Headers) {
return false
}
for i := 0; i < len(po.Headers); i++ {
if po.Headers[i] != target.Headers[i] {
return false
}
}
// 比较 TLS 配置
if po.TLS.SkipTlsVerify != target.TLS.SkipTlsVerify {
return false
}
if po.TLS.CACert != target.TLS.CACert {
return false
}
if po.TLS.ClientCert != target.TLS.ClientCert {
return false
}
if po.TLS.ClientKey != target.TLS.ClientKey {
return false
}
if po.TLS.ServerName != target.TLS.ServerName {
return false
}
if po.TLS.MinVersion != target.TLS.MinVersion {
return false
}
if po.TLS.MaxVersion != target.TLS.MaxVersion {
return false
}
return true
}
type PromOptionsStruct struct {
Data map[int64]PromOption
sync.RWMutex
}
func (pos *PromOptionsStruct) Set(datasourceId int64, po PromOption) {
pos.Lock()
pos.Data[datasourceId] = po
pos.Unlock()
}
func (pos *PromOptionsStruct) Del(datasourceId int64) {
pos.Lock()
delete(pos.Data, datasourceId)
pos.Unlock()
}
func (pos *PromOptionsStruct) Get(datasourceId int64) (PromOption, bool) {
pos.RLock()
defer pos.RUnlock()
ret, has := pos.Data[datasourceId]
return ret, has
}
// Data key is cluster name
var PromOptions = &PromOptionsStruct{Data: make(map[int64]PromOption)}
================================================
FILE: prom/reader.go
================================================
package prom
import (
"fmt"
"net"
"net/http"
"strings"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/pkg/prom"
"github.com/prometheus/client_golang/api"
"github.com/toolkits/pkg/logger"
)
func NewPromClient(ctx *ctx.Context) *PromClientMap {
pc := &PromClientMap{
ReaderClients: make(map[int64]prom.API),
WriterClients: make(map[int64]prom.WriterType),
ctx: ctx,
}
pc.InitReader()
return pc
}
func (pc *PromClientMap) InitReader() error {
go func() {
for {
pc.loadFromDatabase()
time.Sleep(time.Second)
}
}()
return nil
}
func (pc *PromClientMap) loadFromDatabase() {
var datasources []*models.Datasource
var err error
if !pc.ctx.IsCenter {
datasources, err = poster.GetByUrls[[]*models.Datasource](pc.ctx, "/v1/n9e/datasources?typ="+models.PROMETHEUS)
if err != nil {
logger.Errorf("failed to get datasources, error: %v", err)
return
}
lokiDatasource, err := poster.GetByUrls[[]*models.Datasource](pc.ctx, "/v1/n9e/datasources?typ="+models.LOKI)
datasources = append(datasources, lokiDatasource...)
if err != nil {
logger.Errorf("failed to get datasources, error: %v", err)
return
}
for i := 0; i < len(datasources); i++ {
if err := datasources[i].Decrypt(); err != nil {
logger.Errorf("decrypt datasource %+v fail: %v", datasources[i], err)
continue
}
datasources[i].FE2DB()
}
} else {
datasources, err = models.GetDatasourcesGetsBy(pc.ctx, models.PROMETHEUS, "", "", "")
if err != nil {
logger.Errorf("failed to get datasources, error: %v", err)
return
}
lokiDatasource, err := models.GetDatasourcesGetsBy(pc.ctx, models.LOKI, "", "", "")
datasources = append(datasources, lokiDatasource...)
if err != nil {
logger.Errorf("failed to get datasources, error: %v", err)
return
}
}
newCluster := make(map[int64]struct{})
for _, ds := range datasources {
dsId := ds.Id
var header []string
for k, v := range ds.HTTPJson.Headers {
header = append(header, k)
header = append(header, v)
}
var writeAddr string
var internalAddr string
for k, v := range ds.SettingsJson {
if strings.Contains(k, "write_addr") {
writeAddr = strings.TrimSpace(v.(string))
} else if strings.Contains(k, "internal_addr") && v.(string) != "" {
internalAddr = strings.TrimSpace(v.(string))
}
}
po := PromOption{
ClusterName: ds.Name,
Url: strings.TrimSpace(ds.HTTPJson.Url),
WriteAddr: writeAddr,
BasicAuthUser: ds.AuthJson.BasicAuthUser,
BasicAuthPass: ds.AuthJson.BasicAuthPassword,
Timeout: ds.HTTPJson.Timeout,
DialTimeout: ds.HTTPJson.DialTimeout,
MaxIdleConnsPerHost: ds.HTTPJson.MaxIdleConnsPerHost,
Headers: header,
TLS: ds.HTTPJson.TLS,
}
if internalAddr != "" && !pc.ctx.IsCenter {
// internal addr is set, use internal addr when edge mode
po.Url = internalAddr
}
newCluster[dsId] = struct{}{}
if pc.IsNil(dsId) {
// first time
if err = pc.setClientFromPromOption(dsId, po); err != nil {
logger.Errorf("failed to setClientFromPromOption po:%+v err:%v", po, err)
continue
}
logger.Infof("setClientFromPromOption success, datasourceId: %d", dsId)
PromOptions.Set(dsId, po)
continue
}
localPo, has := PromOptions.Get(dsId)
if !has || !localPo.Equal(po) {
if err = pc.setClientFromPromOption(dsId, po); err != nil {
logger.Errorf("failed to setClientFromPromOption: %v", err)
continue
}
PromOptions.Set(dsId, po)
}
}
// delete useless cluster
oldIds := pc.GetDatasourceIds()
for _, oldId := range oldIds {
if _, has := newCluster[oldId]; !has {
pc.Del(oldId)
PromOptions.Del(oldId)
logger.Info("delete cluster: ", oldId)
}
}
}
func (pc *PromClientMap) newReaderClientFromPromOption(po PromOption) (api.Client, error) {
tlsConfig, err := po.TLS.TLSConfig()
if err != nil {
return nil, fmt.Errorf("failed to create TLS config: %v", err)
}
return api.NewClient(api.Config{
Address: po.Url,
RoundTripper: &http.Transport{
TLSClientConfig: tlsConfig,
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: time.Duration(po.DialTimeout) * time.Millisecond,
}).DialContext,
ResponseHeaderTimeout: time.Duration(po.Timeout) * time.Millisecond,
MaxIdleConnsPerHost: po.MaxIdleConnsPerHost,
},
})
}
func (pc *PromClientMap) newWriterClientFromPromOption(po PromOption) (api.Client, error) {
tlsConfig, err := po.TLS.TLSConfig()
if err != nil {
return nil, fmt.Errorf("failed to create TLS config: %v", err)
}
return api.NewClient(api.Config{
Address: po.WriteAddr,
RoundTripper: &http.Transport{
TLSClientConfig: tlsConfig,
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: time.Duration(po.DialTimeout) * time.Millisecond,
}).DialContext,
ResponseHeaderTimeout: time.Duration(po.Timeout) * time.Millisecond,
MaxIdleConnsPerHost: po.MaxIdleConnsPerHost,
},
})
}
func (pc *PromClientMap) setClientFromPromOption(datasourceId int64, po PromOption) error {
if datasourceId < 0 {
return fmt.Errorf("argument clusterName is blank")
}
if po.Url == "" {
return fmt.Errorf("prometheus url is blank")
}
readerCli, err := pc.newReaderClientFromPromOption(po)
if err != nil {
return fmt.Errorf("failed to newClientFromPromOption: %v", err)
}
reader := prom.NewAPI(readerCli, prom.ClientOptions{
BasicAuthUser: po.BasicAuthUser,
BasicAuthPass: po.BasicAuthPass,
Headers: po.Headers,
})
writerCli, err := pc.newWriterClientFromPromOption(po)
if err != nil {
return fmt.Errorf("failed to newClientFromPromOption: %v", err)
}
w := prom.NewWriter(writerCli, prom.ClientOptions{
Url: po.WriteAddr,
BasicAuthUser: po.BasicAuthUser,
BasicAuthPass: po.BasicAuthPass,
Headers: po.Headers,
})
logger.Debugf("setClientFromPromOption: %d, %+v", datasourceId, po)
pc.Set(datasourceId, reader, w)
return nil
}
================================================
FILE: pushgw/idents/idents.go
================================================
package idents
import (
"context"
"fmt"
"sync"
"time"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/ccfos/nightingale/v6/pushgw/pstat"
"github.com/ccfos/nightingale/v6/storage"
"github.com/toolkits/pkg/concurrent/semaphore"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/slice"
)
type Set struct {
sync.Mutex
items map[string]struct{}
redis storage.Redis
ctx *ctx.Context
configs pconf.Pushgw
sema *semaphore.Semaphore
}
func New(ctx *ctx.Context, redis storage.Redis, configs pconf.Pushgw) *Set {
set := &Set{
items: make(map[string]struct{}),
redis: redis,
ctx: ctx,
configs: configs,
}
set.sema = semaphore.NewSemaphore(configs.UpdateTargetByUrlConcurrency)
set.Init()
return set
}
func (s *Set) Init() {
go s.LoopPersist()
}
func (s *Set) MSet(items map[string]struct{}) {
s.Lock()
defer s.Unlock()
for ident := range items {
s.items[ident] = struct{}{}
}
}
func (s *Set) LoopPersist() {
for {
time.Sleep(time.Second)
s.persist()
}
}
func (s *Set) persist() {
var items map[string]struct{}
s.Lock()
if len(s.items) == 0 {
s.Unlock()
return
}
items = s.items
s.items = make(map[string]struct{})
s.Unlock()
s.updateTimestamp(items)
}
func (s *Set) updateTimestamp(items map[string]struct{}) {
lst := make([]string, 0, 100)
now := time.Now().Unix()
num := 0
for ident := range items {
lst = append(lst, ident)
num++
if num == 100 {
if err := s.UpdateTargets(lst, now); err != nil {
logger.Errorf("failed to update targets: %v", err)
}
lst = lst[:0]
num = 0
}
}
if err := s.UpdateTargets(lst, now); err != nil {
logger.Errorf("failed to update targets: %v", err)
}
}
type TargetUpdate struct {
Lst []string `json:"lst"`
Now int64 `json:"now"`
}
func (s *Set) UpdateTargets(lst []string, now int64) error {
if len(lst) == 0 {
return nil
}
// 心跳时间只写入 Redis,不再写入 MySQL update_at
err := s.updateTargetsUpdateTs(lst, now, s.redis)
if err != nil {
logger.Errorf("update_ts: failed to update targets: %v error: %v", lst, err)
}
if !s.ctx.IsCenter {
t := TargetUpdate{
Lst: lst,
Now: now,
}
if !s.sema.TryAcquire() {
logger.Warningf("update_targets: update target by url concurrency limit, skip update target: %v", lst)
return nil // 达到并发上限,放弃请求,只是页面上的机器时间不更新,不影响机器失联告警,降级处理下
}
go func() {
defer s.sema.Release()
// 修改为异步发送,防止机器太多,每个请求耗时比较长导致机器心跳时间更新不及时
err := poster.PostByUrls(s.ctx, "/v1/n9e/target-update", t)
if err != nil {
logger.Errorf("failed to post target update: %v", err)
}
}()
return nil
}
// 新 target 仍需 INSERT 注册到 MySQL
var exists []string
err = s.ctx.DB.Table("target").Where("ident in ?", lst).Pluck("ident", &exists).Error
if err != nil {
return err
}
news := slice.SubString(lst, exists)
for i := 0; i < len(news); i++ {
err = s.ctx.DB.Exec("INSERT INTO target(ident, update_at) VALUES(?, ?)", news[i], now).Error
if err != nil {
logger.Error("upsert_target: failed to insert target:", news[i], "error:", err)
}
}
return nil
}
func (s *Set) updateTargetsUpdateTs(lst []string, now int64, redis storage.Redis) error {
if redis == nil {
logger.Debugf("update_ts: redis is nil")
return nil
}
newMap := make(map[string]interface{}, len(lst))
for _, ident := range lst {
hostUpdateTime := models.HostUpdateTime{
UpdateTime: now,
Ident: ident,
}
newMap[models.WrapIdentUpdateTime(ident)] = hostUpdateTime
}
return s.updateTargetTsInRedis(newMap, redis)
}
func (s *Set) updateTargetTsInRedis(newMap map[string]interface{}, redis storage.Redis) (err error) {
if len(newMap) == 0 {
return nil
}
timeout := time.Duration(s.configs.UpdateTargetTimeoutMills) * time.Millisecond
batchSize := s.configs.UpdateTargetBatchSize
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
if len(newMap) <= batchSize {
// 如果 newMap 的内容小于等于 batchSize,则直接执行 MSet
return s.writeTargetTsInRedis(ctx, redis, newMap)
}
i := 0
batchMap := make(map[string]interface{}, batchSize)
for mapKey := range newMap {
batchMap[mapKey] = newMap[mapKey]
if (i+1)%batchSize == 0 {
if e := s.writeTargetTsInRedis(ctx, redis, batchMap); e != nil {
err = e
}
batchMap = make(map[string]interface{}, batchSize)
}
i++
}
if len(batchMap) > 0 {
if e := s.writeTargetTsInRedis(ctx, redis, batchMap); e != nil {
err = e
}
}
return err
}
func (s *Set) writeTargetTsInRedis(ctx context.Context, redis storage.Redis, content map[string]interface{}) error {
retryCount := s.configs.UpdateTargetRetryCount
retryInterval := time.Duration(s.configs.UpdateTargetRetryIntervalMills) * time.Millisecond
keys := make([]string, 0, len(content))
for k := range content {
keys = append(keys, k)
}
for i := 0; i < retryCount; i++ {
start := time.Now()
err := storage.MSet(ctx, redis, content, 24*time.Hour)
duration := time.Since(start).Seconds()
logger.Debugf("update_ts: write target ts in redis, keys: %v, retryCount: %d, retryInterval: %v, error: %v", keys, retryCount, retryInterval, err)
if err == nil {
pstat.RedisOperationLatency.WithLabelValues("mset_target_ts", "success").Observe(duration)
return nil
} else {
logger.Errorf("update_ts: failed to write target ts in redis: %v, keys: %v, retry %d/%d", err, keys, i+1, retryCount)
}
if i < retryCount-1 {
// 最后一次尝试的时候不需要 sleep,之前的尝试如果失败了,都需要完事之后 sleep
time.Sleep(retryInterval)
}
if i == retryCount-1 {
// 记录最后一次的失败情况
pstat.RedisOperationLatency.WithLabelValues("mset_target_ts", "fail").Observe(duration)
}
}
return fmt.Errorf("failed to write target ts in redis after %d retries, keys: %v", retryCount, keys)
}
================================================
FILE: pushgw/kafka/producer.go
================================================
package kafka
import (
"fmt"
"github.com/IBM/sarama"
"github.com/prometheus/client_golang/prometheus"
)
const (
AsyncProducer = "async"
SyncProducer = "sync"
)
var (
KafkaProducerSuccess = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "kafka_producer_message_success_total",
Help: "Total number of successful messages sent to Kafka.",
},
[]string{"producer_type"},
)
KafkaProducerError = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "kafka_producer_message_error_total",
Help: "Total number of errors encountered while sending messages to Kafka.",
},
[]string{"producer_type"},
)
)
func init() {
prometheus.MustRegister(
KafkaProducerSuccess,
KafkaProducerError,
)
}
type (
Producer interface {
Send(*sarama.ProducerMessage) error
Close() error
}
AsyncProducerWrapper struct {
asyncProducer sarama.AsyncProducer
stop chan struct{}
}
SyncProducerWrapper struct {
syncProducer sarama.SyncProducer
stop chan struct{}
}
)
func New(typ string, brokers []string, config *sarama.Config) (Producer, error) {
stop := make(chan struct{})
switch typ {
case AsyncProducer:
p, err := sarama.NewAsyncProducer(brokers, config)
if err != nil {
return nil, err
}
apw := &AsyncProducerWrapper{
asyncProducer: p,
stop: stop,
}
go apw.errorWorker()
go apw.successWorker()
return apw, nil
case SyncProducer:
if !config.Producer.Return.Successes {
config.Producer.Return.Successes = true
}
p, err := sarama.NewSyncProducer(brokers, config)
return &SyncProducerWrapper{syncProducer: p}, err
default:
return nil, fmt.Errorf("unknown producer type: %s", typ)
}
}
func (p *AsyncProducerWrapper) Send(msg *sarama.ProducerMessage) error {
p.asyncProducer.Input() <- msg
return nil
}
func (p *AsyncProducerWrapper) Close() error {
close(p.stop)
return p.asyncProducer.Close()
}
func (p *AsyncProducerWrapper) errorWorker() {
for {
select {
case <-p.asyncProducer.Errors():
KafkaProducerError.WithLabelValues(AsyncProducer).Inc()
case <-p.stop:
return
}
}
}
func (p *AsyncProducerWrapper) successWorker() {
for {
select {
case <-p.asyncProducer.Successes():
KafkaProducerSuccess.WithLabelValues(AsyncProducer).Inc()
case <-p.stop:
return
}
}
}
func (p *SyncProducerWrapper) Send(msg *sarama.ProducerMessage) error {
_, _, err := p.syncProducer.SendMessage(msg)
if err == nil {
KafkaProducerSuccess.WithLabelValues(SyncProducer).Inc()
} else {
KafkaProducerError.WithLabelValues(SyncProducer).Inc()
}
return err
}
func (p *SyncProducerWrapper) Close() error {
close(p.stop)
return p.syncProducer.Close()
}
================================================
FILE: pushgw/pconf/conf.go
================================================
package pconf
import (
"log"
"net"
"net/http"
"regexp"
"runtime"
"time"
"github.com/ccfos/nightingale/v6/pkg/tlsx"
"github.com/prometheus/common/model"
)
type Pushgw struct {
UpdateTargetRetryCount int
UpdateTargetRetryIntervalMills int64
UpdateTargetTimeoutMills int64
UpdateTargetBatchSize int
PushConcurrency int
UpdateTargetByUrlConcurrency int
GetHeartbeatFromMetric bool // 是否从时序数据中提取机器心跳时间,默认 false
BusiGroupLabelKey string
IdentMetrics []string
IdentStatsThreshold int
IdentDropThreshold int // 每分钟单个 ident 的样本数超过该阈值,则丢弃
WriteConcurrency int
LabelRewrite bool
ForceUseServerTS bool
DebugSample map[string]string
DropSample []map[string]string
WriterOpt WriterGlobalOpt
Writers []WriterOptions
KafkaWriters []KafkaWriterOptions
}
type WriterGlobalOpt struct {
QueueMaxSize int
QueuePopSize int
QueueNumber int // 每个 writer 固定数量的队列
QueueWaterMark float64 // 队列将满,开始丢弃数据的水位,比如 0.8
AllQueueMaxSize int64 // 自动计算得到,无需配置
AllQueueMaxSizeInterval int
RetryCount int
RetryInterval int64
OverLimitStatusCode int
}
type WriterOptions struct {
Url string
BasicAuthUser string
BasicAuthPass string
AsyncWrite bool // 如果有多个转发 writer,对应不重要的 writer,可以设置为 true,异步转发提供转发效率
Timeout int64
DialTimeout int64
TLSHandshakeTimeout int64
ExpectContinueTimeout int64
IdleConnTimeout int64
KeepAlive int64
MaxConnsPerHost int
MaxIdleConns int
MaxIdleConnsPerHost int
Headers []string
WriteRelabels []*RelabelConfig
tlsx.ClientConfig
// writer 是在配置文件中写死的,不支持动态更新,所以启动的时候就初始化好
// 后面大概率也不需要动态更新,pushgw 甚至想单独拆出来作为一个独立的进程提供服务
HTTPTransport *http.Transport
}
type SASLConfig struct {
Enable bool
User string
Password string
Mechanism string
Version int16
Handshake bool
AuthIdentity string
}
type KafkaWriterOptions struct {
Typ string
Brokers []string
Topic string
Version string
Timeout int64
SASL *SASLConfig
WriteRelabels []*RelabelConfig
}
type RelabelConfig struct {
SourceLabels model.LabelNames `json:"source_labels"`
Separator string `json:"separator"`
Regex string `json:"regex"`
RegexCompiled *regexp.Regexp
If string `json:"if"`
IfRegex *regexp.Regexp
Modulus uint64 `json:"modulus"`
TargetLabel string `json:"target_label"`
Replacement string `json:"replacement"`
Action string `json:"action"`
}
func (p *Pushgw) PreCheck() {
if p.UpdateTargetRetryCount <= 0 {
p.UpdateTargetRetryCount = 3
}
if p.UpdateTargetRetryIntervalMills <= 0 {
p.UpdateTargetRetryIntervalMills = 500
}
if p.UpdateTargetTimeoutMills <= 0 {
p.UpdateTargetTimeoutMills = 3000
}
if p.UpdateTargetBatchSize <= 0 {
p.UpdateTargetBatchSize = 20
}
if p.PushConcurrency <= 0 {
p.PushConcurrency = 16
}
if p.UpdateTargetByUrlConcurrency <= 0 {
p.UpdateTargetByUrlConcurrency = 10
}
if p.BusiGroupLabelKey == "" {
p.BusiGroupLabelKey = "busigroup"
}
if p.WriterOpt.QueueMaxSize <= 0 {
p.WriterOpt.QueueMaxSize = 1000_000
}
if p.WriterOpt.QueuePopSize <= 0 {
p.WriterOpt.QueuePopSize = 1000
}
if p.WriterOpt.QueueNumber <= 0 {
if runtime.NumCPU() > 1 {
p.WriterOpt.QueueNumber = runtime.NumCPU()
} else {
p.WriterOpt.QueueNumber = 128
}
}
if p.WriterOpt.QueueWaterMark <= 0 {
p.WriterOpt.QueueWaterMark = 0.1
}
p.WriterOpt.AllQueueMaxSize = int64(float64(p.WriterOpt.QueueNumber*p.WriterOpt.QueueMaxSize) * p.WriterOpt.QueueWaterMark)
if p.WriterOpt.AllQueueMaxSizeInterval <= 0 {
p.WriterOpt.AllQueueMaxSizeInterval = 200
}
if p.WriterOpt.RetryCount <= 0 {
p.WriterOpt.RetryCount = 1000
}
if p.WriterOpt.RetryInterval <= 0 {
p.WriterOpt.RetryInterval = 1
}
if p.WriterOpt.OverLimitStatusCode <= 0 {
p.WriterOpt.OverLimitStatusCode = 499
}
if p.WriteConcurrency <= 0 {
p.WriteConcurrency = 5000
}
if p.IdentStatsThreshold <= 0 {
p.IdentStatsThreshold = 1500
}
if p.IdentDropThreshold <= 0 {
p.IdentDropThreshold = 5000000
}
for index := range p.Writers {
for _, relabel := range p.Writers[index].WriteRelabels {
if relabel.Regex == "" {
relabel.Regex = "(.*)"
}
regex, err := regexp.Compile("^(?:" + relabel.Regex + ")$")
if err != nil {
log.Fatalln("failed to compile regexp:", relabel.Regex, "error:", err)
}
relabel.RegexCompiled = regex
if relabel.Separator == "" {
relabel.Separator = ";"
}
if relabel.Action == "" {
relabel.Action = "replace"
}
if relabel.Replacement == "" {
relabel.Replacement = "$1"
}
}
tlsConf, err := p.Writers[index].ClientConfig.TLSConfig()
if err != nil {
panic(err)
}
// 初始化 http transport
p.Writers[index].HTTPTransport = &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: time.Duration(p.Writers[index].DialTimeout) * time.Millisecond,
KeepAlive: time.Duration(p.Writers[index].KeepAlive) * time.Millisecond,
}).DialContext,
ResponseHeaderTimeout: time.Duration(p.Writers[index].Timeout) * time.Millisecond,
TLSHandshakeTimeout: time.Duration(p.Writers[index].TLSHandshakeTimeout) * time.Millisecond,
ExpectContinueTimeout: time.Duration(p.Writers[index].ExpectContinueTimeout) * time.Millisecond,
MaxConnsPerHost: p.Writers[index].MaxConnsPerHost,
MaxIdleConns: p.Writers[index].MaxIdleConns,
MaxIdleConnsPerHost: p.Writers[index].MaxIdleConnsPerHost,
IdleConnTimeout: time.Duration(p.Writers[index].IdleConnTimeout) * time.Millisecond,
}
if tlsConf != nil {
p.Writers[index].HTTPTransport.TLSClientConfig = tlsConf
}
}
}
================================================
FILE: pushgw/pstat/pstat.go
================================================
package pstat
import "github.com/prometheus/client_golang/prometheus"
const (
namespace = "n9e"
subsystem = "pushgw"
)
var (
CounterSampleTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "samples_received_total",
Help: "Total number samples received.",
}, []string{"channel"})
CounterDropSampleTotal = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "drop_sample_total",
Help: "Number of drop sample.",
})
CounterSampleReceivedByIdent = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "sample_received_by_ident",
Help: "Number of sample push by ident.",
}, []string{"host_ident"})
RequestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "http_request_duration_seconds",
Help: "HTTP request latencies in seconds.",
}, []string{"service", "code", "path", "method"},
)
ForwardDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Buckets: []float64{.001, .01, .1, 1, 5, 10},
Name: "forward_duration_seconds",
Help: "Forward samples to TSDB. latencies in seconds.",
}, []string{"url"},
)
ForwardKafkaDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Buckets: []float64{.1, 1, 10},
Name: "forward_kafka_duration_seconds",
Help: "Forward samples to Kafka. latencies in seconds.",
}, []string{"brokers_topic"},
)
GaugeSampleQueueSize = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "sample_queue_size",
Help: "The size of sample queue.",
}, []string{"queueid"},
)
CounterWriteTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "write_total",
Help: "Number of write.",
}, []string{"url"})
CounterWriteErrorTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "write_error_total",
Help: "Number of write error.",
}, []string{"url"})
CounterPushQueueErrorTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "push_queue_error_total",
Help: "Number of push queue error.",
}, []string{"queueid"})
CounterPushQueueOverLimitTotal = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "push_queue_over_limit_error_total",
Help: "Number of push queue over limit.",
})
RedisOperationLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "redis_operation_latency_seconds",
Help: "Histogram of latencies for Redis operations",
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5},
},
[]string{"operation", "status"},
)
DBOperationLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "db_operation_latency_seconds",
Help: "Histogram of latencies for DB operations",
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5},
},
[]string{"operation"},
)
)
func init() {
prometheus.MustRegister(
CounterSampleTotal,
CounterDropSampleTotal,
CounterSampleReceivedByIdent,
RequestDuration,
ForwardDuration,
ForwardKafkaDuration,
CounterWriteTotal,
CounterWriteErrorTotal,
CounterPushQueueErrorTotal,
GaugeSampleQueueSize,
CounterPushQueueOverLimitTotal,
RedisOperationLatency,
DBOperationLatency,
)
}
================================================
FILE: pushgw/pushgw.go
================================================
package pushgw
import (
"context"
"fmt"
"github.com/ccfos/nightingale/v6/center/metas"
"github.com/ccfos/nightingale/v6/conf"
"github.com/ccfos/nightingale/v6/dscache"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/httpx"
"github.com/ccfos/nightingale/v6/pkg/logx"
"github.com/ccfos/nightingale/v6/pushgw/idents"
"github.com/ccfos/nightingale/v6/pushgw/router"
"github.com/ccfos/nightingale/v6/pushgw/writer"
"github.com/ccfos/nightingale/v6/storage"
)
type PushgwProvider struct {
Ident *idents.Set
Router *router.Router
}
func Initialize(configDir string, cryptoKey string) (func(), error) {
config, err := conf.InitConfig(configDir, cryptoKey)
if err != nil {
return nil, fmt.Errorf("failed to init config: %v", err)
}
logxClean, err := logx.Init(config.Log)
if err != nil {
return nil, err
}
ctx := ctx.NewContext(context.Background(), nil, false, config.CenterApi)
var redis storage.Redis
if config.Redis.Address != "" {
redis, err = storage.NewRedis(config.Redis)
if err != nil {
return nil, err
}
}
idents := idents.New(ctx, redis, config.Pushgw)
metas := metas.New(redis)
stats := memsto.NewSyncStats()
busiGroupCache := memsto.NewBusiGroupCache(ctx, stats)
targetCache := memsto.NewTargetCache(ctx, stats, nil)
configCvalCache := memsto.NewCvalCache(ctx, stats)
writers := writer.NewWriters(config.Pushgw)
r := httpx.GinEngine(config.Global.RunMode, config.HTTP, configCvalCache.PrintBodyPaths, configCvalCache.PrintAccessLog)
rt := router.New(config.HTTP, config.Pushgw, config.Alert, targetCache, busiGroupCache, idents, metas, writers, ctx)
rt.Config(r)
dscache.Init(ctx, false)
httpClean := httpx.Init(config.HTTP, r)
return func() {
logxClean()
httpClean()
}, nil
}
================================================
FILE: pushgw/router/fns.go
================================================
package router
import (
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pushgw/pstat"
"github.com/prometheus/prometheus/prompb"
"github.com/toolkits/pkg/logger"
)
func (rt *Router) AppendLabels(pt *prompb.TimeSeries, target *models.Target, bgCache *memsto.BusiGroupCacheType) {
if target == nil {
return
}
labelKeys := make(map[string]int)
for j := 0; j < len(pt.Labels); j++ {
labelKeys[pt.Labels[j].Name] = j
}
for key, value := range target.TagsMap {
if index, has := labelKeys[key]; has {
// e.g. busigroup=cloud
if _, has := labelKeys[rt.Pushgw.BusiGroupLabelKey]; has {
// busigroup key already exists, skip
continue
}
// overwrite labels
if rt.Pushgw.LabelRewrite {
pt.Labels[index].Value = value
}
continue
}
pt.Labels = append(pt.Labels, prompb.Label{
Name: key,
Value: value,
})
}
// e.g. busigroup=cloud
if _, has := labelKeys[rt.Pushgw.BusiGroupLabelKey]; has {
return
}
// append busigroup tags
if target.GroupId > 0 && len(rt.Pushgw.BusiGroupLabelKey) > 0 {
bg := bgCache.GetByBusiGroupId(target.GroupId)
if bg == nil {
return
}
if bg.LabelEnable == 0 {
return
}
if index, has := labelKeys[rt.Pushgw.BusiGroupLabelKey]; has {
// overwrite labels
if rt.Pushgw.LabelRewrite {
pt.Labels[index].Value = bg.LabelValue
}
return
}
pt.Labels = append(pt.Labels, prompb.Label{
Name: rt.Pushgw.BusiGroupLabelKey,
Value: bg.LabelValue,
})
}
}
// func getTs(pt *prompb.TimeSeries) int64 {
// if len(pt.Samples) == 0 {
// return 0
// }
// return pt.Samples[0].Timestamp
// }
func (rt *Router) debugSample(remoteAddr string, v *prompb.TimeSeries) {
if v == nil {
return
}
filter := rt.Pushgw.DebugSample
if len(filter) == 0 {
return
}
labelMap := make(map[string]string)
for i := 0; i < len(v.Labels); i++ {
labelMap[v.Labels[i].Name] = v.Labels[i].Value
}
for k, v := range filter {
labelValue, exists := labelMap[k]
if !exists {
return
}
if labelValue != v {
return
}
}
logger.Debugf("--> debug sample from: %s, sample: %s", remoteAddr, v.String())
}
func (rt *Router) DropSample(v *prompb.TimeSeries) bool {
// 快速路径:检查仅 __name__ 的过滤器 O(1)
if len(rt.dropByNameOnly) > 0 {
for i := 0; i < len(v.Labels); i++ {
if v.Labels[i].Name == "__name__" {
if _, ok := rt.dropByNameOnly[v.Labels[i].Value]; ok {
return true
}
break // __name__ 只会出现一次,找到后直接跳出
}
}
}
// 慢速路径:处理复杂的多条件过滤器
if len(rt.dropComplex) == 0 {
return false
}
// 只有复杂过滤器存在时才创建 labelMap
labelMap := make(map[string]string, len(v.Labels))
for i := 0; i < len(v.Labels); i++ {
labelMap[v.Labels[i].Name] = v.Labels[i].Value
}
for _, filter := range rt.dropComplex {
if matchSample(filter, labelMap) {
return true
}
}
return false
}
func matchSample(filterMap, sampleMap map[string]string) bool {
for k, v := range filterMap {
labelValue, exists := sampleMap[k]
if !exists {
return false
}
if labelValue != v {
return false
}
}
return true
}
func (rt *Router) ForwardToQueue(clientIP string, queueid string, v *prompb.TimeSeries) error {
v = rt.BeforePush(clientIP, v)
if v == nil {
return nil
}
if rt.DropSample(v) {
pstat.CounterDropSampleTotal.Inc()
return nil
}
return rt.Writers.PushSample(queueid, *v)
}
func (rt *Router) BeforePush(clientIP string, v *prompb.TimeSeries) *prompb.TimeSeries {
rt.debugSample(clientIP, v)
return rt.HandleTS(v)
}
================================================
FILE: pushgw/router/router.go
================================================
package router
import (
"fmt"
"time"
"github.com/gin-gonic/gin"
"github.com/prometheus/prometheus/prompb"
"github.com/toolkits/pkg/logger"
"github.com/ccfos/nightingale/v6/alert/aconf"
"github.com/ccfos/nightingale/v6/center/metas"
"github.com/ccfos/nightingale/v6/memsto"
"github.com/ccfos/nightingale/v6/pkg/ctx"
"github.com/ccfos/nightingale/v6/pkg/httpx"
"github.com/ccfos/nightingale/v6/pushgw/idents"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/ccfos/nightingale/v6/pushgw/pstat"
"github.com/ccfos/nightingale/v6/pushgw/writer"
"github.com/ccfos/nightingale/v6/pkg/ginx"
)
type HandleTSFunc func(pt *prompb.TimeSeries) *prompb.TimeSeries
type Router struct {
HTTP httpx.Config
Pushgw pconf.Pushgw
Aconf aconf.Alert
TargetCache *memsto.TargetCacheType
BusiGroupCache *memsto.BusiGroupCacheType
IdentSet *idents.Set
MetaSet *metas.Set
Writers *writer.WritersType
Ctx *ctx.Context
HandleTS HandleTSFunc
HeartbeatApi string
// 预编译的 DropSample 过滤器
dropByNameOnly map[string]struct{} // 仅 __name__ 条件的快速匹配
dropComplex []map[string]string // 多条件的复杂匹配
}
func stat() gin.HandlerFunc {
return func(c *gin.Context) {
start := time.Now()
c.Next()
code := fmt.Sprintf("%d", c.Writer.Status())
method := c.Request.Method
labels := []string{"pushgw", code, c.FullPath(), method}
pstat.RequestDuration.WithLabelValues(labels...).Observe(float64(time.Since(start).Seconds()))
}
}
func New(httpConfig httpx.Config, pushgw pconf.Pushgw, aconf aconf.Alert, tc *memsto.TargetCacheType, bg *memsto.BusiGroupCacheType,
idents *idents.Set, metas *metas.Set,
writers *writer.WritersType, ctx *ctx.Context) *Router {
rt := &Router{
HTTP: httpConfig,
Pushgw: pushgw,
Aconf: aconf,
Writers: writers,
Ctx: ctx,
TargetCache: tc,
BusiGroupCache: bg,
IdentSet: idents,
MetaSet: metas,
HandleTS: func(pt *prompb.TimeSeries) *prompb.TimeSeries { return pt },
}
// 预编译 DropSample 过滤器
rt.initDropSampleFilters()
return rt
}
// initDropSampleFilters 预编译 DropSample 过滤器,将单条件 __name__ 过滤器
// 放入 map 实现 O(1) 查找,多条件过滤器保留原有逻辑
func (rt *Router) initDropSampleFilters() {
rt.dropByNameOnly = make(map[string]struct{})
rt.dropComplex = make([]map[string]string, 0)
for _, filter := range rt.Pushgw.DropSample {
if len(filter) == 0 {
continue
}
// 如果只有一个条件且是 __name__,放入快速匹配 map
if len(filter) == 1 {
if name, ok := filter["__name__"]; ok {
rt.dropByNameOnly[name] = struct{}{}
continue
}
}
// 其他情况放入复杂匹配列表
rt.dropComplex = append(rt.dropComplex, filter)
}
logger.Infof("DropSample filters initialized: %d name-only, %d complex",
len(rt.dropByNameOnly), len(rt.dropComplex))
}
func (rt *Router) Config(r *gin.Engine) {
service := r.Group("/v1/n9e")
if len(rt.HTTP.APIForService.BasicAuth) > 0 {
service.Use(gin.BasicAuth(rt.HTTP.APIForService.BasicAuth))
}
service.POST("/target-update", rt.targetUpdate)
if !rt.HTTP.APIForAgent.Enable {
return
}
r.Use(stat())
// datadog url: http://n9e-pushgw.foo.com/datadog
// use apiKey not basic auth
r.POST("/datadog/api/v1/series", rt.datadogSeries)
r.POST("/datadog/api/v1/check_run", datadogCheckRun)
r.GET("/datadog/api/v1/validate", datadogValidate)
r.POST("/datadog/api/v1/metadata", datadogMetadata)
r.POST("/datadog/intake/", datadogIntake)
if len(rt.HTTP.APIForAgent.BasicAuth) > 0 {
// enable basic auth
accounts := make(ginx.Accounts, 0)
for username, password := range rt.HTTP.APIForAgent.BasicAuth {
accounts = append(accounts, ginx.Account{
User: username,
Password: password,
})
}
for username, password := range rt.HTTP.APIForService.BasicAuth {
accounts = append(accounts, ginx.Account{
User: username,
Password: password,
})
}
auth := ginx.BasicAuth(accounts)
r.POST("/opentsdb/put", auth, rt.openTSDBPut)
r.POST("/openfalcon/push", auth, rt.falconPush)
r.POST("/prometheus/v1/write", auth, rt.remoteWrite)
r.POST("/proxy/v1/write", auth, rt.proxyRemoteWrite)
r.POST("/v1/n9e/edge/heartbeat", auth, rt.heartbeat)
if len(rt.Ctx.CenterApi.Addrs) > 0 {
r.POST("/v1/n9e/heartbeat", auth, rt.heartbeat)
}
} else {
// no need basic auth
r.POST("/opentsdb/put", rt.openTSDBPut)
r.POST("/openfalcon/push", rt.falconPush)
r.POST("/prometheus/v1/write", rt.remoteWrite)
r.POST("/proxy/v1/write", rt.proxyRemoteWrite)
r.POST("/v1/n9e/edge/heartbeat", rt.heartbeat)
if len(rt.Ctx.CenterApi.Addrs) > 0 {
r.POST("/v1/n9e/heartbeat", rt.heartbeat)
}
}
}
================================================
FILE: pushgw/router/router_datadog.go
================================================
package router
import (
"compress/gzip"
"compress/zlib"
"fmt"
"io/ioutil"
"net/http"
"strings"
"sync/atomic"
"github.com/ccfos/nightingale/v6/pushgw/pstat"
"github.com/gin-gonic/gin"
easyjson "github.com/mailru/easyjson"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/prompb"
)
//easyjson:json
type TimeSeries struct {
Series []*DatadogMetric `json:"series"`
}
//easyjson:json
type DatadogMetric struct {
Metric string `json:"metric"`
Points []DatadogPoint `json:"points"`
Host string `json:"host"`
Tags []string `json:"tags,omitempty"`
}
//easyjson:json
type DatadogPoint [2]float64
func (m *DatadogMetric) Clean() error {
if m.Metric == "" {
return fmt.Errorf("metric is blank")
}
return nil
}
func (m *DatadogMetric) ToProm() (*prompb.TimeSeries, string, error) {
pt := &prompb.TimeSeries{}
for i := 0; i < len(m.Points); i++ {
pt.Samples = append(pt.Samples, prompb.Sample{
// use ms
Timestamp: int64(m.Points[i][0]) * 1000,
Value: m.Points[i][1],
})
}
if strings.IndexByte(m.Metric, '.') != -1 {
m.Metric = strings.ReplaceAll(m.Metric, ".", "_")
}
if strings.IndexByte(m.Metric, '-') != -1 {
m.Metric = strings.ReplaceAll(m.Metric, "-", "_")
}
if !model.MetricNameRE.MatchString(m.Metric) {
return nil, "", fmt.Errorf("invalid metric name: %s", m.Metric)
}
pt.Labels = append(pt.Labels, prompb.Label{
Name: model.MetricNameLabel,
Value: m.Metric,
})
identInTag := ""
hostInTag := ""
for i := 0; i < len(m.Tags); i++ {
arr := strings.SplitN(m.Tags[i], ":", 2)
if len(arr) != 2 {
continue
}
key := arr[0]
if key == "ident" {
identInTag = arr[1]
pt.Labels = append(pt.Labels, prompb.Label{
Name: key,
Value: arr[1],
})
continue
}
if key == "host" {
hostInTag = arr[1]
continue
}
if strings.IndexByte(key, '.') != -1 {
key = strings.ReplaceAll(key, ".", "_")
}
if strings.IndexByte(key, '-') != -1 {
key = strings.ReplaceAll(key, "-", "_")
}
if !model.LabelNameRE.MatchString(key) {
return nil, "", fmt.Errorf("invalid tag name: %s", key)
}
pt.Labels = append(pt.Labels, prompb.Label{
Name: key,
Value: arr[1],
})
}
if m.Host != "" {
// m.Host has high priority
hostInTag = m.Host
}
if hostInTag != "" {
if identInTag != "" {
pt.Labels = append(pt.Labels, prompb.Label{
Name: "host",
Value: hostInTag,
})
} else {
pt.Labels = append(pt.Labels, prompb.Label{
Name: "ident",
Value: hostInTag,
})
}
}
ident := hostInTag
if identInTag != "" {
ident = identInTag
}
return pt, ident, nil
}
func datadogCheckRun(c *gin.Context) {
c.String(200, "not implemented")
}
func datadogValidate(c *gin.Context) {
c.String(200, "not implemented")
}
func datadogIntake(c *gin.Context) {
c.String(200, "not implemented")
}
func datadogMetadata(c *gin.Context) {
// body, err := readDatadogBody(c)
// fmt.Println("metadata:", string(body), err)
c.String(200, "not implemented")
}
func readDatadogBody(c *gin.Context) ([]byte, error) {
var bs []byte
var err error
enc := c.GetHeader("Content-Encoding")
if enc == "gzip" {
r, e := gzip.NewReader(c.Request.Body)
if e != nil {
return nil, e
}
defer r.Close()
bs, err = ioutil.ReadAll(r)
} else if enc == "deflate" {
r, e := zlib.NewReader(c.Request.Body)
if e != nil {
return nil, e
}
defer r.Close()
bs, err = ioutil.ReadAll(r)
} else {
defer c.Request.Body.Close()
bs, err = ioutil.ReadAll(c.Request.Body)
}
return bs, err
}
func (r *Router) datadogSeries(c *gin.Context) {
apiKey, has := c.GetQuery("api_key")
if !has {
apiKey = ""
}
if len(r.HTTP.APIForAgent.BasicAuth) > 0 {
ok := false
for _, v := range r.HTTP.APIForAgent.BasicAuth {
if apiKey == v {
ok = true
break
}
}
if !ok {
c.String(http.StatusUnauthorized, "unauthorized")
return
}
}
bs, err := readDatadogBody(c)
if err != nil {
c.String(400, err.Error())
return
}
var series TimeSeries
err = easyjson.Unmarshal(bs, &series)
if err != nil {
c.String(400, err.Error())
return
}
cnt := len(series.Series)
if cnt == 0 {
c.String(400, "series empty")
return
}
queueid := fmt.Sprint(atomic.AddUint64(&globalCounter, 1) % uint64(r.Pushgw.WriterOpt.QueueNumber))
var (
succ int
fail int
msg = "received"
ids = make(map[string]struct{})
)
for i := 0; i < cnt; i++ {
item := series.Series[i]
if item == nil {
continue
}
if err = item.Clean(); err != nil {
fail++
continue
}
pt, ident, err := item.ToProm()
if err != nil {
fail++
continue
}
if ident != "" {
if r.Pushgw.GetHeartbeatFromMetric {
// register host
ids[ident] = struct{}{}
}
// fill tags
target, has := r.TargetCache.Get(ident)
if has {
r.AppendLabels(pt, target, r.BusiGroupCache)
}
pstat.CounterSampleReceivedByIdent.WithLabelValues(ident).Inc()
}
err = r.ForwardToQueue(c.ClientIP(), queueid, pt)
if err != nil {
c.String(r.Pushgw.WriterOpt.OverLimitStatusCode, err.Error())
return
}
succ++
}
if succ > 0 {
pstat.CounterSampleTotal.WithLabelValues("datadog").Add(float64(succ))
r.IdentSet.MSet(ids)
}
c.JSON(200, gin.H{
"succ": succ,
"fail": fail,
"msg": msg,
})
}
================================================
FILE: pushgw/router/router_datadog_easyjson.go
================================================
// Code generated by easyjson for marshaling/unmarshaling. DO NOT EDIT.
package router
import (
json "encoding/json"
easyjson "github.com/mailru/easyjson"
jlexer "github.com/mailru/easyjson/jlexer"
jwriter "github.com/mailru/easyjson/jwriter"
)
// suppress unused package warning
var (
_ *json.RawMessage
_ *jlexer.Lexer
_ *jwriter.Writer
_ easyjson.Marshaler
)
func easyjsonF301f710DecodeGithubComDidiNightingaleV5SrcServerRouter(in *jlexer.Lexer, out *TimeSeries) {
isTopLevel := in.IsStart()
if in.IsNull() {
if isTopLevel {
in.Consumed()
}
in.Skip()
return
}
in.Delim('{')
for !in.IsDelim('}') {
key := in.UnsafeFieldName(false)
in.WantColon()
if in.IsNull() {
in.Skip()
in.WantComma()
continue
}
switch key {
case "series":
if in.IsNull() {
in.Skip()
out.Series = nil
} else {
in.Delim('[')
if out.Series == nil {
if !in.IsDelim(']') {
out.Series = make([]*DatadogMetric, 0, 8)
} else {
out.Series = []*DatadogMetric{}
}
} else {
out.Series = (out.Series)[:0]
}
for !in.IsDelim(']') {
var v1 *DatadogMetric
if in.IsNull() {
in.Skip()
v1 = nil
} else {
if v1 == nil {
v1 = new(DatadogMetric)
}
(*v1).UnmarshalEasyJSON(in)
}
out.Series = append(out.Series, v1)
in.WantComma()
}
in.Delim(']')
}
default:
in.SkipRecursive()
}
in.WantComma()
}
in.Delim('}')
if isTopLevel {
in.Consumed()
}
}
func easyjsonF301f710EncodeGithubComDidiNightingaleV5SrcServerRouter(out *jwriter.Writer, in TimeSeries) {
out.RawByte('{')
first := true
_ = first
{
const prefix string = ",\"series\":"
out.RawString(prefix[1:])
if in.Series == nil && (out.Flags&jwriter.NilSliceAsEmpty) == 0 {
out.RawString("null")
} else {
out.RawByte('[')
for v2, v3 := range in.Series {
if v2 > 0 {
out.RawByte(',')
}
if v3 == nil {
out.RawString("null")
} else {
(*v3).MarshalEasyJSON(out)
}
}
out.RawByte(']')
}
}
out.RawByte('}')
}
// MarshalJSON supports json.Marshaler interface
func (v TimeSeries) MarshalJSON() ([]byte, error) {
w := jwriter.Writer{}
easyjsonF301f710EncodeGithubComDidiNightingaleV5SrcServerRouter(&w, v)
return w.Buffer.BuildBytes(), w.Error
}
// MarshalEasyJSON supports easyjson.Marshaler interface
func (v TimeSeries) MarshalEasyJSON(w *jwriter.Writer) {
easyjsonF301f710EncodeGithubComDidiNightingaleV5SrcServerRouter(w, v)
}
// UnmarshalJSON supports json.Unmarshaler interface
func (v *TimeSeries) UnmarshalJSON(data []byte) error {
r := jlexer.Lexer{Data: data}
easyjsonF301f710DecodeGithubComDidiNightingaleV5SrcServerRouter(&r, v)
return r.Error()
}
// UnmarshalEasyJSON supports easyjson.Unmarshaler interface
func (v *TimeSeries) UnmarshalEasyJSON(l *jlexer.Lexer) {
easyjsonF301f710DecodeGithubComDidiNightingaleV5SrcServerRouter(l, v)
}
func easyjsonF301f710DecodeGithubComDidiNightingaleV5SrcServerRouter1(in *jlexer.Lexer, out *DatadogPoint) {
isTopLevel := in.IsStart()
if in.IsNull() {
in.Skip()
} else {
in.Delim('[')
v4 := 0
for !in.IsDelim(']') {
if v4 < 2 {
(*out)[v4] = float64(in.Float64())
v4++
} else {
in.SkipRecursive()
}
in.WantComma()
}
in.Delim(']')
}
if isTopLevel {
in.Consumed()
}
}
func easyjsonF301f710EncodeGithubComDidiNightingaleV5SrcServerRouter1(out *jwriter.Writer, in DatadogPoint) {
out.RawByte('[')
for v5 := range in {
if v5 > 0 {
out.RawByte(',')
}
out.Float64(float64((in)[v5]))
}
out.RawByte(']')
}
// MarshalJSON supports json.Marshaler interface
func (v DatadogPoint) MarshalJSON() ([]byte, error) {
w := jwriter.Writer{}
easyjsonF301f710EncodeGithubComDidiNightingaleV5SrcServerRouter1(&w, v)
return w.Buffer.BuildBytes(), w.Error
}
// MarshalEasyJSON supports easyjson.Marshaler interface
func (v DatadogPoint) MarshalEasyJSON(w *jwriter.Writer) {
easyjsonF301f710EncodeGithubComDidiNightingaleV5SrcServerRouter1(w, v)
}
// UnmarshalJSON supports json.Unmarshaler interface
func (v *DatadogPoint) UnmarshalJSON(data []byte) error {
r := jlexer.Lexer{Data: data}
easyjsonF301f710DecodeGithubComDidiNightingaleV5SrcServerRouter1(&r, v)
return r.Error()
}
// UnmarshalEasyJSON supports easyjson.Unmarshaler interface
func (v *DatadogPoint) UnmarshalEasyJSON(l *jlexer.Lexer) {
easyjsonF301f710DecodeGithubComDidiNightingaleV5SrcServerRouter1(l, v)
}
func easyjsonF301f710DecodeGithubComDidiNightingaleV5SrcServerRouter2(in *jlexer.Lexer, out *DatadogMetric) {
isTopLevel := in.IsStart()
if in.IsNull() {
if isTopLevel {
in.Consumed()
}
in.Skip()
return
}
in.Delim('{')
for !in.IsDelim('}') {
key := in.UnsafeFieldName(false)
in.WantColon()
if in.IsNull() {
in.Skip()
in.WantComma()
continue
}
switch key {
case "metric":
out.Metric = string(in.String())
case "points":
if in.IsNull() {
in.Skip()
out.Points = nil
} else {
in.Delim('[')
if out.Points == nil {
if !in.IsDelim(']') {
out.Points = make([]DatadogPoint, 0, 4)
} else {
out.Points = []DatadogPoint{}
}
} else {
out.Points = (out.Points)[:0]
}
for !in.IsDelim(']') {
var v6 DatadogPoint
(v6).UnmarshalEasyJSON(in)
out.Points = append(out.Points, v6)
in.WantComma()
}
in.Delim(']')
}
case "host":
out.Host = string(in.String())
case "tags":
if in.IsNull() {
in.Skip()
out.Tags = nil
} else {
in.Delim('[')
if out.Tags == nil {
if !in.IsDelim(']') {
out.Tags = make([]string, 0, 4)
} else {
out.Tags = []string{}
}
} else {
out.Tags = (out.Tags)[:0]
}
for !in.IsDelim(']') {
var v7 string
v7 = string(in.String())
out.Tags = append(out.Tags, v7)
in.WantComma()
}
in.Delim(']')
}
default:
in.SkipRecursive()
}
in.WantComma()
}
in.Delim('}')
if isTopLevel {
in.Consumed()
}
}
func easyjsonF301f710EncodeGithubComDidiNightingaleV5SrcServerRouter2(out *jwriter.Writer, in DatadogMetric) {
out.RawByte('{')
first := true
_ = first
{
const prefix string = ",\"metric\":"
out.RawString(prefix[1:])
out.String(string(in.Metric))
}
{
const prefix string = ",\"points\":"
out.RawString(prefix)
if in.Points == nil && (out.Flags&jwriter.NilSliceAsEmpty) == 0 {
out.RawString("null")
} else {
out.RawByte('[')
for v8, v9 := range in.Points {
if v8 > 0 {
out.RawByte(',')
}
(v9).MarshalEasyJSON(out)
}
out.RawByte(']')
}
}
{
const prefix string = ",\"host\":"
out.RawString(prefix)
out.String(string(in.Host))
}
if len(in.Tags) != 0 {
const prefix string = ",\"tags\":"
out.RawString(prefix)
{
out.RawByte('[')
for v10, v11 := range in.Tags {
if v10 > 0 {
out.RawByte(',')
}
out.String(string(v11))
}
out.RawByte(']')
}
}
out.RawByte('}')
}
// MarshalJSON supports json.Marshaler interface
func (v DatadogMetric) MarshalJSON() ([]byte, error) {
w := jwriter.Writer{}
easyjsonF301f710EncodeGithubComDidiNightingaleV5SrcServerRouter2(&w, v)
return w.Buffer.BuildBytes(), w.Error
}
// MarshalEasyJSON supports easyjson.Marshaler interface
func (v DatadogMetric) MarshalEasyJSON(w *jwriter.Writer) {
easyjsonF301f710EncodeGithubComDidiNightingaleV5SrcServerRouter2(w, v)
}
// UnmarshalJSON supports json.Unmarshaler interface
func (v *DatadogMetric) UnmarshalJSON(data []byte) error {
r := jlexer.Lexer{Data: data}
easyjsonF301f710DecodeGithubComDidiNightingaleV5SrcServerRouter2(&r, v)
return r.Error()
}
// UnmarshalEasyJSON supports easyjson.Unmarshaler interface
func (v *DatadogMetric) UnmarshalEasyJSON(l *jlexer.Lexer) {
easyjsonF301f710DecodeGithubComDidiNightingaleV5SrcServerRouter2(l, v)
}
================================================
FILE: pushgw/router/router_heartbeat.go
================================================
package router
import (
"compress/gzip"
"encoding/json"
"fmt"
"io"
"time"
"github.com/ccfos/nightingale/v6/center/metas"
"github.com/ccfos/nightingale/v6/models"
"github.com/ccfos/nightingale/v6/pkg/poster"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/logger"
)
// heartbeat Forward heartbeat request to the center.
func (rt *Router) heartbeat(c *gin.Context) {
gid := ginx.QueryStr(c, "gid", "")
overwriteGids := ginx.QueryBool(c, "overwrite_gids", false)
req, err := HandleHeartbeat(c, rt.Aconf.Heartbeat.EngineName, rt.MetaSet)
if err != nil {
logger.Warningf("req:%v heartbeat failed to handle heartbeat err:%v", req, err)
ginx.Dangerous(err)
}
api := "/v1/n9e/center/heartbeat"
if rt.HeartbeatApi != "" {
api = rt.HeartbeatApi
}
ret, err := poster.PostByUrlsWithResp[map[string]interface{}](rt.Ctx, fmt.Sprintf("%s?gid=%s&overwrite_gids=%t", api, gid, overwriteGids), req)
if err != nil {
logger.Warningf("req:%v heartbeat failed to post to center, centerApi:%v err:%v", req, rt.Ctx.CenterApi, err)
}
ginx.NewRender(c).Data(ret, err)
}
func HandleHeartbeat(c *gin.Context, engineName string, metaSet *metas.Set) (models.HostMeta, error) {
var bs []byte
var err error
var r *gzip.Reader
var req models.HostMeta
if c.GetHeader("Content-Encoding") == "gzip" {
r, err = gzip.NewReader(c.Request.Body)
if err != nil {
return req, err
}
defer r.Close()
bs, err = io.ReadAll(r)
if err != nil {
return req, err
}
} else {
defer c.Request.Body.Close()
bs, err = io.ReadAll(c.Request.Body)
if err != nil {
return req, err
}
}
err = json.Unmarshal(bs, &req)
if err != nil {
return req, err
}
if req.Hostname == "" {
ginx.Dangerous("hostname is required", 400)
}
req.Offset = (time.Now().UnixMilli() - req.UnixTime)
req.RemoteAddr = c.ClientIP()
req.EngineName = engineName
metaSet.Set(req.Hostname, req)
return req, nil
}
================================================
FILE: pushgw/router/router_openfalcon.go
================================================
package router
import (
"compress/gzip"
"fmt"
"io/ioutil"
"strconv"
"strings"
"sync/atomic"
"time"
"github.com/ccfos/nightingale/v6/pushgw/pstat"
"github.com/gin-gonic/gin"
"github.com/mailru/easyjson"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/prompb"
)
//easyjson:json
type FalconMetric struct {
Metric string `json:"metric"`
Endpoint string `json:"endpoint"`
Timestamp int64 `json:"timestamp"`
ValueUnTyped interface{} `json:"value"`
Value float64 `json:"-"`
Tags string `json:"tags"`
}
//easyjson:json
type FalconMetricArr []FalconMetric
func (m *FalconMetric) Clean(ts int64) error {
if m.Metric == "" {
return fmt.Errorf("metric is blank")
}
switch v := m.ValueUnTyped.(type) {
case string:
if f, err := strconv.ParseFloat(v, 64); err == nil {
m.Value = f
} else {
return fmt.Errorf("unparsable value %v", v)
}
case float64:
m.Value = v
case uint64:
m.Value = float64(v)
case int64:
m.Value = float64(v)
case int:
m.Value = float64(v)
default:
return fmt.Errorf("unparsable value %v", v)
}
// if timestamp bigger than 32 bits, likely in milliseconds
if m.Timestamp > 0xffffffff {
m.Timestamp /= 1000
}
// If the timestamp is greater than 5 minutes, the current time shall prevail
diff := m.Timestamp - ts
if diff > 300 {
m.Timestamp = ts
}
return nil
}
func (m *FalconMetric) ToProm() (*prompb.TimeSeries, string, error) {
pt := &prompb.TimeSeries{}
pt.Samples = append(pt.Samples, prompb.Sample{
// use ms
Timestamp: m.Timestamp * 1000,
Value: m.Value,
})
if strings.IndexByte(m.Metric, '.') != -1 {
m.Metric = strings.ReplaceAll(m.Metric, ".", "_")
}
if strings.IndexByte(m.Metric, '-') != -1 {
m.Metric = strings.ReplaceAll(m.Metric, "-", "_")
}
if !model.MetricNameRE.MatchString(m.Metric) {
return nil, "", fmt.Errorf("invalid metric name: %s", m.Metric)
}
pt.Labels = append(pt.Labels, prompb.Label{
Name: model.MetricNameLabel,
Value: m.Metric,
})
tagarr := strings.Split(m.Tags, ",")
tagmap := make(map[string]string, len(tagarr)+1)
for i := 0; i < len(tagarr); i++ {
tmp := strings.SplitN(tagarr[i], "=", 2)
if len(tmp) != 2 {
continue
}
tagmap[tmp[0]] = tmp[1]
}
ident := ""
if len(m.Endpoint) > 0 {
ident = m.Endpoint
if id, exists := tagmap["ident"]; exists {
ident = id
// use ident in tags
tagmap["endpoint"] = m.Endpoint
} else {
// use endpoint as ident
tagmap["ident"] = m.Endpoint
}
}
for key, value := range tagmap {
if strings.IndexByte(key, '.') != -1 {
key = strings.ReplaceAll(key, ".", "_")
}
if strings.IndexByte(key, '-') != -1 {
key = strings.ReplaceAll(key, "-", "_")
}
if !model.LabelNameRE.MatchString(key) {
return nil, "", fmt.Errorf("invalid tag name: %s", key)
}
pt.Labels = append(pt.Labels, prompb.Label{
Name: key,
Value: value,
})
}
return pt, ident, nil
}
func (rt *Router) falconPush(c *gin.Context) {
var bs []byte
var err error
var r *gzip.Reader
if c.GetHeader("Content-Encoding") == "gzip" {
r, err = gzip.NewReader(c.Request.Body)
if err != nil {
c.String(400, err.Error())
return
}
defer r.Close()
bs, err = ioutil.ReadAll(r)
} else {
defer c.Request.Body.Close()
bs, err = ioutil.ReadAll(c.Request.Body)
}
if err != nil {
c.String(400, err.Error())
return
}
var arr FalconMetricArr
if bs[0] == '[' {
err = easyjson.Unmarshal(bs, &arr)
} else {
var one FalconMetric
err = easyjson.Unmarshal(bs, &one)
arr = []FalconMetric{one}
}
if err != nil {
c.String(400, err.Error())
return
}
queueid := fmt.Sprint(atomic.AddUint64(&globalCounter, 1) % uint64(rt.Pushgw.WriterOpt.QueueNumber))
var (
succ int
fail int
msg = "received"
ts = time.Now().Unix()
ids = make(map[string]struct{})
)
for i := 0; i < len(arr); i++ {
if err := arr[i].Clean(ts); err != nil {
fail++
continue
}
pt, ident, err := arr[i].ToProm()
if err != nil {
fail++
continue
}
if ident != "" {
if rt.Pushgw.GetHeartbeatFromMetric {
// register host
ids[ident] = struct{}{}
}
// fill tags
target, has := rt.TargetCache.Get(ident)
if has {
rt.AppendLabels(pt, target, rt.BusiGroupCache)
}
pstat.CounterSampleReceivedByIdent.WithLabelValues(ident).Inc()
}
err = rt.ForwardToQueue(c.ClientIP(), queueid, pt)
if err != nil {
c.String(rt.Pushgw.WriterOpt.OverLimitStatusCode, err.Error())
return
}
succ++
}
if succ > 0 {
pstat.CounterSampleTotal.WithLabelValues("openfalcon").Add(float64(succ))
rt.IdentSet.MSet(ids)
}
c.JSON(200, gin.H{
"succ": succ,
"fail": fail,
"msg": msg,
})
}
================================================
FILE: pushgw/router/router_openfalcon_easyjson.go
================================================
// Code generated by easyjson for marshaling/unmarshaling. DO NOT EDIT.
package router
import (
json "encoding/json"
easyjson "github.com/mailru/easyjson"
jlexer "github.com/mailru/easyjson/jlexer"
jwriter "github.com/mailru/easyjson/jwriter"
)
// suppress unused package warning
var (
_ *json.RawMessage
_ *jlexer.Lexer
_ *jwriter.Writer
_ easyjson.Marshaler
)
func easyjson61ba9b47DecodeGithubComDidiNightingaleV5SrcServerRouter(in *jlexer.Lexer, out *FalconMetricArr) {
isTopLevel := in.IsStart()
if in.IsNull() {
in.Skip()
*out = nil
} else {
in.Delim('[')
if *out == nil {
if !in.IsDelim(']') {
*out = make(FalconMetricArr, 0, 0)
} else {
*out = FalconMetricArr{}
}
} else {
*out = (*out)[:0]
}
for !in.IsDelim(']') {
var v1 FalconMetric
(v1).UnmarshalEasyJSON(in)
*out = append(*out, v1)
in.WantComma()
}
in.Delim(']')
}
if isTopLevel {
in.Consumed()
}
}
func easyjson61ba9b47EncodeGithubComDidiNightingaleV5SrcServerRouter(out *jwriter.Writer, in FalconMetricArr) {
if in == nil && (out.Flags&jwriter.NilSliceAsEmpty) == 0 {
out.RawString("null")
} else {
out.RawByte('[')
for v2, v3 := range in {
if v2 > 0 {
out.RawByte(',')
}
(v3).MarshalEasyJSON(out)
}
out.RawByte(']')
}
}
// MarshalJSON supports json.Marshaler interface
func (v FalconMetricArr) MarshalJSON() ([]byte, error) {
w := jwriter.Writer{}
easyjson61ba9b47EncodeGithubComDidiNightingaleV5SrcServerRouter(&w, v)
return w.Buffer.BuildBytes(), w.Error
}
// MarshalEasyJSON supports easyjson.Marshaler interface
func (v FalconMetricArr) MarshalEasyJSON(w *jwriter.Writer) {
easyjson61ba9b47EncodeGithubComDidiNightingaleV5SrcServerRouter(w, v)
}
// UnmarshalJSON supports json.Unmarshaler interface
func (v *FalconMetricArr) UnmarshalJSON(data []byte) error {
r := jlexer.Lexer{Data: data}
easyjson61ba9b47DecodeGithubComDidiNightingaleV5SrcServerRouter(&r, v)
return r.Error()
}
// UnmarshalEasyJSON supports easyjson.Unmarshaler interface
func (v *FalconMetricArr) UnmarshalEasyJSON(l *jlexer.Lexer) {
easyjson61ba9b47DecodeGithubComDidiNightingaleV5SrcServerRouter(l, v)
}
func easyjson61ba9b47DecodeGithubComDidiNightingaleV5SrcServerRouter1(in *jlexer.Lexer, out *FalconMetric) {
isTopLevel := in.IsStart()
if in.IsNull() {
if isTopLevel {
in.Consumed()
}
in.Skip()
return
}
in.Delim('{')
for !in.IsDelim('}') {
key := in.UnsafeFieldName(false)
in.WantColon()
if in.IsNull() {
in.Skip()
in.WantComma()
continue
}
switch key {
case "metric":
out.Metric = string(in.String())
case "endpoint":
out.Endpoint = string(in.String())
case "timestamp":
out.Timestamp = int64(in.Int64())
case "value":
if m, ok := out.ValueUnTyped.(easyjson.Unmarshaler); ok {
m.UnmarshalEasyJSON(in)
} else if m, ok := out.ValueUnTyped.(json.Unmarshaler); ok {
_ = m.UnmarshalJSON(in.Raw())
} else {
out.ValueUnTyped = in.Interface()
}
case "tags":
out.Tags = string(in.String())
default:
in.SkipRecursive()
}
in.WantComma()
}
in.Delim('}')
if isTopLevel {
in.Consumed()
}
}
func easyjson61ba9b47EncodeGithubComDidiNightingaleV5SrcServerRouter1(out *jwriter.Writer, in FalconMetric) {
out.RawByte('{')
first := true
_ = first
{
const prefix string = ",\"metric\":"
out.RawString(prefix[1:])
out.String(string(in.Metric))
}
{
const prefix string = ",\"endpoint\":"
out.RawString(prefix)
out.String(string(in.Endpoint))
}
{
const prefix string = ",\"timestamp\":"
out.RawString(prefix)
out.Int64(int64(in.Timestamp))
}
{
const prefix string = ",\"value\":"
out.RawString(prefix)
if m, ok := in.ValueUnTyped.(easyjson.Marshaler); ok {
m.MarshalEasyJSON(out)
} else if m, ok := in.ValueUnTyped.(json.Marshaler); ok {
out.Raw(m.MarshalJSON())
} else {
out.Raw(json.Marshal(in.ValueUnTyped))
}
}
{
const prefix string = ",\"tags\":"
out.RawString(prefix)
out.String(string(in.Tags))
}
out.RawByte('}')
}
// MarshalJSON supports json.Marshaler interface
func (v FalconMetric) MarshalJSON() ([]byte, error) {
w := jwriter.Writer{}
easyjson61ba9b47EncodeGithubComDidiNightingaleV5SrcServerRouter1(&w, v)
return w.Buffer.BuildBytes(), w.Error
}
// MarshalEasyJSON supports easyjson.Marshaler interface
func (v FalconMetric) MarshalEasyJSON(w *jwriter.Writer) {
easyjson61ba9b47EncodeGithubComDidiNightingaleV5SrcServerRouter1(w, v)
}
// UnmarshalJSON supports json.Unmarshaler interface
func (v *FalconMetric) UnmarshalJSON(data []byte) error {
r := jlexer.Lexer{Data: data}
easyjson61ba9b47DecodeGithubComDidiNightingaleV5SrcServerRouter1(&r, v)
return r.Error()
}
// UnmarshalEasyJSON supports easyjson.Unmarshaler interface
func (v *FalconMetric) UnmarshalEasyJSON(l *jlexer.Lexer) {
easyjson61ba9b47DecodeGithubComDidiNightingaleV5SrcServerRouter1(l, v)
}
================================================
FILE: pushgw/router/router_opentsdb.go
================================================
package router
import (
"compress/gzip"
"fmt"
"io/ioutil"
"strconv"
"strings"
"sync/atomic"
"time"
"github.com/ccfos/nightingale/v6/pushgw/pstat"
"github.com/gin-gonic/gin"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/prompb"
"github.com/toolkits/pkg/logger"
"github.com/mailru/easyjson"
_ "github.com/mailru/easyjson/gen"
)
// easyjson:json
type HTTPMetric struct {
Metric string `json:"metric"`
Timestamp int64 `json:"timestamp"`
ValueUnTyped interface{} `json:"value"`
Value float64 `json:"-"`
Tags map[string]string `json:"tags"`
}
//easyjson:json
type HTTPMetricArr []HTTPMetric
func (m *HTTPMetric) Clean(ts int64) error {
if m.Metric == "" {
return fmt.Errorf("metric is blank")
}
switch v := m.ValueUnTyped.(type) {
case string:
if f, err := strconv.ParseFloat(v, 64); err == nil {
m.Value = f
} else {
return fmt.Errorf("unparsable value %v", v)
}
case float64:
m.Value = v
case uint64:
m.Value = float64(v)
case int64:
m.Value = float64(v)
case int:
m.Value = float64(v)
default:
return fmt.Errorf("unparsable value %v", v)
}
// if timestamp bigger than 32 bits, likely in milliseconds
if m.Timestamp > 0xffffffff {
m.Timestamp /= 1000
}
// If the timestamp is greater than 5 minutes, the current time shall prevail
diff := m.Timestamp - ts
if diff > 300 {
m.Timestamp = ts
}
return nil
}
func (m *HTTPMetric) ToProm() (*prompb.TimeSeries, error) {
pt := &prompb.TimeSeries{}
pt.Samples = append(pt.Samples, prompb.Sample{
// use ms
Timestamp: m.Timestamp * 1000,
Value: m.Value,
})
if strings.IndexByte(m.Metric, '.') != -1 {
m.Metric = strings.ReplaceAll(m.Metric, ".", "_")
}
if strings.IndexByte(m.Metric, '-') != -1 {
m.Metric = strings.ReplaceAll(m.Metric, "-", "_")
}
if !model.MetricNameRE.MatchString(m.Metric) {
return nil, fmt.Errorf("invalid metric name: %s", m.Metric)
}
pt.Labels = append(pt.Labels, prompb.Label{
Name: model.MetricNameLabel,
Value: m.Metric,
})
if _, exists := m.Tags["ident"]; !exists {
// rename tag key
host, has := m.Tags["host"]
if has {
delete(m.Tags, "host")
m.Tags["ident"] = host
}
}
for key, value := range m.Tags {
if strings.IndexByte(key, '.') != -1 {
key = strings.ReplaceAll(key, ".", "_")
}
if strings.IndexByte(key, '-') != -1 {
key = strings.ReplaceAll(key, "-", "_")
}
if !model.LabelNameRE.MatchString(key) {
return nil, fmt.Errorf("invalid tag name: %s", key)
}
pt.Labels = append(pt.Labels, prompb.Label{
Name: key,
Value: value,
})
}
return pt, nil
}
func (rt *Router) openTSDBPut(c *gin.Context) {
var bs []byte
var err error
var r *gzip.Reader
if c.GetHeader("Content-Encoding") == "gzip" {
r, err = gzip.NewReader(c.Request.Body)
if err != nil {
c.String(400, err.Error())
return
}
defer r.Close()
bs, err = ioutil.ReadAll(r)
} else {
defer c.Request.Body.Close()
bs, err = ioutil.ReadAll(c.Request.Body)
}
if err != nil {
c.String(400, err.Error())
return
}
var arr HTTPMetricArr
if bs[0] == '[' {
err = easyjson.Unmarshal(bs, &arr)
} else {
var one HTTPMetric
err = easyjson.Unmarshal(bs, &one)
arr = []HTTPMetric{one}
}
if err != nil {
logger.Debugf("opentsdb msg format error: %s", err.Error())
c.String(400, err.Error())
return
}
queueid := fmt.Sprint(atomic.AddUint64(&globalCounter, 1) % uint64(rt.Pushgw.WriterOpt.QueueNumber))
var (
succ int
fail int
msg = "received"
ts = time.Now().Unix()
ids = make(map[string]struct{})
)
for i := 0; i < len(arr); i++ {
if err := arr[i].Clean(ts); err != nil {
logger.Debugf("opentsdb msg clean error: %s", err.Error())
if fail == 0 {
msg = fmt.Sprintf("%s , Error clean: %s", msg, err.Error())
}
fail++
continue
}
pt, err := arr[i].ToProm()
if err != nil {
logger.Debugf("opentsdb msg to tsdb error: %s", err.Error())
if fail == 0 {
msg = fmt.Sprintf("%s , Error toprom: %s", msg, err.Error())
}
fail++
continue
}
host, has := arr[i].Tags["ident"]
if has {
if rt.Pushgw.GetHeartbeatFromMetric {
// register host
ids[host] = struct{}{}
}
// fill tags
target, has := rt.TargetCache.Get(host)
if has {
rt.AppendLabels(pt, target, rt.BusiGroupCache)
}
pstat.CounterSampleReceivedByIdent.WithLabelValues(host).Inc()
}
err = rt.ForwardToQueue(c.ClientIP(), queueid, pt)
if err != nil {
c.String(rt.Pushgw.WriterOpt.OverLimitStatusCode, err.Error())
return
}
succ++
}
if succ > 0 {
pstat.CounterSampleTotal.WithLabelValues("opentsdb").Add(float64(succ))
rt.IdentSet.MSet(ids)
}
c.JSON(200, gin.H{
"succ": succ,
"fail": fail,
"msg": msg,
})
}
================================================
FILE: pushgw/router/router_opentsdb_easyjson.go
================================================
// Code generated by easyjson for marshaling/unmarshaling. DO NOT EDIT.
package router
import (
json "encoding/json"
easyjson "github.com/mailru/easyjson"
jlexer "github.com/mailru/easyjson/jlexer"
jwriter "github.com/mailru/easyjson/jwriter"
)
// suppress unused package warning
var (
_ *json.RawMessage
_ *jlexer.Lexer
_ *jwriter.Writer
_ easyjson.Marshaler
)
func easyjson30864de9DecodeGithubComDidiNightingaleV5SrcServerRouter(in *jlexer.Lexer, out *HTTPMetricArr) {
isTopLevel := in.IsStart()
if in.IsNull() {
in.Skip()
*out = nil
} else {
in.Delim('[')
if *out == nil {
if !in.IsDelim(']') {
*out = make(HTTPMetricArr, 0, 1)
} else {
*out = HTTPMetricArr{}
}
} else {
*out = (*out)[:0]
}
for !in.IsDelim(']') {
var v1 HTTPMetric
(v1).UnmarshalEasyJSON(in)
*out = append(*out, v1)
in.WantComma()
}
in.Delim(']')
}
if isTopLevel {
in.Consumed()
}
}
func easyjson30864de9EncodeGithubComDidiNightingaleV5SrcServerRouter(out *jwriter.Writer, in HTTPMetricArr) {
if in == nil && (out.Flags&jwriter.NilSliceAsEmpty) == 0 {
out.RawString("null")
} else {
out.RawByte('[')
for v2, v3 := range in {
if v2 > 0 {
out.RawByte(',')
}
(v3).MarshalEasyJSON(out)
}
out.RawByte(']')
}
}
// MarshalJSON supports json.Marshaler interface
func (v HTTPMetricArr) MarshalJSON() ([]byte, error) {
w := jwriter.Writer{}
easyjson30864de9EncodeGithubComDidiNightingaleV5SrcServerRouter(&w, v)
return w.Buffer.BuildBytes(), w.Error
}
// MarshalEasyJSON supports easyjson.Marshaler interface
func (v HTTPMetricArr) MarshalEasyJSON(w *jwriter.Writer) {
easyjson30864de9EncodeGithubComDidiNightingaleV5SrcServerRouter(w, v)
}
// UnmarshalJSON supports json.Unmarshaler interface
func (v *HTTPMetricArr) UnmarshalJSON(data []byte) error {
r := jlexer.Lexer{Data: data}
easyjson30864de9DecodeGithubComDidiNightingaleV5SrcServerRouter(&r, v)
return r.Error()
}
// UnmarshalEasyJSON supports easyjson.Unmarshaler interface
func (v *HTTPMetricArr) UnmarshalEasyJSON(l *jlexer.Lexer) {
easyjson30864de9DecodeGithubComDidiNightingaleV5SrcServerRouter(l, v)
}
func easyjson30864de9DecodeGithubComDidiNightingaleV5SrcServerRouter1(in *jlexer.Lexer, out *HTTPMetric) {
isTopLevel := in.IsStart()
if in.IsNull() {
if isTopLevel {
in.Consumed()
}
in.Skip()
return
}
in.Delim('{')
for !in.IsDelim('}') {
key := in.UnsafeFieldName(false)
in.WantColon()
if in.IsNull() {
in.Skip()
in.WantComma()
continue
}
switch key {
case "metric":
out.Metric = string(in.String())
case "timestamp":
out.Timestamp = int64(in.Int64())
case "value":
if m, ok := out.ValueUnTyped.(easyjson.Unmarshaler); ok {
m.UnmarshalEasyJSON(in)
} else if m, ok := out.ValueUnTyped.(json.Unmarshaler); ok {
_ = m.UnmarshalJSON(in.Raw())
} else {
out.ValueUnTyped = in.Interface()
}
case "tags":
if in.IsNull() {
in.Skip()
} else {
in.Delim('{')
out.Tags = make(map[string]string)
for !in.IsDelim('}') {
key := string(in.String())
in.WantColon()
var v4 string
v4 = string(in.String())
(out.Tags)[key] = v4
in.WantComma()
}
in.Delim('}')
}
default:
in.SkipRecursive()
}
in.WantComma()
}
in.Delim('}')
if isTopLevel {
in.Consumed()
}
}
func easyjson30864de9EncodeGithubComDidiNightingaleV5SrcServerRouter1(out *jwriter.Writer, in HTTPMetric) {
out.RawByte('{')
first := true
_ = first
{
const prefix string = ",\"metric\":"
out.RawString(prefix[1:])
out.String(string(in.Metric))
}
{
const prefix string = ",\"timestamp\":"
out.RawString(prefix)
out.Int64(int64(in.Timestamp))
}
{
const prefix string = ",\"value\":"
out.RawString(prefix)
if m, ok := in.ValueUnTyped.(easyjson.Marshaler); ok {
m.MarshalEasyJSON(out)
} else if m, ok := in.ValueUnTyped.(json.Marshaler); ok {
out.Raw(m.MarshalJSON())
} else {
out.Raw(json.Marshal(in.ValueUnTyped))
}
}
{
const prefix string = ",\"tags\":"
out.RawString(prefix)
if in.Tags == nil && (out.Flags&jwriter.NilMapAsEmpty) == 0 {
out.RawString(`null`)
} else {
out.RawByte('{')
v5First := true
for v5Name, v5Value := range in.Tags {
if v5First {
v5First = false
} else {
out.RawByte(',')
}
out.String(string(v5Name))
out.RawByte(':')
out.String(string(v5Value))
}
out.RawByte('}')
}
}
out.RawByte('}')
}
// MarshalJSON supports json.Marshaler interface
func (v HTTPMetric) MarshalJSON() ([]byte, error) {
w := jwriter.Writer{}
easyjson30864de9EncodeGithubComDidiNightingaleV5SrcServerRouter1(&w, v)
return w.Buffer.BuildBytes(), w.Error
}
// MarshalEasyJSON supports easyjson.Marshaler interface
func (v HTTPMetric) MarshalEasyJSON(w *jwriter.Writer) {
easyjson30864de9EncodeGithubComDidiNightingaleV5SrcServerRouter1(w, v)
}
// UnmarshalJSON supports json.Unmarshaler interface
func (v *HTTPMetric) UnmarshalJSON(data []byte) error {
r := jlexer.Lexer{Data: data}
easyjson30864de9DecodeGithubComDidiNightingaleV5SrcServerRouter1(&r, v)
return r.Error()
}
// UnmarshalEasyJSON supports easyjson.Unmarshaler interface
func (v *HTTPMetric) UnmarshalEasyJSON(l *jlexer.Lexer) {
easyjson30864de9DecodeGithubComDidiNightingaleV5SrcServerRouter1(l, v)
}
================================================
FILE: pushgw/router/router_proxy_remotewrite.go
================================================
package router
import (
"bytes"
"io"
"net/http"
"strings"
"time"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/logger"
)
// 客户端把数据推给 pushgw,pushgw 再转发给 prometheus。
// 这个方法中,pushgw 不做任何处理,不解析 http request body,直接转发给配置文件中指定的多个 writers。
// 相比 /prometheus/v1/write 方法,这个方法不需要在内存里搞很多队列,性能更好。
// 注意:后来想了想这个方法也不太合适,不推荐用户使用。还是应该继续优化一下 /prometheus/v1/write 方法的队列逻辑。
func (rt *Router) proxyRemoteWrite(c *gin.Context) {
// 读取 request body
bs, err := c.GetRawData()
if err != nil {
c.JSON(400, gin.H{"error": err.Error()})
return
}
// 拿到所有的 writer 配置
for index := range rt.Pushgw.Writers {
writer := rt.Pushgw.Writers[index]
targetUrl := writer.Url
if c.Request.URL.RawQuery != "" {
// 如果有 querystring,把 querystring 拼接到 url 后面
if strings.Contains(writer.Url, "?") {
targetUrl += "&" + c.Request.URL.RawQuery
} else {
targetUrl += "?" + c.Request.URL.RawQuery
}
}
// 把 bs 放到 http request 中发给 writer 中的 HTTPTransport
req, err := http.NewRequest("POST", targetUrl, bytes.NewReader(bs))
if err != nil {
c.JSON(500, gin.H{"error": err.Error()})
return
}
// 把 header 转发给后端
contentType := c.GetHeader("Content-Type")
if contentType == "" {
contentType = "application/x-protobuf"
}
req.Header.Set("Content-Type", contentType)
contentEncoding := c.GetHeader("Content-Encoding")
if contentEncoding == "" {
contentEncoding = "snappy"
}
req.Header.Set("Content-Encoding", contentEncoding)
userAgent := c.GetHeader("User-Agent")
if userAgent == "" {
userAgent = "n9e"
} else {
userAgent += "-n9e"
}
req.Header.Set("User-Agent", userAgent)
rwVersion := c.GetHeader("X-Prometheus-Remote-Write-Version")
if rwVersion == "" {
rwVersion = "0.1.0"
}
req.Header.Set("X-Prometheus-Remote-Write-Version", rwVersion)
if writer.BasicAuthUser != "" {
req.SetBasicAuth(writer.BasicAuthUser, writer.BasicAuthPass)
}
headerCount := len(writer.Headers)
if headerCount > 0 && headerCount%2 == 0 {
for i := 0; i < len(writer.Headers); i += 2 {
req.Header.Add(writer.Headers[i], writer.Headers[i+1])
if writer.Headers[i] == "Host" {
req.Host = writer.Headers[i+1]
}
}
}
client := http.Client{
Timeout: time.Duration(writer.Timeout) * time.Millisecond,
Transport: writer.HTTPTransport,
}
res, err := client.Do(req)
if err != nil {
logger.Warningf("[forward-timeseries] failed to do request. url=%s error=%v", targetUrl, err)
continue
}
defer res.Body.Close()
if res.StatusCode >= 400 {
body, err := io.ReadAll(res.Body)
if err != nil {
logger.Warningf("[forward-timeseries] failed to read response body. url=%s error=%v", targetUrl, err)
continue
}
logger.Warningf("[forward-timeseries] response status code ge 400. url=%s status_code=%d response=%s", targetUrl, res.StatusCode, string(body))
continue
}
}
}
================================================
FILE: pushgw/router/router_remotewrite.go
================================================
package router
import (
"fmt"
"io"
"io/ioutil"
"net/http"
"sync/atomic"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/ccfos/nightingale/v6/pushgw/pstat"
"github.com/gin-gonic/gin"
"github.com/gogo/protobuf/proto"
"github.com/golang/snappy"
"github.com/prometheus/prometheus/prompb"
"github.com/toolkits/pkg/logger"
)
func extractMetricFromTimeSeries(s *prompb.TimeSeries) string {
for i := 0; i < len(s.Labels); i++ {
if s.Labels[i].Name == "__name__" {
return s.Labels[i].Value
}
}
return ""
}
// 返回的第二个参数,bool,表示是否需要把 ident 写入 target 表
func extractIdentFromTimeSeries(s *prompb.TimeSeries, ignoreIdent, ignoreHost bool, identMetrics []string) (string, bool) {
if s == nil {
return "", false
}
labelMap := make(map[string]int)
for i, label := range s.Labels {
labelMap[label.Name] = i
}
var ident string
// 如果标签中有ident,则直接使用
if idx, ok := labelMap["ident"]; ok {
ident = s.Labels[idx].Value
}
if ident == "" {
// 没有 ident 标签,尝试使用 agent_hostname 作为 ident
// agent_hostname for grafana-agent and categraf
if idx, ok := labelMap["agent_hostname"]; ok {
s.Labels[idx].Name = "ident"
ident = s.Labels[idx].Value
}
}
if !ignoreHost && ident == "" {
// agent_hostname 没有,那就使用 host 作为 ident,用于 telegraf 的场景
// 但是,有的时候 nginx 采集的指标中带有 host 标签表示域名,这个时候就不能用 host 作为 ident,此时需要在 url 中设置 ignore_host=true
// telegraf, output plugin: http, format: prometheusremotewrite
if idx, ok := labelMap["host"]; ok {
s.Labels[idx].Name = "ident"
ident = s.Labels[idx].Value
}
}
if ident == "" {
// 上报的监控数据中并没有 ident 信息
return "", false
}
if len(identMetrics) > 0 {
metricFound := false
for _, identMetric := range identMetrics {
if idx, has := labelMap["__name__"]; has && s.Labels[idx].Value == identMetric {
metricFound = true
break
}
}
if !metricFound {
return ident, false
}
}
return ident, !ignoreIdent
}
func duplicateLabelKey(series *prompb.TimeSeries) bool {
if series == nil {
return false
}
labelKeys := make(map[string]struct{})
for j := 0; j < len(series.Labels); j++ {
if _, has := labelKeys[series.Labels[j].Name]; has {
return true
} else {
labelKeys[series.Labels[j].Name] = struct{}{}
}
}
return false
}
func (rt *Router) remoteWrite(c *gin.Context) {
curLen := rt.Writers.AllQueueLen.Load().(int64)
if curLen > rt.Pushgw.WriterOpt.AllQueueMaxSize {
err := fmt.Errorf("write queue full, metric count over limit: %d", curLen)
logger.Warning(err)
pstat.CounterPushQueueOverLimitTotal.Inc()
c.String(rt.Pushgw.WriterOpt.OverLimitStatusCode, err.Error())
return
}
req, err := DecodeWriteRequest(c.Request.Body)
if err != nil {
c.String(http.StatusBadRequest, err.Error())
return
}
count := len(req.Timeseries)
if count == 0 {
c.String(200, "")
return
}
queueid := fmt.Sprint(atomic.AddUint64(&globalCounter, 1) % uint64(rt.Pushgw.WriterOpt.QueueNumber))
var (
ignoreIdent = ginx.QueryBool(c, "ignore_ident", false)
ignoreHost = ginx.QueryBool(c, "ignore_host", true) // 默认值改成 true,要不然答疑成本太高。发版的时候通知 telegraf 用户,让他们设置 ignore_host=false
ids = make(map[string]struct{})
)
for i := 0; i < count; i++ {
if duplicateLabelKey(&req.Timeseries[i]) {
continue
}
ident, insertTarget := extractIdentFromTimeSeries(&req.Timeseries[i], ignoreIdent, ignoreHost, rt.Pushgw.IdentMetrics)
if len(ident) > 0 {
// enrich host labels
target, has := rt.TargetCache.Get(ident)
if has {
rt.AppendLabels(&req.Timeseries[i], target, rt.BusiGroupCache)
}
pstat.CounterSampleReceivedByIdent.WithLabelValues(ident).Inc()
}
if rt.Pushgw.GetHeartbeatFromMetric && insertTarget {
// has ident tag or agent_hostname tag
// register host in table target
ids[ident] = struct{}{}
}
err = rt.ForwardToQueue(c.ClientIP(), queueid, &req.Timeseries[i])
if err != nil {
c.String(rt.Pushgw.WriterOpt.OverLimitStatusCode, err.Error())
return
}
}
pstat.CounterSampleTotal.WithLabelValues("prometheus").Add(float64(count))
rt.IdentSet.MSet(ids)
c.String(200, "")
}
// DecodeWriteRequest from an io.Reader into a prompb.WriteRequest, handling
// snappy decompression.
func DecodeWriteRequest(r io.Reader) (*prompb.WriteRequest, error) {
compressed, err := ioutil.ReadAll(r)
if err != nil {
return nil, err
}
reqBuf, err := snappy.Decode(nil, compressed)
if err != nil {
return nil, err
}
var req prompb.WriteRequest
if err := proto.Unmarshal(reqBuf, &req); err != nil {
return nil, err
}
return &req, nil
}
================================================
FILE: pushgw/router/router_target.go
================================================
package router
import (
"github.com/ccfos/nightingale/v6/pushgw/idents"
"github.com/ccfos/nightingale/v6/pkg/ginx"
"github.com/gin-gonic/gin"
)
func (rt *Router) targetUpdate(c *gin.Context) {
var f idents.TargetUpdate
ginx.BindJSON(c, &f)
m := make(map[string]struct{})
for _, ident := range f.Lst {
m[ident] = struct{}{}
}
rt.IdentSet.MSet(m)
ginx.NewRender(c).Message(nil)
}
================================================
FILE: pushgw/router/vars.go
================================================
package router
var globalCounter uint64
================================================
FILE: pushgw/writer/kafka_writer.go
================================================
package writer
import (
"time"
"github.com/IBM/sarama"
"github.com/ccfos/nightingale/v6/pushgw/kafka"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/ccfos/nightingale/v6/pushgw/pstat"
"github.com/prometheus/prometheus/prompb"
"github.com/toolkits/pkg/logger"
)
type KafkaWriterType struct {
Opts pconf.KafkaWriterOptions
ForceUseServerTS bool
Client kafka.Producer
RetryCount int
RetryInterval int64 // 单位秒
}
func (w KafkaWriterType) Write(key string, items []prompb.TimeSeries, headers ...map[string]string) {
if len(items) == 0 {
return
}
items = Relabel(items, w.Opts.WriteRelabels)
if len(items) == 0 {
return
}
start := time.Now()
defer func() {
pstat.ForwardDuration.WithLabelValues(key).Observe(time.Since(start).Seconds())
}()
data, err := beforeWrite(key, items, w.ForceUseServerTS, "json")
if err != nil {
logger.Warningf("marshal prom data to proto got error: %v, data: %+v", err, items)
return
}
for i := 0; i < w.RetryCount; i++ {
err := w.Client.Send(&sarama.ProducerMessage{Topic: w.Opts.Topic,
Key: sarama.StringEncoder(key), Value: sarama.ByteEncoder(data)})
if err == nil {
break
}
pstat.CounterWriteErrorTotal.WithLabelValues(key).Add(float64(len(items)))
logger.Warningf("send to kafka got error: %v in %d times, broker: %v, topic: %s",
err, i, w.Opts.Brokers, w.Opts.Topic)
if i == 0 {
logger.Warning("example timeseries:", items[0].String())
}
time.Sleep(time.Duration(w.RetryInterval) * time.Second)
}
}
================================================
FILE: pushgw/writer/queue.go
================================================
package writer
import (
"container/list"
"sync"
"github.com/prometheus/prometheus/prompb"
)
type SafeList struct {
sync.RWMutex
L *list.List
}
func NewSafeList() *SafeList {
return &SafeList{L: list.New()}
}
func (sl *SafeList) PushFront(v interface{}) *list.Element {
sl.Lock()
e := sl.L.PushFront(v)
sl.Unlock()
return e
}
func (sl *SafeList) PushFrontBatch(vs []interface{}) {
sl.Lock()
for _, item := range vs {
sl.L.PushFront(item)
}
sl.Unlock()
}
func (sl *SafeList) PopBack(max int) []prompb.TimeSeries {
sl.Lock()
count := sl.L.Len()
if count == 0 {
sl.Unlock()
return []prompb.TimeSeries{}
}
if count > max {
count = max
}
items := make([]prompb.TimeSeries, 0, count)
for i := 0; i < count; i++ {
item := sl.L.Remove(sl.L.Back())
sample, ok := item.(prompb.TimeSeries)
if ok {
items = append(items, sample)
}
}
sl.Unlock()
return items
}
func (sl *SafeList) RemoveAll() {
sl.Lock()
sl.L.Init()
sl.Unlock()
}
func (sl *SafeList) Len() int {
sl.RLock()
size := sl.L.Len()
sl.RUnlock()
return size
}
// SafeList with Limited Size
type SafeListLimited struct {
maxSize int
SL *SafeList
}
func NewSafeListLimited(maxSize int) *SafeListLimited {
return &SafeListLimited{SL: NewSafeList(), maxSize: maxSize}
}
func (sll *SafeListLimited) PopBack(max int) []prompb.TimeSeries {
return sll.SL.PopBack(max)
}
func (sll *SafeListLimited) PushFront(v interface{}) bool {
if sll.SL.Len() >= sll.maxSize {
return false
}
sll.SL.PushFront(v)
return true
}
func (sll *SafeListLimited) PushFrontBatch(vs []interface{}) bool {
if sll.SL.Len() >= sll.maxSize {
return false
}
sll.SL.PushFrontBatch(vs)
return true
}
func (sll *SafeListLimited) RemoveAll() {
sll.SL.RemoveAll()
}
func (sll *SafeListLimited) Len() int {
return sll.SL.Len()
}
================================================
FILE: pushgw/writer/relabel.go
================================================
package writer
import (
"crypto/md5"
"fmt"
"regexp"
"sort"
"strings"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/toolkits/pkg/logger"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/prompb"
)
const (
Replace string = "replace"
Keep string = "keep"
Drop string = "drop"
HashMod string = "hashmod"
LabelMap string = "labelmap"
LabelDrop string = "labeldrop"
LabelKeep string = "labelkeep"
Lowercase string = "lowercase"
Uppercase string = "uppercase"
DropIfEqual string = "drop_if_equal"
)
func Process(labels []prompb.Label, cfgs ...*pconf.RelabelConfig) []prompb.Label {
for _, cfg := range cfgs {
labels = relabel(labels, cfg)
if labels == nil {
return nil
}
}
return labels
}
func Relabel(items []prompb.TimeSeries, rc []*pconf.RelabelConfig) []prompb.TimeSeries {
ritems := make([]prompb.TimeSeries, 0, len(items))
for _, item := range items {
lbls := Process(item.Labels, rc...)
if len(lbls) == 0 {
continue
}
item.Labels = lbls
ritems = append(ritems, item)
}
return ritems
}
func getValue(ls []prompb.Label, name model.LabelName) string {
for _, l := range ls {
if l.Name == string(name) {
return l.Value
}
}
return ""
}
type LabelBuilder struct {
LabelSet map[string]string
}
func newBuilder(ls []prompb.Label) *LabelBuilder {
lset := make(map[string]string, len(ls))
for _, l := range ls {
lset[l.Name] = l.Value
}
return &LabelBuilder{LabelSet: lset}
}
func (l *LabelBuilder) set(k, v string) *LabelBuilder {
l.LabelSet[k] = v
return l
}
func (l *LabelBuilder) del(ns ...string) *LabelBuilder {
for _, n := range ns {
delete(l.LabelSet, n)
}
return l
}
func (l *LabelBuilder) labels() []prompb.Label {
ls := make([]prompb.Label, 0, len(l.LabelSet))
if len(l.LabelSet) == 0 {
return ls
}
for k, v := range l.LabelSet {
ls = append(ls, prompb.Label{
Name: k,
Value: v,
})
}
sort.Slice(ls, func(i, j int) bool {
return ls[i].Name > ls[j].Name
})
return ls
}
func relabel(lset []prompb.Label, cfg *pconf.RelabelConfig) []prompb.Label {
values := make([]string, 0, len(cfg.SourceLabels))
for _, ln := range cfg.SourceLabels {
values = append(values, getValue(lset, ln))
}
regx := cfg.RegexCompiled
if regx == nil {
regx = compileRegex(cfg.Regex)
}
if regx == nil {
return lset
}
val := strings.Join(values, cfg.Separator)
lb := newBuilder(lset)
switch cfg.Action {
case Drop:
if regx.MatchString(val) {
return nil
}
case Keep:
if !regx.MatchString(val) {
return nil
}
case Replace:
return handleReplace(lb, regx, cfg, val, lset)
case Lowercase:
lb.set(cfg.TargetLabel, strings.ToLower(val))
case Uppercase:
lb.set(cfg.TargetLabel, strings.ToUpper(val))
case HashMod:
mod := sum64(md5.Sum([]byte(val))) % cfg.Modulus
lb.set(cfg.TargetLabel, fmt.Sprintf("%d", mod))
case LabelMap:
for _, l := range lset {
if regx.MatchString(l.Name) {
res := regx.ReplaceAllString(l.Name, cfg.Replacement)
lb.set(res, l.Value)
}
}
case LabelDrop:
for _, l := range lset {
if regx.MatchString(l.Name) {
lb.del(l.Name)
}
}
case LabelKeep:
for _, l := range lset {
if !regx.MatchString(l.Name) {
lb.del(l.Name)
}
}
case DropIfEqual:
return handleDropIfEqual(lb, cfg, lset)
default:
logger.Errorf("relabel: unknown relabel action type %q", cfg.Action)
}
return lb.labels()
}
func handleReplace(lb *LabelBuilder, regx *regexp.Regexp, cfg *pconf.RelabelConfig, val string, lset []prompb.Label) []prompb.Label {
// replace 如果没有 target_label,直接返回原标签
if len(cfg.TargetLabel) == 0 {
return lb.labels()
}
// 如果没有 source_labels,直接设置标签(新增标签)
if len(cfg.SourceLabels) == 0 {
lb.set(cfg.TargetLabel, cfg.Replacement)
return lb.labels()
}
// 如果 Replacement 为空, separator 不为空, 则用已有标签构建新标签
if cfg.Replacement == "" && len(cfg.SourceLabels) > 1 {
lb.set(cfg.TargetLabel, val)
return lb.labels()
}
// 处理正则表达式替换的情况(修改标签值,正则)
if regx != nil {
indexes := regx.FindStringSubmatchIndex(val)
if indexes == nil {
return lb.labels()
}
target := model.LabelName(cfg.TargetLabel)
if !target.IsValid() {
lb.del(cfg.TargetLabel)
return lb.labels()
}
res := regx.ExpandString([]byte{}, cfg.Replacement, val, indexes)
if len(res) == 0 {
lb.del(cfg.TargetLabel)
} else {
lb.set(string(target), string(res))
}
return lb.labels()
}
// 默认情况,直接设置目标标签值
lb.set(cfg.TargetLabel, cfg.Replacement)
return lb.labels()
}
func handleDropIfEqual(lb *LabelBuilder, cfg *pconf.RelabelConfig, lset []prompb.Label) []prompb.Label {
if len(cfg.SourceLabels) < 2 {
return lb.labels()
}
firstVal := getValue(lset, cfg.SourceLabels[0])
equal := true
for _, label := range cfg.SourceLabels[1:] {
if getValue(lset, label) != firstVal {
equal = false
break
}
}
if equal {
return nil
}
return lb.labels()
}
func compileRegex(expr string) *regexp.Regexp {
regex, err := regexp.Compile(expr)
if err != nil {
logger.Error("failed to compile regexp:", expr, "error:", err)
return nil
}
return regex
}
func sum64(hash [md5.Size]byte) uint64 {
var s uint64
for i, b := range hash {
shift := uint64((md5.Size - i - 1) * 8)
s |= uint64(b) << shift
}
return s
}
================================================
FILE: pushgw/writer/relabel_test.go
================================================
// @Author: Ciusyan 6/19/24
package writer
import (
"reflect"
"sort"
"testing"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/prompb"
)
func TestProcess(t *testing.T) {
tests := []struct {
name string
labels []prompb.Label
cfgs []*pconf.RelabelConfig
expected []prompb.Label
}{
// 1. 添加新标签 (Adding new label)
{
name: "Adding new label",
labels: []prompb.Label{{Name: "job", Value: "aa"}},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
TargetLabel: "foo",
Replacement: "bar",
},
},
expected: []prompb.Label{{Name: "job", Value: "aa"}, {Name: "foo", Value: "bar"}},
},
// 2. 更新现有标签 (Updating existing label)
{
name: "Updating existing label",
labels: []prompb.Label{{Name: "foo", Value: "aaaa"}},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
TargetLabel: "foo",
Replacement: "bar",
},
},
expected: []prompb.Label{{Name: "foo", Value: "bar"}},
},
// 3. 重写现有标签 (Rewriting existing label)
{
name: "Rewriting existing label",
labels: []prompb.Label{{Name: "instance", Value: "bar:123"}},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
SourceLabels: model.LabelNames{"instance"},
Regex: "([^:]+):.+",
TargetLabel: "instance",
Replacement: "$1",
},
},
expected: []prompb.Label{{Name: "instance", Value: "bar"}},
},
{
name: "Rewriting existing label",
labels: []prompb.Label{{Name: "instance", Value: "bar:123"}},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
SourceLabels: model.LabelNames{"instance"},
Regex: ":([0-9]+)$",
TargetLabel: "port",
Replacement: "$1",
},
},
expected: []prompb.Label{{Name: "port", Value: "123"}, {Name: "instance", Value: "bar:123"}},
},
// 4. 更新度量标准名称 (Updating metric name)
{
name: "Updating metric name",
labels: []prompb.Label{{Name: "__name__", Value: "foo_suffix"}},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
SourceLabels: model.LabelNames{"__name__"},
Regex: "(.+)_suffix",
TargetLabel: "__name__",
Replacement: "prefix_$1",
},
},
expected: []prompb.Label{{Name: "__name__", Value: "prefix_foo"}},
},
// 5. 删除不需要/保持需要 的标签 (Removing unneeded labels)
{
name: "Removing unneeded labels",
labels: []prompb.Label{
{Name: "job", Value: "a"},
{Name: "instance", Value: "xyz"},
{Name: "foobar", Value: "baz"},
{Name: "foox", Value: "aaa"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "labeldrop",
Regex: "foo.+",
},
},
expected: []prompb.Label{
{Name: "job", Value: "a"},
{Name: "instance", Value: "xyz"},
},
},
{
name: "keep needed labels",
labels: []prompb.Label{
{Name: "job", Value: "a"},
{Name: "instance", Value: "xyz"},
{Name: "foobar", Value: "baz"},
{Name: "foox", Value: "aaa"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "labelkeep",
Regex: "foo.+",
},
},
expected: []prompb.Label{
{Name: "foobar", Value: "baz"},
{Name: "foox", Value: "aaa"},
},
},
// 6. 删除特定标签值 (Removing the specific label value)
{
name: "Removing the specific label value",
labels: []prompb.Label{
{Name: "foo", Value: "bar"},
{Name: "baz", Value: "x"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
SourceLabels: model.LabelNames{"foo"},
Regex: "bar",
TargetLabel: "foo",
Replacement: "",
},
},
expected: []prompb.Label{
{Name: "baz", Value: "x"},
},
},
// 7. 删除不需要的度量标准 (Removing unneeded metrics)
{
name: "Removing unneeded metrics",
labels: []prompb.Label{
{Name: "instance", Value: "foobar1"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "drop",
SourceLabels: model.LabelNames{"instance"},
Regex: "foobar.+",
},
},
expected: nil,
},
{
name: "Removing unneeded metrics 2",
labels: []prompb.Label{
{Name: "instance", Value: "foobar2"},
{Name: "job", Value: "xxx"},
{Name: "aaa", Value: "bb"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "drop",
SourceLabels: model.LabelNames{"instance"},
Regex: "foobar.+",
},
},
expected: nil,
},
{
name: "Removing unneeded metrics 3",
labels: []prompb.Label{
{Name: "instance", Value: "xxx"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "drop",
SourceLabels: model.LabelNames{"instance"},
Regex: "foobar.+",
},
},
expected: []prompb.Label{
{Name: "instance", Value: "xxx"},
},
},
{
name: "Removing unneeded metrics 4",
labels: []prompb.Label{
{Name: "instance", Value: "abc"},
{Name: "job", Value: "xyz"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "drop",
SourceLabels: model.LabelNames{"instance"},
Regex: "foobar.+",
},
},
expected: []prompb.Label{
{Name: "instance", Value: "abc"},
{Name: "job", Value: "xyz"},
},
},
{
name: "Removing unneeded metrics with multiple labels",
labels: []prompb.Label{
{Name: "job", Value: "foo"},
{Name: "instance", Value: "bar"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "drop",
SourceLabels: model.LabelNames{"job", "instance"},
Regex: "foo;bar",
Separator: ";",
},
},
expected: nil,
},
// 8. 按条件删除度量标准 (Dropping metrics on certain condition)
{
name: "Dropping metrics on certain condition",
labels: []prompb.Label{
{Name: "real_port", Value: "123"},
{Name: "needed_port", Value: "123"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "drop_if_equal",
SourceLabels: model.LabelNames{"real_port", "needed_port"},
},
},
expected: nil,
},
{
name: "Dropping metrics on certain condition 2",
labels: []prompb.Label{
{Name: "real_port", Value: "123"},
{Name: "needed_port", Value: "456"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "drop_if_equal",
SourceLabels: model.LabelNames{"real_port", "needed_port"},
},
},
expected: []prompb.Label{
{Name: "real_port", Value: "123"},
{Name: "needed_port", Value: "456"},
},
},
// 9. 修改标签名称 (Modifying label names)
{
name: "Modifying label names",
labels: []prompb.Label{
{Name: "foo_xx", Value: "bb"},
{Name: "job", Value: "qq"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "labelmap",
Regex: "foo_(.+)",
Replacement: "bar_$1",
},
},
expected: []prompb.Label{
{Name: "foo_xx", Value: "bb"},
{Name: "bar_xx", Value: "bb"},
{Name: "job", Value: "qq"},
},
},
// 10. 从多个现有标签构建新标签 (Constructing a label from multiple existing labels)
{
name: "Constructing a label from multiple existing labels",
labels: []prompb.Label{
{Name: "host", Value: "hostname"},
{Name: "port", Value: "9090"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
SourceLabels: model.LabelNames{"host", "port"},
Separator: ":",
TargetLabel: "address",
},
},
expected: []prompb.Label{
{Name: "host", Value: "hostname"},
{Name: "port", Value: "9090"},
{Name: "address", Value: "hostname:9090"},
},
},
// 11. 链式重标记规则 (Chaining relabeling rules)
{
name: "Chaining relabeling rules",
labels: []prompb.Label{
{Name: "instance", Value: "hostname:9090"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
TargetLabel: "foo",
Replacement: "bar",
},
{
Action: "replace",
SourceLabels: model.LabelNames{"instance"},
Regex: "([^:]+):.*",
TargetLabel: "instance",
Replacement: "$1",
},
},
expected: []prompb.Label{
{Name: "instance", Value: "hostname"},
{Name: "foo", Value: "bar"},
},
},
// 12. 条件重标记 (Conditional relabeling)
{
name: "Conditional relabeling matches",
labels: []prompb.Label{
{Name: "label", Value: "x"},
{Name: "foo", Value: "aaa"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
If: `label="x|y"`,
TargetLabel: "foo",
Replacement: "bar",
IfRegex: compileRegex(`label="x|y"`),
},
},
expected: []prompb.Label{
{Name: "label", Value: "x"},
{Name: "foo", Value: "bar"},
},
},
{
name: "Conditional relabeling matches alternative",
labels: []prompb.Label{
{Name: "label", Value: "y"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
If: `label="x|y"`,
TargetLabel: "foo",
Replacement: "bar",
IfRegex: compileRegex(`label="x|y"`),
},
},
expected: []prompb.Label{
{Name: "label", Value: "y"},
{Name: "foo", Value: "bar"},
},
},
{
name: "Conditional relabeling does not match",
labels: []prompb.Label{
{Name: "label", Value: "z"},
},
cfgs: []*pconf.RelabelConfig{
{
Action: "replace",
If: `label="x|y"`,
TargetLabel: "foo",
Replacement: "bar",
IfRegex: compileRegex(`label="x|y"`),
},
},
expected: []prompb.Label{
{Name: "label", Value: "z"},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := Process(tt.labels, tt.cfgs...)
// Sort the slices before comparison
sort.Slice(got, func(i, j int) bool {
return got[i].Name < got[j].Name
})
sort.Slice(tt.expected, func(i, j int) bool {
return tt.expected[i].Name < tt.expected[j].Name
})
if !reflect.DeepEqual(got, tt.expected) {
t.Errorf("Process() = %v, want %v", got, tt.expected)
}
})
}
}
================================================
FILE: pushgw/writer/writer.go
================================================
package writer
import (
"bytes"
"context"
"encoding/json"
"fmt"
"math"
"net/http"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/IBM/sarama"
"github.com/ccfos/nightingale/v6/pkg/fasttime"
"github.com/ccfos/nightingale/v6/pushgw/kafka"
"github.com/ccfos/nightingale/v6/pushgw/pconf"
"github.com/ccfos/nightingale/v6/pushgw/pstat"
"github.com/golang/protobuf/proto"
"github.com/golang/snappy"
"github.com/prometheus/client_golang/api"
"github.com/prometheus/prometheus/prompb"
"github.com/toolkits/pkg/logger"
)
type WriterType struct {
Opts pconf.WriterOptions
ForceUseServerTS bool
Client api.Client
RetryCount int
RetryInterval int64 // 单位秒
}
func beforeWrite(key string, items []prompb.TimeSeries, forceUseServerTS bool, encodeType string) ([]byte, error) {
pstat.CounterWriteTotal.WithLabelValues(key).Add(float64(len(items)))
if forceUseServerTS {
ts := int64(fasttime.UnixTimestamp()) * 1000
for i := 0; i < len(items); i++ {
if len(items[i].Samples) == 0 {
continue
}
items[i].Samples[0].Timestamp = ts
}
}
if encodeType == "proto" {
req := &prompb.WriteRequest{
Timeseries: items,
}
return proto.Marshal(req)
}
// 如果是 json 格式,将 NaN 值的数据丢弃掉
return json.Marshal(filterNaNSamples(items))
}
func filterNaNSamples(items []prompb.TimeSeries) []prompb.TimeSeries {
// 早期检查:如果没有NaN值,直接返回原始数据
hasNaN := false
for i := range items {
for j := range items[i].Samples {
if math.IsNaN(items[i].Samples[j].Value) {
hasNaN = true
break
}
}
if hasNaN {
break
}
}
if !hasNaN {
return items
}
// 有NaN值时进行过滤,原地修改以减少内存分配
for i := range items {
samples := items[i].Samples
validCount := 0
// 原地过滤 samples,避免额外的内存分配
for j := range samples {
if !math.IsNaN(samples[j].Value) {
if validCount != j {
samples[validCount] = samples[j]
}
validCount++
}
}
// 保留所有时间序列,即使没有有效样本(此时Samples为空)
items[i].Samples = samples[:validCount]
}
return items
}
func (w WriterType) Write(key string, items []prompb.TimeSeries, headers ...map[string]string) {
if len(items) == 0 {
return
}
items = Relabel(items, w.Opts.WriteRelabels)
if len(items) == 0 {
return
}
start := time.Now()
defer func() {
pstat.ForwardDuration.WithLabelValues(key).Observe(time.Since(start).Seconds())
}()
data, err := beforeWrite(key, items, w.ForceUseServerTS, "proto")
if err != nil {
logger.Warningf("marshal prom data to proto got error: %v, data: %+v", err, items)
return
}
for i := 0; i < w.RetryCount; i++ {
err := w.Post(snappy.Encode(nil, data), headers...)
if err == nil {
break
}
pstat.CounterWriteErrorTotal.WithLabelValues(key).Add(float64(len(items)))
logger.Warningf("post to %s got error: %v in %d times", w.Opts.Url, err, i)
if i == 0 {
logger.Warning("example timeseries:", items[0].String())
}
time.Sleep(time.Duration(w.RetryInterval) * time.Second)
}
}
func (w WriterType) Post(req []byte, headers ...map[string]string) error {
urls := strings.Split(w.Opts.Url, ",")
var err error
var newRequestErr error
var httpReq *http.Request
for _, url := range urls {
httpReq, newRequestErr = http.NewRequest("POST", url, bytes.NewReader(req))
if newRequestErr != nil {
logger.Warningf("create remote write:%s request got error: %s", url, newRequestErr.Error())
continue
}
httpReq.Header.Add("Content-Encoding", "snappy")
httpReq.Header.Set("Content-Type", "application/x-protobuf")
httpReq.Header.Set("User-Agent", "n9e")
httpReq.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
if len(headers) > 0 {
for k, v := range headers[0] {
httpReq.Header.Set(k, v)
}
}
if w.Opts.BasicAuthUser != "" {
httpReq.SetBasicAuth(w.Opts.BasicAuthUser, w.Opts.BasicAuthPass)
}
headerCount := len(w.Opts.Headers)
if headerCount > 0 && headerCount%2 == 0 {
for i := 0; i < len(w.Opts.Headers); i += 2 {
httpReq.Header.Add(w.Opts.Headers[i], w.Opts.Headers[i+1])
if w.Opts.Headers[i] == "Host" {
httpReq.Host = w.Opts.Headers[i+1]
}
}
}
resp, body, e := w.Client.Do(context.Background(), httpReq)
if e != nil {
logger.Warningf("push data with remote write:%s request got error: %v, response body: %s", url, e, string(body))
err = e
continue
}
if resp.StatusCode >= 400 && resp.StatusCode < 500 {
// 解码并解析 req 以便打印指标信息
decoded, decodeErr := snappy.Decode(nil, req)
metricsInfo := "failed to decode request"
if decodeErr == nil {
var writeReq prompb.WriteRequest
if unmarshalErr := proto.Unmarshal(decoded, &writeReq); unmarshalErr == nil {
metricsInfo = fmt.Sprintf("timeseries count: %d", len(writeReq.Timeseries))
logger.Warningf("push data with remote write:%s request got status code: %v, response body: %s, %s", url, resp.StatusCode, string(body), metricsInfo)
// 只打印前几条样本,避免日志泛滥
sampleCount := 5
if sampleCount > len(writeReq.Timeseries) {
sampleCount = len(writeReq.Timeseries)
}
for i := 0; i < sampleCount; i++ {
logger.Warningf("push data with remote write:%s timeseries: [%d] %s", url, i, writeReq.Timeseries[i].String())
}
} else {
metricsInfo = fmt.Sprintf("failed to unmarshal: %v", unmarshalErr)
logger.Warningf("push data with remote write:%s request got status code: %v, response body: %s, metrics: %s", url, resp.StatusCode, string(body), metricsInfo)
}
} else {
metricsInfo = fmt.Sprintf("failed to decode: %v", decodeErr)
logger.Warningf("push data with remote write:%s request got status code: %v, response body: %s, metrics: %s", url, resp.StatusCode, string(body), metricsInfo)
}
continue
}
if resp.StatusCode >= 500 {
err = fmt.Errorf("push data with remote write:%s request got status code: %v, response body: %s", url, resp.StatusCode, string(body))
logger.Warning(err)
continue
}
err = nil
break
}
return err
}
type WritersType struct {
pushgw pconf.Pushgw
backends map[string]Writer
queues map[string]*IdentQueue
AllQueueLen atomic.Value
PushConcurrency atomic.Int64
sync.RWMutex
}
type IdentQueue struct {
list *SafeListLimited
closeCh chan struct{}
ts int64
}
func (ws *WritersType) ReportQueueStats(queueid string, identQueue *IdentQueue) (interface{}, bool) {
for {
time.Sleep(15 * time.Second)
count := identQueue.list.Len()
pstat.GaugeSampleQueueSize.WithLabelValues(queueid).Set(float64(count))
}
}
func (ws *WritersType) SetAllQueueLen() {
for {
curMetricLen := 0
ws.RLock()
for _, q := range ws.queues {
curMetricLen += q.list.Len()
}
ws.RUnlock()
ws.AllQueueLen.Store(int64(curMetricLen))
time.Sleep(time.Duration(ws.pushgw.WriterOpt.AllQueueMaxSizeInterval) * time.Millisecond)
}
}
func NewWriters(pushgwConfig pconf.Pushgw) *WritersType {
writers := &WritersType{
backends: make(map[string]Writer),
queues: make(map[string]*IdentQueue),
pushgw: pushgwConfig,
AllQueueLen: atomic.Value{},
}
writers.Init()
go writers.SetAllQueueLen()
go writers.CleanExpQueue()
return writers
}
func (ws *WritersType) Put(name string, writer Writer) {
ws.backends[name] = writer
}
func (ws *WritersType) isCriticalBackend(key string) bool {
backend, exists := ws.backends[key]
if !exists {
return false
}
// 使用类型断言判断
switch backend.(type) {
case WriterType:
if backend.(WriterType).Opts.AsyncWrite {
return false
}
// HTTP Writer 作为关键后端
return true
case KafkaWriterType:
// Kafka Writer 作为非关键后端
return false
default:
// 未知类型,保守起见作为关键后端
logger.Warningf("Unknown backend type: %T, treating as critical", backend)
return true
}
}
func (ws *WritersType) CleanExpQueue() {
for {
ws.Lock()
for ident := range ws.queues {
identQueue := ws.queues[ident]
if identQueue == nil {
delete(ws.queues, ident)
logger.Warningf("Write channel(%s) not found", ident)
continue
}
if time.Now().Unix()-identQueue.ts > 3600 {
close(identQueue.closeCh)
delete(ws.queues, ident)
}
}
ws.Unlock()
time.Sleep(time.Second * 600)
}
}
func (ws *WritersType) PushSample(queueid string, v interface{}) error {
ws.RLock()
queue := ws.queues[queueid]
ws.RUnlock()
if queue == nil {
queue = &IdentQueue{
list: NewSafeListLimited(ws.pushgw.WriterOpt.QueueMaxSize),
closeCh: make(chan struct{}),
ts: time.Now().Unix(),
}
ws.Lock()
ws.queues[queueid] = queue
ws.Unlock()
go ws.ReportQueueStats(queueid, queue)
go ws.StartConsumer(queue)
}
queue.ts = time.Now().Unix()
succ := queue.list.PushFront(v)
if !succ {
logger.Warningf("Write channel(%s) full, current channel size: %d, item: %+v", queueid, queue.list.Len(), v)
pstat.CounterPushQueueErrorTotal.WithLabelValues(queueid).Inc()
}
return nil
}
type Writer interface {
Write(string, []prompb.TimeSeries, ...map[string]string)
}
func (ws *WritersType) StartConsumer(identQueue *IdentQueue) {
for {
select {
case <-identQueue.closeCh:
logger.Infof("write queue:%v closed", identQueue)
return
default:
series := identQueue.list.PopBack(ws.pushgw.WriterOpt.QueuePopSize)
if len(series) == 0 {
time.Sleep(time.Millisecond * 400)
continue
}
for key := range ws.backends {
if ws.isCriticalBackend(key) {
ws.backends[key].Write(key, series)
} else {
// 像 kafka 这种 writer 使用异步写入,防止因为写入太慢影响主流程
ws.writeToNonCriticalBackend(key, series)
}
}
}
}
}
func (ws *WritersType) writeToNonCriticalBackend(key string, series []prompb.TimeSeries) {
// 原子性地检查并增加并发数
currentConcurrency := ws.PushConcurrency.Add(1)
if currentConcurrency > int64(ws.pushgw.PushConcurrency) {
// 超过限制,立即减少计数并丢弃
ws.PushConcurrency.Add(-1)
logger.Warningf("push concurrency limit exceeded, current: %d, limit: %d, dropping %d series for backend: %s",
currentConcurrency-1, ws.pushgw.PushConcurrency, len(series), key)
pstat.CounterWriteErrorTotal.WithLabelValues(key).Add(float64(len(series)))
return
}
// 深拷贝数据,确保并发安全
seriesCopy := ws.deepCopySeries(series)
// 启动goroutine处理
go func(backendKey string, data []prompb.TimeSeries) {
defer func() {
ws.PushConcurrency.Add(-1)
if r := recover(); r != nil {
logger.Errorf("panic in non-critical backend %s: %v", backendKey, r)
}
}()
ws.backends[backendKey].Write(backendKey, data)
}(key, seriesCopy)
}
// 完整的深拷贝方法
func (ws *WritersType) deepCopySeries(series []prompb.TimeSeries) []prompb.TimeSeries {
seriesCopy := make([]prompb.TimeSeries, len(series))
for i := range series {
seriesCopy[i] = series[i]
if len(series[i].Samples) > 0 {
samples := make([]prompb.Sample, len(series[i].Samples))
copy(samples, series[i].Samples)
seriesCopy[i].Samples = samples
}
}
return seriesCopy
}
func (ws *WritersType) Init() error {
ws.AllQueueLen.Store(int64(0))
if err := ws.initWriters(); err != nil {
return err
}
return ws.initKafkaWriters()
}
func (ws *WritersType) initWriters() error {
opts := ws.pushgw.Writers
for i := range opts {
cli, err := api.NewClient(api.Config{
Address: opts[i].Url,
RoundTripper: opts[i].HTTPTransport,
})
if err != nil {
return err
}
writer := WriterType{
Opts: opts[i],
Client: cli,
ForceUseServerTS: ws.pushgw.ForceUseServerTS,
RetryCount: ws.pushgw.WriterOpt.RetryCount,
RetryInterval: ws.pushgw.WriterOpt.RetryInterval,
}
ws.Put(opts[i].Url, writer)
}
return nil
}
func initKafkaSASL(cfg *sarama.Config, opt pconf.KafkaWriterOptions) {
if opt.SASL != nil && opt.SASL.Enable {
cfg.Net.SASL.Enable = true
cfg.Net.SASL.User = opt.SASL.User
cfg.Net.SASL.Password = opt.SASL.Password
cfg.Net.SASL.Mechanism = sarama.SASLMechanism(opt.SASL.Mechanism)
cfg.Net.SASL.Version = opt.SASL.Version
cfg.Net.SASL.Handshake = opt.SASL.Handshake
cfg.Net.SASL.AuthIdentity = opt.SASL.AuthIdentity
}
}
func (ws *WritersType) initKafkaWriters() error {
opts := ws.pushgw.KafkaWriters
for i := 0; i < len(opts); i++ {
cfg := sarama.NewConfig()
initKafkaSASL(cfg, opts[i])
if opts[i].Timeout != 0 {
cfg.Producer.Timeout = time.Duration(opts[i].Timeout) * time.Second
}
if opts[i].Version != "" {
kafkaVersion, err := sarama.ParseKafkaVersion(opts[i].Version)
if err != nil {
logger.Warningf("parse kafka version got error: %v", err)
} else {
cfg.Version = kafkaVersion
}
}
if opts[i].Typ == "" {
opts[i].Typ = kafka.AsyncProducer
}
producer, err := kafka.New(opts[i].Typ, opts[i].Brokers, cfg)
if err != nil {
logger.Warningf("new kafka producer got error: %v", err)
return err
}
writer := KafkaWriterType{
Opts: opts[i],
ForceUseServerTS: ws.pushgw.ForceUseServerTS,
Client: producer,
RetryCount: ws.pushgw.WriterOpt.RetryCount,
RetryInterval: ws.pushgw.WriterOpt.RetryInterval,
}
ws.Put(fmt.Sprintf("%v_%s", opts[i].Brokers, opts[i].Topic), writer)
}
return nil
}
================================================
FILE: storage/redis.go
================================================
package storage
import (
"context"
"errors"
"fmt"
"os"
"strings"
"time"
"github.com/alicebob/miniredis/v2"
"github.com/ccfos/nightingale/v6/pkg/tlsx"
"github.com/redis/go-redis/v9"
"github.com/toolkits/pkg/logger"
)
type RedisConfig struct {
Address string
Username string
Password string
DB int
tlsx.ClientConfig
RedisType string
MasterName string
SentinelUsername string
SentinelPassword string
DialTimeoutMills int // default 5000 ms
ReadTimeoutMills int // default 3000 ms
WriteTimeoutMills int // default 3000 ms
}
type Redis redis.Cmdable
func NewRedis(cfg RedisConfig) (Redis, error) {
var redisClient Redis
if cfg.DialTimeoutMills == 0 {
cfg.DialTimeoutMills = 5000
}
if cfg.ReadTimeoutMills == 0 {
cfg.ReadTimeoutMills = 3000
}
if cfg.WriteTimeoutMills == 0 {
cfg.WriteTimeoutMills = 3000
}
switch cfg.RedisType {
case "standalone", "":
redisOptions := &redis.Options{
Addr: cfg.Address,
Username: cfg.Username,
Password: cfg.Password,
DB: cfg.DB,
DialTimeout: time.Duration(cfg.DialTimeoutMills) * time.Millisecond,
ReadTimeout: time.Duration(cfg.ReadTimeoutMills) * time.Millisecond,
WriteTimeout: time.Duration(cfg.WriteTimeoutMills) * time.Millisecond,
}
if cfg.UseTLS {
tlsConfig, err := cfg.TLSConfig()
if err != nil {
fmt.Println("failed to init redis tls config:", err)
os.Exit(1)
}
redisOptions.TLSConfig = tlsConfig
}
redisClient = redis.NewClient(redisOptions)
case "cluster":
redisOptions := &redis.ClusterOptions{
Addrs: strings.Split(cfg.Address, ","),
Username: cfg.Username,
Password: cfg.Password,
DialTimeout: time.Duration(cfg.DialTimeoutMills) * time.Millisecond,
ReadTimeout: time.Duration(cfg.ReadTimeoutMills) * time.Millisecond,
WriteTimeout: time.Duration(cfg.WriteTimeoutMills) * time.Millisecond,
}
if cfg.UseTLS {
tlsConfig, err := cfg.TLSConfig()
if err != nil {
fmt.Println("failed to init redis tls config:", err)
os.Exit(1)
}
redisOptions.TLSConfig = tlsConfig
}
redisClient = redis.NewClusterClient(redisOptions)
case "sentinel":
redisOptions := &redis.FailoverOptions{
MasterName: cfg.MasterName,
SentinelAddrs: strings.Split(cfg.Address, ","),
Username: cfg.Username,
Password: cfg.Password,
DB: cfg.DB,
SentinelUsername: cfg.SentinelUsername,
SentinelPassword: cfg.SentinelPassword,
DialTimeout: time.Duration(cfg.DialTimeoutMills) * time.Millisecond,
ReadTimeout: time.Duration(cfg.ReadTimeoutMills) * time.Millisecond,
WriteTimeout: time.Duration(cfg.WriteTimeoutMills) * time.Millisecond,
}
if cfg.UseTLS {
tlsConfig, err := cfg.TLSConfig()
if err != nil {
fmt.Println("failed to init redis tls config:", err)
os.Exit(1)
}
redisOptions.TLSConfig = tlsConfig
}
redisClient = redis.NewFailoverClient(redisOptions)
case "miniredis":
s, err := miniredis.Run()
if err != nil {
fmt.Println("failed to init miniredis:", err)
os.Exit(1)
}
redisClient = redis.NewClient(&redis.Options{
Addr: s.Addr(),
})
default:
fmt.Println("failed to init redis , redis type is illegal:", cfg.RedisType)
os.Exit(1)
}
err := redisClient.Ping(context.Background()).Err()
if err != nil {
fmt.Println("failed to ping redis:", err)
os.Exit(1)
}
return redisClient, nil
}
func MGet(ctx context.Context, r Redis, keys []string) [][]byte {
var vals [][]byte
pipe := r.Pipeline()
for _, key := range keys {
pipe.Get(ctx, key)
}
cmds, _ := pipe.Exec(ctx)
for i, key := range keys {
cmd := cmds[i]
if errors.Is(cmd.Err(), redis.Nil) {
continue
}
if cmd.Err() != nil {
logger.Errorf("failed to get key: %s, err: %s", key, cmd.Err())
continue
}
val := []byte(cmd.(*redis.StringCmd).Val())
vals = append(vals, val)
}
return vals
}
func MSet(ctx context.Context, r Redis, m map[string]interface{}, expiration time.Duration) error {
pipe := r.Pipeline()
for k, v := range m {
pipe.Set(ctx, k, v, expiration)
}
_, err := pipe.Exec(ctx)
return err
}
================================================
FILE: storage/redis_test.go
================================================
package storage
import (
"context"
"testing"
"github.com/alicebob/miniredis/v2"
"github.com/redis/go-redis/v9"
"github.com/stretchr/testify/assert"
)
func TestMiniRedisMGet(t *testing.T) {
s, err := miniredis.Run()
if err != nil {
t.Fatalf("failed to start miniredis: %v", err)
}
defer s.Close()
rdb := redis.NewClient(&redis.Options{
Addr: s.Addr(),
})
err = rdb.Ping(context.Background()).Err()
if err != nil {
t.Fatalf("failed to ping miniredis: %v", err)
}
mp := make(map[string]interface{})
mp["key1"] = "value1"
mp["key2"] = "value2"
mp["key3"] = "value3"
err = MSet(context.Background(), rdb, mp, 0)
if err != nil {
t.Fatalf("failed to set miniredis value: %v", err)
}
ctx := context.Background()
keys := []string{"key1", "key2", "key3", "key4"}
vals := MGet(ctx, rdb, keys)
expected := [][]byte{[]byte("value1"), []byte("value2"), []byte("value3")}
assert.Equal(t, expected, vals)
}
================================================
FILE: storage/storage.go
================================================
package storage
import (
"github.com/ccfos/nightingale/v6/pkg/ormx"
"gorm.io/gorm"
)
func New(cfg ormx.DBConfig) (*gorm.DB, error) {
db, err := ormx.New(cfg)
if err != nil {
return nil, err
}
return db, nil
}