Repository: ccfos/nightingale Branch: main Commit: ea9c52c808cd Files: 802 Total size: 9.6 MB Directory structure: gitextract_wwt_h4rf/ ├── .gitattributes ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── config.yml │ │ ├── enhancement.md │ │ └── question.yml │ ├── PULL_REQUEST_TEMPLATE.md │ └── workflows/ │ ├── issue-translator.yml │ └── n9e.yml ├── .gitignore ├── .goreleaser.yaml ├── .typos.toml ├── LICENSE ├── Makefile ├── README.md ├── README_zh.md ├── alert/ │ ├── aconf/ │ │ └── conf.go │ ├── alert.go │ ├── astats/ │ │ └── stats.go │ ├── common/ │ │ └── key.go │ ├── dispatch/ │ │ ├── consume.go │ │ ├── dispatch.go │ │ ├── log.go │ │ ├── notify_channel.go │ │ └── notify_target.go │ ├── eval/ │ │ ├── alert_rule.go │ │ ├── eval.go │ │ └── eval_test.go │ ├── mute/ │ │ └── mute.go │ ├── naming/ │ │ ├── hashring.go │ │ ├── heartbeat.go │ │ └── leader.go │ ├── pipeline/ │ │ ├── engine/ │ │ │ └── engine.go │ │ ├── pipeline.go │ │ └── processor/ │ │ ├── aisummary/ │ │ │ ├── ai_summary.go │ │ │ └── ai_summary_test.go │ │ ├── callback/ │ │ │ └── callback.go │ │ ├── common/ │ │ │ └── common.go │ │ ├── eventdrop/ │ │ │ └── event_drop.go │ │ ├── eventupdate/ │ │ │ └── event_update.go │ │ ├── logic/ │ │ │ ├── if.go │ │ │ └── switch.go │ │ ├── relabel/ │ │ │ └── relabel.go │ │ └── utils/ │ │ └── utils.go │ ├── process/ │ │ ├── alert_cur_event.go │ │ └── process.go │ ├── queue/ │ │ └── queue.go │ ├── record/ │ │ ├── prom_rule.go │ │ ├── sample.go │ │ └── scheduler.go │ ├── router/ │ │ ├── router.go │ │ ├── router_alert_eval_detail.go │ │ ├── router_event.go │ │ ├── router_event_detail.go │ │ └── router_trace_logs.go │ └── sender/ │ ├── callback.go │ ├── dingtalk.go │ ├── email.go │ ├── feishu.go │ ├── feishucard.go │ ├── global_webhook.go │ ├── global_webhook_test.go │ ├── ibex.go │ ├── lark.go │ ├── larkcard.go │ ├── mm.go │ ├── notify_record_queue.go │ ├── plugin.go │ ├── plugin_cmd_unix.go │ ├── plugin_cmd_windows.go │ ├── sender.go │ ├── telegram.go │ ├── webhook.go │ ├── webhook_event_queue.go │ ├── webhook_event_queue_test.go │ ├── webhook_queue.go │ └── wecom.go ├── center/ │ ├── cconf/ │ │ ├── conf.go │ │ ├── event_example.go │ │ ├── metric.go │ │ ├── ops.go │ │ ├── plugin.go │ │ ├── rsa/ │ │ │ └── rsa_conf.go │ │ └── sql_tpl.go │ ├── center.go │ ├── cstats/ │ │ └── stats.go │ ├── integration/ │ │ └── init.go │ ├── metas/ │ │ └── metas.go │ ├── router/ │ │ ├── router.go │ │ ├── router_alert_aggr_view.go │ │ ├── router_alert_cur_event.go │ │ ├── router_alert_eval_detail.go │ │ ├── router_alert_his_event.go │ │ ├── router_alert_rule.go │ │ ├── router_alert_subscribe.go │ │ ├── router_board.go │ │ ├── router_builtin.go │ │ ├── router_builtin_component.go │ │ ├── router_builtin_metric_filter.go │ │ ├── router_builtin_metrics.go │ │ ├── router_builtin_payload.go │ │ ├── router_busi_group.go │ │ ├── router_captcha.go │ │ ├── router_chart_share.go │ │ ├── router_config.go │ │ ├── router_configs.go │ │ ├── router_crypto.go │ │ ├── router_dash_annotation.go │ │ ├── router_dashboard.go │ │ ├── router_datasource.go │ │ ├── router_datasource_db.go │ │ ├── router_embedded.go │ │ ├── router_es.go │ │ ├── router_es_index_pattern.go │ │ ├── router_event_detail.go │ │ ├── router_event_pipeline.go │ │ ├── router_funcs.go │ │ ├── router_heartbeat.go │ │ ├── router_login.go │ │ ├── router_message_template.go │ │ ├── router_metric_desc.go │ │ ├── router_metric_view.go │ │ ├── router_mute.go │ │ ├── router_mw.go │ │ ├── router_notification_record.go │ │ ├── router_notify_channel.go │ │ ├── router_notify_channel_test.go │ │ ├── router_notify_config.go │ │ ├── router_notify_rule.go │ │ ├── router_notify_tpl.go │ │ ├── router_opensearch.go │ │ ├── router_proxy.go │ │ ├── router_query.go │ │ ├── router_recording_rule.go │ │ ├── router_role.go │ │ ├── router_role_operation.go │ │ ├── router_saved_view.go │ │ ├── router_self.go │ │ ├── router_server.go │ │ ├── router_source_token.go │ │ ├── router_target.go │ │ ├── router_task.go │ │ ├── router_task_tpl.go │ │ ├── router_tdengine.go │ │ ├── router_trace_logs.go │ │ ├── router_user.go │ │ ├── router_user_group.go │ │ └── router_user_variable_config.go │ └── sso/ │ ├── init.go │ └── sync.go ├── cli/ │ ├── cli.go │ └── upgrade/ │ ├── config.go │ ├── readme.md │ ├── upgrade.go │ └── upgrade.sql ├── cmd/ │ ├── alert/ │ │ └── main.go │ ├── center/ │ │ └── main.go │ ├── cli/ │ │ └── main.go │ ├── edge/ │ │ ├── edge.go │ │ └── main.go │ └── pushgw/ │ └── main.go ├── conf/ │ ├── conf.go │ └── crypto.go ├── cron/ │ ├── clean_notify_record.go │ └── clean_pipeline_execution.go ├── datasource/ │ ├── ck/ │ │ └── clickhouse.go │ ├── commons/ │ │ └── eslike/ │ │ └── eslike.go │ ├── datasource.go │ ├── doris/ │ │ └── doris.go │ ├── es/ │ │ └── es.go │ ├── mysql/ │ │ └── mysql.go │ ├── opensearch/ │ │ └── opensearch.go │ ├── postgresql/ │ │ └── postgresql.go │ ├── prom/ │ │ └── prom.go │ ├── tdengine/ │ │ └── tdengine.go │ └── victorialogs/ │ └── victorialogs.go ├── doc/ │ ├── README.bak.md │ ├── active-contributors.md │ ├── committers.md │ ├── community-governance.md │ ├── contributors.md │ ├── end-users.md │ ├── pmc.md │ └── server-dash.json ├── docker/ │ ├── .dockerignore │ ├── Dockerfile.goreleaser │ ├── Dockerfile.goreleaser.arm64 │ ├── build.sh │ ├── compose-bridge/ │ │ ├── docker-compose.yaml │ │ ├── etc-categraf/ │ │ │ ├── config.toml │ │ │ ├── input.cpu/ │ │ │ │ └── cpu.toml │ │ │ ├── input.disk/ │ │ │ │ └── disk.toml │ │ │ ├── input.diskio/ │ │ │ │ └── diskio.toml │ │ │ ├── input.kernel/ │ │ │ │ └── kernel.toml │ │ │ ├── input.mem/ │ │ │ │ └── mem.toml │ │ │ ├── input.mysql/ │ │ │ │ └── mysql.toml │ │ │ ├── input.net/ │ │ │ │ └── net.toml │ │ │ ├── input.netstat/ │ │ │ │ └── netstat.toml │ │ │ ├── input.processes/ │ │ │ │ └── processes.toml │ │ │ ├── input.prometheus/ │ │ │ │ └── prometheus.toml │ │ │ ├── input.redis/ │ │ │ │ └── redis.toml │ │ │ └── input.system/ │ │ │ └── system.toml │ │ ├── etc-mysql/ │ │ │ └── my.cnf │ │ └── etc-nightingale/ │ │ ├── config.toml │ │ ├── metrics.yaml │ │ └── script/ │ │ ├── notify.bak.py │ │ ├── notify.py │ │ ├── notify_feishu.py │ │ └── rule_converter.py │ ├── compose-host-network/ │ │ ├── docker-compose.yaml │ │ ├── etc-categraf/ │ │ │ ├── config.toml │ │ │ ├── input.cpu/ │ │ │ │ └── cpu.toml │ │ │ ├── input.disk/ │ │ │ │ └── disk.toml │ │ │ ├── input.diskio/ │ │ │ │ └── diskio.toml │ │ │ ├── input.kernel/ │ │ │ │ └── kernel.toml │ │ │ ├── input.mem/ │ │ │ │ └── mem.toml │ │ │ ├── input.net/ │ │ │ │ └── net.toml │ │ │ ├── input.netstat/ │ │ │ │ └── netstat.toml │ │ │ ├── input.processes/ │ │ │ │ └── processes.toml │ │ │ └── input.system/ │ │ │ └── system.toml │ │ ├── etc-mysql/ │ │ │ └── my.cnf │ │ ├── etc-nightingale/ │ │ │ ├── config.toml │ │ │ ├── metrics.yaml │ │ │ └── script/ │ │ │ ├── notify.bak.py │ │ │ ├── notify.py │ │ │ ├── notify_feishu.py │ │ │ └── rule_converter.py │ │ └── etc-prometheus/ │ │ └── prometheus.yml │ ├── compose-host-network-metric-log/ │ │ ├── docker-compose.yaml │ │ ├── etc-categraf/ │ │ │ ├── config.toml │ │ │ ├── input.cpu/ │ │ │ │ └── cpu.toml │ │ │ ├── input.disk/ │ │ │ │ └── disk.toml │ │ │ ├── input.diskio/ │ │ │ │ └── diskio.toml │ │ │ ├── input.kernel/ │ │ │ │ └── kernel.toml │ │ │ ├── input.mem/ │ │ │ │ └── mem.toml │ │ │ ├── input.net/ │ │ │ │ └── net.toml │ │ │ ├── input.netstat/ │ │ │ │ └── netstat.toml │ │ │ ├── input.processes/ │ │ │ │ └── processes.toml │ │ │ ├── input.system/ │ │ │ │ └── system.toml │ │ │ └── logs.toml │ │ ├── etc-logstash/ │ │ │ └── logstash.yaml │ │ ├── etc-mysql/ │ │ │ └── my.cnf │ │ ├── etc-nightingale/ │ │ │ ├── config.toml │ │ │ ├── metrics.yaml │ │ │ └── script/ │ │ │ ├── notify.bak.py │ │ │ ├── notify.py │ │ │ ├── notify_feishu.py │ │ │ └── rule_converter.py │ │ └── etc-prometheus/ │ │ └── prometheus.yml │ ├── compose-postgres/ │ │ ├── categraf/ │ │ │ └── conf/ │ │ │ ├── config.toml │ │ │ ├── input.cpu/ │ │ │ │ └── cpu.toml │ │ │ ├── input.disk/ │ │ │ │ └── disk.toml │ │ │ ├── input.diskio/ │ │ │ │ └── diskio.toml │ │ │ ├── input.docker/ │ │ │ │ └── docker.toml │ │ │ ├── input.kernel/ │ │ │ │ └── kernel.toml │ │ │ ├── input.mem/ │ │ │ │ └── mem.toml │ │ │ ├── input.net/ │ │ │ │ └── net.toml │ │ │ ├── input.netstat/ │ │ │ │ └── netstat.toml │ │ │ ├── input.processes/ │ │ │ │ └── processes.toml │ │ │ ├── input.system/ │ │ │ │ └── system.toml │ │ │ └── prometheus.toml │ │ ├── docker-compose.yaml │ │ ├── initsql_for_postgres/ │ │ │ ├── a-n9e-for-Postgres.sql │ │ │ └── b-ibex-for-Postgres.sql │ │ ├── n9eetc_pg/ │ │ │ ├── config.toml │ │ │ └── metrics.yaml │ │ └── prometc_vm/ │ │ ├── prometheus.yml │ │ └── targets.json │ ├── initsql/ │ │ ├── a-n9e.sql │ │ └── c-init.sql │ ├── migratesql/ │ │ └── migrate.sql │ └── sqlite.sql ├── dscache/ │ ├── cache.go │ └── sync.go ├── dskit/ │ ├── clickhouse/ │ │ ├── clickhouse.go │ │ ├── clickhouse_test.go │ │ └── timeseries.go │ ├── doris/ │ │ ├── doris.go │ │ ├── logs.go │ │ ├── sql_analyzer.go │ │ ├── sql_analyzer_test.go │ │ ├── template.md │ │ └── timeseries.go │ ├── mysql/ │ │ ├── mysql.go │ │ ├── mysql_test.go │ │ ├── timeseries.go │ │ └── timeseries_test.go │ ├── pool/ │ │ └── pool.go │ ├── postgres/ │ │ ├── postgres.go │ │ └── timeseries.go │ ├── sqlbase/ │ │ ├── base.go │ │ ├── timeseries.go │ │ └── timeseries_test.go │ ├── tdengine/ │ │ └── tdengine.go │ ├── types/ │ │ ├── timeseries.go │ │ └── types.go │ └── victorialogs/ │ ├── victorialogs.go │ └── victorialogs_test.go ├── dumper/ │ ├── dumper.go │ └── sync.go ├── etc/ │ ├── config.toml │ ├── edge/ │ │ └── edge.toml │ ├── metrics.yaml │ └── script/ │ ├── notify.bak.py │ ├── notify.py │ ├── notify_feishu.py │ └── rule_converter.py ├── fe.sh ├── go.mod ├── go.sum ├── integrations/ │ ├── AMD_ROCm_SMI/ │ │ ├── collect/ │ │ │ └── amd_rocm_smi/ │ │ │ └── rocm.toml │ │ └── markdown/ │ │ └── README.md │ ├── AliYun/ │ │ ├── collect/ │ │ │ └── aliyun/ │ │ │ └── cloud.toml │ │ ├── dashboards/ │ │ │ ├── arms-api.json │ │ │ ├── arms-application.json │ │ │ ├── arms-db.json │ │ │ ├── arms-jvm-service.json │ │ │ ├── arms-machine.json │ │ │ ├── arms_jvm.json │ │ │ ├── cdn.json │ │ │ ├── ecs.json │ │ │ ├── mongodb.json │ │ │ ├── mse.json │ │ │ ├── mysql.json │ │ │ ├── nat.json │ │ │ ├── oss.json │ │ │ ├── polardb_mysql.json │ │ │ ├── rds.json │ │ │ ├── rds_new.json │ │ │ ├── redis.json │ │ │ ├── redis_cluster.json │ │ │ ├── redis_new.json │ │ │ ├── redis_standard.json │ │ │ ├── slb.json │ │ │ ├── slb_new.json │ │ │ └── waf.json │ │ └── markdown/ │ │ └── README.md │ ├── AppDynamics/ │ │ ├── collect/ │ │ │ └── appdynamics/ │ │ │ └── app.toml │ │ └── markdown/ │ │ └── README.md │ ├── AutoMQ/ │ │ ├── alerts/ │ │ │ └── 常用告警规则.json │ │ ├── collect/ │ │ │ └── prometheus/ │ │ │ └── 采集OTEL-COLLECTOR的样例.toml │ │ ├── dashboards/ │ │ │ ├── broker_metrics.json │ │ │ ├── cluster_overview.json │ │ │ ├── detailed_metrics.json │ │ │ ├── group_metrics.json │ │ │ └── topic_metrics.json │ │ ├── markdown/ │ │ │ └── overview.md │ │ └── metrics/ │ │ └── exporter.json │ ├── Bind/ │ │ ├── collect/ │ │ │ └── bind/ │ │ │ └── bind.toml │ │ └── markdown/ │ │ └── README.md │ ├── Canal/ │ │ ├── dashboards/ │ │ │ └── canal_by_categraf.json │ │ └── markdown/ │ │ └── README.md │ ├── Ceph/ │ │ ├── alerts/ │ │ │ └── ceph_by_categraf.json │ │ ├── dashboards/ │ │ │ └── ceph_by_categraf.json │ │ └── markdown/ │ │ └── README.md │ ├── ClickHouse/ │ │ ├── alerts/ │ │ │ ├── clickhouse_by_categraf.json │ │ │ └── clickhouse_by_exporter.json │ │ ├── collect/ │ │ │ └── clickhouse/ │ │ │ └── clickhouse.toml │ │ ├── dashboards/ │ │ │ ├── clickhouse_by_categraf.json │ │ │ └── clickhouse_by_exporter.json │ │ ├── markdown/ │ │ │ └── README.md │ │ └── metrics/ │ │ ├── clickhouse_by_categraf.json │ │ └── clickhouse_by_exporter.json │ ├── CloudWatch/ │ │ ├── collect/ │ │ │ └── cloudwatch/ │ │ │ └── cloud.toml │ │ ├── dashboards/ │ │ │ └── dashboard-by-aws-rds.json │ │ └── markdown/ │ │ └── README.md │ ├── Consul/ │ │ ├── collect/ │ │ │ └── consul/ │ │ │ └── consul.toml │ │ └── markdown/ │ │ └── README.md │ ├── Dns_Query/ │ │ ├── collect/ │ │ │ └── dns_query/ │ │ │ └── dns_query.toml │ │ └── markdown/ │ │ └── README.md │ ├── Docker/ │ │ ├── collect/ │ │ │ └── docker/ │ │ │ └── docker.toml │ │ └── markdown/ │ │ └── README.md │ ├── Doris/ │ │ ├── alerts/ │ │ │ └── doris_by_categraf.json │ │ ├── collect/ │ │ │ └── prometheus/ │ │ │ └── collect_doris_examples.toml │ │ ├── dashboards/ │ │ │ └── Doris_Overview.json │ │ └── markdown/ │ │ └── README.md │ ├── Elasticsearch/ │ │ ├── alerts/ │ │ │ ├── elasticsearch_by_categraf.json │ │ │ └── elasticsearch_by_exporter.json │ │ ├── collect/ │ │ │ └── elasticsearch/ │ │ │ └── elasticsearch.toml │ │ ├── dashboards/ │ │ │ ├── elasticsearch_by_categraf.json │ │ │ ├── elasticsearch_by_categraf_0.3.102.json │ │ │ ├── elasticsearch_by_categraf_a.json │ │ │ ├── elasticsearch_by_categraf_b.json │ │ │ └── elasticsearch_by_exporter.json │ │ ├── markdown/ │ │ │ └── README.md │ │ └── metrics/ │ │ └── categraf-base.json │ ├── Exec/ │ │ ├── collect/ │ │ │ └── exec/ │ │ │ └── exec.toml │ │ └── markdown/ │ │ └── README.md │ ├── Filecount/ │ │ ├── collect/ │ │ │ └── filecount/ │ │ │ └── filecount.toml │ │ └── markdown/ │ │ └── README.md │ ├── Gitlab/ │ │ ├── alerts/ │ │ │ └── gitlab_by_categraf.json │ │ ├── dashboards/ │ │ │ ├── MachinePerformance.json │ │ │ ├── NGINXVTS.json │ │ │ ├── Overview.json │ │ │ ├── PostgreSQL.json │ │ │ └── Redis.json │ │ └── markdown/ │ │ └── README.md │ ├── GoogleCloud/ │ │ ├── collect/ │ │ │ └── googlecloud/ │ │ │ └── gcp.toml │ │ └── markdown/ │ │ └── README.md │ ├── HAProxy/ │ │ ├── collect/ │ │ │ └── haproxy/ │ │ │ └── haproxy.toml │ │ ├── dashboards/ │ │ │ └── dashboard.json │ │ └── markdown/ │ │ └── README.md │ ├── HTTP_Response/ │ │ ├── alerts/ │ │ │ └── http_response_by_categraf.json │ │ ├── collect/ │ │ │ └── http_response/ │ │ │ └── http_response.toml │ │ ├── dashboards/ │ │ │ └── http_response_by_categraf.json │ │ ├── markdown/ │ │ │ └── http.md │ │ └── metrics/ │ │ └── categraf.json │ ├── IPMI/ │ │ ├── alerts/ │ │ │ └── alerts.json │ │ ├── collect/ │ │ │ └── ipmi/ │ │ │ └── conf.toml │ │ ├── dashboards/ │ │ │ ├── IPMI.json │ │ │ ├── IPMI_by_categraf.json │ │ │ └── IPMI_by_prometheus.json │ │ └── markdown/ │ │ └── README.md │ ├── IPVS/ │ │ ├── collect/ │ │ │ └── ipvs/ │ │ │ └── ipvs.toml │ │ └── markdown/ │ │ └── README.md │ ├── Java/ │ │ └── dashboards/ │ │ ├── jmx_by_exporter.json │ │ ├── jmx_by_kubernetes.json │ │ └── jvm_by_opentelementry.json │ ├── Jenkins/ │ │ ├── collect/ │ │ │ └── jenkins/ │ │ │ └── jenkins.toml │ │ └── markdown/ │ │ └── README.md │ ├── Jolokia_Agent/ │ │ ├── collect/ │ │ │ └── jolokia_agent/ │ │ │ ├── activemq.toml │ │ │ ├── bitbucket.toml │ │ │ ├── cassandra.toml │ │ │ ├── hadoop-hdfs.toml │ │ │ ├── java.toml │ │ │ ├── jboss.toml │ │ │ ├── kafka-connect.toml │ │ │ ├── kafka.toml │ │ │ ├── tomcat.toml │ │ │ ├── weblogic.toml │ │ │ └── zookeeper.toml │ │ └── markdown/ │ │ └── README.md │ ├── Kafka/ │ │ ├── alerts/ │ │ │ ├── kafka_by_categraf.json │ │ │ └── kafka_by_exporter.json │ │ ├── collect/ │ │ │ └── kafka/ │ │ │ └── kafka.toml │ │ ├── dashboards/ │ │ │ ├── kafka_by_categraf.json │ │ │ └── kafka_by_exporter.json │ │ ├── markdown/ │ │ │ └── README.md │ │ └── metrics/ │ │ └── categraf-base.json │ ├── Kubernetes/ │ │ ├── alerts/ │ │ │ ├── apiserver.json │ │ │ ├── kube-controller-plane.json │ │ │ ├── kubelet.json │ │ │ ├── node-exporter.json │ │ │ ├── prometheus-operator.json │ │ │ └── prometheus.json │ │ ├── dashboards/ │ │ │ ├── APIServer.json │ │ │ ├── ControllerManager.json │ │ │ ├── DeploymentContainer.json │ │ │ ├── KubeStateMetrics.json │ │ │ ├── KubeletMetrics.json │ │ │ ├── Pod.json │ │ │ ├── Scheduler.json │ │ │ └── StatefulsetContainer.json │ │ ├── markdown/ │ │ │ └── README.md │ │ ├── metrics/ │ │ │ ├── k8s-node.json │ │ │ └── k8s-pod.json │ │ └── record-rules/ │ │ ├── kube-controller-plane.json │ │ └── node-exporter.json │ ├── Ldap/ │ │ ├── collect/ │ │ │ └── ldap/ │ │ │ └── ldap.toml │ │ └── markdown/ │ │ └── README.md │ ├── Linux/ │ │ ├── alerts/ │ │ │ ├── CommonAlertingRules-Categraf.json │ │ │ ├── linux_by_categraf.json │ │ │ ├── linux_by_exporter.json │ │ │ ├── linux_by_telegraf.json │ │ │ └── 常用中文告警规则-采集器Categraf.json │ │ ├── collect/ │ │ │ ├── arp_packet/ │ │ │ │ └── arp_packet.toml │ │ │ ├── kernel_vmstat/ │ │ │ │ └── kernel_vmstat.toml │ │ │ ├── netstat/ │ │ │ │ └── netstat.toml │ │ │ ├── ntp/ │ │ │ │ └── ntp.toml │ │ │ └── processes/ │ │ │ └── processes.toml │ │ ├── dashboards/ │ │ │ ├── categraf-detail.json │ │ │ ├── categraf-overview.json │ │ │ ├── categraf-processes.json │ │ │ ├── categraf-table-ng.json │ │ │ └── exporter-detail.json │ │ ├── markdown/ │ │ │ └── README.md │ │ └── metrics/ │ │ ├── categraf-base.json │ │ └── exporter-base.json │ ├── Logstash/ │ │ ├── collect/ │ │ │ └── logstash/ │ │ │ └── logstash.toml │ │ ├── dashboards/ │ │ │ └── logstash-dash.json │ │ └── markdown/ │ │ └── README.md │ ├── MinIO/ │ │ ├── alerts/ │ │ │ └── minio_by_categraf.json │ │ ├── dashboards/ │ │ │ ├── minio_by_categraf.json │ │ │ └── new-version.json │ │ └── markdown/ │ │ └── README.md │ ├── MongoDB/ │ │ ├── alerts/ │ │ │ └── mongo_by_exporter.json │ │ ├── collect/ │ │ │ └── mongodb/ │ │ │ └── mongodb.toml │ │ ├── dashboards/ │ │ │ └── mongo_by_exporter.json │ │ └── markdown/ │ │ └── README.md │ ├── Mtail/ │ │ ├── collect/ │ │ │ └── mtail/ │ │ │ └── mtail.toml │ │ └── markdown/ │ │ └── README.md │ ├── MySQL/ │ │ ├── alerts/ │ │ │ ├── mysql_by_categraf.json │ │ │ └── mysql_by_exporter.json │ │ ├── collect/ │ │ │ └── mysql/ │ │ │ └── mysql.toml │ │ ├── dashboards/ │ │ │ ├── MySQL-by-address.json │ │ │ ├── MySQL仪表盘-远端.json │ │ │ ├── MySQL仪表盘.json │ │ │ ├── mysql_by_categraf.json │ │ │ ├── mysql_by_categraf_ident.json │ │ │ ├── mysql_by_categraf_instance.json │ │ │ └── mysql_by_exporter.json │ │ ├── markdown/ │ │ │ ├── README.md │ │ │ └── mysql.md │ │ └── metrics/ │ │ └── categraf-base.json │ ├── N9E/ │ │ ├── dashboards/ │ │ │ ├── n9e_server.json │ │ │ ├── n9e_v6.json │ │ │ └── n9e_v8.json │ │ └── markdown/ │ │ └── README.md │ ├── NFSClient/ │ │ ├── collect/ │ │ │ └── nfsclient/ │ │ │ └── nfsclient.toml │ │ └── markdown/ │ │ └── README.md │ ├── NSQ/ │ │ ├── collect/ │ │ │ └── nsq/ │ │ │ └── nsq.toml │ │ └── markdown/ │ │ └── README.md │ ├── NVIDIA/ │ │ ├── collect/ │ │ │ └── nvidia_smi/ │ │ │ └── nvidia_smi.toml │ │ ├── dashboards/ │ │ │ └── nvidia-gpu-metrics-by-categraf.json │ │ └── markdown/ │ │ └── README.md │ ├── Net_Response/ │ │ ├── alerts/ │ │ │ └── net_response_by_categraf.json │ │ ├── collect/ │ │ │ └── net_response/ │ │ │ └── net_response.toml │ │ ├── dashboards/ │ │ │ ├── dashboard-by-ziv.json │ │ │ └── net_response_by_categraf.json │ │ ├── markdown/ │ │ │ └── README.md │ │ └── metrics/ │ │ └── categraf.json │ ├── Netstat_Filter/ │ │ ├── collect/ │ │ │ └── netstat_filter/ │ │ │ └── netstat_filter.toml │ │ └── markdown/ │ │ └── README.md │ ├── Nginx/ │ │ ├── collect/ │ │ │ ├── nginx/ │ │ │ │ └── nginx.toml │ │ │ └── nginx_upstream_check/ │ │ │ └── nginx_upstream_check.toml │ │ ├── dashboards/ │ │ │ ├── nginx_stub_status.json │ │ │ ├── nginx_upstream_check.json │ │ │ └── nginx_vts.json │ │ ├── markdown/ │ │ │ └── README.md │ │ └── metrics/ │ │ └── categraf.json │ ├── Oracle/ │ │ ├── alerts/ │ │ │ └── oracle_alert.json │ │ ├── collect/ │ │ │ └── oracle/ │ │ │ └── oracle.toml │ │ ├── dashboards/ │ │ │ └── oracle_by_categraf.json │ │ └── markdown/ │ │ └── README.md │ ├── PHP/ │ │ ├── collect/ │ │ │ └── phpfpm/ │ │ │ └── phpfpm.toml │ │ └── markdown/ │ │ └── README.md │ ├── Ping/ │ │ ├── alerts/ │ │ │ └── ping_by_categraf.json │ │ ├── collect/ │ │ │ └── ping/ │ │ │ └── ping.toml │ │ ├── dashboards/ │ │ │ ├── ping_by_categraf_a.json │ │ │ └── ping_by_categraf_b.json │ │ ├── markdown/ │ │ │ └── README.md │ │ └── metrics/ │ │ └── categraf.json │ ├── PostgreSQL/ │ │ ├── alerts/ │ │ │ └── postgresql_by_categraf.json │ │ ├── collect/ │ │ │ └── postgresql/ │ │ │ └── postgresql.toml │ │ ├── dashboards/ │ │ │ └── postgresql_by_categraf.json │ │ └── markdown/ │ │ └── README.md │ ├── Procstat/ │ │ ├── alerts/ │ │ │ └── categraf-procstat.json │ │ ├── collect/ │ │ │ └── procstat/ │ │ │ └── procstat.toml │ │ ├── dashboards/ │ │ │ └── categraf-procstat.json │ │ ├── markdown/ │ │ │ └── readme.md │ │ └── metrics/ │ │ └── categraf.json │ ├── Prometheus/ │ │ ├── collect/ │ │ │ └── prometheus/ │ │ │ └── prometheus.toml │ │ └── markdown/ │ │ └── README.md │ ├── RabbitMQ/ │ │ ├── alerts/ │ │ │ └── alerts.json │ │ ├── collect/ │ │ │ └── rabbitmq/ │ │ │ └── rabbitmq.toml │ │ ├── dashboards/ │ │ │ ├── rabbitmq_CN_v3.8_gt.json │ │ │ ├── rabbitmq_by_categraf.json │ │ │ ├── rabbitmq_v3.8_gt.json │ │ │ └── rabbitmq_v3.8_lt.json │ │ └── markdown/ │ │ └── README.md │ ├── Redis/ │ │ ├── alerts/ │ │ │ ├── redis_by_categraf.json │ │ │ └── redis_by_exporter.json │ │ ├── collect/ │ │ │ ├── redis/ │ │ │ │ └── redis.toml │ │ │ └── redis_sentinel/ │ │ │ └── redis_sentinel.toml │ │ ├── dashboards/ │ │ │ ├── FilterByAddress.json │ │ │ ├── redis_by_categraf.json │ │ │ └── redis_by_exporter.json │ │ └── markdown/ │ │ └── README.md │ ├── SMART/ │ │ ├── collect/ │ │ │ └── smart/ │ │ │ └── smart.toml │ │ ├── dashboards/ │ │ │ └── smart.json │ │ └── markdown/ │ │ └── README.md │ ├── SNMP/ │ │ ├── collect/ │ │ │ └── snmp/ │ │ │ ├── Cisco.toml │ │ │ ├── snmp.toml │ │ │ └── snmp.toml.example │ │ ├── dashboards/ │ │ │ ├── dashboards.json │ │ │ ├── switch branch.json │ │ │ └── switch main.json │ │ └── markdown/ │ │ └── README.md │ ├── SQLServer/ │ │ ├── collect/ │ │ │ └── sqlserver/ │ │ │ └── sqlserver.toml │ │ ├── dashboards/ │ │ │ └── sqlserver.json │ │ └── markdown/ │ │ └── README.md │ ├── SpringBoot/ │ │ ├── alerts/ │ │ │ └── alerts.json │ │ ├── dashboards/ │ │ │ ├── JVM(Actuator)withapplicationname.json │ │ │ └── JVM.json │ │ └── markdown/ │ │ └── README.md │ ├── Switch_Legacy/ │ │ ├── collect/ │ │ │ └── switch_legacy/ │ │ │ └── switch_legacy.toml │ │ ├── dashboards/ │ │ │ └── dashboard.json │ │ └── markdown/ │ │ └── README.md │ ├── Systemd/ │ │ ├── collect/ │ │ │ └── systemd/ │ │ │ └── systemd.toml │ │ └── markdown/ │ │ └── README.md │ ├── TDEngine/ │ │ ├── dashboards/ │ │ │ └── tasokeeper3.x.json │ │ └── markdown/ │ │ └── README.md │ ├── TiDB/ │ │ ├── alerts/ │ │ │ └── tidb-alerts.json │ │ └── dashboards/ │ │ └── tidb-dashboard.json │ ├── Tomcat/ │ │ ├── collect/ │ │ │ └── tomcat/ │ │ │ └── tomcat.toml │ │ ├── dashboards/ │ │ │ └── tomcat_by_categraf.json │ │ └── markdown/ │ │ └── README.md │ ├── VictoriaMetrics/ │ │ ├── alerts/ │ │ │ └── alerts.json │ │ ├── dashboards/ │ │ │ ├── victoriametrics-cluster.json │ │ │ └── victoriametrics-single.json │ │ └── markdown/ │ │ └── README.md │ ├── Whois/ │ │ ├── collect/ │ │ │ └── whois/ │ │ │ └── whois.toml │ │ └── markdown/ │ │ └── README.md │ ├── Windows/ │ │ ├── alerts/ │ │ │ ├── windows_by_categraf.json │ │ │ └── windows_by_exporter.json │ │ ├── dashboards/ │ │ │ ├── windows_by_categraf.json │ │ │ └── windows_by_exporter.json │ │ └── markdown/ │ │ └── README.md │ ├── XSKYApi/ │ │ ├── collect/ │ │ │ └── xskyapi/ │ │ │ └── xskyapi.toml │ │ └── markdown/ │ │ └── README.md │ ├── ZooKeeper/ │ │ ├── alerts/ │ │ │ └── zookeeper_by_exporter.json │ │ ├── collect/ │ │ │ └── zookeeper/ │ │ │ └── zookeeper.toml │ │ ├── dashboards/ │ │ │ └── zookeeper_by_exporter.json │ │ └── markdown/ │ │ └── README.md │ ├── cAdvisor/ │ │ ├── collect/ │ │ │ └── cadvisor/ │ │ │ └── cadvisor.toml │ │ ├── dashboards/ │ │ │ └── dashboard.json │ │ ├── markdown/ │ │ │ └── README.md │ │ └── metrics/ │ │ └── exporter-base.json │ └── vSphere/ │ ├── alerts/ │ │ └── alerts.json │ ├── collect/ │ │ └── vsphere/ │ │ └── vsphere.toml │ ├── dashboards/ │ │ ├── vmware_by_vsphere-monitor.json │ │ └── vsphere.json │ └── markdown/ │ └── README.md ├── memsto/ │ ├── alert_mute_cache.go │ ├── alert_rule_cache.go │ ├── alert_subscribe_cache.go │ ├── busi_group_cache.go │ ├── config_cache.go │ ├── config_cval_cache.go │ ├── datasource_cache.go │ ├── drop_ident.go │ ├── es_index_pattern.go │ ├── event_processor_cache.go │ ├── host_alert_rule_targets.go │ ├── memsto.go │ ├── message_template_cache.go │ ├── notify_channel_cache.go │ ├── notify_config.go │ ├── notify_rule_cache.go │ ├── recording_rule_cache.go │ ├── stat.go │ ├── target_cache.go │ ├── task_tpl_cache.go │ ├── user_cache.go │ ├── user_group_cache.go │ └── user_token_cache.go ├── models/ │ ├── alert_aggr_view.go │ ├── alert_cur_event.go │ ├── alert_his_event.go │ ├── alert_mute.go │ ├── alert_rule.go │ ├── alert_subscribe.go │ ├── alerting_engine.go │ ├── anomaly_point.go │ ├── board.go │ ├── board_busi.go │ ├── board_payload.go │ ├── builtin_cate.go │ ├── builtin_component.go │ ├── builtin_metrics.go │ ├── builtin_metrics_filter.go │ ├── builtin_payload.go │ ├── busi_group.go │ ├── busi_group_member.go │ ├── chart.go │ ├── chart_group.go │ ├── chart_share.go │ ├── common.go │ ├── configs.go │ ├── dash_annotation.go │ ├── dashboard.go │ ├── datasource.go │ ├── embedded_product.go │ ├── es_index_pattern.go │ ├── event_pipeline.go │ ├── event_pipeline_execution.go │ ├── event_processor.go │ ├── host_meta.go │ ├── message_tpl.go │ ├── metric_view.go │ ├── migrate/ │ │ ├── migrate.go │ │ ├── migrate_es_index_pattern.go │ │ └── migrate_test.go │ ├── notification_record.go │ ├── notify_channel.go │ ├── notify_channel_test.go │ ├── notify_config.go │ ├── notify_rule.go │ ├── notify_tpl.go │ ├── prom_alert_rule.go │ ├── prom_alert_rule_test.go │ ├── recording_rule.go │ ├── role.go │ ├── role_operation.go │ ├── saved_view.go │ ├── source_token.go │ ├── sso_config.go │ ├── target.go │ ├── target_busi_group.go │ ├── task_record.go │ ├── task_tpl.go │ ├── ts.go │ ├── user.go │ ├── user_group.go │ ├── user_group_member.go │ ├── user_token.go │ └── workflow.go ├── pkg/ │ ├── aop/ │ │ ├── log.go │ │ └── rec.go │ ├── cas/ │ │ └── cas.go │ ├── cfg/ │ │ ├── cfg.go │ │ └── scan.go │ ├── choice/ │ │ └── choice.go │ ├── cmdx/ │ │ ├── cmd_notwindows.go │ │ ├── cmd_windows.go │ │ └── cmdx.go │ ├── ctx/ │ │ └── ctx.go │ ├── dingtalk/ │ │ ├── dingtalk.go │ │ └── user/ │ │ └── client.go │ ├── fasttime/ │ │ └── fasttime.go │ ├── feishu/ │ │ └── feishu.go │ ├── flashduty/ │ │ ├── post.go │ │ ├── sync_user.go │ │ ├── sync_user_group.go │ │ └── sync_user_test.go │ ├── ginx/ │ │ ├── auth.go │ │ ├── bytesconv.go │ │ ├── errorx.go │ │ ├── funcs.go │ │ ├── param.go │ │ └── render.go │ ├── hash/ │ │ ├── hash.go │ │ ├── hash_fnv.go │ │ └── hash_md5.go │ ├── httpx/ │ │ └── httpx.go │ ├── i18nx/ │ │ ├── i18n.go │ │ └── var.go │ ├── ibex/ │ │ └── ibex.go │ ├── ldapx/ │ │ ├── ldapx.go │ │ └── user_sync.go │ ├── loggrep/ │ │ └── loggrep.go │ ├── logx/ │ │ └── logx.go │ ├── macros/ │ │ └── macros.go │ ├── oauth2x/ │ │ └── oauth2x.go │ ├── oidcx/ │ │ └── oidc.go │ ├── ormx/ │ │ ├── database_init.go │ │ ├── database_init_test.go │ │ ├── ormx.go │ │ └── types.go │ ├── osx/ │ │ └── osx.go │ ├── parser/ │ │ ├── calc.go │ │ └── calc_test.go │ ├── poster/ │ │ ├── post.go │ │ └── post_test.go │ ├── prom/ │ │ ├── client_option.go │ │ ├── conv.go │ │ ├── conv_test.go │ │ ├── reader.go │ │ └── writer.go │ ├── promql/ │ │ ├── parser.go │ │ ├── perser_test.go │ │ └── promql.go │ ├── secu/ │ │ ├── aes.go │ │ └── rsa.go │ ├── slice/ │ │ └── contains.go │ ├── strx/ │ │ └── verify.go │ ├── tlsx/ │ │ ├── common.go │ │ └── config.go │ ├── tplx/ │ │ ├── conv.go │ │ ├── fns.go │ │ ├── tpl_test.go │ │ └── tplx.go │ ├── unit/ │ │ ├── unit_convert.go │ │ └── unit_convert_test.go │ └── version/ │ └── version.go ├── prom/ │ ├── client.go │ ├── option.go │ └── reader.go ├── pushgw/ │ ├── idents/ │ │ └── idents.go │ ├── kafka/ │ │ └── producer.go │ ├── pconf/ │ │ └── conf.go │ ├── pstat/ │ │ └── pstat.go │ ├── pushgw.go │ ├── router/ │ │ ├── fns.go │ │ ├── router.go │ │ ├── router_datadog.go │ │ ├── router_datadog_easyjson.go │ │ ├── router_heartbeat.go │ │ ├── router_openfalcon.go │ │ ├── router_openfalcon_easyjson.go │ │ ├── router_opentsdb.go │ │ ├── router_opentsdb_easyjson.go │ │ ├── router_proxy_remotewrite.go │ │ ├── router_remotewrite.go │ │ ├── router_target.go │ │ └── vars.go │ └── writer/ │ ├── kafka_writer.go │ ├── queue.go │ ├── relabel.go │ ├── relabel_test.go │ └── writer.go └── storage/ ├── redis.go ├── redis_test.go └── storage.go ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ *.css linguist-language=go *.less linguist-language=go *.js linguist-language=go *.tsx linguist-language=go *.html linguist-language=go ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: false contact_links: - name: Nightingale docs url: https://n9e.github.io/ about: You may want to read through the document before asking questions. ================================================ FILE: .github/ISSUE_TEMPLATE/enhancement.md ================================================ --- name: Enhancement Request about: Suggest an enhancement to the nightingale project labels: kind/feature --- **What would you like to be added**: **Why is this needed**: ================================================ FILE: .github/ISSUE_TEMPLATE/question.yml ================================================ name: Bug Report & Usage Question description: Reporting a bug or asking a question about how to use Nightingale labels: [] body: - type: markdown attributes: value: | The more detailed the form is filled in, the easier the problem will be solved. 提供的信息越详细,问题解决的可能性就越大。另外, 提问之前请先搜索历史 issue (包括 close 的), 以免重复提问。 - type: textarea id: question attributes: label: Question and Steps to reproduce description: Describe your question and steps to reproduce the bug. 描述问题以及复现步骤 validations: required: true - type: textarea id: logs attributes: label: Relevant logs and configurations description: Relevant logs and configurations. 报错日志([查看方法](https://flashcat.cloud/docs/content/flashcat-monitor/nightingale-v6/faq/how-to-check-logs/))以及各个相关组件的配置信息 render: text validations: required: true - type: textarea id: system-info attributes: label: Version description: Include nightingale version, operating system, and other relevant details. 请告知夜莺的版本、操作系统的版本、CPU架构等信息 validations: required: true ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ **What type of PR is this?** **What this PR does / why we need it**: **Which issue(s) this PR fixes**: Fixes # **Special notes for your reviewer**: ================================================ FILE: .github/workflows/issue-translator.yml ================================================ name: 'Issue Translator' on: issues: types: [opened] jobs: translate: runs-on: ubuntu-latest permissions: issues: write contents: read steps: - name: Translate Issues uses: usthe/issues-translate-action@v2.7 with: # 是否翻译 issue 标题 IS_MODIFY_TITLE: true # GitHub Token BOT_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # 自定义翻译标注(可选) # CUSTOM_BOT_NOTE: "Translation by bot" ================================================ FILE: .github/workflows/n9e.yml ================================================ name: Release on: push: tags: - 'v*' env: GO_VERSION: 1.23 jobs: goreleaser: runs-on: ubuntu-latest steps: - name: Checkout Source Code uses: actions/checkout@v3 with: fetch-depth: 0 - name: Setup Go Environment uses: actions/setup-go@v3 with: go-version: ${{ env.GO_VERSION }} - uses: docker/login-action@v2 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Run GoReleaser uses: goreleaser/goreleaser-action@v3 with: distribution: goreleaser version: '~> v1' args: release --rm-dist env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .gitignore ================================================ *.exe *.exe~ *.dll *.dylib *.test *.out *.prof *.log *.o *.a *.so *.db *.sw[po] *.tar.gz *.[568vq] [568vq].out *.cgo1.go *.cgo2.c _cgo_defun.c _cgo_gotypes.go _cgo_export.* _testmain.go _obj _test /log* /bin /out /build /dist /etc/*.local.yml /etc/*.local.conf /etc/rsa/* /etc/plugins/*.local.yml /etc/script/rules.yaml /etc/script/alert-rules.json /etc/script/record-rules.json /data* /tarball /run /vendor /tmp /pub /n9e /docker/pub /docker/n9e /docker/compose-bridge/mysqldata /docker/compose-host-network/mysqldata /docker/compose-host-network-metric-log/mysqldata /docker/compose-host-network-metric-log/n9e-logs /docker/compose-postgres/pgdata /etc.local* /front/statik/statik.go /docker/compose-bridge/etc-nightingale/rsa/ .alerts .idea .index .vscode .issue .issue/* .cursor .claude .DS_Store .cache-loader .payload queries.active /n9e-* n9e.sql !/datasource .env.json ================================================ FILE: .goreleaser.yaml ================================================ before: hooks: # You may remove this if you don't use go modules. - go mod tidy - go install github.com/rakyll/statik snapshot: name_template: '{{ .Tag }}' checksum: name_template: 'checksums.txt' changelog: skip: true builds: - id: build hooks: pre: - cmd: sh -x ./fe.sh output: true main: ./cmd/center/ binary: n9e env: - CGO_ENABLED=0 goos: - linux goarch: - amd64 - arm64 ldflags: - -s -w - -X github.com/ccfos/nightingale/v6/pkg/version.Version={{ .Tag }}-{{.Commit}} - id: build-cli main: ./cmd/cli/ binary: n9e-cli env: - CGO_ENABLED=0 goos: - linux goarch: - amd64 - arm64 ldflags: - -s -w - -X github.com/ccfos/nightingale/v6/pkg/version.Version={{ .Tag }}-{{.Commit}} - id: build-edge main: ./cmd/edge/ binary: n9e-edge env: - CGO_ENABLED=0 goos: - linux goarch: - amd64 - arm64 ldflags: - -s -w - -X github.com/ccfos/nightingale/v6/pkg/version.Version={{ .Tag }}-{{.Commit}} archives: - id: n9e builds: - build - build-cli - build-edge format: tar.gz format_overrides: - goos: windows format: zip name_template: "n9e-v{{ .Version }}-{{ .Os }}-{{ .Arch }}" wrap_in_directory: false files: - docker/* - etc/* - integrations/* - cli/* - n9e.sql release: github: owner: ccfos name: nightingale name_template: "v{{ .Version }}" dockers: - image_templates: - flashcatcloud/nightingale:{{ .Version }}-amd64 goos: linux goarch: amd64 ids: - build dockerfile: docker/Dockerfile.goreleaser extra_files: - etc - integrations use: buildx build_flag_templates: - "--platform=linux/amd64" - image_templates: - flashcatcloud/nightingale:{{ .Version }}-arm64v8 goos: linux goarch: arm64 ids: - build dockerfile: docker/Dockerfile.goreleaser.arm64 extra_files: - etc - integrations use: buildx build_flag_templates: - "--platform=linux/arm64/v8" docker_manifests: - name_template: flashcatcloud/nightingale:{{ .Version }} image_templates: - flashcatcloud/nightingale:{{ .Version }}-amd64 - flashcatcloud/nightingale:{{ .Version }}-arm64v8 - name_template: flashcatcloud/nightingale:latest image_templates: - flashcatcloud/nightingale:{{ .Version }}-amd64 - flashcatcloud/nightingale:{{ .Version }}-arm64v8 ================================================ FILE: .typos.toml ================================================ # Configuration for typos tool [files] extend-exclude = [ # Ignore auto-generated easyjson files "*_easyjson.go", # Ignore binary files "*.gz", "*.tar", "n9e", "n9e-*" ] [default.extend-identifiers] # Didi is a company name (DiDi), not a typo Didi = "Didi" # datas is intentionally used as plural of data (slice variable) datas = "datas" # pendings is intentionally used as plural pendings = "pendings" pendingsUseByRecover = "pendingsUseByRecover" pendingsUseByRecoverMap = "pendingsUseByRecoverMap" # typs is intentionally used as shorthand for types (parameter name) typs = "typs" [default.extend-words] # Some false positives ba = "ba" # Specific corrections for ambiguous typos contigious = "contiguous" onw = "own" componet = "component" Patten = "Pattern" Requets = "Requests" Mis = "Miss" exporer = "exporter" soruce = "source" verison = "version" Configations = "Configurations" emmited = "emitted" Utlization = "Utilization" serie = "series" ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright CCF ODC. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: Makefile ================================================ .PHONY: prebuild build ROOT:=$(shell pwd -P) GIT_COMMIT:=$(shell git --work-tree ${ROOT} rev-parse 'HEAD^{commit}') _GIT_VERSION:=$(shell git --work-tree ${ROOT} describe --tags --abbrev=14 "${GIT_COMMIT}^{commit}" 2>/dev/null) TAG=$(shell echo "${_GIT_VERSION}" | awk -F"-" '{print $$1}') RELEASE_VERSION:="$(TAG)-$(GIT_COMMIT)" all: prebuild build prebuild: echo "begin download and embed the front-end file..." sh fe.sh echo "front-end file download and embedding completed." build: go build -ldflags "-w -s -X github.com/ccfos/nightingale/v6/pkg/version.Version=$(RELEASE_VERSION)" -o n9e ./cmd/center/main.go build-edge: go build -ldflags "-w -s -X github.com/ccfos/nightingale/v6/pkg/version.Version=$(RELEASE_VERSION)" -o n9e-edge ./cmd/edge/ build-alert: go build -ldflags "-w -s -X github.com/ccfos/nightingale/v6/pkg/version.Version=$(RELEASE_VERSION)" -o n9e-alert ./cmd/alert/main.go build-pushgw: go build -ldflags "-w -s -X github.com/ccfos/nightingale/v6/pkg/version.Version=$(RELEASE_VERSION)" -o n9e-pushgw ./cmd/pushgw/main.go build-cli: go build -ldflags "-w -s -X github.com/ccfos/nightingale/v6/pkg/version.Version=$(RELEASE_VERSION)" -o n9e-cli ./cmd/cli/main.go run: nohup ./n9e > n9e.log 2>&1 & run-alert: nohup ./n9e-alert > n9e-alert.log 2>&1 & run-pushgw: nohup ./n9e-pushgw > n9e-pushgw.log 2>&1 & release: goreleaser --skip-validate --skip-publish --snapshot ================================================ FILE: README.md ================================================

nightingale - cloud native monitoring

Open-Source Alerting Expert

Docs Docker pulls GitHub contributors GitHub Repo stars GitHub forks
GitHub Repo issues GitHub Repo issues closed GitHub latest release License GitHub contributors

[English](./README.md) | [中文](./README_zh.md) ## 🎯 What is Nightingale Nightingale is an open-source monitoring project that focuses on alerting. Similar to Grafana, Nightingale also connects with various existing data sources. However, while Grafana emphasizes visualization, Nightingale places greater emphasis on the alerting engine, as well as the processing and distribution of alarms. > 💡 Nightingale has now officially launched the [MCP-Server](https://github.com/n9e/n9e-mcp-server/). This MCP Server enables AI assistants to interact with the Nightingale API using natural language, facilitating alert management, monitoring, and observability tasks. > > The Nightingale project was initially developed and open-sourced by DiDi.inc. On May 11, 2022, it was donated to the Open Source Development Committee of the China Computer Federation (CCF ODTC). ![](https://n9e.github.io/img/global/arch-bg.png) ## 💡 How Nightingale Works Many users have already collected metrics and log data. In this case, you can connect your storage repositories (such as VictoriaMetrics, ElasticSearch, etc.) as data sources in Nightingale. This allows you to configure alerting rules and notification rules within Nightingale, enabling the generation and distribution of alarms. ![Nightingale Product Architecture](doc/img/readme/20240221152601.png) Nightingale itself does not provide monitoring data collection capabilities. We recommend using [Categraf](https://github.com/flashcatcloud/categraf) as the collector, which integrates seamlessly with Nightingale. [Categraf](https://github.com/flashcatcloud/categraf) can collect monitoring data from operating systems, network devices, various middleware, and databases. It pushes this data to Nightingale via the `Prometheus Remote Write` protocol. Nightingale then stores the monitoring data in a time-series database (such as Prometheus, VictoriaMetrics, etc.) and provides alerting and visualization capabilities. For certain edge data centers with poor network connectivity to the central Nightingale server, we offer a distributed deployment mode for the alerting engine. In this mode, even if the network is disconnected, the alerting functionality remains unaffected. ![Edge Deployment Mode](doc/img/readme/multi-region-arch.png) > In the above diagram, Data Center A has a good network with the central data center, so it uses the Nightingale process in the central data center as the alerting engine. Data Center B has a poor network with the central data center, so it deploys `n9e-edge` as the alerting engine to handle alerting for its own data sources. ## 🔕 Alert Noise Reduction, Escalation, and Collaboration Nightingale focuses on being an alerting engine, responsible for generating alarms and flexibly distributing them based on rules. It supports 20 built-in notification medias (such as phone calls, SMS, email, DingTalk, Slack, etc.). If you have more advanced requirements, such as: - Want to consolidate events from multiple monitoring systems into one platform for unified noise reduction, response handling, and data analysis. - Want to support personnel scheduling, practice on-call culture, and support alert escalation (to avoid missing alerts) and collaborative handling. Then Nightingale is not suitable. It is recommended that you choose on-call products such as PagerDuty and FlashDuty. These products are simple and easy to use. ## 🗨️ Communication Channels - **Report Bugs:** It is highly recommended to submit issues via the [Nightingale GitHub Issue tracker](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=kind%2Fbug&projects=&template=bug_report.yml). - **Documentation:** For more information, we recommend thoroughly browsing the [Nightingale Documentation Site](https://n9e.github.io/). ## 🔑 Key Features ![Nightingale Alerting rules](doc/img/readme/alerting-rules-en.png) - Nightingale supports alerting rules, mute rules, subscription rules, and notification rules. It natively supports 20 types of notification media and allows customization of message templates. - It supports event pipelines for Pipeline processing of alarms, facilitating automated integration with in-house systems. For example, it can append metadata to alarms or perform relabeling on events. - It introduces the concept of business groups and a permission system to manage various rules in a categorized manner. - Many databases and middleware come with built-in alert rules that can be directly imported and used. It also supports direct import of Prometheus alerting rules. - It supports alerting self-healing, which automatically triggers a script to execute predefined logic after an alarm is generated—such as cleaning up disk space or capturing the current system state. ![Nightingale Alarm Dashboard](doc/img/readme/active-events-en.png) - Nightingale archives historical alarms and supports multi-dimensional query and statistics. - It supports flexible aggregation grouping, allowing a clear view of the distribution of alarms across the company. ![Nightingale Integration Center](doc/img/readme/integration-components-en.png) - Nightingale has built-in metric descriptions, dashboards, and alerting rules for common operating systems, middleware, and databases, which are contributed by the community with varying quality. - It directly receives data via multiple protocols such as Remote Write, OpenTSDB, Datadog, and Falcon, integrates with various Agents. - It supports data sources like Prometheus, ElasticSearch, Loki, ClickHouse, MySQL, Postgres, allowing alerting based on data from these sources. - Nightingale can be easily embedded into internal enterprise systems (e.g. Grafana, CMDB), and even supports configuring menu visibility for these embedded systems. ![Nightingale dashboards](doc/img/readme/dashboard-en.png) - Nightingale supports dashboard functionality, including common chart types, and comes with pre-built dashboards. The image above is a screenshot of one of these dashboards. - If you are already accustomed to Grafana, it is recommended to continue using Grafana for visualization, as Grafana has deeper expertise in this area. - For machine-related monitoring data collected by Categraf, it is advisable to use Nightingale's built-in dashboards for viewing. This is because Categraf's metric naming follows Telegraf's convention, which differs from that of Node Exporter. - Due to Nightingale's concept of business groups (where machines can belong to different groups), there may be scenarios where you only want to view machines within the current business group on the dashboard. Thus, Nightingale's dashboards can be linked with business groups for interactive filtering. ## 🌟 Stargazers over time [![Stargazers over time](https://api.star-history.com/svg?repos=ccfos/nightingale&type=Date)](https://star-history.com/#ccfos/nightingale&Date) ## 🔥 Users ![User Logos](doc/img/readme/logos.png) ## 🤝 Community Co-Building - ❇️ Please read the [Nightingale Open Source Project and Community Governance Draft](./doc/community-governance.md). We sincerely welcome every user, developer, company, and organization to use Nightingale, actively report bugs, submit feature requests, share best practices, and help build a professional and active open-source community. - ❤️ Nightingale Contributors ## 📜 License - [Apache License V2.0](https://github.com/ccfos/nightingale/blob/main/LICENSE) ================================================ FILE: README_zh.md ================================================

nightingale - cloud native monitoring

开源监控告警管理专家

Docs Docker pulls GitHub contributors GitHub Repo stars GitHub forks
GitHub Repo issues GitHub Repo issues closed GitHub latest release License GitHub contributors

[English](./README.md) | [中文](./README_zh.md) ## 夜莺是什么 夜莺 Nightingale 是一款开源云原生监控告警工具,是中国计算机学会接受捐赠并托管的第一个开源项目,在 GitHub 上有超过 12000 颗星,广受关注和使用。夜莺的统一告警引擎,可以对接 Prometheus、Elasticsearch、ClickHouse、Loki、MySQL 等多种数据源,提供全面的告警判定、丰富的事件处理和灵活的告警分发及通知能力。 夜莺侧重于监控告警,类似于 Grafana 的数据源集成方式,夜莺也是对接多种既有的数据源,不过 Grafana 侧重于可视化,夜莺则是侧重于告警引擎、告警事件的处理和分发。 > - 💡夜莺正式推出了 [MCP-Server](https://github.com/n9e/n9e-mcp-server/),此 MCP Server 允许 AI 助手通过自然语言与夜莺 API 交互,实现告警管理、监控和可观测性任务。 > - 夜莺监控项目,最初由滴滴开发和开源,并于 2022 年 5 月 11 日,捐赠予中国计算机学会开源发展技术委员会(CCF ODTC),为 CCF ODTC 成立后接受捐赠的第一个开源项目。 ![](https://n9e.github.io/img/global/arch-bg.png) ## 夜莺的工作逻辑 很多用户已经自行采集了指标、日志数据,此时就把存储库(VictoriaMetrics、ElasticSearch等)作为数据源接入夜莺,即可在夜莺里配置告警规则、通知规则,完成告警事件的生成和派发。 ![夜莺产品架构](doc/img/readme/20240221152601.png) 夜莺项目本身不提供监控数据采集能力。推荐您使用 [Categraf](https://github.com/flashcatcloud/categraf) 作为采集器,可以和夜莺丝滑对接。 [Categraf](https://github.com/flashcatcloud/categraf) 可以采集操作系统、网络设备、各类中间件、数据库的监控数据,通过 Remote Write 协议推送给夜莺,夜莺把监控数据转存到时序库(如 Prometheus、VictoriaMetrics 等),并提供告警和可视化能力。 对于个别边缘机房,如果和中心夜莺服务端网络链路不好,希望提升告警可用性,夜莺也提供边缘机房告警引擎下沉部署模式,这个模式下,即便边缘和中心端网络割裂,告警功能也不受影响。 ![边缘部署模式](doc/img/readme/20240222102119.png) > 上图中,机房A和中心机房的网络链路很好,所以直接由中心端的夜莺进程做告警引擎,机房B和中心机房的网络链路不好,所以在机房B部署了 `n9e-edge` 做告警引擎,对机房B的数据源做告警判定。 ## 告警降噪、升级、协同 夜莺的侧重点是做告警引擎,即负责产生告警事件,并根据规则做灵活派发,内置支持 20 种通知媒介(电话、短信、邮件、钉钉、飞书、企微、Slack 等)。 如果您有更高级的需求,比如: - 想要把公司的多套监控系统产生的事件聚拢到一个平台,统一做收敛降噪、响应处理、数据分析 - 想要支持人员的排班,践行 On-call 文化,想要支持告警认领、升级(避免遗漏)、协同处理 那夜莺是不合适的,推荐您选用 [FlashDuty](https://flashcat.cloud/product/flashcat-duty/) 这样的 On-call 产品,产品简单易用,也有免费套餐。 ## 相关资料 & 交流渠道 - 📚 [夜莺介绍PPT](https://mp.weixin.qq.com/s/Mkwx_46xrltSq8NLqAIYow) 对您了解夜莺各项关键特性会有帮助(PPT链接在文末) - 👉 [文档中心](https://flashcat.cloud/docs/) 为了更快的访问速度,站点托管在 [FlashcatCloud](https://flashcat.cloud) - ❤️ [报告 Bug](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=&projects=&template=question.yml) 写清楚问题描述、复现步骤、截图等信息,更容易得到答案 - 💡 前后端代码分离,前端代码仓库:[https://github.com/n9e/fe](https://github.com/n9e/fe) - 🎯 关注[这个公众号](https://gitlink.org.cn/UlricQin)了解更多夜莺动态和知识 - 🌟 加我微信:`picobyte`(我已关闭好友验证)拉入微信群,备注:`夜莺互助群`,如果已经把夜莺上到生产环境,可联系我拉入资深监控用户群 ## 关键特性简介 ![夜莺告警规则](doc/img/readme/2025-05-23_18-43-37.png) - 夜莺支持告警规则、屏蔽规则、订阅规则、通知规则,内置支持 20 种通知媒介,支持消息模板自定义 - 支持事件管道,对告警事件做 Pipeline 处理,方便和自有系统做自动化整合,比如给告警事件附加一些元信息,对事件做 relabel - 支持业务组概念,引入权限体系,分门别类管理各类规则 - 很多数据库、中间件内置了告警规则,可以直接导入使用,也可以直接导入 Prometheus 的告警规则 - 支持告警自愈,即告警之后自动触发一个脚本执行一些预定义的逻辑,比如清理一下磁盘、抓一下现场等 ![夜莺事件大盘](doc/img/readme/2025-05-30_08-49-28.png) - 夜莺存档了历史告警事件,支持多维度的查询和统计 - 支持灵活的聚合分组,一目了然看到公司的告警事件分布情况 ![夜莺集成中心](doc/img/readme/2025-05-23_18-46-06.png) - 夜莺内置常用操作系统、中间件、数据库的的指标说明、仪表盘、告警规则,不过都是社区贡献的,整体也是参差不齐 - 夜莺直接接收 Remote Write、OpenTSDB、Datadog、Falcon 等多种协议的数据,故而可以和各类 Agent 对接 - 夜莺支持 Prometheus、ElasticSearch、Loki、TDEngine 等多种数据源,可以对其中的数据做告警 - 夜莺可以很方便内嵌企业内部系统,比如 Grafana、CMDB 等,甚至可以配置这些内嵌系统的菜单可见性 ![夜莺仪表盘](doc/img/readme/2025-05-23_18-49-02.png) - 夜莺支持仪表盘功能,支持常见的图表类型,也内置了一些仪表盘,上图是其中一个仪表盘的截图。 - 如果你已经习惯了 Grafana,建议仍然使用 Grafana 看图。Grafana 在看图方面道行更深。 - 机器相关的监控数据,如果是 Categraf 采集的,建议使用夜莺自带的仪表盘查看,因为 Categraf 的指标命名 Follow 的是 Telegraf 的命名方式,和 Node Exporter 不同 - 因为夜莺有个业务组的概念,机器可以归属不同的业务组,有时在仪表盘里只想查看当前所属业务组的机器,所以夜莺的仪表盘可以和业务组联动 ## 广受关注 [![Stargazers over time](https://api.star-history.com/svg?repos=ccfos/nightingale&type=Date)](https://star-history.com/#ccfos/nightingale&Date) ## 感谢众多企业的信赖 ![夜莺客户](doc/img/readme/logos.png) ## 社区共建 - ❇️ 请阅读浏览[夜莺开源项目和社区治理架构草案](./doc/community-governance.md),真诚欢迎每一位用户、开发者、公司以及组织,使用夜莺监控、积极反馈 Bug、提交功能需求、分享最佳实践,共建专业、活跃的夜莺开源社区。 - ❤️ 夜莺贡献者 ## License - [Apache License V2.0](https://github.com/ccfos/nightingale/blob/main/LICENSE) ================================================ FILE: alert/aconf/conf.go ================================================ package aconf import ( "path" ) type Alert struct { Disable bool EngineDelay int64 Heartbeat HeartbeatConfig Alerting Alerting } type SMTPConfig struct { Host string Port int User string Pass string From string InsecureSkipVerify bool Batch int } type HeartbeatConfig struct { IP string Interval int64 Endpoint string EngineName string } type Alerting struct { Timeout int64 TemplatesDir string NotifyConcurrency int WebhookBatchSend bool GlobalWebhook GlobalWebhook } type GlobalWebhook struct { Enable bool Url string BasicAuthUser string BasicAuthPass string Timeout int Headers []string SkipVerify bool } type CallPlugin struct { Enable bool PluginPath string Caller string } type RedisPub struct { Enable bool ChannelPrefix string ChannelKey string } func (a *Alert) PreCheck(configDir string) { if a.Alerting.TemplatesDir == "" { a.Alerting.TemplatesDir = path.Join(configDir, "template") } if a.Alerting.NotifyConcurrency == 0 { a.Alerting.NotifyConcurrency = 10 } if a.Heartbeat.Interval == 0 { a.Heartbeat.Interval = 1000 } if a.EngineDelay == 0 { a.EngineDelay = 30 } } ================================================ FILE: alert/alert.go ================================================ package alert import ( "context" "fmt" "github.com/ccfos/nightingale/v6/dscache" "github.com/ccfos/nightingale/v6/alert/aconf" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/alert/dispatch" "github.com/ccfos/nightingale/v6/alert/eval" "github.com/ccfos/nightingale/v6/alert/naming" "github.com/ccfos/nightingale/v6/alert/process" "github.com/ccfos/nightingale/v6/alert/queue" "github.com/ccfos/nightingale/v6/alert/record" "github.com/ccfos/nightingale/v6/alert/router" "github.com/ccfos/nightingale/v6/alert/sender" "github.com/ccfos/nightingale/v6/conf" "github.com/ccfos/nightingale/v6/dumper" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/httpx" "github.com/ccfos/nightingale/v6/pkg/logx" "github.com/ccfos/nightingale/v6/pkg/macros" "github.com/ccfos/nightingale/v6/prom" "github.com/ccfos/nightingale/v6/pushgw/pconf" "github.com/ccfos/nightingale/v6/pushgw/writer" "github.com/ccfos/nightingale/v6/storage" "github.com/flashcatcloud/ibex/src/cmd/ibex" ) func Initialize(configDir string, cryptoKey string) (func(), error) { config, err := conf.InitConfig(configDir, cryptoKey) if err != nil { return nil, fmt.Errorf("failed to init config: %v", err) } logxClean, err := logx.Init(config.Log) if err != nil { return nil, err } ctx := ctx.NewContext(context.Background(), nil, false, config.CenterApi) var redis storage.Redis redis, err = storage.NewRedis(config.Redis) if err != nil { return nil, err } syncStats := memsto.NewSyncStats() alertStats := astats.NewSyncStats() configCache := memsto.NewConfigCache(ctx, syncStats, nil, "") targetCache := memsto.NewTargetCache(ctx, syncStats, redis) busiGroupCache := memsto.NewBusiGroupCache(ctx, syncStats) alertMuteCache := memsto.NewAlertMuteCache(ctx, syncStats) alertRuleCache := memsto.NewAlertRuleCache(ctx, syncStats) notifyConfigCache := memsto.NewNotifyConfigCache(ctx, configCache) dsCache := memsto.NewDatasourceCache(ctx, syncStats) userCache := memsto.NewUserCache(ctx, syncStats) userGroupCache := memsto.NewUserGroupCache(ctx, syncStats) taskTplsCache := memsto.NewTaskTplCache(ctx) configCvalCache := memsto.NewCvalCache(ctx, syncStats) notifyRuleCache := memsto.NewNotifyRuleCache(ctx, syncStats) notifyChannelCache := memsto.NewNotifyChannelCache(ctx, syncStats) messageTemplateCache := memsto.NewMessageTemplateCache(ctx, syncStats) promClients := prom.NewPromClient(ctx) dispatch.InitRegisterQueryFunc(promClients) externalProcessors := process.NewExternalProcessors() macros.RegisterMacro(macros.MacroInVain) dscache.Init(ctx, false) Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, taskTplsCache, dsCache, ctx, promClients, userCache, userGroupCache, notifyRuleCache, notifyChannelCache, messageTemplateCache, configCvalCache) r := httpx.GinEngine(config.Global.RunMode, config.HTTP, configCvalCache.PrintBodyPaths, configCvalCache.PrintAccessLog) rt := router.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors, config.Log.Dir) if config.Ibex.Enable { ibex.ServerStart(false, nil, redis, config.HTTP.APIForService.BasicAuth, config.Alert.Heartbeat, &config.CenterApi, r, nil, config.Ibex, config.HTTP.Port) } rt.Config(r) dumper.ConfigRouter(r) httpClean := httpx.Init(config.HTTP, r) return func() { logxClean() httpClean() }, nil } func Start(alertc aconf.Alert, pushgwc pconf.Pushgw, syncStats *memsto.Stats, alertStats *astats.Stats, externalProcessors *process.ExternalProcessorsType, targetCache *memsto.TargetCacheType, busiGroupCache *memsto.BusiGroupCacheType, alertMuteCache *memsto.AlertMuteCacheType, alertRuleCache *memsto.AlertRuleCacheType, notifyConfigCache *memsto.NotifyConfigCacheType, taskTplsCache *memsto.TaskTplCache, datasourceCache *memsto.DatasourceCacheType, ctx *ctx.Context, promClients *prom.PromClientMap, userCache *memsto.UserCacheType, userGroupCache *memsto.UserGroupCacheType, notifyRuleCache *memsto.NotifyRuleCacheType, notifyChannelCache *memsto.NotifyChannelCacheType, messageTemplateCache *memsto.MessageTemplateCacheType, configCvalCache *memsto.CvalCache) { alertSubscribeCache := memsto.NewAlertSubscribeCache(ctx, syncStats) recordingRuleCache := memsto.NewRecordingRuleCache(ctx, syncStats) targetsOfAlertRulesCache := memsto.NewTargetOfAlertRuleCache(ctx, alertc.Heartbeat.EngineName, syncStats) go models.InitNotifyConfig(ctx, alertc.Alerting.TemplatesDir) go models.InitNotifyChannel(ctx) go models.InitMessageTemplate(ctx) naming := naming.NewNaming(ctx, alertc.Heartbeat, alertStats) writers := writer.NewWriters(pushgwc) record.NewScheduler(alertc, recordingRuleCache, promClients, writers, alertStats, datasourceCache) eval.NewScheduler(alertc, externalProcessors, alertRuleCache, targetCache, targetsOfAlertRulesCache, busiGroupCache, alertMuteCache, datasourceCache, promClients, naming, ctx, alertStats) eventProcessorCache := memsto.NewEventProcessorCache(ctx, syncStats) sender.InitStaticGlobalWebhook(alertc.Alerting.GlobalWebhook) dp := dispatch.NewDispatch(alertRuleCache, userCache, userGroupCache, alertSubscribeCache, targetCache, notifyConfigCache, taskTplsCache, notifyRuleCache, notifyChannelCache, messageTemplateCache, eventProcessorCache, configCvalCache, alertc.Alerting, ctx, alertStats) consumer := dispatch.NewConsumer(alertc.Alerting, ctx, dp, promClients, alertMuteCache) notifyRecordConsumer := sender.NewNotifyRecordConsumer(ctx) go dp.ReloadTpls() go consumer.LoopConsume() go notifyRecordConsumer.LoopConsume() go queue.ReportQueueSize(alertStats) go sender.ReportNotifyRecordQueueSize(alertStats) go sender.InitEmailSender(ctx, notifyConfigCache) } ================================================ FILE: alert/astats/stats.go ================================================ package astats import ( "github.com/prometheus/client_golang/prometheus" ) const ( namespace = "n9e" subsystem = "alert" ) type Stats struct { AlertNotifyTotal *prometheus.CounterVec AlertNotifyErrorTotal *prometheus.CounterVec CounterAlertsTotal *prometheus.CounterVec GaugeAlertQueueSize prometheus.Gauge CounterRuleEval *prometheus.CounterVec CounterQueryDataErrorTotal *prometheus.CounterVec CounterQueryDataTotal *prometheus.CounterVec CounterVarFillingQuery *prometheus.CounterVec CounterRecordEval *prometheus.CounterVec CounterRecordEvalErrorTotal *prometheus.CounterVec CounterMuteTotal *prometheus.CounterVec CounterRuleEvalErrorTotal *prometheus.CounterVec CounterHeartbeatErrorTotal *prometheus.CounterVec CounterSubEventTotal *prometheus.CounterVec GaugeQuerySeriesCount *prometheus.GaugeVec GaugeRuleEvalDuration *prometheus.GaugeVec GaugeNotifyRecordQueueSize prometheus.Gauge } func NewSyncStats() *Stats { CounterRuleEval := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, Name: "rule_eval_total", Help: "Number of rule eval.", }, []string{}) CounterRuleEvalErrorTotal := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, Name: "rule_eval_error_total", Help: "Number of rule eval error.", }, []string{"datasource", "stage", "busi_group", "rule_id"}) CounterQueryDataErrorTotal := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, Name: "query_data_error_total", Help: "Number of rule eval query data error.", }, []string{"datasource"}) CounterQueryDataTotal := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, Name: "query_data_total", Help: "Number of rule eval query data.", }, []string{"datasource", "rule_id"}) CounterRecordEval := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, Name: "record_eval_total", Help: "Number of record eval.", }, []string{"datasource"}) CounterRecordEvalErrorTotal := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, Name: "record_eval_error_total", Help: "Number of record eval error.", }, []string{"datasource"}) AlertNotifyTotal := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, Name: "alert_notify_total", Help: "Number of send msg.", }, []string{"channel"}) AlertNotifyErrorTotal := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, Name: "alert_notify_error_total", Help: "Number of send msg.", }, []string{"channel"}) // 产生的告警总量 CounterAlertsTotal := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, Name: "alerts_total", Help: "Total number alert events.", }, []string{"cluster", "type", "busi_group"}) // 内存中的告警事件队列的长度 GaugeAlertQueueSize := prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: subsystem, Name: "alert_queue_size", Help: "The size of alert queue.", }) CounterMuteTotal := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, Name: "mute_total", Help: "Number of mute.", }, []string{"group", "rule_id", "mute_rule_id", "datasource_id"}) CounterSubEventTotal := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, Name: "sub_event_total", Help: "Number of sub event.", }, []string{"group"}) CounterHeartbeatErrorTotal := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, Name: "heartbeat_error_count", Help: "Number of heartbeat error.", }, []string{}) GaugeQuerySeriesCount := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: subsystem, Name: "eval_query_series_count", Help: "Number of series retrieved from data source after query.", }, []string{"rule_id", "datasource_id", "ref"}) // 通知记录队列的长度 GaugeNotifyRecordQueueSize := prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: subsystem, Name: "notify_record_queue_size", Help: "The size of notify record queue.", }) GaugeRuleEvalDuration := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: subsystem, Name: "rule_eval_duration_ms", Help: "Duration of rule eval in milliseconds.", }, []string{"rule_id", "datasource_id"}) CounterVarFillingQuery := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, Name: "var_filling_query_total", Help: "Number of var filling query.", }, []string{"rule_id", "datasource_id", "ref", "typ"}) prometheus.MustRegister( CounterAlertsTotal, GaugeAlertQueueSize, AlertNotifyTotal, AlertNotifyErrorTotal, CounterRuleEval, CounterQueryDataTotal, CounterQueryDataErrorTotal, CounterRecordEval, CounterRecordEvalErrorTotal, CounterMuteTotal, CounterRuleEvalErrorTotal, CounterHeartbeatErrorTotal, CounterSubEventTotal, GaugeQuerySeriesCount, GaugeRuleEvalDuration, GaugeNotifyRecordQueueSize, CounterVarFillingQuery, ) return &Stats{ CounterAlertsTotal: CounterAlertsTotal, GaugeAlertQueueSize: GaugeAlertQueueSize, AlertNotifyTotal: AlertNotifyTotal, AlertNotifyErrorTotal: AlertNotifyErrorTotal, CounterRuleEval: CounterRuleEval, CounterQueryDataTotal: CounterQueryDataTotal, CounterQueryDataErrorTotal: CounterQueryDataErrorTotal, CounterRecordEval: CounterRecordEval, CounterRecordEvalErrorTotal: CounterRecordEvalErrorTotal, CounterMuteTotal: CounterMuteTotal, CounterRuleEvalErrorTotal: CounterRuleEvalErrorTotal, CounterHeartbeatErrorTotal: CounterHeartbeatErrorTotal, CounterSubEventTotal: CounterSubEventTotal, GaugeQuerySeriesCount: GaugeQuerySeriesCount, GaugeRuleEvalDuration: GaugeRuleEvalDuration, GaugeNotifyRecordQueueSize: GaugeNotifyRecordQueueSize, CounterVarFillingQuery: CounterVarFillingQuery, } } ================================================ FILE: alert/common/key.go ================================================ package common import ( "encoding/json" "fmt" "strings" "github.com/ccfos/nightingale/v6/models" ) func RuleKey(datasourceId, id int64) string { return fmt.Sprintf("alert-%d-%d", datasourceId, id) } func MatchTags(eventTagsMap map[string]string, itags []models.TagFilter) bool { for _, filter := range itags { // target_group in和not in优先特殊处理:匹配通过则继续下一个 filter,匹配失败则整组不匹配 if filter.Key == "target_group" { // target 字段从 event.JsonTagsAndValue() 中获取的 v, ok := eventTagsMap["target"] if !ok { return false } if !targetGroupMatch(v, filter) { return false } continue } // 普通标签按原逻辑处理 value, has := eventTagsMap[filter.Key] if !has { return false } if !matchTag(value, filter) { return false } } return true } func MatchGroupsName(groupName string, groupFilter []models.TagFilter) bool { for _, filter := range groupFilter { if !matchTag(groupName, filter) { return false } } return true } func matchTag(value string, filter models.TagFilter) bool { switch filter.Func { case "==": return strings.TrimSpace(fmt.Sprintf("%v", filter.Value)) == strings.TrimSpace(value) case "!=": return strings.TrimSpace(fmt.Sprintf("%v", filter.Value)) != strings.TrimSpace(value) case "in": _, has := filter.Vset[value] return has case "not in": _, has := filter.Vset[value] return !has case "=~": return filter.Regexp.MatchString(value) case "!~": return !filter.Regexp.MatchString(value) } // unexpected func return false } // targetGroupMatch 处理 target_group 的特殊匹配逻辑 func targetGroupMatch(value string, filter models.TagFilter) bool { var valueMap map[string]interface{} if err := json.Unmarshal([]byte(value), &valueMap); err != nil { return false } switch filter.Func { case "in", "not in": // float64 类型的 id 切片 filterValueIds, ok := filter.Value.([]interface{}) if !ok { return false } filterValueIdsMap := make(map[float64]struct{}) for _, id := range filterValueIds { filterValueIdsMap[id.(float64)] = struct{}{} } // float64 类型的 groupIds 切片 groupIds, ok := valueMap["group_ids"].([]interface{}) if !ok { return false } // in 只要 groupIds 中有一个在 filterGroupIds 中出现,就返回 true // not in 则相反 found := false for _, gid := range groupIds { if _, found = filterValueIdsMap[gid.(float64)]; found { break } } if filter.Func == "in" { return found } // filter.Func == "not in" return !found case "=~", "!~": // 正则满足一个就认为 matched groupNames, ok := valueMap["group_names"].([]interface{}) if !ok { return false } matched := false for _, gname := range groupNames { if filter.Regexp.MatchString(fmt.Sprintf("%v", gname)) { matched = true break } } if filter.Func == "=~" { return matched } // "!~": 只要有一个匹配就返回 false,否则返回 true return !matched default: return false } } ================================================ FILE: alert/dispatch/consume.go ================================================ package dispatch import ( "context" "encoding/json" "fmt" "strings" "time" "github.com/ccfos/nightingale/v6/alert/aconf" "github.com/ccfos/nightingale/v6/alert/queue" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/poster" promsdk "github.com/ccfos/nightingale/v6/pkg/prom" "github.com/ccfos/nightingale/v6/pkg/tplx" "github.com/ccfos/nightingale/v6/prom" "github.com/prometheus/common/model" "github.com/toolkits/pkg/concurrent/semaphore" "github.com/toolkits/pkg/logger" ) type Consumer struct { alerting aconf.Alerting ctx *ctx.Context dispatch *Dispatch promClients *prom.PromClientMap alertMuteCache *memsto.AlertMuteCacheType } type EventMuteHookFunc func(event *models.AlertCurEvent) bool var EventMuteHook EventMuteHookFunc = func(event *models.AlertCurEvent) bool { return false } func InitRegisterQueryFunc(promClients *prom.PromClientMap) { tplx.RegisterQueryFunc(func(datasourceID int64, promql string) model.Value { if promClients.IsNil(datasourceID) { return nil } readerClient := promClients.GetCli(datasourceID) value, _, _ := readerClient.Query(context.Background(), promql, time.Now()) return value }) } // 创建一个 Consumer 实例 func NewConsumer(alerting aconf.Alerting, ctx *ctx.Context, dispatch *Dispatch, promClients *prom.PromClientMap, alertMuteCache *memsto.AlertMuteCacheType) *Consumer { return &Consumer{ alerting: alerting, ctx: ctx, dispatch: dispatch, promClients: promClients, alertMuteCache: alertMuteCache, } } func (e *Consumer) LoopConsume() { sema := semaphore.NewSemaphore(e.alerting.NotifyConcurrency) duration := time.Duration(100) * time.Millisecond for { events := queue.EventQueue.PopBackBy(100) if len(events) == 0 { time.Sleep(duration) continue } e.consume(events, sema) } } func (e *Consumer) consume(events []interface{}, sema *semaphore.Semaphore) { for i := range events { if events[i] == nil { continue } event := events[i].(*models.AlertCurEvent) sema.Acquire() go func(event *models.AlertCurEvent) { defer sema.Release() e.consumeOne(event) }(event) } } func (e *Consumer) consumeOne(event *models.AlertCurEvent) { LogEvent(event, "consume") eventType := "alert" if event.IsRecovered { eventType = "recovery" } e.dispatch.Astats.CounterAlertsTotal.WithLabelValues(event.Cluster, eventType, event.GroupName).Inc() if err := event.ParseRule("rule_name"); err != nil { logger.Warningf("alert_eval_%d datasource_%d failed to parse rule name: %v", event.RuleId, event.DatasourceId, err) event.RuleName = fmt.Sprintf("failed to parse rule name: %v", err) } if err := event.ParseRule("annotations"); err != nil { logger.Warningf("alert_eval_%d datasource_%d failed to parse annotations: %v", event.RuleId, event.DatasourceId, err) event.Annotations = fmt.Sprintf("failed to parse annotations: %v", err) event.AnnotationsJSON["error"] = event.Annotations } e.queryRecoveryVal(event) if err := event.ParseRule("rule_note"); err != nil { logger.Warningf("alert_eval_%d datasource_%d failed to parse rule note: %v", event.RuleId, event.DatasourceId, err) event.RuleNote = fmt.Sprintf("failed to parse rule note: %v", err) } e.persist(event) e.dispatch.HandleEventNotify(event, false) } func (e *Consumer) persist(event *models.AlertCurEvent) { if event.Status != 0 { return } if !e.ctx.IsCenter { event.DB2FE() var err error event.Id, err = poster.PostByUrlsWithResp[int64](e.ctx, "/v1/n9e/event-persist", event) if err != nil { logger.Errorf("event:%s persist err:%v", event.Hash, err) e.dispatch.Astats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", event.DatasourceId), "persist_event", event.GroupName, fmt.Sprintf("%v", event.RuleId)).Inc() } return } err := models.EventPersist(e.ctx, event) if err != nil { logger.Errorf("event:%s persist err:%v", event.Hash, err) e.dispatch.Astats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", event.DatasourceId), "persist_event", event.GroupName, fmt.Sprintf("%v", event.RuleId)).Inc() } } func (e *Consumer) queryRecoveryVal(event *models.AlertCurEvent) { if !event.IsRecovered { return } // If the event is a recovery event, execute the recovery_promql query promql, ok := event.AnnotationsJSON["recovery_promql"] if !ok { return } promql = strings.TrimSpace(promql) if promql == "" { logger.Warningf("alert_eval_%d datasource_%d promql is blank", event.RuleId, event.DatasourceId) return } if e.promClients.IsNil(event.DatasourceId) { logger.Warningf("alert_eval_%d datasource_%d error reader client is nil", event.RuleId, event.DatasourceId) return } readerClient := e.promClients.GetCli(event.DatasourceId) var warnings promsdk.Warnings value, warnings, err := readerClient.Query(e.ctx.Ctx, promql, time.Now()) if err != nil { logger.Errorf("alert_eval_%d datasource_%d promql:%s, error:%v", event.RuleId, event.DatasourceId, promql, err) event.AnnotationsJSON["recovery_promql_error"] = fmt.Sprintf("promql:%s error:%v", promql, err) b, err := json.Marshal(event.AnnotationsJSON) if err != nil { event.AnnotationsJSON = make(map[string]string) event.AnnotationsJSON["error"] = fmt.Sprintf("failed to parse annotations: %v", err) } else { event.Annotations = string(b) } return } if len(warnings) > 0 { logger.Errorf("alert_eval_%d datasource_%d promql:%s, warnings:%v", event.RuleId, event.DatasourceId, promql, warnings) } anomalyPoints := models.ConvertAnomalyPoints(value) if len(anomalyPoints) == 0 { logger.Warningf("alert_eval_%d datasource_%d promql:%s, result is empty", event.RuleId, event.DatasourceId, promql) event.AnnotationsJSON["recovery_promql_error"] = fmt.Sprintf("promql:%s error:%s", promql, "result is empty") } else { event.AnnotationsJSON["recovery_value"] = fmt.Sprintf("%v", anomalyPoints[0].Value) } b, err := json.Marshal(event.AnnotationsJSON) if err != nil { event.AnnotationsJSON = make(map[string]string) event.AnnotationsJSON["error"] = fmt.Sprintf("failed to parse annotations: %v", err) } else { event.Annotations = string(b) } } ================================================ FILE: alert/dispatch/dispatch.go ================================================ package dispatch import ( "bytes" "encoding/json" "errors" "fmt" "html/template" "net/url" "strconv" "strings" "sync" "time" "github.com/ccfos/nightingale/v6/alert/aconf" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/alert/common" "github.com/ccfos/nightingale/v6/alert/pipeline" "github.com/ccfos/nightingale/v6/alert/pipeline/engine" "github.com/ccfos/nightingale/v6/alert/sender" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/toolkits/pkg/logger" ) var ShouldSkipNotify func(*ctx.Context, *models.AlertCurEvent, int64) bool var SendByNotifyRule func(*ctx.Context, *memsto.UserCacheType, *memsto.UserGroupCacheType, *memsto.NotifyChannelCacheType, *memsto.CvalCache, []*models.AlertCurEvent, int64, *models.NotifyConfig, *models.NotifyChannelConfig, *models.MessageTemplate) var EventProcessorCache *memsto.EventProcessorCacheType func init() { ShouldSkipNotify = shouldSkipNotify SendByNotifyRule = SendNotifyRuleMessage } type Dispatch struct { alertRuleCache *memsto.AlertRuleCacheType userCache *memsto.UserCacheType userGroupCache *memsto.UserGroupCacheType alertSubscribeCache *memsto.AlertSubscribeCacheType targetCache *memsto.TargetCacheType notifyConfigCache *memsto.NotifyConfigCacheType taskTplsCache *memsto.TaskTplCache configCvalCache *memsto.CvalCache notifyRuleCache *memsto.NotifyRuleCacheType notifyChannelCache *memsto.NotifyChannelCacheType messageTemplateCache *memsto.MessageTemplateCacheType eventProcessorCache *memsto.EventProcessorCacheType alerting aconf.Alerting Senders map[string]sender.Sender CallBacks map[string]sender.CallBacker tpls map[string]*template.Template ExtraSenders map[string]sender.Sender BeforeSenderHook func(*models.AlertCurEvent) bool ctx *ctx.Context Astats *astats.Stats RwLock sync.RWMutex } // 创建一个 Notify 实例 func NewDispatch(alertRuleCache *memsto.AlertRuleCacheType, userCache *memsto.UserCacheType, userGroupCache *memsto.UserGroupCacheType, alertSubscribeCache *memsto.AlertSubscribeCacheType, targetCache *memsto.TargetCacheType, notifyConfigCache *memsto.NotifyConfigCacheType, taskTplsCache *memsto.TaskTplCache, notifyRuleCache *memsto.NotifyRuleCacheType, notifyChannelCache *memsto.NotifyChannelCacheType, messageTemplateCache *memsto.MessageTemplateCacheType, eventProcessorCache *memsto.EventProcessorCacheType, configCvalCache *memsto.CvalCache, alerting aconf.Alerting, c *ctx.Context, astats *astats.Stats) *Dispatch { notify := &Dispatch{ alertRuleCache: alertRuleCache, userCache: userCache, userGroupCache: userGroupCache, alertSubscribeCache: alertSubscribeCache, targetCache: targetCache, notifyConfigCache: notifyConfigCache, taskTplsCache: taskTplsCache, notifyRuleCache: notifyRuleCache, notifyChannelCache: notifyChannelCache, messageTemplateCache: messageTemplateCache, eventProcessorCache: eventProcessorCache, configCvalCache: configCvalCache, alerting: alerting, Senders: make(map[string]sender.Sender), tpls: make(map[string]*template.Template), ExtraSenders: make(map[string]sender.Sender), BeforeSenderHook: func(*models.AlertCurEvent) bool { return true }, ctx: c, Astats: astats, } pipeline.Init() EventProcessorCache = eventProcessorCache // 设置通知记录回调函数 notifyChannelCache.SetNotifyRecordFunc(sender.NotifyRecord) return notify } func (e *Dispatch) ReloadTpls() error { err := e.reloadTpls() if err != nil { logger.Errorf("failed to reload tpls: %v", err) } duration := time.Duration(9000) * time.Millisecond for { time.Sleep(duration) if err := e.reloadTpls(); err != nil { logger.Warning("failed to reload tpls:", err) } } } func (e *Dispatch) reloadTpls() error { tmpTpls, err := models.ListTpls(e.ctx) if err != nil { return err } smtp := e.notifyConfigCache.GetSMTP() senders := map[string]sender.Sender{ models.Email: sender.NewSender(models.Email, tmpTpls, smtp), models.Dingtalk: sender.NewSender(models.Dingtalk, tmpTpls), models.Wecom: sender.NewSender(models.Wecom, tmpTpls), models.Feishu: sender.NewSender(models.Feishu, tmpTpls), models.Mm: sender.NewSender(models.Mm, tmpTpls), models.Telegram: sender.NewSender(models.Telegram, tmpTpls), models.FeishuCard: sender.NewSender(models.FeishuCard, tmpTpls), models.Lark: sender.NewSender(models.Lark, tmpTpls), models.LarkCard: sender.NewSender(models.LarkCard, tmpTpls), } // domain -> Callback() callbacks := map[string]sender.CallBacker{ models.DingtalkDomain: sender.NewCallBacker(models.DingtalkDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls), models.WecomDomain: sender.NewCallBacker(models.WecomDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls), models.FeishuDomain: sender.NewCallBacker(models.FeishuDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls), models.TelegramDomain: sender.NewCallBacker(models.TelegramDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls), models.FeishuCardDomain: sender.NewCallBacker(models.FeishuCardDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls), models.IbexDomain: sender.NewCallBacker(models.IbexDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls), models.LarkDomain: sender.NewCallBacker(models.LarkDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls), models.DefaultDomain: sender.NewCallBacker(models.DefaultDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls), models.LarkCardDomain: sender.NewCallBacker(models.LarkCardDomain, e.targetCache, e.userCache, e.taskTplsCache, tmpTpls), } e.RwLock.RLock() for channelName, extraSender := range e.ExtraSenders { senders[channelName] = extraSender } e.RwLock.RUnlock() e.RwLock.Lock() e.tpls = tmpTpls e.Senders = senders e.CallBacks = callbacks e.RwLock.Unlock() return nil } func (e *Dispatch) HandleEventWithNotifyRule(eventOrigin *models.AlertCurEvent) { if len(eventOrigin.NotifyRuleIds) > 0 { for _, notifyRuleId := range eventOrigin.NotifyRuleIds { // 深拷贝新的 event,避免并发修改 event 冲突 eventCopy := eventOrigin.DeepCopy() logger.Infof("notify rule ids: %v, event: %s", notifyRuleId, eventCopy.Hash) notifyRule := e.notifyRuleCache.Get(notifyRuleId) if notifyRule == nil { continue } if !notifyRule.Enable { continue } eventCopy.NotifyRuleId = notifyRuleId eventCopy.NotifyRuleName = notifyRule.Name eventCopy = HandleEventPipeline(notifyRule.PipelineConfigs, eventOrigin, eventCopy, e.eventProcessorCache, e.ctx, notifyRuleId, "notify_rule") if eventCopy == nil { continue } if ShouldSkipNotify(e.ctx, eventCopy, notifyRuleId) { logger.Infof("notify_id: %d, event:%s, should skip notify", notifyRuleId, eventCopy.Hash) continue } // notify for i := range notifyRule.NotifyConfigs { err := NotifyRuleMatchCheck(¬ifyRule.NotifyConfigs[i], eventCopy) if err != nil { logger.Errorf("notify_id: %d, event:%s, channel_id:%d, template_id: %d, notify_config:%+v, err:%v", notifyRuleId, eventCopy.Hash, notifyRule.NotifyConfigs[i].ChannelID, notifyRule.NotifyConfigs[i].TemplateID, notifyRule.NotifyConfigs[i], err) continue } notifyChannel := e.notifyChannelCache.Get(notifyRule.NotifyConfigs[i].ChannelID) messageTemplate := e.messageTemplateCache.Get(notifyRule.NotifyConfigs[i].TemplateID) if notifyChannel == nil { sender.NotifyRecord(e.ctx, []*models.AlertCurEvent{eventCopy}, notifyRuleId, fmt.Sprintf("notify_channel_id:%d", notifyRule.NotifyConfigs[i].ChannelID), "", "", errors.New("notify_channel not found")) logger.Warningf("notify_id: %d, event:%s, channel_id:%d, template_id: %d, notify_channel not found", notifyRuleId, eventCopy.Hash, notifyRule.NotifyConfigs[i].ChannelID, notifyRule.NotifyConfigs[i].TemplateID) continue } if notifyChannel.RequestType != "flashduty" && notifyChannel.RequestType != "pagerduty" && messageTemplate == nil { logger.Warningf("notify_id: %d, channel_name: %v, event:%s, template_id: %d, message_template not found", notifyRuleId, notifyChannel.Ident, eventCopy.Hash, notifyRule.NotifyConfigs[i].TemplateID) sender.NotifyRecord(e.ctx, []*models.AlertCurEvent{eventCopy}, notifyRuleId, notifyChannel.Name, "", "", errors.New("message_template not found")) continue } go SendByNotifyRule(e.ctx, e.userCache, e.userGroupCache, e.notifyChannelCache, e.configCvalCache, []*models.AlertCurEvent{eventCopy}, notifyRuleId, ¬ifyRule.NotifyConfigs[i], notifyChannel, messageTemplate) } } } } func shouldSkipNotify(ctx *ctx.Context, event *models.AlertCurEvent, notifyRuleId int64) bool { if event == nil { // 如果 eventCopy 为 nil,说明 eventCopy 被 processor drop 掉了, 不再发送通知 return true } if event.IsRecovered && event.NotifyRecovered == 0 { // 如果 eventCopy 是恢复事件,且 NotifyRecovered 为 0,则不发送通知 return true } return false } func HandleEventPipeline(pipelineConfigs []models.PipelineConfig, eventOrigin, event *models.AlertCurEvent, eventProcessorCache *memsto.EventProcessorCacheType, ctx *ctx.Context, id int64, from string) *models.AlertCurEvent { workflowEngine := engine.NewWorkflowEngine(ctx) for _, pipelineConfig := range pipelineConfigs { if !pipelineConfig.Enable { continue } eventPipeline := eventProcessorCache.Get(pipelineConfig.PipelineId) if eventPipeline == nil { logger.Warningf("processor_by_%s_id:%d pipeline_id:%d, event pipeline not found, event: %s", from, id, pipelineConfig.PipelineId, event.Hash) continue } if !PipelineApplicable(eventPipeline, event) { logger.Debugf("processor_by_%s_id:%d pipeline_id:%d, event pipeline not applicable, event: %s", from, id, pipelineConfig.PipelineId, event.Hash) continue } // 统一使用工作流引擎执行(兼容线性模式和工作流模式) triggerCtx := &models.WorkflowTriggerContext{ Mode: models.TriggerModeEvent, TriggerBy: from + "_" + strconv.FormatInt(id, 10), } resultEvent, result, err := workflowEngine.Execute(eventPipeline, event, triggerCtx) if err != nil { logger.Errorf("processor_by_%s_id:%d pipeline_id:%d, pipeline execute error: %v", from, id, pipelineConfig.PipelineId, err) continue } if resultEvent == nil { logger.Infof("processor_by_%s_id:%d pipeline_id:%d, event dropped, event: %s", from, id, pipelineConfig.PipelineId, eventOrigin.Hash) if from == "notify_rule" { sender.NotifyRecord(ctx, []*models.AlertCurEvent{eventOrigin}, id, "", "", result.Message, fmt.Errorf("processor_by_%s_id:%d pipeline_id:%d, drop by pipeline", from, id, pipelineConfig.PipelineId)) } return nil } event = resultEvent logger.Infof("processor_by_%s_id:%d pipeline_id:%d, pipeline executed, status:%s, message:%s", from, id, pipelineConfig.PipelineId, result.Status, result.Message) } event.FE2DB() event.FillTagsMap() return event } func PipelineApplicable(pipeline *models.EventPipeline, event *models.AlertCurEvent) bool { if pipeline == nil { return true } if !pipeline.FilterEnable { return true } tagMatch := true if len(pipeline.LabelFilters) > 0 { // Deep copy to avoid concurrent map writes on cached objects labelFiltersCopy := make([]models.TagFilter, len(pipeline.LabelFilters)) copy(labelFiltersCopy, pipeline.LabelFilters) for i := range labelFiltersCopy { if labelFiltersCopy[i].Func == "" { labelFiltersCopy[i].Func = labelFiltersCopy[i].Op } } tagFilters, err := models.ParseTagFilter(labelFiltersCopy) if err != nil { logger.Errorf("pipeline applicable failed to parse tag filter: %v event:%s pipeline:%+v", err, event.Hash, pipeline) return false } tagMatch = common.MatchTags(event.TagsMap, tagFilters) } attributesMatch := true if len(pipeline.AttrFilters) > 0 { // Deep copy to avoid concurrent map writes on cached objects attrFiltersCopy := make([]models.TagFilter, len(pipeline.AttrFilters)) copy(attrFiltersCopy, pipeline.AttrFilters) tagFilters, err := models.ParseTagFilter(attrFiltersCopy) if err != nil { logger.Errorf("pipeline applicable failed to parse tag filter: %v event:%s pipeline:%+v err:%v", tagFilters, event.Hash, pipeline, err) return false } attributesMatch = common.MatchTags(event.JsonTagsAndValue(), tagFilters) } return tagMatch && attributesMatch } func NotifyRuleMatchCheck(notifyConfig *models.NotifyConfig, event *models.AlertCurEvent) error { tm := time.Unix(event.TriggerTime, 0) triggerTime := tm.Format("15:04") triggerWeek := int(tm.Weekday()) timeMatch := false if len(notifyConfig.TimeRanges) == 0 { timeMatch = true } for j := range notifyConfig.TimeRanges { if timeMatch { break } enableStime := notifyConfig.TimeRanges[j].Start enableEtime := notifyConfig.TimeRanges[j].End enableDaysOfWeek := notifyConfig.TimeRanges[j].Week length := len(enableDaysOfWeek) // enableStime,enableEtime,enableDaysOfWeek三者长度肯定相同,这里循环一个即可 for i := 0; i < length; i++ { if enableDaysOfWeek[i] != triggerWeek { continue } if enableStime < enableEtime { if enableEtime == "23:59" { // 02:00-23:59,这种情况做个特殊处理,相当于左闭右闭区间了 if triggerTime < enableStime { // mute, 即没生效 continue } } else { // 02:00-04:00 或者 02:00-24:00 if triggerTime < enableStime || triggerTime >= enableEtime { // mute, 即没生效 continue } } } else if enableStime > enableEtime { // 21:00-09:00 if triggerTime < enableStime && triggerTime >= enableEtime { // mute, 即没生效 continue } } // 到这里说明当前时刻在告警规则的某组生效时间范围内,即没有 mute,直接返回 false timeMatch = true break } } if !timeMatch { return fmt.Errorf("event time not match time filter") } severityMatch := false for i := range notifyConfig.Severities { if notifyConfig.Severities[i] == event.Severity { severityMatch = true } } if !severityMatch { return fmt.Errorf("event severity not match severity filter") } tagMatch := true if len(notifyConfig.LabelKeys) > 0 { // Deep copy to avoid concurrent map writes on cached objects labelKeysCopy := make([]models.TagFilter, len(notifyConfig.LabelKeys)) copy(labelKeysCopy, notifyConfig.LabelKeys) for i := range labelKeysCopy { if labelKeysCopy[i].Func == "" { labelKeysCopy[i].Func = labelKeysCopy[i].Op } } tagFilters, err := models.ParseTagFilter(labelKeysCopy) if err != nil { logger.Errorf("notify send failed to parse tag filter: %v event:%s notify_config:%+v", err, event.Hash, notifyConfig) return fmt.Errorf("failed to parse tag filter: %v", err) } tagMatch = common.MatchTags(event.TagsMap, tagFilters) } if !tagMatch { return fmt.Errorf("event tag not match tag filter") } attributesMatch := true if len(notifyConfig.Attributes) > 0 { // Deep copy to avoid concurrent map writes on cached objects attributesCopy := make([]models.TagFilter, len(notifyConfig.Attributes)) copy(attributesCopy, notifyConfig.Attributes) tagFilters, err := models.ParseTagFilter(attributesCopy) if err != nil { logger.Errorf("notify send failed to parse tag filter: %v event:%s notify_config:%+v err:%v", tagFilters, event.Hash, notifyConfig, err) return fmt.Errorf("failed to parse tag filter: %v", err) } attributesMatch = common.MatchTags(event.JsonTagsAndValue(), tagFilters) } if !attributesMatch { return fmt.Errorf("event attributes not match attributes filter") } logger.Infof("notify send timeMatch:%v severityMatch:%v tagMatch:%v attributesMatch:%v event:%s notify_config:%+v", timeMatch, severityMatch, tagMatch, attributesMatch, event.Hash, notifyConfig) return nil } func GetNotifyConfigParams(notifyConfig *models.NotifyConfig, contactKey string, userCache *memsto.UserCacheType, userGroupCache *memsto.UserGroupCacheType) ([]string, []int64, []string, map[string]string) { customParams := make(map[string]string) var flashDutyChannelIDs []int64 var pagerDutyRoutingKeys []string var userInfoParams models.CustomParams for key, value := range notifyConfig.Params { switch key { case "user_ids", "user_group_ids", "ids": if data, err := json.Marshal(value); err == nil { var ids []int64 if json.Unmarshal(data, &ids) == nil { if key == "user_ids" { userInfoParams.UserIDs = ids } else if key == "user_group_ids" { userInfoParams.UserGroupIDs = ids } else if key == "ids" { flashDutyChannelIDs = ids } } } case "pagerduty_integration_keys", "pagerduty_integration_ids": if key == "pagerduty_integration_ids" { // 不处理ids,直接跳过,这个字段只给前端标记用 continue } if data, err := json.Marshal(value); err == nil { var keys []string if json.Unmarshal(data, &keys) == nil { pagerDutyRoutingKeys = keys break } } default: // 避免直接 value.(string) 导致 panic,支持多种类型并统一为字符串 customParams[key] = value.(string) } } if len(userInfoParams.UserIDs) == 0 && len(userInfoParams.UserGroupIDs) == 0 { return []string{}, flashDutyChannelIDs, pagerDutyRoutingKeys, customParams } userIds := make([]int64, 0) userIds = append(userIds, userInfoParams.UserIDs...) if len(userInfoParams.UserGroupIDs) > 0 { userGroups := userGroupCache.GetByUserGroupIds(userInfoParams.UserGroupIDs) for _, userGroup := range userGroups { userIds = append(userIds, userGroup.UserIds...) } } users := userCache.GetByUserIds(userIds) visited := make(map[int64]bool) sendtos := make([]string, 0) for _, user := range users { if visited[user.Id] { continue } var sendto string if contactKey == "phone" { sendto = user.Phone } else if contactKey == "email" { sendto = user.Email } else { sendto, _ = user.ExtractToken(contactKey) } if sendto == "" { continue } sendtos = append(sendtos, sendto) visited[user.Id] = true } return sendtos, flashDutyChannelIDs, pagerDutyRoutingKeys, customParams } func SendNotifyRuleMessage(ctx *ctx.Context, userCache *memsto.UserCacheType, userGroupCache *memsto.UserGroupCacheType, notifyChannelCache *memsto.NotifyChannelCacheType, configCvalCache *memsto.CvalCache, events []*models.AlertCurEvent, notifyRuleId int64, notifyConfig *models.NotifyConfig, notifyChannel *models.NotifyChannelConfig, messageTemplate *models.MessageTemplate) { if len(events) == 0 { logger.Errorf("notify_id: %d events is empty", notifyRuleId) return } siteInfo := configCvalCache.GetSiteInfo() tplContent := make(map[string]interface{}) if notifyChannel.RequestType != "flashduty" { tplContent = messageTemplate.RenderEvent(events, siteInfo.SiteUrl) } var contactKey string if notifyChannel.ParamConfig != nil && notifyChannel.ParamConfig.UserInfo != nil { contactKey = notifyChannel.ParamConfig.UserInfo.ContactKey } sendtos, flashDutyChannelIDs, pagerdutyRoutingKeys, customParams := GetNotifyConfigParams(notifyConfig, contactKey, userCache, userGroupCache) switch notifyChannel.RequestType { case "flashduty": if len(flashDutyChannelIDs) == 0 { flashDutyChannelIDs = []int64{0} // 如果 flashduty 通道没有配置,则使用 0, 给 SendFlashDuty 判断使用, 不给 flashduty 传 channel_id 参数 } for i := range flashDutyChannelIDs { start := time.Now() respBody, err := notifyChannel.SendFlashDuty(events, flashDutyChannelIDs[i], notifyChannelCache.GetHttpClient(notifyChannel.ID)) respBody = fmt.Sprintf("send_time: %s duration: %d ms %s", time.Now().Format("2006-01-02 15:04:05"), time.Since(start).Milliseconds(), respBody) logger.Infof("duty_sender notify_id: %d, channel_name: %v, event:%s, IntegrationUrl: %v dutychannel_id: %v, respBody: %v, err: %v", notifyRuleId, notifyChannel.Name, events[0].Hash, notifyChannel.RequestConfig.FlashDutyRequestConfig.IntegrationUrl, flashDutyChannelIDs[i], respBody, err) sender.NotifyRecord(ctx, events, notifyRuleId, notifyChannel.Name, strconv.FormatInt(flashDutyChannelIDs[i], 10), respBody, err) } case "pagerduty": for _, routingKey := range pagerdutyRoutingKeys { start := time.Now() respBody, err := notifyChannel.SendPagerDuty(events, routingKey, siteInfo.SiteUrl, notifyChannelCache.GetHttpClient(notifyChannel.ID)) respBody = fmt.Sprintf("send_time: %s duration: %d ms %s", time.Now().Format("2006-01-02 15:04:05"), time.Since(start).Milliseconds(), respBody) logger.Infof("pagerduty_sender notify_id: %d, channel_name: %v, event:%s, respBody: %v, err: %v", notifyRuleId, notifyChannel.Name, events[0].Hash, respBody, err) sender.NotifyRecord(ctx, events, notifyRuleId, notifyChannel.Name, "", respBody, err) } case "http": // 使用队列模式处理 http 通知 // 创建通知任务 task := &memsto.NotifyTask{ Events: events, NotifyRuleId: notifyRuleId, NotifyChannel: notifyChannel, TplContent: tplContent, CustomParams: customParams, Sendtos: sendtos, } // 将任务加入队列 success := notifyChannelCache.EnqueueNotifyTask(task) if !success { logger.Errorf("failed to enqueue notify task for channel %d, notify_id: %d", notifyChannel.ID, notifyRuleId) // 如果入队失败,记录错误通知 sender.NotifyRecord(ctx, events, notifyRuleId, notifyChannel.Name, getSendTarget(customParams, sendtos), "", errors.New("failed to enqueue notify task, queue is full")) } case "smtp": notifyChannel.SendEmail(notifyRuleId, events, tplContent, sendtos, notifyChannelCache.GetSmtpClient(notifyChannel.ID)) case "script": start := time.Now() target, res, err := notifyChannel.SendScript(events, tplContent, customParams, sendtos) res = fmt.Sprintf("send_time: %s duration: %d ms %s", time.Now().Format("2006-01-02 15:04:05"), time.Since(start).Milliseconds(), res) logger.Infof("script_sender notify_id: %d, channel_name: %v, event:%s, tplContent:%s, customParams:%v, target:%s, res:%s, err:%v", notifyRuleId, notifyChannel.Name, events[0].Hash, tplContent, customParams, target, res, err) sender.NotifyRecord(ctx, events, notifyRuleId, notifyChannel.Name, target, res, err) default: logger.Warningf("notify_id: %d, channel_name: %v, event:%s send type not found", notifyRuleId, notifyChannel.Name, events[0].Hash) } } func NeedBatchContacts(requestConfig *models.HTTPRequestConfig) bool { b, _ := json.Marshal(requestConfig) return strings.Contains(string(b), "$sendtos") } // HandleEventNotify 处理event事件的主逻辑 // event: 告警/恢复事件 // isSubscribe: 告警事件是否由subscribe的配置产生 func (e *Dispatch) HandleEventNotify(event *models.AlertCurEvent, isSubscribe bool) { go e.HandleEventWithNotifyRule(event) if event.IsRecovered && event.NotifyRecovered == 0 { return } if !isSubscribe { go sender.SendStaticGlobalWebhook(e.ctx, event.DeepCopy(), e.Astats) } rule := e.alertRuleCache.Get(event.RuleId) if rule == nil { return } fillUsers(event, e.userCache, e.userGroupCache) var ( // 处理事件到 notifyTarget 关系,处理的notifyTarget用OrMerge进行合并 handlers []NotifyTargetDispatch // 额外去掉一些订阅,处理的notifyTarget用AndMerge进行合并, 如设置 channel=false,合并后不通过这个channel发送 // 如果实现了相关 Dispatch,可以添加到interceptors中 interceptorHandlers []NotifyTargetDispatch ) if isSubscribe { handlers = []NotifyTargetDispatch{NotifyGroupDispatch, EventCallbacksDispatch} } else { handlers = []NotifyTargetDispatch{NotifyGroupDispatch, GlobalWebhookDispatch, EventCallbacksDispatch} } notifyTarget := NewNotifyTarget() // 处理订阅关系使用OrMerge for _, handler := range handlers { notifyTarget.OrMerge(handler(rule, event, notifyTarget, e)) } // 处理移除订阅关系的逻辑,比如员工离职,临时静默某个通道的策略等 for _, handler := range interceptorHandlers { notifyTarget.AndMerge(handler(rule, event, notifyTarget, e)) } go e.Send(rule, event, notifyTarget, isSubscribe) // 如果是不是订阅规则出现的event, 则需要处理订阅规则的event if !isSubscribe { e.handleSubs(event) } } func (e *Dispatch) handleSubs(event *models.AlertCurEvent) { // handle alert subscribes subscribes := make([]*models.AlertSubscribe, 0) // rule specific subscribes if subs, has := e.alertSubscribeCache.Get(event.RuleId); has { subscribes = append(subscribes, subs...) } // global subscribes if subs, has := e.alertSubscribeCache.Get(0); has { subscribes = append(subscribes, subs...) } for _, sub := range subscribes { e.handleSub(sub, *event) } } // handleSub 处理订阅规则的event,注意这里event要使用值传递,因为后面会修改event的状态 func (e *Dispatch) handleSub(sub *models.AlertSubscribe, event models.AlertCurEvent) { if sub.IsDisabled() { return } if !sub.MatchCluster(event.DatasourceId) { return } if !sub.MatchProd(event.RuleProd) { return } if !sub.MatchCate(event.Cate) { return } if !common.MatchTags(event.TagsMap, sub.ITags) { return } // event BusiGroups filter if !common.MatchGroupsName(event.GroupName, sub.IBusiGroups) { return } if sub.ForDuration > (event.TriggerTime - event.FirstTriggerTime) { return } if len(sub.SeveritiesJson) != 0 { match := false for _, s := range sub.SeveritiesJson { if s == event.Severity || s == 0 { match = true break } } if !match { return } } e.Astats.CounterSubEventTotal.WithLabelValues(event.GroupName).Inc() sub.ModifyEvent(&event) event.SubRuleId = sub.Id LogEvent(&event, "subscribe") e.HandleEventNotify(&event, true) } func (e *Dispatch) Send(rule *models.AlertRule, event *models.AlertCurEvent, notifyTarget *NotifyTarget, isSubscribe bool) { needSend := e.BeforeSenderHook(event) if needSend { for channel, uids := range notifyTarget.ToChannelUserMap() { msgCtx := sender.BuildMessageContext(e.ctx, rule, []*models.AlertCurEvent{event}, uids, e.userCache, e.Astats) e.RwLock.RLock() s := e.Senders[channel] e.RwLock.RUnlock() if s == nil { logger.Debugf("no sender for channel: %s", channel) continue } var event *models.AlertCurEvent if len(msgCtx.Events) > 0 { event = msgCtx.Events[0] } logger.Debugf("send to channel:%s event:%s users:%+v", channel, event.Hash, msgCtx.Users) s.Send(msgCtx) } } // handle event callbacks e.SendCallbacks(rule, notifyTarget, event) // handle global webhooks if !event.OverrideGlobalWebhook() { if e.alerting.WebhookBatchSend { sender.BatchSendWebhooks(e.ctx, notifyTarget.ToWebhookMap(), event, e.Astats) } else { sender.SingleSendWebhooks(e.ctx, notifyTarget.ToWebhookMap(), event, e.Astats) } } // handle plugin call go sender.MayPluginNotify(e.ctx, e.genNoticeBytes(event), e.notifyConfigCache. GetNotifyScript(), e.Astats, event) if !isSubscribe { // handle ibex callbacks e.HandleIbex(rule, event) } } func (e *Dispatch) SendCallbacks(rule *models.AlertRule, notifyTarget *NotifyTarget, event *models.AlertCurEvent) { uids := notifyTarget.ToUidList() urls := notifyTarget.ToCallbackList() whMap := notifyTarget.ToWebhookMap() ogw := event.OverrideGlobalWebhook() for _, urlStr := range urls { if len(urlStr) == 0 { continue } cbCtx := sender.BuildCallBackContext(e.ctx, urlStr, rule, []*models.AlertCurEvent{event}, uids, e.userCache, e.alerting.WebhookBatchSend, e.Astats) if wh, ok := whMap[cbCtx.CallBackURL]; !ogw && ok && wh.Enable { logger.Debugf("SendCallbacks: webhook[%s] is in global conf.", cbCtx.CallBackURL) continue } if strings.HasPrefix(urlStr, "${ibex}") { e.CallBacks[models.IbexDomain].CallBack(cbCtx) continue } if !(strings.HasPrefix(urlStr, "http://") || strings.HasPrefix(urlStr, "https://")) { cbCtx.CallBackURL = "http://" + urlStr } parsedURL, err := url.Parse(urlStr) if err != nil { logger.Errorf("SendCallbacks: failed to url.Parse(urlStr=%s): %v", urlStr, err) continue } // process feishu card if parsedURL.Host == models.FeishuDomain && parsedURL.Query().Get("card") == "1" { e.CallBacks[models.FeishuCardDomain].CallBack(cbCtx) continue } // process lark card if parsedURL.Host == models.LarkDomain && parsedURL.Query().Get("card") == "1" { e.CallBacks[models.LarkCardDomain].CallBack(cbCtx) continue } callBacker, ok := e.CallBacks[parsedURL.Host] if ok { callBacker.CallBack(cbCtx) } else { e.CallBacks[models.DefaultDomain].CallBack(cbCtx) } } } func (e *Dispatch) HandleIbex(rule *models.AlertRule, event *models.AlertCurEvent) { // 解析 RuleConfig 字段 var ruleConfig struct { TaskTpls []*models.Tpl `json:"task_tpls"` } json.Unmarshal([]byte(rule.RuleConfig), &ruleConfig) if event.IsRecovered { // 恢复事件不需要走故障自愈的逻辑 return } for _, t := range ruleConfig.TaskTpls { if t.TplId == 0 { continue } if len(t.Host) == 0 { sender.CallIbex(e.ctx, t.TplId, event.TargetIdent, e.taskTplsCache, e.targetCache, e.userCache, event, "") continue } for _, host := range t.Host { sender.CallIbex(e.ctx, t.TplId, host, e.taskTplsCache, e.targetCache, e.userCache, event, "") } } } type Notice struct { Event *models.AlertCurEvent `json:"event"` Tpls map[string]string `json:"tpls"` } func (e *Dispatch) genNoticeBytes(event *models.AlertCurEvent) []byte { // build notice body with templates ntpls := make(map[string]string) e.RwLock.RLock() defer e.RwLock.RUnlock() for filename, tpl := range e.tpls { var body bytes.Buffer if err := tpl.Execute(&body, event); err != nil { ntpls[filename] = err.Error() } else { ntpls[filename] = body.String() } } notice := Notice{Event: event, Tpls: ntpls} stdinBytes, err := json.Marshal(notice) if err != nil { logger.Errorf("event_notify: failed to marshal notice: %v", err) return nil } return stdinBytes } // for alerting func fillUsers(ce *models.AlertCurEvent, uc *memsto.UserCacheType, ugc *memsto.UserGroupCacheType) { gids := make([]int64, 0, len(ce.NotifyGroupsJSON)) for i := 0; i < len(ce.NotifyGroupsJSON); i++ { gid, err := strconv.ParseInt(ce.NotifyGroupsJSON[i], 10, 64) if err != nil { continue } gids = append(gids, gid) } ce.NotifyGroupsObj = ugc.GetByUserGroupIds(gids) uids := make(map[int64]struct{}) for i := 0; i < len(ce.NotifyGroupsObj); i++ { ug := ce.NotifyGroupsObj[i] for j := 0; j < len(ug.UserIds); j++ { uids[ug.UserIds[j]] = struct{}{} } } ce.NotifyUsersObj = uc.GetByUserIds(mapKeys(uids)) } func mapKeys(m map[int64]struct{}) []int64 { lst := make([]int64, 0, len(m)) for k := range m { lst = append(lst, k) } return lst } func getSendTarget(customParams map[string]string, sendtos []string) string { if len(customParams) == 0 { return strings.Join(sendtos, ",") } values := make([]string, 0) for _, value := range customParams { runes := []rune(value) if len(runes) <= 4 { values = append(values, value) } else { maskedValue := string(runes[:len(runes)-4]) + "****" values = append(values, maskedValue) } } return strings.Join(values, ",") } ================================================ FILE: alert/dispatch/log.go ================================================ package dispatch import ( "github.com/ccfos/nightingale/v6/models" "github.com/toolkits/pkg/logger" ) func LogEvent(event *models.AlertCurEvent, location string, err ...error) { status := "triggered" if event.IsRecovered { status = "recovered" } message := "" if len(err) > 0 && err[0] != nil { message = "error_message: " + err[0].Error() } logger.Infof( "alert_eval_%d event(%s %s) %s: sub_id:%d notify_rule_ids:%v cluster:%s %v%s@%d last_eval_time:%d %s", event.RuleId, event.Hash, status, location, event.SubRuleId, event.NotifyRuleIds, event.Cluster, event.TagsJSON, event.TriggerValue, event.TriggerTime, event.LastEvalTime, message, ) } ================================================ FILE: alert/dispatch/notify_channel.go ================================================ package dispatch // NotifyChannels channelKey -> bool type NotifyChannels map[string]bool func NewNotifyChannels(channels []string) NotifyChannels { nc := make(NotifyChannels) for _, ch := range channels { nc[ch] = true } return nc } func (nc NotifyChannels) OrMerge(other NotifyChannels) { nc.merge(other, func(a, b bool) bool { return a || b }) } func (nc NotifyChannels) AndMerge(other NotifyChannels) { nc.merge(other, func(a, b bool) bool { return a && b }) } func (nc NotifyChannels) merge(other NotifyChannels, f func(bool, bool) bool) { if other == nil { return } for k, v := range other { if curV, has := nc[k]; has { nc[k] = f(curV, v) } else { nc[k] = v } } } ================================================ FILE: alert/dispatch/notify_target.go ================================================ package dispatch import ( "strconv" "github.com/ccfos/nightingale/v6/models" ) // NotifyTarget 维护所有需要发送的目标 用户-通道/回调/钩子信息,用map维护的数据结构具有去重功能 type NotifyTarget struct { userMap map[int64]NotifyChannels webhooks map[string]*models.Webhook callbacks map[string]struct{} } func NewNotifyTarget() *NotifyTarget { return &NotifyTarget{ userMap: make(map[int64]NotifyChannels), webhooks: make(map[string]*models.Webhook), callbacks: make(map[string]struct{}), } } // OrMerge 将 channelMap 按照 or 的方式合并,方便实现多种组合的策略,比如根据某个 tag 进行路由等 func (s *NotifyTarget) OrMerge(other *NotifyTarget) { s.merge(other, NotifyChannels.OrMerge) } // AndMerge 将 channelMap 中的 bool 值按照 and 的逻辑进行合并,可以单独将人/通道维度的通知移除 // 常用的场景有: // 1. 人员离职了不需要发送告警了 // 2. 某个告警通道进行维护,暂时不需要发送告警了 // 3. 业务值班的重定向逻辑,将高等级的告警额外发送给应急人员等 // 可以结合业务需求自己实现router func (s *NotifyTarget) AndMerge(other *NotifyTarget) { s.merge(other, NotifyChannels.AndMerge) } func (s *NotifyTarget) merge(other *NotifyTarget, f func(NotifyChannels, NotifyChannels)) { if other == nil { return } for k, v := range other.userMap { if curV, has := s.userMap[k]; has { f(curV, v) } else { s.userMap[k] = v } } for k, v := range other.webhooks { s.webhooks[k] = v } for k, v := range other.callbacks { s.callbacks[k] = v } } // ToChannelUserMap userMap(map[uid][channel]bool) 转换为 map[channel][]uid 的结构 func (s *NotifyTarget) ToChannelUserMap() map[string][]int64 { m := make(map[string][]int64) for uid, nc := range s.userMap { for ch, send := range nc { if send { m[ch] = append(m[ch], uid) } } } return m } func (s *NotifyTarget) ToCallbackList() []string { callbacks := make([]string, 0, len(s.callbacks)) for cb := range s.callbacks { callbacks = append(callbacks, cb) } return callbacks } func (s *NotifyTarget) ToWebhookMap() map[string]*models.Webhook { return s.webhooks } func (s *NotifyTarget) ToUidList() []int64 { uids := make([]int64, 0, len(s.userMap)) for uid, _ := range s.userMap { uids = append(uids, uid) } return uids } // Dispatch 抽象由告警事件到信息接收者的路由策略 // rule: 告警规则 // event: 告警事件 // prev: 前一次路由结果, Dispatch 的实现可以直接修改 prev, 也可以返回一个新的 NotifyTarget 用于 AndMerge/OrMerge type NotifyTargetDispatch func(rule *models.AlertRule, event *models.AlertCurEvent, prev *NotifyTarget, dispatch *Dispatch) *NotifyTarget // GroupDispatch 处理告警规则的组订阅关系 func NotifyGroupDispatch(rule *models.AlertRule, event *models.AlertCurEvent, prev *NotifyTarget, dispatch *Dispatch) *NotifyTarget { groupIds := make([]int64, 0, len(event.NotifyGroupsJSON)) for _, groupId := range event.NotifyGroupsJSON { gid, err := strconv.ParseInt(groupId, 10, 64) if err != nil { continue } groupIds = append(groupIds, gid) } groups := dispatch.userGroupCache.GetByUserGroupIds(groupIds) NotifyTarget := NewNotifyTarget() for _, group := range groups { for _, userId := range group.UserIds { NotifyTarget.userMap[userId] = NewNotifyChannels(event.NotifyChannelsJSON) } } return NotifyTarget } func GlobalWebhookDispatch(rule *models.AlertRule, event *models.AlertCurEvent, prev *NotifyTarget, dispatch *Dispatch) *NotifyTarget { webhooks := dispatch.notifyConfigCache.GetWebhooks() NotifyTarget := NewNotifyTarget() for _, webhook := range webhooks { if !webhook.Enable { continue } NotifyTarget.webhooks[webhook.Url] = webhook } return NotifyTarget } func EventCallbacksDispatch(rule *models.AlertRule, event *models.AlertCurEvent, prev *NotifyTarget, dispatch *Dispatch) *NotifyTarget { for _, c := range event.CallbacksJSON { if c == "" { continue } prev.callbacks[c] = struct{}{} } return nil } ================================================ FILE: alert/eval/alert_rule.go ================================================ package eval import ( "context" "fmt" "strconv" "time" "github.com/ccfos/nightingale/v6/alert/aconf" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/alert/naming" "github.com/ccfos/nightingale/v6/alert/process" "github.com/ccfos/nightingale/v6/datasource/commons/eslike" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/prom" "github.com/toolkits/pkg/logger" ) type Scheduler struct { // key: hash alertRules map[string]*AlertRuleWorker ExternalProcessors *process.ExternalProcessorsType aconf aconf.Alert alertRuleCache *memsto.AlertRuleCacheType targetCache *memsto.TargetCacheType targetsOfAlertRuleCache *memsto.TargetsOfAlertRuleCacheType busiGroupCache *memsto.BusiGroupCacheType alertMuteCache *memsto.AlertMuteCacheType datasourceCache *memsto.DatasourceCacheType promClients *prom.PromClientMap naming *naming.Naming ctx *ctx.Context stats *astats.Stats } func NewScheduler(aconf aconf.Alert, externalProcessors *process.ExternalProcessorsType, arc *memsto.AlertRuleCacheType, targetCache *memsto.TargetCacheType, toarc *memsto.TargetsOfAlertRuleCacheType, busiGroupCache *memsto.BusiGroupCacheType, alertMuteCache *memsto.AlertMuteCacheType, datasourceCache *memsto.DatasourceCacheType, promClients *prom.PromClientMap, naming *naming.Naming, ctx *ctx.Context, stats *astats.Stats) *Scheduler { scheduler := &Scheduler{ aconf: aconf, alertRules: make(map[string]*AlertRuleWorker), ExternalProcessors: externalProcessors, alertRuleCache: arc, targetCache: targetCache, targetsOfAlertRuleCache: toarc, busiGroupCache: busiGroupCache, alertMuteCache: alertMuteCache, datasourceCache: datasourceCache, promClients: promClients, naming: naming, ctx: ctx, stats: stats, } eslike.SetEsIndexPatternCacheType(memsto.NewEsIndexPatternCacheType(ctx)) go scheduler.LoopSyncRules(context.Background()) return scheduler } func (s *Scheduler) LoopSyncRules(ctx context.Context) { time.Sleep(time.Duration(s.aconf.EngineDelay) * time.Second) duration := 9000 * time.Millisecond for { select { case <-ctx.Done(): return case <-time.After(duration): s.syncAlertRules() } } } func (s *Scheduler) syncAlertRules() { ids := s.alertRuleCache.GetRuleIds() alertRuleWorkers := make(map[string]*AlertRuleWorker) externalRuleWorkers := make(map[string]*process.Processor) for _, id := range ids { rule := s.alertRuleCache.Get(id) if rule == nil { continue } ruleType := rule.GetRuleType() if rule.IsPrometheusRule() || rule.IsInnerRule() { datasourceIds := s.datasourceCache.GetIDsByDsCateAndQueries(rule.Cate, rule.DatasourceQueries) for _, dsId := range datasourceIds { if !naming.DatasourceHashRing.IsHit(strconv.FormatInt(dsId, 10), fmt.Sprintf("%d", rule.Id), s.aconf.Heartbeat.Endpoint) { continue } ds := s.datasourceCache.GetById(dsId) if ds == nil { logger.Debugf("alert_eval_%d datasource %d not found", rule.Id, dsId) continue } if ds.PluginType != ruleType { logger.Debugf("alert_eval_%d datasource %d category is %s not %s", rule.Id, dsId, ds.PluginType, ruleType) continue } if ds.Status != "enabled" { logger.Debugf("alert_eval_%d datasource %d status is %s", rule.Id, dsId, ds.Status) continue } processor := process.NewProcessor(s.aconf.Heartbeat.EngineName, rule, dsId, s.alertRuleCache, s.targetCache, s.targetsOfAlertRuleCache, s.busiGroupCache, s.alertMuteCache, s.datasourceCache, s.ctx, s.stats) alertRule := NewAlertRuleWorker(rule, dsId, processor, s.promClients, s.ctx) alertRuleWorkers[alertRule.Hash()] = alertRule } } else if rule.IsHostRule() { // all host rule will be processed by center instance if !naming.DatasourceHashRing.IsHit(s.aconf.Heartbeat.EngineName, strconv.FormatInt(rule.Id, 10), s.aconf.Heartbeat.Endpoint) { continue } processor := process.NewProcessor(s.aconf.Heartbeat.EngineName, rule, 0, s.alertRuleCache, s.targetCache, s.targetsOfAlertRuleCache, s.busiGroupCache, s.alertMuteCache, s.datasourceCache, s.ctx, s.stats) alertRule := NewAlertRuleWorker(rule, 0, processor, s.promClients, s.ctx) alertRuleWorkers[alertRule.Hash()] = alertRule } else { // 如果 rule 不是通过 prometheus engine 来告警的,则创建为 externalRule // if rule is not processed by prometheus engine, create it as externalRule dsIds := s.datasourceCache.GetIDsByDsCateAndQueries(rule.Cate, rule.DatasourceQueries) for _, dsId := range dsIds { ds := s.datasourceCache.GetById(dsId) if ds == nil { logger.Debugf("alert_eval_%d datasource %d not found", rule.Id, dsId) continue } if ds.Status != "enabled" { logger.Debugf("alert_eval_%d datasource %d status is %s", rule.Id, dsId, ds.Status) continue } processor := process.NewProcessor(s.aconf.Heartbeat.EngineName, rule, dsId, s.alertRuleCache, s.targetCache, s.targetsOfAlertRuleCache, s.busiGroupCache, s.alertMuteCache, s.datasourceCache, s.ctx, s.stats) externalRuleWorkers[processor.Key()] = processor } } } for hash, rule := range alertRuleWorkers { if _, has := s.alertRules[hash]; !has { rule.Prepare() time.Sleep(time.Duration(20) * time.Millisecond) rule.Start() s.alertRules[hash] = rule } } for hash, rule := range s.alertRules { if _, has := alertRuleWorkers[hash]; !has { rule.Stop() delete(s.alertRules, hash) } } s.ExternalProcessors.ExternalLock.Lock() for key, processor := range externalRuleWorkers { if curProcessor, has := s.ExternalProcessors.Processors[key]; has { // rule存在,且hash一致,认为没有变更,这里可以根据需求单独实现一个关联数据更多的hash函数 if processor.Hash() == curProcessor.Hash() { continue } } // 现有规则中没有rule以及有rule但hash不一致的场景,需要触发rule的update processor.RecoverAlertCurEventFromDb() s.ExternalProcessors.Processors[key] = processor } for key := range s.ExternalProcessors.Processors { if _, has := externalRuleWorkers[key]; !has { delete(s.ExternalProcessors.Processors, key) } } s.ExternalProcessors.ExternalLock.Unlock() } ================================================ FILE: alert/eval/eval.go ================================================ package eval import ( "context" "encoding/json" "errors" "fmt" "math" "reflect" "sort" "strconv" "strings" "sync" "text/template" "time" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/alert/common" "github.com/ccfos/nightingale/v6/alert/process" "github.com/ccfos/nightingale/v6/dscache" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/hash" "github.com/ccfos/nightingale/v6/pkg/parser" "github.com/ccfos/nightingale/v6/pkg/poster" promsdk "github.com/ccfos/nightingale/v6/pkg/prom" promql2 "github.com/ccfos/nightingale/v6/pkg/promql" "github.com/ccfos/nightingale/v6/pkg/tplx" "github.com/ccfos/nightingale/v6/pkg/unit" "github.com/ccfos/nightingale/v6/prom" "github.com/prometheus/common/model" "github.com/robfig/cron/v3" "github.com/toolkits/pkg/logger" "github.com/toolkits/pkg/str" ) type AlertRuleWorker struct { DatasourceId int64 Quit chan struct{} Inhibit bool Rule *models.AlertRule Processor *process.Processor PromClients *prom.PromClientMap Ctx *ctx.Context Scheduler *cron.Cron HostAndDeviceIdentCache sync.Map LastSeriesStore map[uint64]models.DataResp DeviceIdentHook func(arw *AlertRuleWorker, paramQuery models.ParamQuery) ([]string, error) } const ( GET_RULE_CONFIG = "get_rule_config" GET_Processor = "get_Processor" CHECK_QUERY = "check_query_config" GET_CLIENT = "get_client" QUERY_DATA = "query_data" EXEC_TEMPLATE = "exec_template" ) const ( JoinMark = "@@" ) type JoinType string const ( Left JoinType = "left" Right JoinType = "right" Inner JoinType = "inner" ) func NewAlertRuleWorker(rule *models.AlertRule, datasourceId int64, Processor *process.Processor, promClients *prom.PromClientMap, ctx *ctx.Context) *AlertRuleWorker { arw := &AlertRuleWorker{ DatasourceId: datasourceId, Quit: make(chan struct{}), Rule: rule, Processor: Processor, PromClients: promClients, Ctx: ctx, HostAndDeviceIdentCache: sync.Map{}, DeviceIdentHook: func(arw *AlertRuleWorker, paramQuery models.ParamQuery) ([]string, error) { return nil, nil }, LastSeriesStore: make(map[uint64]models.DataResp), } interval := rule.PromEvalInterval if interval <= 0 { interval = 10 } if rule.CronPattern == "" { rule.CronPattern = fmt.Sprintf("@every %ds", interval) } arw.Scheduler = cron.New(cron.WithSeconds(), cron.WithChain(cron.SkipIfStillRunning(cron.DefaultLogger))) entryID, err := arw.Scheduler.AddFunc(rule.CronPattern, func() { arw.Eval() }) if err != nil { logger.Errorf("alert_eval_%d datasource_%d add cron pattern error: %v", arw.Rule.Id, arw.DatasourceId, err) } Processor.ScheduleEntry = arw.Scheduler.Entry(entryID) Processor.PromEvalInterval = getPromEvalInterval(Processor.ScheduleEntry.Schedule) return arw } func getPromEvalInterval(schedule cron.Schedule) int { now := time.Now() next1 := schedule.Next(now) next2 := schedule.Next(next1) return int(next2.Sub(next1).Seconds()) } func (arw *AlertRuleWorker) Key() string { return common.RuleKey(arw.DatasourceId, arw.Rule.Id) } func (arw *AlertRuleWorker) Hash() string { return str.MD5(fmt.Sprintf("%d_%s_%s_%d", arw.Rule.Id, arw.Rule.CronPattern, arw.Rule.RuleConfig, arw.DatasourceId, )) } func (arw *AlertRuleWorker) Prepare() { arw.Processor.RecoverAlertCurEventFromDb() } func (arw *AlertRuleWorker) Start() { arw.Scheduler.Start() } func (arw *AlertRuleWorker) Eval() { begin := time.Now() var message string defer func() { if len(message) == 0 { logger.Infof("alert_eval_%d datasource_%d finished, duration:%v", arw.Rule.Id, arw.DatasourceId, time.Since(begin)) } else { logger.Warningf("alert_eval_%d datasource_%d finished, duration:%v, message:%s", arw.Rule.Id, arw.DatasourceId, time.Since(begin), message) } }() if arw.Processor.PromEvalInterval == 0 { arw.Processor.PromEvalInterval = getPromEvalInterval(arw.Processor.ScheduleEntry.Schedule) } cachedRule := arw.Rule if cachedRule == nil { message = "rule not found" return } arw.Processor.Stats.CounterRuleEval.WithLabelValues().Inc() arw.HostAndDeviceIdentCache = sync.Map{} typ := cachedRule.GetRuleType() var ( anomalyPoints []models.AnomalyPoint recoverPoints []models.AnomalyPoint err error ) switch typ { case models.PROMETHEUS: anomalyPoints, err = arw.GetPromAnomalyPoint(cachedRule.RuleConfig) case models.HOST: anomalyPoints, err = arw.GetHostAnomalyPoint(cachedRule.RuleConfig) case models.LOKI: anomalyPoints, err = arw.GetPromAnomalyPoint(cachedRule.RuleConfig) default: anomalyPoints, recoverPoints, err = arw.GetAnomalyPoint(cachedRule, arw.Processor.DatasourceId()) } if err != nil { message = fmt.Sprintf("failed to get anomaly points: %v", err) return } if arw.Processor == nil { message = "processor is nil" return } if arw.Inhibit { pointsMap := make(map[string]models.AnomalyPoint) for _, point := range recoverPoints { // 对于恢复的事件,合并处理 tagHash := process.TagHash(point) p, exists := pointsMap[tagHash] if !exists { pointsMap[tagHash] = point continue } if p.Severity > point.Severity { hash := process.Hash(cachedRule.Id, arw.Processor.DatasourceId(), p) arw.Processor.DeleteProcessEvent(hash) models.AlertCurEventDelByHash(arw.Ctx, hash) pointsMap[tagHash] = point } } now := time.Now().Unix() for _, point := range pointsMap { str := fmt.Sprintf("%v", point.Value) arw.Processor.RecoverSingle(true, process.Hash(cachedRule.Id, arw.Processor.DatasourceId(), point), now, &str) } } else { now := time.Now().Unix() for _, point := range recoverPoints { str := fmt.Sprintf("%v", point.Value) arw.Processor.RecoverSingle(true, process.Hash(cachedRule.Id, arw.Processor.DatasourceId(), point), now, &str) } } arw.Processor.Handle(anomalyPoints, "inner", arw.Inhibit) } func (arw *AlertRuleWorker) Stop() { logger.Infof("alert_eval_%d datasource_%d stopped", arw.Rule.Id, arw.DatasourceId) close(arw.Quit) c := arw.Scheduler.Stop() <-c.Done() } func (arw *AlertRuleWorker) GetPromAnomalyPoint(ruleConfig string) ([]models.AnomalyPoint, error) { var lst []models.AnomalyPoint start := time.Now() defer func() { arw.Processor.Stats.GaugeRuleEvalDuration.WithLabelValues(fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId())).Set(float64(time.Since(start).Milliseconds())) }() var rule *models.PromRuleConfig if err := json.Unmarshal([]byte(ruleConfig), &rule); err != nil { logger.Errorf("alert_eval_%d datasource_%d rule_config:%s, error:%v", arw.Rule.Id, arw.DatasourceId, ruleConfig, err) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_RULE_CONFIG, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), "", ).Set(0) return lst, err } if rule == nil { logger.Errorf("alert_eval_%d datasource_%d rule_config:%s, error:rule is nil", arw.Rule.Id, arw.DatasourceId, ruleConfig) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_RULE_CONFIG, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), "", ).Set(0) return lst, errors.New("rule is nil") } arw.Inhibit = rule.Inhibit for i, query := range rule.Queries { readerClient := arw.PromClients.GetCli(arw.DatasourceId) if readerClient == nil { logger.Warningf("alert_eval_%d datasource_%d error reader client is nil", arw.Rule.Id, arw.DatasourceId) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_CLIENT, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), fmt.Sprintf("%v", i), ).Set(-2) continue } if query.VarEnabled && strings.Contains(query.PromQl, "$") { var anomalyPoints []models.AnomalyPoint if hasLabelLossAggregator(query) || notExactMatch(query) { // 若有聚合函数或非精确匹配则需要先填充变量然后查询,这个方式效率较低 anomalyPoints = arw.VarFillingBeforeQuery(query, readerClient) arw.Processor.Stats.CounterVarFillingQuery.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), fmt.Sprintf("%v", i), "BeforeQuery", ).Inc() } else { // 先查询再过滤变量,效率较高,但无法处理有聚合函数的情况 anomalyPoints = arw.VarFillingAfterQuery(query, readerClient) arw.Processor.Stats.CounterVarFillingQuery.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), fmt.Sprintf("%v", i), "AfterQuery", ).Inc() } lst = append(lst, anomalyPoints...) } else { // 无变量 promql := strings.TrimSpace(query.PromQl) if promql == "" { logger.Warningf("alert_eval_%d datasource_%d promql is blank", arw.Rule.Id, arw.DatasourceId) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), CHECK_QUERY, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() continue } if arw.PromClients.IsNil(arw.DatasourceId) { logger.Warningf("alert_eval_%d datasource_%d error reader client is nil", arw.Rule.Id, arw.DatasourceId) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_CLIENT, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() continue } var warnings promsdk.Warnings arw.Processor.Stats.CounterQueryDataTotal.WithLabelValues(fmt.Sprintf("%d", arw.DatasourceId), fmt.Sprintf("%d", arw.Rule.Id)).Inc() value, warnings, err := readerClient.Query(context.Background(), promql, time.Now()) if err != nil { logger.Errorf("alert_eval_%d datasource_%d promql:%s, error:%v", arw.Rule.Id, arw.DatasourceId, promql, err) arw.Processor.Stats.CounterQueryDataErrorTotal.WithLabelValues(fmt.Sprintf("%d", arw.DatasourceId)).Inc() arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), QUERY_DATA, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), fmt.Sprintf("%v", i), ).Set(-1) return lst, err } if len(warnings) > 0 { logger.Errorf("alert_eval_%d datasource_%d promql:%s, warnings:%v", arw.Rule.Id, arw.DatasourceId, promql, warnings) arw.Processor.Stats.CounterQueryDataErrorTotal.WithLabelValues(fmt.Sprintf("%d", arw.DatasourceId)).Inc() arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), QUERY_DATA, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() } logger.Infof("alert_eval_%d datasource_%d query:%+v, value:%v", arw.Rule.Id, arw.DatasourceId, query, value) points := models.ConvertAnomalyPoints(value) arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), fmt.Sprintf("%v", i), ).Set(float64(len(points))) for i := 0; i < len(points); i++ { points[i].Severity = query.Severity points[i].Query = promql points[i].ValuesUnit = map[string]unit.FormattedValue{ "v": unit.ValueFormatter(query.Unit, 2, points[i].Value), } } lst = append(lst, points...) } arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), fmt.Sprintf("%v", i), ).Set(float64(len(lst))) } return lst, nil } type sample struct { Metric model.Metric `json:"metric"` Value model.SampleValue `json:"value"` Timestamp model.Time } // VarFillingAfterQuery 填充变量,先查询再填充变量 // 公式: mem_used_percent{host="$host"} > $val 其中 $host 为参数变量,$val 为值变量 // 实现步骤: // 依次遍历参数配置节点,保证同一参数变量的子筛选可以覆盖上一层筛选 // 每个节点先查询无参数的 query, 即 mem_used_percent{} > curVal, 得到满足值变量的所有结果 // 结果中有满足本节点参数变量的值,加入异常点列表 // 参数变量的值不满足的组合,需要覆盖上层筛选中产生的异常点 func (arw *AlertRuleWorker) VarFillingAfterQuery(query models.PromQuery, readerClient promsdk.API) []models.AnomalyPoint { varToLabel := ExtractVarMapping(query.PromQl) fullQuery := removeVal(query.PromQl) // 存储所有的异常点,key 为参数变量的组合,可以实现子筛选对上一层筛选的覆盖 anomalyPointsMap := make(map[string]models.AnomalyPoint) // 统一变量配置格式 VarConfigForCalc := &models.ChildVarConfig{ ParamVal: make([]map[string]models.ParamQuery, 1), ChildVarConfigs: query.VarConfig.ChildVarConfigs, } VarConfigForCalc.ParamVal[0] = make(map[string]models.ParamQuery) for _, p := range query.VarConfig.ParamVal { VarConfigForCalc.ParamVal[0][p.Name] = models.ParamQuery{ ParamType: p.ParamType, Query: p.Query, } } // 使用一个统一的参数变量顺序 var ParamKeys []string for val, valQuery := range VarConfigForCalc.ParamVal[0] { if valQuery.ParamType == "threshold" { continue } ParamKeys = append(ParamKeys, val) } sort.Slice(ParamKeys, func(i, j int) bool { return ParamKeys[i] < ParamKeys[j] }) // 遍历变量配置链表 curNode := VarConfigForCalc for curNode != nil { for _, param := range curNode.ParamVal { // curQuery 当前节点的无参数 query,用于时序库查询 curQuery := fullQuery // realQuery 当前节点产生异常点的 query,用于告警展示 realQuery := query.PromQl // 取出阈值变量 valMap := make(map[string]string) for val, valQuery := range param { if valQuery.ParamType == "threshold" { valMap[val] = getString(valQuery.Query) } } // 替换值变量 for key, val := range valMap { curQuery = strings.Replace(curQuery, fmt.Sprintf("$%s", key), val, -1) realQuery = strings.Replace(realQuery, fmt.Sprintf("$%s", key), val, -1) } // 得到满足值变量的所有结果 arw.Processor.Stats.CounterQueryDataTotal.WithLabelValues(fmt.Sprintf("%d", arw.DatasourceId), fmt.Sprintf("%d", arw.Rule.Id)).Inc() value, _, err := readerClient.Query(context.Background(), curQuery, time.Now()) if err != nil { logger.Errorf("alert_eval_%d datasource_%d promql:%s, error:%v", arw.Rule.Id, arw.DatasourceId, curQuery, err) continue } seqVals := getSamples(value) // 得到参数变量的所有组合 paramPermutation, err := arw.getParamPermutation(param, ParamKeys, varToLabel, query.PromQl, readerClient) if err != nil { logger.Errorf("alert_eval_%d datasource_%d paramPermutation error:%v", arw.Rule.Id, arw.DatasourceId, err) continue } // 判断哪些参数值符合条件 for i := range seqVals { curRealQuery := realQuery var cur []string for _, paramKey := range ParamKeys { val := string(seqVals[i].Metric[model.LabelName(varToLabel[paramKey])]) cur = append(cur, val) curRealQuery = fillVar(curRealQuery, paramKey, val) } if _, ok := paramPermutation[strings.Join(cur, JoinMark)]; ok { anomalyPointsMap[strings.Join(cur, JoinMark)] = models.AnomalyPoint{ Key: seqVals[i].Metric.String(), Timestamp: seqVals[i].Timestamp.Unix(), Value: float64(seqVals[i].Value), Labels: seqVals[i].Metric, Severity: query.Severity, Query: curRealQuery, } // 生成异常点后,删除该参数组合 delete(paramPermutation, strings.Join(cur, JoinMark)) } } // 剩余的参数组合为本层筛选不产生异常点的组合,需要覆盖上层筛选中产生的异常点 for k, _ := range paramPermutation { delete(anomalyPointsMap, k) } } curNode = curNode.ChildVarConfigs } anomalyPoints := make([]models.AnomalyPoint, 0) for _, point := range anomalyPointsMap { anomalyPoints = append(anomalyPoints, point) } return anomalyPoints } // getSamples 获取查询结果的所有样本,并转化为统一的格式 func getSamples(value model.Value) []sample { var seqVals []sample switch value.Type() { case model.ValVector: items, ok := value.(model.Vector) if !ok { break } for i := range items { seqVals = append(seqVals, sample{ Metric: items[i].Metric, Value: items[i].Value, Timestamp: items[i].Timestamp, }) } case model.ValMatrix: items, ok := value.(model.Matrix) if !ok { break } for i := range items { last := items[i].Values[len(items[i].Values)-1] seqVals = append(seqVals, sample{ Metric: items[i].Metric, Value: last.Value, Timestamp: last.Timestamp, }) } default: } return seqVals } // removeVal 去除 promql 中的参数变量 // mem{test1=\"$test1\",test2=\"test2\"} > $val1 and mem{test3=\"test3\",test4=\"$test4\"} > $val2 // ==> mem{test2=\"test2\"} > $val1 and mem{test3=\"test3\"} > $val2 func removeVal(promql string) string { sb := strings.Builder{} n := len(promql) start := false lastIdx := 0 curIdx := 0 isVar := false for curIdx < n { if !start { if promql[curIdx] == '{' { start = true lastIdx = curIdx } sb.WriteRune(rune(promql[curIdx])) } else { if promql[curIdx] == '$' { isVar = true } if promql[curIdx] == ',' || promql[curIdx] == '}' { if !isVar { if sb.String()[sb.Len()-1] == '{' { lastIdx++ } sb.WriteString(promql[lastIdx:curIdx]) } isVar = false if promql[curIdx] == '}' { start = false sb.WriteRune(rune(promql[curIdx])) } lastIdx = curIdx } } curIdx++ } return sb.String() } // 获取参数变量的所有组合 func (arw *AlertRuleWorker) getParamPermutation(paramVal map[string]models.ParamQuery, paramKeys []string, varToLabel map[string]string, originPromql string, readerClient promsdk.API) (map[string]struct{}, error) { // 参数变量查询,得到参数变量值 paramMap := make(map[string][]string) for _, paramKey := range paramKeys { var params []string paramQuery, ok := paramVal[paramKey] if !ok { return nil, fmt.Errorf("param key not found: %s", paramKey) } switch paramQuery.ParamType { case "host": hostIdents, err := arw.getHostIdents(paramQuery) if err != nil { logger.Errorf("alert_eval_%d datasource_%d fail to get host idents, error:%v", arw.Rule.Id, arw.DatasourceId, err) break } params = hostIdents case "device": deviceIdents, err := arw.getDeviceIdents(paramQuery) if err != nil { logger.Errorf("alert_eval_%d datasource_%d fail to get device idents, error:%v", arw.Rule.Id, arw.DatasourceId, err) break } params = deviceIdents case "enum": q, _ := json.Marshal(paramQuery.Query) var query []string err := json.Unmarshal(q, &query) if err != nil { logger.Errorf("alert_eval_%d datasource_%d query:%s fail to unmarshalling into string slice, error:%v", arw.Rule.Id, arw.DatasourceId, paramQuery.Query, err) } if len(query) == 0 { paramsKeyAllLabel, err := getParamKeyAllLabel(varToLabel[paramKey], originPromql, readerClient, arw.DatasourceId, arw.Rule.Id, arw.Processor.Stats) if err != nil { logger.Errorf("alert_eval_%d datasource_%d fail to getParamKeyAllLabel, error:%v query:%s", arw.Rule.Id, arw.DatasourceId, err, paramQuery.Query) } params = paramsKeyAllLabel } else { params = query } default: return nil, fmt.Errorf("unknown param type: %s", paramQuery.ParamType) } if len(params) == 0 { return nil, fmt.Errorf("param key: %s, params is empty", paramKey) } logger.Infof("alert_eval_%d datasource_%d paramKey: %s, params: %v", arw.Rule.Id, arw.DatasourceId, paramKey, params) paramMap[paramKey] = params } // 得到以 paramKeys 为顺序的所有参数组合 permutation := mapPermutation(paramKeys, paramMap) res := make(map[string]struct{}) for i := range permutation { res[strings.Join(permutation[i], JoinMark)] = struct{}{} } return res, nil } func getParamKeyAllLabel(paramKey string, promql string, client promsdk.API, dsId int64, rid int64, stats *astats.Stats) ([]string, error) { labels, metricName, err := promql2.GetLabelsAndMetricNameWithReplace(promql, "$") if err != nil { return nil, fmt.Errorf("promql:%s, get labels error:%v", promql, err) } labelstrs := make([]string, 0) for _, label := range labels { if strings.HasPrefix(label.Value, "$") { continue } labelstrs = append(labelstrs, label.Name+label.Op+label.Value) } pr := metricName + "{" + strings.Join(labelstrs, ",") + "}" stats.CounterQueryDataTotal.WithLabelValues(fmt.Sprintf("%d", dsId), fmt.Sprintf("%d", rid)).Inc() value, _, err := client.Query(context.Background(), pr, time.Now()) if err != nil { return nil, fmt.Errorf("promql: %s query error: %v", pr, err) } labelValuesMap := make(map[string]struct{}) switch value.Type() { case model.ValVector: vector := value.(model.Vector) for _, sample := range vector { for labelName, labelValue := range sample.Metric { // 只处理ParamKeys中指定的label if string(labelName) == paramKey { labelValuesMap[string(labelValue)] = struct{}{} } } } case model.ValMatrix: matrix := value.(model.Matrix) for _, series := range matrix { for labelName, labelValue := range series.Metric { // 只处理ParamKeys中指定的label if string(labelName) == paramKey { labelValuesMap[string(labelValue)] = struct{}{} } } } } result := make([]string, 0) for labelValue, _ := range labelValuesMap { result = append(result, labelValue) } return result, nil } func (arw *AlertRuleWorker) getHostIdents(paramQuery models.ParamQuery) ([]string, error) { var params []string q, _ := json.Marshal(paramQuery.Query) cacheKey := "Host_" + string(q) value, hit := arw.HostAndDeviceIdentCache.Load(cacheKey) if idents, ok := value.([]string); hit && ok { params = idents return params, nil } var queries []models.HostQuery err := json.Unmarshal(q, &queries) if err != nil { return nil, err } if !arw.Ctx.IsCenter { lst, err := poster.PostByUrlsWithResp[[]*models.Target](arw.Ctx, "/v1/n9e/targets-of-host-query", queries) if err != nil { return nil, err } for i := range lst { params = append(params, lst[i].Ident) } } else { hostsQuery := models.GetHostsQuery(queries) session := models.TargetFilterQueryBuild(arw.Ctx, hostsQuery, 0, 0) var lst []*models.Target err = session.Find(&lst).Error if err != nil { return nil, err } for i := range lst { params = append(params, lst[i].Ident) } } arw.HostAndDeviceIdentCache.Store(cacheKey, params) return params, nil } func (arw *AlertRuleWorker) getDeviceIdents(paramQuery models.ParamQuery) ([]string, error) { return arw.DeviceIdentHook(arw, paramQuery) } // 生成所有排列组合 func mapPermutation(paramKeys []string, paraMap map[string][]string) [][]string { var result [][]string current := make([]string, len(paramKeys)) combine(paramKeys, paraMap, 0, current, &result) return result } // 递归生成所有排列组合 func combine(paramKeys []string, paraMap map[string][]string, index int, current []string, result *[][]string) { // 当到达最后一个 key 时,存储当前的组合 if index == len(paramKeys) { combination := make([]string, len(current)) copy(combination, current) *result = append(*result, combination) return } // 获取当前 key 对应的 value 列表 key := paramKeys[index] valueList := paraMap[key] // 遍历每个 value,并递归生成下一个 key 的组合 for _, value := range valueList { current[index] = value combine(paramKeys, paraMap, index+1, current, result) } } func (arw *AlertRuleWorker) GetHostAnomalyPoint(ruleConfig string) ([]models.AnomalyPoint, error) { var lst []models.AnomalyPoint start := time.Now() defer func() { arw.Processor.Stats.GaugeRuleEvalDuration.WithLabelValues(fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId())).Set(float64(time.Since(start).Milliseconds())) }() var rule *models.HostRuleConfig if err := json.Unmarshal([]byte(ruleConfig), &rule); err != nil { logger.Errorf("alert_eval_%d datasource_%d rule_config:%s, error:%v", arw.Rule.Id, arw.DatasourceId, ruleConfig, err) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_RULE_CONFIG, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), "", ).Set(0) return lst, err } if rule == nil { logger.Errorf("alert_eval_%d datasource_%d rule_config:%s, error:rule is nil", arw.Rule.Id, arw.DatasourceId, ruleConfig) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_RULE_CONFIG, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), "", ).Set(0) return lst, errors.New("rule is nil") } arw.Inhibit = rule.Inhibit now := time.Now().Unix() for _, trigger := range rule.Triggers { switch trigger.Type { case "target_miss": t := now - int64(trigger.Duration) var idents, engineIdents, missEngineIdents []string var exists bool if arw.Ctx.IsCenter { // 如果是中心节点, 将不再上报数据的主机 engineName 为空的机器,也加入到 targets 中 missEngineIdents, exists = arw.Processor.TargetsOfAlertRuleCache.Get("", arw.Rule.Id) if !exists { logger.Debugf("alert_eval_%d datasource_%d targets not found engineName:%s", arw.Rule.Id, arw.DatasourceId, arw.Processor.EngineName) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), QUERY_DATA, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() } } idents = append(idents, missEngineIdents...) engineIdents, exists = arw.Processor.TargetsOfAlertRuleCache.Get(arw.Processor.EngineName, arw.Rule.Id) if !exists { logger.Warningf("alert_eval_%d datasource_%d targets not found engineName:%s", arw.Rule.Id, arw.DatasourceId, arw.Processor.EngineName) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), QUERY_DATA, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() } idents = append(idents, engineIdents...) if len(idents) == 0 { arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), "", ).Set(0) continue } var missTargets []string targetUpdateTimeMap := arw.Processor.TargetCache.GetHostUpdateTime(idents) for ident, updateTime := range targetUpdateTimeMap { if updateTime < t { missTargets = append(missTargets, ident) } } arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), "", ).Set(float64(len(missTargets))) logger.Debugf("alert_eval_%d datasource_%d missTargets:%v", arw.Rule.Id, arw.DatasourceId, missTargets) targets := arw.Processor.TargetCache.Gets(missTargets) for _, target := range targets { m := make(map[string]string) for k, v := range target.TagsMap { m[k] = v } m["ident"] = target.Ident lst = append(lst, models.NewAnomalyPoint(trigger.Type, m, now, float64(now-target.BeatTime), trigger.Severity)) } case "offset": idents, exists := arw.Processor.TargetsOfAlertRuleCache.Get(arw.Processor.EngineName, arw.Rule.Id) if !exists { arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), "", ).Set(0) logger.Warningf("alert_eval_%d datasource_%d targets not found", arw.Rule.Id, arw.DatasourceId) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), QUERY_DATA, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() continue } targets := arw.Processor.TargetCache.Gets(idents) targetMap := make(map[string]*models.Target) for _, target := range targets { targetMap[target.Ident] = target } offsetIdents := make(map[string]int64) targetsMeta := arw.Processor.TargetCache.GetHostMetas(targets) for ident, meta := range targetsMeta { if meta.CpuNum <= 0 { // means this target is not collect by categraf, do not check offset continue } if target, exists := targetMap[ident]; exists { if now-target.BeatTime > 120 { // means this target is not a active host, do not check offset continue } } offset := meta.Offset if math.Abs(float64(offset)) > float64(trigger.Duration) { offsetIdents[ident] = offset } } logger.Debugf("alert_eval_%d datasource_%d offsetIdents:%v", arw.Rule.Id, arw.DatasourceId, offsetIdents) arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), "", ).Set(float64(len(offsetIdents))) for host, offset := range offsetIdents { m := make(map[string]string) target, exists := arw.Processor.TargetCache.Get(host) if exists { for k, v := range target.TagsMap { m[k] = v } } m["ident"] = host lst = append(lst, models.NewAnomalyPoint(trigger.Type, m, now, float64(offset), trigger.Severity)) } case "pct_target_miss": t := now - int64(trigger.Duration) idents, exists := arw.Processor.TargetsOfAlertRuleCache.Get(arw.Processor.EngineName, arw.Rule.Id) if !exists { arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), "", ).Set(0) logger.Warningf("alert_eval_%d datasource_%d targets not found", arw.Rule.Id, arw.DatasourceId) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), QUERY_DATA, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() continue } var missTargets []string targetUpdateTimeMap := arw.Processor.TargetCache.GetHostUpdateTime(idents) for ident, updateTime := range targetUpdateTimeMap { if updateTime < t { missTargets = append(missTargets, ident) } } logger.Debugf("alert_eval_%d datasource_%d missTargets:%v", arw.Rule.Id, arw.DatasourceId, missTargets) arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), "", ).Set(float64(len(missTargets))) pct := float64(len(missTargets)) / float64(len(idents)) * 100 if pct >= float64(trigger.Percent) { lst = append(lst, models.NewAnomalyPoint(trigger.Type, nil, now, pct, trigger.Severity)) } } } return lst, nil } func flatten(rehashed map[uint64][][]uint64) map[uint64][]uint64 { seriesTagIndex := make(map[uint64][]uint64) var i uint64 for _, HashTagIndex := range rehashed { for u := range HashTagIndex { seriesTagIndex[i] = HashTagIndex[u] i++ } } return seriesTagIndex } // onJoin 组合两个经过 rehash 之后的集合 // 如查询 A,经过 on data_base rehash 分组后 // [[A1{data_base=1, table=alert},A2{data_base=1, table=alert}],[A5{data_base=1, table=board}]] // [[A3{data_base=2, table=board}],[A4{data_base=2, table=alert}]] // 查询 B,经过 on data_base rehash 分组后 // [[B1{data_base=1, table=alert}]] // [[B2{data_base=2, table=alert}]] // 内联得到 // [[A1{data_base=1, table=alert},A2{data_base=1, table=alert},B1{data_base=1, table=alert}],[A5{data_base=1, table=board},[B1{data_base=1, table=alert}]] // [[A3{data_base=2, table=board},B2{data_base=2, table=alert}],[A4{data_base=2, table=alert},B2{data_base=2, table=alert}]] func onJoin(reHashTagIndex1 map[uint64][][]uint64, reHashTagIndex2 map[uint64][][]uint64, joinType JoinType) map[uint64][][]uint64 { reHashTagIndex := make(map[uint64][][]uint64) for rehash := range reHashTagIndex1 { if _, ok := reHashTagIndex2[rehash]; ok { // 若有 rehash 相同的记录,两两合并 for i1 := range reHashTagIndex1[rehash] { for i2 := range reHashTagIndex2[rehash] { reHashTagIndex[rehash] = append(reHashTagIndex[rehash], mergeNewArray(reHashTagIndex1[rehash][i1], reHashTagIndex2[rehash][i2])) } } } else { // 合并方式不为 inner 时,需要保留 reHashTagIndex1 中未匹配的记录 if joinType != Inner { reHashTagIndex[rehash] = reHashTagIndex1[rehash] } } } return reHashTagIndex } // rehashSet 重新 hash 分组 // 如当前查询 A 有五条记录 // A1{data_base=1, table=alert} // A2{data_base=1, table=alert} // A3{data_base=2, table=board} // A4{data_base=2, table=alert} // A5{data_base=1, table=board} // 经过预处理(按曲线分组,此步已在进入 GetAnomalyPoint 函数前完成)后,分为 4 组, // [A1{data_base=1, table=alert},A2{data_base=1, table=alert}] // [A3{data_base=2, table=board}] // [A4{data_base=2, table=alert}] // [A5{data_base=1, table=board}] // 若 rehashSet 按 data_base 重新分组,此时会得到按 rehash 值分的二维数组,即不会将 rehash 值相同的记录完全合并 // [[A1{data_base=1, table=alert},A2{data_base=1, table=alert}],[A5{data_base=1, table=board}]] // [[A3{data_base=2, table=board}],[A4{data_base=2, table=alert}]] func rehashSet(seriesTagIndex1 map[uint64][]uint64, seriesStore map[uint64]models.DataResp, on []string) map[uint64][][]uint64 { reHashTagIndex := make(map[uint64][][]uint64) for _, seriesHashes := range seriesTagIndex1 { if len(seriesHashes) == 0 { continue } series, exists := seriesStore[seriesHashes[0]] if !exists { continue } rehash := hash.GetTargetTagHash(series.Metric, on) if _, ok := reHashTagIndex[rehash]; !ok { reHashTagIndex[rehash] = make([][]uint64, 0) } reHashTagIndex[rehash] = append(reHashTagIndex[rehash], seriesHashes) } return reHashTagIndex } // 笛卡尔积,查询的结果两两合并 func cartesianJoin(seriesTagIndex1 map[uint64][]uint64, seriesTagIndex2 map[uint64][]uint64) map[uint64][]uint64 { var index uint64 seriesTagIndex := make(map[uint64][]uint64) for _, seriesHashes1 := range seriesTagIndex1 { for _, seriesHashes2 := range seriesTagIndex2 { seriesTagIndex[index] = mergeNewArray(seriesHashes1, seriesHashes2) index++ } } return seriesTagIndex } // noneJoin 直接拼接 func noneJoin(seriesTagIndex1 map[uint64][]uint64, seriesTagIndex2 map[uint64][]uint64) map[uint64][]uint64 { seriesTagIndex := make(map[uint64][]uint64) var index uint64 for _, seriesHashes := range seriesTagIndex1 { seriesTagIndex[index] = seriesHashes index++ } for _, seriesHashes := range seriesTagIndex2 { seriesTagIndex[index] = seriesHashes index++ } return seriesTagIndex } // originalJoin 原始分组方案,key 相同,即标签全部相同分为一组 func originalJoin(seriesTagIndex1 map[uint64][]uint64, seriesTagIndex2 map[uint64][]uint64) map[uint64][]uint64 { seriesTagIndex := make(map[uint64][]uint64) for tagHash, seriesHashes := range seriesTagIndex1 { if _, ok := seriesTagIndex[tagHash]; !ok { seriesTagIndex[tagHash] = mergeNewArray(seriesHashes) } else { seriesTagIndex[tagHash] = append(seriesTagIndex[tagHash], seriesHashes...) } } for tagHash, seriesHashes := range seriesTagIndex2 { if _, ok := seriesTagIndex[tagHash]; !ok { seriesTagIndex[tagHash] = mergeNewArray(seriesHashes) } else { seriesTagIndex[tagHash] = append(seriesTagIndex[tagHash], seriesHashes...) } } return seriesTagIndex } // exclude 左斥,留下在 reHashTagIndex1 中,但不在 reHashTagIndex2 中的记录 func exclude(reHashTagIndex1 map[uint64][][]uint64, reHashTagIndex2 map[uint64][][]uint64) map[uint64][][]uint64 { reHashTagIndex := make(map[uint64][][]uint64) for rehash, _ := range reHashTagIndex1 { if _, ok := reHashTagIndex2[rehash]; !ok { reHashTagIndex[rehash] = reHashTagIndex1[rehash] } } return reHashTagIndex } func MakeSeriesMap(series []models.DataResp, seriesTagIndex map[uint64][]uint64, seriesStore map[uint64]models.DataResp) { for i := 0; i < len(series); i++ { seriesHash := hash.GetHash(series[i].Metric, series[i].Ref) tagHash := hash.GetTagHash(series[i].Metric) seriesStore[seriesHash] = series[i] // 将曲线按照相同的 tag 分组 if _, exists := seriesTagIndex[tagHash]; !exists { seriesTagIndex[tagHash] = make([]uint64, 0) } seriesTagIndex[tagHash] = append(seriesTagIndex[tagHash], seriesHash) } } func mergeNewArray(arg ...[]uint64) []uint64 { res := make([]uint64, 0) for _, a := range arg { res = append(res, a...) } return res } func ProcessJoins(ruleId int64, trigger models.Trigger, seriesTagIndexes map[string]map[uint64][]uint64, seriesStore map[uint64]models.DataResp) map[uint64][]uint64 { last := make(map[uint64][]uint64) if len(seriesTagIndexes) == 0 { return last } if len(trigger.Joins) == 0 { idx := 0 for _, seriesTagIndex := range seriesTagIndexes { if idx == 0 { last = seriesTagIndex } else { last = originalJoin(last, seriesTagIndex) } idx++ } return last } // 有 join 条件,按条件依次合并 if len(seriesTagIndexes) < len(trigger.Joins)+1 { logger.Errorf("alert_eval_%d queries' count: %d not match join condition's count: %d", ruleId, len(seriesTagIndexes), len(trigger.Joins)) return nil } last = seriesTagIndexes[trigger.JoinRef] lastRehashed := rehashSet(last, seriesStore, trigger.Joins[0].On) for i := range trigger.Joins { cur := seriesTagIndexes[trigger.Joins[i].Ref] switch trigger.Joins[i].JoinType { case "original": last = originalJoin(last, cur) case "none": last = noneJoin(last, cur) case "cartesian": last = cartesianJoin(last, cur) case "inner_join": curRehashed := rehashSet(cur, seriesStore, trigger.Joins[i].On) lastRehashed = onJoin(lastRehashed, curRehashed, Inner) last = flatten(lastRehashed) case "left_join": curRehashed := rehashSet(cur, seriesStore, trigger.Joins[i].On) lastRehashed = onJoin(lastRehashed, curRehashed, Left) last = flatten(lastRehashed) case "right_join": curRehashed := rehashSet(cur, seriesStore, trigger.Joins[i].On) lastRehashed = onJoin(curRehashed, lastRehashed, Right) last = flatten(lastRehashed) case "left_exclude": curRehashed := rehashSet(cur, seriesStore, trigger.Joins[i].On) lastRehashed = exclude(lastRehashed, curRehashed) last = flatten(lastRehashed) case "right_exclude": curRehashed := rehashSet(cur, seriesStore, trigger.Joins[i].On) lastRehashed = exclude(curRehashed, lastRehashed) last = flatten(lastRehashed) default: logger.Warningf("alert_eval_%d join type:%s not support", ruleId, trigger.Joins[i].JoinType) } } return last } func GetQueryRef(query interface{}) (string, error) { // 首先检查是否为 map if m, ok := query.(map[string]interface{}); ok { if ref, exists := m["ref"]; exists { if refStr, ok := ref.(string); ok { return refStr, nil } return "", fmt.Errorf("ref 字段不是字符串类型") } return "", fmt.Errorf("query 中没有找到 ref 字段") } // 如果不是 map,则按原来的方式处理结构体 v := reflect.ValueOf(query) if v.Kind() == reflect.Ptr { v = v.Elem() } if v.Kind() != reflect.Struct { return "", fmt.Errorf("query not a struct or map") } refField := v.FieldByName("Ref") if !refField.IsValid() { return "", fmt.Errorf("not find ref field") } if refField.Kind() != reflect.String { return "", fmt.Errorf("ref not a string") } return refField.String(), nil } // query 可能是 string 或是 int int64 float64 等数字,全部转为 string func getString(query interface{}) string { switch query.(type) { case string: return query.(string) case float64: return strconv.FormatFloat(query.(float64), 'f', -1, 64) default: return "" } } func GetQueryRefAndUnit(query interface{}) (string, string, error) { type Query struct { Ref string `json:"ref"` Unit string `json:"unit"` } queryMap := Query{} queryBytes, err := json.Marshal(query) if err != nil { return "", "", err } json.Unmarshal(queryBytes, &queryMap) return queryMap.Ref, queryMap.Unit, nil } // VarFillingBeforeQuery 填充变量,先填充变量再查询,针对有聚合函数的情况 // 公式: avg(mem_used_percent{host="$host"}) > $val 其中 $host 为参数变量,$val 为值变量 // 实现步骤: // 依次遍历参数配置节点,保证同一参数变量的子筛选可以覆盖上一层筛选 // 每个节点先填充参数再进行查询, 即先得到完整的 promql avg(mem_used_percent{host="127.0.0.1"}) > 5 // 再查询得到满足值变量的所有结果加入异常点列表 // 参数变量的值不满足的组合,需要覆盖上层筛选中产生的异常点 func (arw *AlertRuleWorker) VarFillingBeforeQuery(query models.PromQuery, readerClient promsdk.API) []models.AnomalyPoint { varToLabel := ExtractVarMapping(query.PromQl) // 存储异常点的 map,key 为参数变量的组合,可以实现子筛选对上一层筛选的覆盖 anomalyPointsMap := sync.Map{} // 统一变量配置格式 VarConfigForCalc := &models.ChildVarConfig{ ParamVal: make([]map[string]models.ParamQuery, 1), ChildVarConfigs: query.VarConfig.ChildVarConfigs, } VarConfigForCalc.ParamVal[0] = make(map[string]models.ParamQuery) for _, p := range query.VarConfig.ParamVal { VarConfigForCalc.ParamVal[0][p.Name] = models.ParamQuery{ ParamType: p.ParamType, Query: p.Query, } } // 使用一个统一的参数变量顺序 var ParamKeys []string for val, valQuery := range VarConfigForCalc.ParamVal[0] { if valQuery.ParamType == "threshold" { continue } ParamKeys = append(ParamKeys, val) } sort.Slice(ParamKeys, func(i, j int) bool { return ParamKeys[i] < ParamKeys[j] }) // 遍历变量配置链表 curNode := VarConfigForCalc for curNode != nil { for _, param := range curNode.ParamVal { curPromql := query.PromQl // 取出阈值变量 valMap := make(map[string]string) for val, valQuery := range param { if valQuery.ParamType == "threshold" { valMap[val] = getString(valQuery.Query) } } // 替换阈值变量 for key, val := range valMap { curPromql = strings.Replace(curPromql, fmt.Sprintf("$%s", key), val, -1) } // 得到参数变量的所有组合 paramPermutation, err := arw.getParamPermutation(param, ParamKeys, varToLabel, query.PromQl, readerClient) if err != nil { logger.Errorf("alert_eval_%d datasource_%d paramPermutation error:%v", arw.Rule.Id, arw.DatasourceId, err) continue } keyToPromql := make(map[string]string) for paramPermutationKeys, _ := range paramPermutation { realPromql := curPromql split := strings.Split(paramPermutationKeys, JoinMark) for j := range ParamKeys { realPromql = fillVar(realPromql, ParamKeys[j], split[j]) } keyToPromql[paramPermutationKeys] = realPromql } // 并发查询 wg := sync.WaitGroup{} semaphore := make(chan struct{}, 200) for key, promql := range keyToPromql { wg.Add(1) semaphore <- struct{}{} go func(key, promql string) { defer func() { <-semaphore wg.Done() }() arw.Processor.Stats.CounterQueryDataTotal.WithLabelValues(fmt.Sprintf("%d", arw.DatasourceId), fmt.Sprintf("%d", arw.Rule.Id)).Inc() value, _, err := readerClient.Query(context.Background(), promql, time.Now()) if err != nil { logger.Errorf("alert_eval_%d datasource_%d promql:%s, error:%v", arw.Rule.Id, arw.DatasourceId, promql, err) return } logger.Infof("alert_eval_%d datasource_%d promql:%s, value:%+v", arw.Rule.Id, arw.DatasourceId, promql, value) points := models.ConvertAnomalyPoints(value) if len(points) == 0 { anomalyPointsMap.Delete(key) return } for i := 0; i < len(points); i++ { points[i].Severity = query.Severity points[i].Query = promql points[i].ValuesUnit = map[string]unit.FormattedValue{ "v": unit.ValueFormatter(query.Unit, 2, points[i].Value), } // 每个异常点都需要生成 key,子筛选使用 key 覆盖上层筛选,解决 issue https://github.com/ccfos/nightingale/issues/2433 提的问题 var cur []string for _, paramKey := range ParamKeys { val := string(points[i].Labels[model.LabelName(varToLabel[paramKey])]) cur = append(cur, val) } anomalyPointsMap.Store(strings.Join(cur, JoinMark), points[i]) } }(key, promql) } wg.Wait() } curNode = curNode.ChildVarConfigs } anomalyPoints := make([]models.AnomalyPoint, 0) anomalyPointsMap.Range(func(key, value any) bool { if point, ok := value.(models.AnomalyPoint); ok { anomalyPoints = append(anomalyPoints, point) } return true }) return anomalyPoints } // 判断 query 中是否有会导致标签丢失的聚合函数 func hasLabelLossAggregator(query models.PromQuery) bool { noLabelAggregators := []string{ "sum", "min", "max", "avg", "stddev", "stdvar", "count", "quantile", "group", } promql := strings.ToLower(query.PromQl) for _, fn := range noLabelAggregators { // 检查是否包含这些聚合函数,需要确保函数名后面跟着左括号 if strings.Contains(promql, fn+"(") { return true } } return false } // 判断 query 中是否有 != =~ !~ func notExactMatch(query models.PromQuery) bool { promql := strings.ToLower(query.PromQl) if strings.Contains(promql, "!=") || strings.Contains(promql, "=~") || strings.Contains(promql, "!~") { return true } return false } // ExtractVarMapping 从 promql 中提取变量映射关系,为了在 query 之后可以将标签正确的放回 promql // 输入: sum(rate(mem_used_percent{host="$my_host"})) by (instance) + avg(node_load1{region="$region"}) > $val // 输出: map[string]string{"my_host":"host", "region":"region"} func ExtractVarMapping(promql string) map[string]string { varMapping := make(map[string]string) // 遍历所有花括号对 for { start := strings.Index(promql, "{") if start == -1 { break } end := strings.Index(promql, "}") if end == -1 { break } // 提取标签键值对 labels := promql[start+1 : end] pairs := strings.Split(labels, ",") for _, pair := range pairs { // 分割键值对 var kv []string if strings.Contains(pair, "!=") { kv = strings.Split(pair, "!=") } else if strings.Contains(pair, "=~") { kv = strings.Split(pair, "=~") } else if strings.Contains(pair, "!~") { kv = strings.Split(pair, "!~") } else { kv = strings.Split(pair, "=") } if len(kv) != 2 { continue } key := strings.TrimSpace(kv[0]) value := strings.Trim(strings.TrimSpace(kv[1]), "\"") value = strings.Trim(value, "'") // 检查值是否为变量(以$开头) if strings.HasPrefix(value, "$") { varName := value[1:] // 去掉$前缀 varMapping[varName] = key } } // 继续处理剩余部分 promql = promql[end+1:] } return varMapping } func fillVar(curRealQuery string, paramKey string, val string) string { curRealQuery = strings.Replace(curRealQuery, fmt.Sprintf("'$%s'", paramKey), fmt.Sprintf("'%s'", val), -1) curRealQuery = strings.Replace(curRealQuery, fmt.Sprintf("\"$%s\"", paramKey), fmt.Sprintf("\"%s\"", val), -1) return curRealQuery } func (arw *AlertRuleWorker) GetAnomalyPoint(rule *models.AlertRule, dsId int64) ([]models.AnomalyPoint, []models.AnomalyPoint, error) { // 获取查询和规则判断条件 start := time.Now() defer func() { arw.Processor.Stats.GaugeRuleEvalDuration.WithLabelValues(fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId())).Set(float64(time.Since(start).Milliseconds())) }() points := []models.AnomalyPoint{} recoverPoints := []models.AnomalyPoint{} ruleConfig := strings.TrimSpace(rule.RuleConfig) if ruleConfig == "" { logger.Warningf("alert_eval_%d datasource_%d ruleConfig is blank", rule.Id, dsId) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_RULE_CONFIG, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), "", ).Set(0) return points, recoverPoints, fmt.Errorf("alert_eval_%d datasource_%d ruleConfig is blank", rule.Id, dsId) } var ruleQuery models.RuleQuery err := json.Unmarshal([]byte(ruleConfig), &ruleQuery) if err != nil { logger.Warningf("alert_eval_%d datasource_%d promql parse error:%s", rule.Id, dsId, err.Error()) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_RULE_CONFIG, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() return points, recoverPoints, fmt.Errorf("alert_eval_%d datasource_%d promql parse error:%s", rule.Id, dsId, err.Error()) } arw.Inhibit = ruleQuery.Inhibit if len(ruleQuery.Queries) > 0 { seriesStore := make(map[uint64]models.DataResp) seriesTagIndexes := make(map[string]map[uint64][]uint64, 0) for i, query := range ruleQuery.Queries { seriesTagIndex := make(map[uint64][]uint64) plug, exists := dscache.DsCache.Get(rule.Cate, dsId) if !exists { logger.Warningf("alert_eval_%d datasource_%d not exists", rule.Id, dsId) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_CLIENT, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), fmt.Sprintf("%v", i), ).Set(-2) return points, recoverPoints, fmt.Errorf("alert_eval_%d datasource_%d not exists", rule.Id, dsId) } if err = ExecuteQueryTemplate(rule.Cate, query, nil); err != nil { logger.Warningf("alert_eval_%d datasource_%d execute query template error: %v", rule.Id, dsId, err) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), EXEC_TEMPLATE, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), fmt.Sprintf("%v", i), ).Set(-3) } ctx := context.WithValue(context.Background(), "delay", int64(rule.Delay)) series, err := plug.QueryData(ctx, query) arw.Processor.Stats.CounterQueryDataTotal.WithLabelValues(fmt.Sprintf("%d", arw.DatasourceId), fmt.Sprintf("%d", rule.Id)).Inc() if err != nil { logger.Warningf("alert_eval_%d datasource_%d query data error: %v", rule.Id, dsId, err) arw.Processor.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", arw.Processor.DatasourceId()), GET_CLIENT, arw.Processor.BusiGroupCache.GetNameByBusiGroupId(arw.Rule.GroupId), fmt.Sprintf("%v", arw.Rule.Id)).Inc() arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), fmt.Sprintf("%v", i), ).Set(-1) return points, recoverPoints, fmt.Errorf("alert_eval_%d datasource_%d query data error: %v", rule.Id, dsId, err) } arw.Processor.Stats.GaugeQuerySeriesCount.WithLabelValues( fmt.Sprintf("%v", arw.Rule.Id), fmt.Sprintf("%v", arw.Processor.DatasourceId()), fmt.Sprintf("%v", i), ).Set(float64(len(series))) // 此条日志很重要,是告警判断的现场值 logger.Infof("alert_eval_%d datasource_%d req:%+v resp:%v", rule.Id, dsId, query, series) for i := 0; i < len(series); i++ { seriesHash := hash.GetHash(series[i].Metric, series[i].Ref) tagHash := hash.GetTagHash(series[i].Metric) seriesStore[seriesHash] = series[i] // 将曲线按照相同的 tag 分组 if _, exists := seriesTagIndex[tagHash]; !exists { seriesTagIndex[tagHash] = make([]uint64, 0) } seriesTagIndex[tagHash] = append(seriesTagIndex[tagHash], seriesHash) } ref, err := GetQueryRef(query) if err != nil { logger.Warningf("alert_eval_%d datasource_%d query:%+v get ref error:%s", rule.Id, dsId, query, err.Error()) continue } seriesTagIndexes[ref] = seriesTagIndex } unitMap := make(map[string]string) for _, query := range ruleQuery.Queries { ref, unit, err := GetQueryRefAndUnit(query) if err != nil { logger.Warningf("alert_eval_%d datasource_%d query:%+v get ref and unit error:%s", rule.Id, dsId, query, err.Error()) continue } unitMap[ref] = unit } if !ruleQuery.ExpTriggerDisable { for _, trigger := range ruleQuery.Triggers { seriesTagIndex := ProcessJoins(rule.Id, trigger, seriesTagIndexes, seriesStore) for _, seriesHash := range seriesTagIndex { valuesUnitMap := make(map[string]unit.FormattedValue) sort.Slice(seriesHash, func(i, j int) bool { return seriesHash[i] < seriesHash[j] }) m := make(map[string]interface{}) var ts int64 var sample models.DataResp var value float64 for _, seriesHash := range seriesHash { series, exists := seriesStore[seriesHash] if !exists { logger.Warningf("alert_eval_%d datasource_%d series:%+v not found", rule.Id, dsId, series) continue } t, v, exists := series.Last() if !exists { logger.Warningf("alert_eval_%d datasource_%d series:%+v value not found", rule.Id, dsId, series) continue } if !strings.Contains(trigger.Exp, "$"+series.Ref) { // 表达式中不包含该变量 continue } m["$"+series.Ref] = v m["$"+series.Ref+"."+series.MetricName()] = v for k, v := range series.Metric { if k == "__name__" { continue } if !strings.Contains(trigger.Exp, "$"+series.Ref+"."+string(k)) { // 过滤掉表达式中不包含的标签 continue } m["$"+series.Ref+"."+string(k)] = string(v) } if u, exists := unitMap[series.Ref]; exists { valuesUnitMap["$"+series.Ref+"."+series.MetricName()] = unit.ValueFormatter(u, 2, v) } ts = int64(t) sample = series value = v logger.Infof("alert_eval_%d datasource_%d origin series labels:%+v", rule.Id, dsId, series.Metric) } isTriggered := parser.CalcWithRid(trigger.Exp, m, rule.Id) // 此条日志很重要,是告警判断的现场值 logger.Infof("alert_eval_%d datasource_%d trigger:%+v exp:%s res:%v m:%v", rule.Id, dsId, trigger, trigger.Exp, isTriggered, m) var values string for k, v := range m { if !strings.Contains(k, ".") { continue } if u, exists := valuesUnitMap[k]; exists { // 配置了单位,优先用配置了单位的值 values += fmt.Sprintf("%s:%s ", k, u.Text) } else { switch v.(type) { case float64: values += fmt.Sprintf("%s:%.3f ", k, v) case string: values += fmt.Sprintf("%s:%s ", k, v) } } } queries := ruleQuery.Queries if sample.Query != "" { queries = []interface{}{sample.Query} } point := models.AnomalyPoint{ Key: sample.MetricName(), Labels: sample.Metric, Timestamp: int64(ts), Value: value, Values: values, Severity: trigger.Severity, Triggered: isTriggered, Query: fmt.Sprintf("query:%+v trigger:%+v", queries, trigger), RecoverConfig: trigger.RecoverConfig, ValuesUnit: valuesUnitMap, } if isTriggered { points = append(points, point) } else { switch trigger.RecoverConfig.JudgeType { case models.Origin: // do nothing case models.RecoverOnCondition: fulfill := parser.CalcWithRid(trigger.RecoverConfig.RecoverExp, m, rule.Id) if !fulfill { continue } } recoverPoints = append(recoverPoints, point) } } } } if ruleQuery.NodataTrigger.Enable { now := time.Now().Unix() // 使用 arw.LastSeriesStore 检查上次查询结果 if len(arw.LastSeriesStore) > 0 { // 遍历上次的曲线数据 for hash, lastSeries := range arw.LastSeriesStore { if ruleQuery.NodataTrigger.ResolveAfterEnable { lastTs, _, exists := lastSeries.Last() if !exists { continue } // 检查是否超过 resolve_after 时间 if now-int64(lastTs) > int64(ruleQuery.NodataTrigger.ResolveAfter) { logger.Infof("alert_eval_%d datasource_%d series:%+v resolve after %d seconds now:%d lastTs:%d", rule.Id, dsId, lastSeries, ruleQuery.NodataTrigger.ResolveAfter, now, int64(lastTs)) delete(arw.LastSeriesStore, hash) continue } } // 检查是否在本次查询结果中存在 if _, exists := seriesStore[hash]; !exists { // 生成无数据告警点 point := models.AnomalyPoint{ Key: lastSeries.MetricName(), Labels: lastSeries.Metric, Timestamp: now, Value: 0, Values: fmt.Sprintf("nodata since %v", time.Unix(now, 0).Format("2006-01-02 15:04:05")), Severity: ruleQuery.NodataTrigger.Severity, Triggered: true, Query: fmt.Sprintf("nodata check for %s", lastSeries.LabelsString()), TriggerType: models.TriggerTypeNodata, } points = append(points, point) logger.Infof("alert_eval_%d datasource_%d nodata point:%+v", rule.Id, dsId, point) } } } // 更新 arw.LastSeriesStore for hash, series := range seriesStore { arw.LastSeriesStore[hash] = series } } } return points, recoverPoints, nil } // ExecuteQueryTemplate 根据数据源类型对 Query 进行模板渲染处理 // cate: 数据源类别,如 "mysql", "pgsql" 等 // query: 查询对象,如果是数据库类型的数据源,会处理其中的 sql 字段 // data: 模板数据对象,如果为 nil 则使用空结构体(不支持变量渲染),如果不为 nil 则使用传入的数据(支持变量渲染) func ExecuteQueryTemplate(cate string, query interface{}, data interface{}) error { // 检查 query 是否是 map,且包含 sql 字段 queryMap, ok := query.(map[string]interface{}) if !ok { return nil } sqlVal, exists := queryMap["sql"] if !exists { return nil } sqlStr, ok := sqlVal.(string) if !ok { return nil } // 调用 ExecuteSqlTemplate 处理 sql 字段 processedSQL, err := ExecuteSqlTemplate(sqlStr, data) if err != nil { return fmt.Errorf("execute sql template error: %w", err) } // 更新 query 中的 sql 字段 queryMap["sql"] = processedSQL return nil } // ExecuteSqlTemplate 执行 query 中的 golang 模板语法函数 // query: 要处理的 query 字符串 // data: 模板数据对象,如果为 nil 则使用空结构体(不支持变量渲染),如果不为 nil 则使用传入的数据(支持变量渲染) func ExecuteSqlTemplate(query string, data interface{}) (string, error) { if !strings.Contains(query, "{{") || !strings.Contains(query, "}}") { return query, nil } tmpl, err := template.New("query").Funcs(tplx.TemplateFuncMap).Parse(query) if err != nil { return "", fmt.Errorf("query tmpl parse error: %w", err) } var buf strings.Builder templateData := data if templateData == nil { templateData = struct{}{} } if err := tmpl.Execute(&buf, templateData); err != nil { return "", fmt.Errorf("query tmpl execute error: %w", err) } return buf.String(), nil } ================================================ FILE: alert/eval/eval_test.go ================================================ package eval import ( "reflect" "testing" "golang.org/x/exp/slices" ) var ( reHashTagIndex1 = map[uint64][][]uint64{ 1: { {1, 2}, {3, 4}, }, 2: { {5, 6}, {7, 8}, }, } reHashTagIndex2 = map[uint64][][]uint64{ 1: { {9, 10}, {11, 12}, }, 3: { {13, 14}, {15, 16}, }, } seriesTagIndex1 = map[uint64][]uint64{ 1: {1, 2, 3, 4}, 2: {5, 6, 7, 8}, } seriesTagIndex2 = map[uint64][]uint64{ 1: {9, 10, 11, 12}, 3: {13, 14, 15, 16}, } ) func Test_originalJoin(t *testing.T) { type args struct { seriesTagIndex1 map[uint64][]uint64 seriesTagIndex2 map[uint64][]uint64 } tests := []struct { name string args args want map[uint64][]uint64 }{ { name: "original join", args: args{ seriesTagIndex1: map[uint64][]uint64{ 1: {1, 2, 3, 4}, 2: {5, 6, 7, 8}, }, seriesTagIndex2: map[uint64][]uint64{ 1: {9, 10, 11, 12}, 3: {13, 14, 15, 16}, }, }, want: map[uint64][]uint64{ 1: {1, 2, 3, 4, 9, 10, 11, 12}, 2: {5, 6, 7, 8}, 3: {13, 14, 15, 16}, }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { if got := originalJoin(tt.args.seriesTagIndex1, tt.args.seriesTagIndex2); !reflect.DeepEqual(got, tt.want) { t.Errorf("originalJoin() = %v, want %v", got, tt.want) } }) } } func Test_exclude(t *testing.T) { type args struct { reHashTagIndex1 map[uint64][][]uint64 reHashTagIndex2 map[uint64][][]uint64 } tests := []struct { name string args args want map[uint64][]uint64 }{ { name: "left exclude", args: args{ reHashTagIndex1: reHashTagIndex1, reHashTagIndex2: reHashTagIndex2, }, want: map[uint64][]uint64{ 0: {5, 6}, 1: {7, 8}, }, }, { name: "right exclude", args: args{ reHashTagIndex1: reHashTagIndex2, reHashTagIndex2: reHashTagIndex1, }, want: map[uint64][]uint64{ 3: {13, 14}, 4: {15, 16}, }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { if got := exclude(tt.args.reHashTagIndex1, tt.args.reHashTagIndex2); !allValueDeepEqual(flatten(got), tt.want) { t.Errorf("exclude() = %v, want %v", got, tt.want) } }) } } func Test_noneJoin(t *testing.T) { type args struct { seriesTagIndex1 map[uint64][]uint64 seriesTagIndex2 map[uint64][]uint64 } tests := []struct { name string args args want map[uint64][]uint64 }{ { name: "none join, direct splicing", args: args{ seriesTagIndex1: seriesTagIndex1, seriesTagIndex2: seriesTagIndex2, }, want: map[uint64][]uint64{ 0: {1, 2, 3, 4}, 1: {5, 6, 7, 8}, 2: {9, 10, 11, 12}, 3: {13, 14, 15, 16}, }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { if got := noneJoin(tt.args.seriesTagIndex1, tt.args.seriesTagIndex2); !allValueDeepEqual(got, tt.want) { t.Errorf("noneJoin() = %v, want %v", got, tt.want) } }) } } func Test_cartesianJoin(t *testing.T) { type args struct { seriesTagIndex1 map[uint64][]uint64 seriesTagIndex2 map[uint64][]uint64 } tests := []struct { name string args args want map[uint64][]uint64 }{ { name: "cartesian join", args: args{ seriesTagIndex1: seriesTagIndex1, seriesTagIndex2: seriesTagIndex2, }, want: map[uint64][]uint64{ 0: {1, 2, 3, 4, 9, 10, 11, 12}, 1: {5, 6, 7, 8, 9, 10, 11, 12}, 2: {5, 6, 7, 8, 13, 14, 15, 16}, 3: {1, 2, 3, 4, 13, 14, 15, 16}, }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { if got := cartesianJoin(tt.args.seriesTagIndex1, tt.args.seriesTagIndex2); !allValueDeepEqual(got, tt.want) { t.Errorf("cartesianJoin() = %v, want %v", got, tt.want) } }) } } func Test_onJoin(t *testing.T) { type args struct { reHashTagIndex1 map[uint64][][]uint64 reHashTagIndex2 map[uint64][][]uint64 joinType JoinType } tests := []struct { name string args args want map[uint64][]uint64 }{ { name: "left join", args: args{ reHashTagIndex1: reHashTagIndex1, reHashTagIndex2: reHashTagIndex2, joinType: Left, }, want: map[uint64][]uint64{ 1: {1, 2, 9, 10}, 2: {3, 4, 9, 10}, 3: {1, 2, 11, 12}, 4: {3, 4, 11, 12}, 5: {5, 6}, 6: {7, 8}, }, }, { name: "right join", args: args{ reHashTagIndex1: reHashTagIndex2, reHashTagIndex2: reHashTagIndex1, joinType: Right, }, want: map[uint64][]uint64{ 1: {1, 2, 9, 10}, 2: {3, 4, 9, 10}, 3: {1, 2, 11, 12}, 4: {3, 4, 11, 12}, 5: {13, 14}, 6: {15, 16}, }, }, { name: "inner join", args: args{ reHashTagIndex1: reHashTagIndex1, reHashTagIndex2: reHashTagIndex2, joinType: Inner, }, want: map[uint64][]uint64{ 1: {1, 2, 9, 10}, 2: {3, 4, 9, 10}, 3: {1, 2, 11, 12}, 4: {3, 4, 11, 12}, }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { if got := onJoin(tt.args.reHashTagIndex1, tt.args.reHashTagIndex2, tt.args.joinType); !allValueDeepEqual(flatten(got), tt.want) { t.Errorf("onJoin() = %v, want %v", got, tt.want) } }) } } // allValueDeepEqual 判断 map 的 value 是否相同,不考虑 key func allValueDeepEqual(got, want map[uint64][]uint64) bool { if len(got) != len(want) { return false } for _, v1 := range got { curEqual := false slices.Sort(v1) for _, v2 := range want { slices.Sort(v2) if reflect.DeepEqual(v1, v2) { curEqual = true break } } if !curEqual { return false } } return true } // allValueDeepEqualOmitOrder 判断两个字符串切片是否相等,不考虑顺序 func allValueDeepEqualOmitOrder(got, want []string) bool { if len(got) != len(want) { return false } slices.Sort(got) slices.Sort(want) for i := range got { if got[i] != want[i] { return false } } return true } func Test_removeVal(t *testing.T) { type args struct { promql string } tests := []struct { name string args args want string }{ // TODO: Add test cases. { name: "removeVal1", args: args{ promql: "mem{test1=\"$test1\",test2=\"$test2\",test3=\"$test3\"} > $val", }, want: "mem{} > $val", }, { name: "removeVal2", args: args{ promql: "mem{test1=\"test1\",test2=\"$test2\",test3=\"$test3\"} > $val", }, want: "mem{test1=\"test1\"} > $val", }, { name: "removeVal3", args: args{ promql: "mem{test1=\"$test1\",test2=\"test2\",test3=\"$test3\"} > $val", }, want: "mem{test2=\"test2\"} > $val", }, { name: "removeVal4", args: args{ promql: "mem{test1=\"$test1\",test2=\"$test2\",test3=\"test3\"} > $val", }, want: "mem{test3=\"test3\"} > $val", }, { name: "removeVal5", args: args{ promql: "mem{test1=\"$test1\",test2=\"test2\",test3=\"test3\"} > $val", }, want: "mem{test2=\"test2\",test3=\"test3\"} > $val", }, { name: "removeVal6", args: args{ promql: "mem{test1=\"test1\",test2=\"$test2\",test3=\"test3\"} > $val", }, want: "mem{test1=\"test1\",test3=\"test3\"} > $val", }, { name: "removeVal7", args: args{ promql: "mem{test1=\"test1\",test2=\"test2\",test3='$test3'} > $val", }, want: "mem{test1=\"test1\",test2=\"test2\"} > $val", }, { name: "removeVal8", args: args{ promql: "mem{test1=\"test1\",test2=\"test2\",test3=\"test3\"} > $val", }, want: "mem{test1=\"test1\",test2=\"test2\",test3=\"test3\"} > $val", }, { name: "removeVal9", args: args{ promql: "mem{test1=\"$test1\",test2=\"test2\"} > $val1 and mem{test3=\"test3\",test4=\"test4\"} > $val2", }, want: "mem{test2=\"test2\"} > $val1 and mem{test3=\"test3\",test4=\"test4\"} > $val2", }, { name: "removeVal10", args: args{ promql: "mem{test1=\"test1\",test2='$test2'} > $val1 and mem{test3=\"test3\",test4=\"test4\"} > $val2", }, want: "mem{test1=\"test1\"} > $val1 and mem{test3=\"test3\",test4=\"test4\"} > $val2", }, { name: "removeVal11", args: args{ promql: "mem{test1='test1',test2=\"test2\"} > $val1 and mem{test3=\"$test3\",test4=\"test4\"} > $val2", }, want: "mem{test1='test1',test2=\"test2\"} > $val1 and mem{test4=\"test4\"} > $val2", }, { name: "removeVal12", args: args{ promql: "mem{test1=\"test1\",test2=\"test2\"} > $val1 and mem{test3=\"test3\",test4=\"$test4\"} > $val2", }, want: "mem{test1=\"test1\",test2=\"test2\"} > $val1 and mem{test3=\"test3\"} > $val2", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { if got := removeVal(tt.args.promql); got != tt.want { t.Errorf("removeVal() = %v, want %v", got, tt.want) } }) } } func TestExtractVarMapping(t *testing.T) { tests := []struct { name string promql string want map[string]string }{ { name: "单个花括号单个变量", promql: `mem_used_percent{host="$my_host"} > $val`, want: map[string]string{"my_host": "host"}, }, { name: "单个花括号多个变量", promql: `mem_used_percent{host="$my_host",region="$region",env="prod"} > $val`, want: map[string]string{"my_host": "host", "region": "region"}, }, { name: "多个花括号多个变量", promql: `sum(rate(mem_used_percent{host="$my_host"})) by (instance) + avg(node_load1{region="$region"}) > $val`, want: map[string]string{"my_host": "host", "region": "region"}, }, { name: "相同变量出现多次", promql: `sum(rate(mem_used_percent{host="$my_host"})) + avg(node_load1{host="$my_host"}) > $val`, want: map[string]string{"my_host": "host"}, }, { name: "没有变量", promql: `mem_used_percent{host="localhost",region="cn"} > 80`, want: map[string]string{}, }, { name: "没有花括号", promql: `80 > $val`, want: map[string]string{}, }, { name: "格式不规范的标签", promql: `mem_used_percent{host=$my_host,region = $region} > $val`, want: map[string]string{"my_host": "host", "region": "region"}, }, { name: "空花括号", promql: `mem_used_percent{} > $val`, want: map[string]string{}, }, { name: "不完整的花括号", promql: `mem_used_percent{host="$my_host"`, want: map[string]string{}, }, { name: "复杂表达式", promql: `sum(rate(http_requests_total{handler="$handler",code="$code"}[5m])) by (handler) / sum(rate(http_requests_total{handler="$handler"}[5m])) by (handler) * 100 > $threshold`, want: map[string]string{"handler": "handler", "code": "code"}, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { got := ExtractVarMapping(tt.promql) if !reflect.DeepEqual(got, tt.want) { t.Errorf("ExtractVarMapping() = %v, want %v", got, tt.want) } }) } } ================================================ FILE: alert/mute/mute.go ================================================ package mute import ( "slices" "strconv" "strings" "time" "github.com/ccfos/nightingale/v6/alert/common" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/pkg/errors" "github.com/toolkits/pkg/logger" ) func IsMuted(rule *models.AlertRule, event *models.AlertCurEvent, targetCache *memsto.TargetCacheType, alertMuteCache *memsto.AlertMuteCacheType) (bool, string, int64) { if rule.Disabled == 1 { return true, "rule disabled", 0 } if TimeSpanMuteStrategy(rule, event) { return true, "rule is not effective for period of time", 0 } if IdentNotExistsMuteStrategy(rule, event, targetCache) { return true, "ident not exists mute", 0 } if BgNotMatchMuteStrategy(rule, event, targetCache) { return true, "bg not match mute", 0 } hit, muteId := EventMuteStrategy(event, alertMuteCache) if hit { return true, "match mute rule", muteId } return false, "", 0 } // TimeSpanMuteStrategy 根据规则配置的告警生效时间段过滤,如果产生的告警不在规则配置的告警生效时间段内,则不告警,即被mute // 时间范围,左闭右开,默认范围:00:00-24:00 // 如果规则配置了时区,则在该时区下进行时间判断;如果时区为空,则使用系统时区 func TimeSpanMuteStrategy(rule *models.AlertRule, event *models.AlertCurEvent) bool { // 确定使用的时区 var targetLoc *time.Location var err error timezone := rule.TimeZone if timezone == "" { // 如果时区为空,使用系统时区(保持原有逻辑) targetLoc = time.Local } else { // 加载规则配置的时区 targetLoc, err = time.LoadLocation(timezone) if err != nil { // 如果时区加载失败,记录错误并使用系统时区 logger.Warningf("Failed to load timezone %s for rule %d, using system timezone: %v", timezone, rule.Id, err) targetLoc = time.Local } } // 将触发时间转换到目标时区 tm := time.Unix(event.TriggerTime, 0).In(targetLoc) triggerTime := tm.Format("15:04") triggerWeek := strconv.Itoa(int(tm.Weekday())) if rule.EnableDaysOfWeek == "" { // 如果规则没有配置生效时间,则默认全天生效 return false } enableStime := strings.Fields(rule.EnableStime) enableEtime := strings.Fields(rule.EnableEtime) enableDaysOfWeek := strings.Split(rule.EnableDaysOfWeek, ";") length := len(enableDaysOfWeek) // enableStime,enableEtime,enableDaysOfWeek三者长度肯定相同,这里循环一个即可 for i := 0; i < length; i++ { enableDaysOfWeek[i] = strings.Replace(enableDaysOfWeek[i], "7", "0", 1) if !strings.Contains(enableDaysOfWeek[i], triggerWeek) { continue } if enableStime[i] < enableEtime[i] { if enableEtime[i] == "23:59" { // 02:00-23:59,这种情况做个特殊处理,相当于左闭右闭区间了 if triggerTime < enableStime[i] { // mute, 即没生效 continue } } else { // 02:00-04:00 或者 02:00-24:00 if triggerTime < enableStime[i] || triggerTime >= enableEtime[i] { // mute, 即没生效 continue } } } else if enableStime[i] > enableEtime[i] { // 21:00-09:00 if triggerTime < enableStime[i] && triggerTime >= enableEtime[i] { // mute, 即没生效 continue } } // 到这里说明当前时刻在告警规则的某组生效时间范围内,即没有 mute,直接返回 false return false } return true } // IdentNotExistsMuteStrategy 根据ident是否存在过滤,如果ident不存在,则target_up的告警直接过滤掉 func IdentNotExistsMuteStrategy(rule *models.AlertRule, event *models.AlertCurEvent, targetCache *memsto.TargetCacheType) bool { ident, has := event.TagsMap["ident"] if !has { return false } _, exists := targetCache.Get(ident) // 如果是target_up的告警,且ident已经不存在了,直接过滤掉 // 这里的判断有点太粗暴了,但是目前没有更好的办法 if !exists && strings.Contains(rule.PromQl, "target_up") { logger.Debugf("alert_eval_%d [IdentNotExistsMuteStrategy] mute: cluster:%s ident:%s", rule.Id, event.Cluster, ident) return true } return false } // BgNotMatchMuteStrategy 当规则开启只在bg内部告警时,对于非bg内部的机器过滤 func BgNotMatchMuteStrategy(rule *models.AlertRule, event *models.AlertCurEvent, targetCache *memsto.TargetCacheType) bool { // 没有开启BG内部告警,直接不过滤 if rule.EnableInBG == 0 { return false } ident, has := event.TagsMap["ident"] if !has { return false } target, exists := targetCache.Get(ident) // 对于包含ident的告警事件,check一下ident所属bg和rule所属bg是否相同 // 如果告警规则选择了只在本BG生效,那其他BG的机器就不能因此规则产生告警 if exists && !target.MatchGroupId(rule.GroupId) { logger.Debugf("alert_eval_%d [BgNotMatchMuteStrategy] mute: cluster:%s", rule.Id, event.Cluster) return true } return false } func EventMuteStrategy(event *models.AlertCurEvent, alertMuteCache *memsto.AlertMuteCacheType) (bool, int64) { mutes, has := alertMuteCache.Gets(event.GroupId) if !has || len(mutes) == 0 { return false, 0 } for i := 0; i < len(mutes); i++ { matched, _ := MatchMute(event, mutes[i]) if matched { return true, mutes[i].Id } } return false, 0 } // MatchMute 如果传入了clock这个可选参数,就表示使用这个clock表示的时间,否则就从event的字段中取TriggerTime func MatchMute(event *models.AlertCurEvent, mute *models.AlertMute, clock ...int64) (bool, error) { if mute.Disabled == 1 { return false, errors.New("mute is disabled") } // 如果不是全局的,判断 匹配的 datasource id if len(mute.DatasourceIdsJson) != 0 && mute.DatasourceIdsJson[0] != 0 && event.DatasourceId != 0 { if !slices.Contains(mute.DatasourceIdsJson, event.DatasourceId) { return false, errors.New("datasource id not match") } } if mute.MuteTimeType == models.TimeRange { if !mute.IsWithinTimeRange(event.TriggerTime) { return false, errors.New("event trigger time not within mute time range") } } else if mute.MuteTimeType == models.Periodic { ts := event.TriggerTime if len(clock) > 0 { ts = clock[0] } if !mute.IsWithinPeriodicMute(ts) { return false, errors.New("event trigger time not within periodic mute range") } } else { logger.Warningf("mute time type invalid, %d", mute.MuteTimeType) return false, errors.New("mute time type invalid") } var matchSeverity bool if len(mute.SeveritiesJson) > 0 { for _, s := range mute.SeveritiesJson { if event.Severity == s || s == 0 { matchSeverity = true break } } } else { matchSeverity = true } if !matchSeverity { return false, errors.New("event severity not match mute severity") } if len(mute.ITags) == 0 { return true, nil } if !common.MatchTags(event.TagsMap, mute.ITags) { return false, errors.New("event tags not match mute tags") } return true, nil } ================================================ FILE: alert/naming/hashring.go ================================================ package naming import ( "errors" "sync" "github.com/toolkits/pkg/consistent" "github.com/toolkits/pkg/logger" ) const NodeReplicas = 500 type DatasourceHashRingType struct { sync.RWMutex Rings map[string]*consistent.Consistent } // for alert_rule sharding var HostDatasource int64 = 99999999 var DatasourceHashRing = DatasourceHashRingType{Rings: make(map[string]*consistent.Consistent)} func NewConsistentHashRing(replicas int32, nodes []string) *consistent.Consistent { ret := consistent.New() ret.NumberOfReplicas = int(replicas) for i := 0; i < len(nodes); i++ { ret.Add(nodes[i]) } return ret } func RebuildConsistentHashRing(datasourceId string, nodes []string) { r := consistent.New() r.NumberOfReplicas = NodeReplicas for i := 0; i < len(nodes); i++ { r.Add(nodes[i]) } DatasourceHashRing.Set(datasourceId, r) logger.Infof("hash ring %s rebuild %+v", datasourceId, r.Members()) } func (chr *DatasourceHashRingType) GetNode(datasourceId string, pk string) (string, error) { chr.Lock() defer chr.Unlock() _, exists := chr.Rings[datasourceId] if !exists { chr.Rings[datasourceId] = NewConsistentHashRing(int32(NodeReplicas), []string{}) } return chr.Rings[datasourceId].Get(pk) } func (chr *DatasourceHashRingType) IsHit(datasourceId string, pk string, currentNode string) bool { node, err := chr.GetNode(datasourceId, pk) if err != nil { if !errors.Is(err, consistent.ErrEmptyCircle) { logger.Errorf("rule id:%s is not work, datasource id:%s failed to get node from hashring:%v", pk, datasourceId, err) } return false } return node == currentNode } func (chr *DatasourceHashRingType) Set(datasourceId string, r *consistent.Consistent) { chr.Lock() defer chr.Unlock() chr.Rings[datasourceId] = r } func (chr *DatasourceHashRingType) Del(datasourceId string) { chr.Lock() defer chr.Unlock() delete(chr.Rings, datasourceId) } func (chr *DatasourceHashRingType) Clear(engineName string) { chr.Lock() defer chr.Unlock() for id := range chr.Rings { if id == engineName { continue } delete(chr.Rings, id) } } ================================================ FILE: alert/naming/heartbeat.go ================================================ package naming import ( "fmt" "sort" "strings" "time" "github.com/ccfos/nightingale/v6/alert/aconf" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/poster" "github.com/toolkits/pkg/logger" ) type Naming struct { ctx *ctx.Context heartbeatConfig aconf.HeartbeatConfig astats *astats.Stats } func NewNaming(ctx *ctx.Context, heartbeat aconf.HeartbeatConfig, alertStats *astats.Stats) *Naming { naming := &Naming{ ctx: ctx, heartbeatConfig: heartbeat, astats: alertStats, } naming.Heartbeats() return naming } // local servers var localss map[int64]string var localHostServers map[string]string func (n *Naming) Heartbeats() error { localss = make(map[int64]string) localHostServers = make(map[string]string) if err := n.heartbeat(); err != nil { fmt.Println("failed to heartbeat:", err) return err } go n.loopHeartbeat() go n.loopDeleteInactiveInstances() return nil } func (n *Naming) loopDeleteInactiveInstances() { if !n.ctx.IsCenter { return } interval := time.Duration(10) * time.Minute for { time.Sleep(interval) n.DeleteInactiveInstances() } } func (n *Naming) DeleteInactiveInstances() { err := models.DB(n.ctx).Where("clock < ?", time.Now().Unix()-600).Delete(new(models.AlertingEngines)).Error if err != nil { logger.Errorf("delete inactive instances err:%v", err) } } func (n *Naming) loopHeartbeat() { interval := time.Duration(n.heartbeatConfig.Interval) * time.Millisecond for { time.Sleep(interval) if err := n.heartbeat(); err != nil { logger.Warning(err) } } } func (n *Naming) heartbeat() error { var datasourceIds []int64 var err error // 在页面上维护实例和集群的对应关系 datasourceIds, err = models.GetDatasourceIdsByEngineName(n.ctx, n.heartbeatConfig.EngineName) if err != nil { return err } if len(datasourceIds) == 0 { err := models.AlertingEngineHeartbeatWithCluster(n.ctx, n.heartbeatConfig.Endpoint, n.heartbeatConfig.EngineName, 0) if err != nil { logger.Warningf("heartbeat with cluster %s err:%v", "", err) n.astats.CounterHeartbeatErrorTotal.WithLabelValues().Inc() } } else { for i := 0; i < len(datasourceIds); i++ { err := models.AlertingEngineHeartbeatWithCluster(n.ctx, n.heartbeatConfig.Endpoint, n.heartbeatConfig.EngineName, datasourceIds[i]) if err != nil { logger.Warningf("heartbeat with cluster %d err:%v", datasourceIds[i], err) n.astats.CounterHeartbeatErrorTotal.WithLabelValues().Inc() } } } if len(datasourceIds) == 0 { DatasourceHashRing.Clear(n.heartbeatConfig.EngineName) for dsId := range localss { delete(localss, dsId) } } newDatasource := make(map[int64]struct{}) for i := 0; i < len(datasourceIds); i++ { newDatasource[datasourceIds[i]] = struct{}{} servers, err := n.ActiveServers(datasourceIds[i]) if err != nil { logger.Warningf("heartbeat %d get active server err:%v", datasourceIds[i], err) n.astats.CounterHeartbeatErrorTotal.WithLabelValues().Inc() continue } sort.Strings(servers) newss := strings.Join(servers, " ") oldss, exists := localss[datasourceIds[i]] if exists && oldss == newss { continue } RebuildConsistentHashRing(fmt.Sprintf("%d", datasourceIds[i]), servers) localss[datasourceIds[i]] = newss } for dsId := range localss { if _, exists := newDatasource[dsId]; !exists { delete(localss, dsId) DatasourceHashRing.Del(fmt.Sprintf("%d", dsId)) } } // host 告警使用的是 hash ring err = models.AlertingEngineHeartbeatWithCluster(n.ctx, n.heartbeatConfig.Endpoint, n.heartbeatConfig.EngineName, HostDatasource) if err != nil { logger.Warningf("heartbeat with cluster %s err:%v", "", err) n.astats.CounterHeartbeatErrorTotal.WithLabelValues().Inc() } servers, err := n.ActiveServersByEngineName() if err != nil { logger.Warningf("heartbeat %d get active server err:%v", HostDatasource, err) n.astats.CounterHeartbeatErrorTotal.WithLabelValues().Inc() return nil } sort.Strings(servers) newss := strings.Join(servers, " ") oldss, exists := localHostServers[n.heartbeatConfig.EngineName] if exists && oldss == newss { return nil } RebuildConsistentHashRing(n.heartbeatConfig.EngineName, servers) localHostServers[n.heartbeatConfig.EngineName] = newss return nil } func (n *Naming) ActiveServers(datasourceId int64) ([]string, error) { if datasourceId == -1 { return nil, fmt.Errorf("cluster is empty") } if !n.ctx.IsCenter { lst, err := poster.GetByUrls[[]string](n.ctx, "/v1/n9e/servers-active?dsid="+fmt.Sprintf("%d", datasourceId)) return lst, err } // 30秒内有心跳,就认为是活的 return models.AlertingEngineGetsInstances(n.ctx, "datasource_id = ? and clock > ?", datasourceId, time.Now().Unix()-30) } func (n *Naming) ActiveServersByEngineName() ([]string, error) { if !n.ctx.IsCenter { lst, err := poster.GetByUrls[[]string](n.ctx, "/v1/n9e/servers-active?engine_name="+n.heartbeatConfig.EngineName) return lst, err } // 30秒内有心跳,就认为是活的 return models.AlertingEngineGetsInstances(n.ctx, "engine_cluster = ? and clock > ?", n.heartbeatConfig.EngineName, time.Now().Unix()-30) } ================================================ FILE: alert/naming/leader.go ================================================ package naming import ( "sort" "github.com/toolkits/pkg/logger" ) func (n *Naming) IamLeader() bool { if !n.ctx.IsCenter { return false } servers, err := n.ActiveServersByEngineName() if err != nil { logger.Errorf("failed to get active servers: %v", err) return false } if len(servers) == 0 { logger.Errorf("active servers empty") return false } sort.Strings(servers) return n.heartbeatConfig.Endpoint == servers[0] } ================================================ FILE: alert/pipeline/engine/engine.go ================================================ package engine import ( "fmt" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/google/uuid" "github.com/toolkits/pkg/logger" ) type WorkflowEngine struct { ctx *ctx.Context } func NewWorkflowEngine(c *ctx.Context) *WorkflowEngine { return &WorkflowEngine{ctx: c} } func (e *WorkflowEngine) Execute(pipeline *models.EventPipeline, event *models.AlertCurEvent, triggerCtx *models.WorkflowTriggerContext) (*models.AlertCurEvent, *models.WorkflowResult, error) { startTime := time.Now() wfCtx := e.initWorkflowContext(pipeline, event, triggerCtx) nodes := pipeline.GetWorkflowNodes() connections := pipeline.GetWorkflowConnections() if len(nodes) == 0 { return event, &models.WorkflowResult{ Event: event, Status: models.ExecutionStatusSuccess, Message: "no nodes to execute", }, nil } nodeMap := make(map[string]*models.WorkflowNode) for i := range nodes { if nodes[i].RetryInterval == 0 { nodes[i].RetryInterval = 1 } if nodes[i].MaxRetries == 0 { nodes[i].MaxRetries = 1 } nodeMap[nodes[i].ID] = &nodes[i] } result := e.executeDAG(nodeMap, connections, wfCtx) result.Event = wfCtx.Event duration := time.Since(startTime).Milliseconds() if triggerCtx != nil && triggerCtx.Mode != "" { e.saveExecutionRecord(pipeline, wfCtx, result, triggerCtx, startTime.Unix(), duration) } return wfCtx.Event, result, nil } func (e *WorkflowEngine) initWorkflowContext(pipeline *models.EventPipeline, event *models.AlertCurEvent, triggerCtx *models.WorkflowTriggerContext) *models.WorkflowContext { // 合并输入参数 inputs := pipeline.GetInputsMap() if triggerCtx != nil && triggerCtx.InputsOverrides != nil { for k, v := range triggerCtx.InputsOverrides { inputs[k] = v } } metadata := map[string]string{ "start_time": fmt.Sprintf("%d", time.Now().Unix()), "pipeline_id": fmt.Sprintf("%d", pipeline.ID), } // 是否启用流式输出 stream := false if triggerCtx != nil { metadata["request_id"] = triggerCtx.RequestID metadata["trigger_mode"] = triggerCtx.Mode metadata["trigger_by"] = triggerCtx.TriggerBy stream = triggerCtx.Stream } return &models.WorkflowContext{ Event: event, Inputs: inputs, Vars: make(map[string]interface{}), // 初始化空的 Vars,供节点间传递数据 Metadata: metadata, Stream: stream, } } // executeDAG 使用 Kahn 算法执行 DAG func (e *WorkflowEngine) executeDAG(nodeMap map[string]*models.WorkflowNode, connections models.Connections, wfCtx *models.WorkflowContext) *models.WorkflowResult { result := &models.WorkflowResult{ Status: models.ExecutionStatusSuccess, NodeResults: make([]*models.NodeExecutionResult, 0), Stream: wfCtx.Stream, // 从上下文继承流式输出设置 } // 计算每个节点的入度 inDegree := make(map[string]int) for nodeID := range nodeMap { inDegree[nodeID] = 0 } // 遍历连接,计算入度 for _, nodeConns := range connections { for _, targets := range nodeConns.Main { for _, target := range targets { inDegree[target.Node]++ } } } // 找到所有入度为 0 的节点(起始节点) queue := make([]string, 0) for nodeID, degree := range inDegree { if degree == 0 { queue = append(queue, nodeID) } } // 如果没有起始节点,说明存在循环依赖 if len(queue) == 0 && len(nodeMap) > 0 { result.Status = models.ExecutionStatusFailed result.Message = "workflow has circular dependency" return result } // 记录已执行的节点 executed := make(map[string]bool) // 记录节点的分支选择结果 branchResults := make(map[string]*int) for len(queue) > 0 { // 取出队首节点 nodeID := queue[0] queue = queue[1:] // 检查是否已执行 if executed[nodeID] { continue } node, exists := nodeMap[nodeID] if !exists { continue } // 执行节点 nodeResult, nodeOutput := e.executeNode(node, wfCtx) result.NodeResults = append(result.NodeResults, nodeResult) if nodeOutput != nil && nodeOutput.Stream && nodeOutput.StreamChan != nil { // 流式输出节点通常是最后一个节点 // 直接传递 StreamChan 给 WorkflowResult,不阻塞等待 result.Stream = true result.StreamChan = nodeOutput.StreamChan result.Event = wfCtx.Event result.Status = "streaming" result.Message = fmt.Sprintf("streaming output from node: %s", node.Name) // 更新节点状态为 streaming nodeResult.Status = "streaming" nodeResult.Message = "streaming in progress" // 立即返回,让 API 层处理流式响应 return result } executed[nodeID] = true // 保存分支结果 if nodeResult.BranchIndex != nil { branchResults[nodeID] = nodeResult.BranchIndex } // 检查执行状态 if nodeResult.Status == "failed" { if !node.ContinueOnFail { result.Status = models.ExecutionStatusFailed result.ErrorNode = nodeID result.Message = fmt.Sprintf("node %s failed: %s", node.Name, nodeResult.Error) } } // 检查是否终止 if nodeResult.Status == "terminated" { result.Message = fmt.Sprintf("workflow terminated at node %s", node.Name) return result } // 更新后继节点的入度 if nodeConns, ok := connections[nodeID]; ok { for outputIndex, targets := range nodeConns.Main { // 检查是否应该走这个分支 if !e.shouldFollowBranch(nodeID, outputIndex, branchResults) { continue } for _, target := range targets { inDegree[target.Node]-- if inDegree[target.Node] == 0 { queue = append(queue, target.Node) } } } } } return result } // executeNode 执行单个节点 // 返回:节点执行结果、节点输出(用于流式输出检测) func (e *WorkflowEngine) executeNode(node *models.WorkflowNode, wfCtx *models.WorkflowContext) (*models.NodeExecutionResult, *models.NodeOutput) { startTime := time.Now() nodeResult := &models.NodeExecutionResult{ NodeID: node.ID, NodeName: node.Name, NodeType: node.Type, StartedAt: startTime.Unix(), } var nodeOutput *models.NodeOutput // 跳过禁用的节点 if node.Disabled { nodeResult.Status = "skipped" nodeResult.Message = "node is disabled" nodeResult.FinishedAt = time.Now().Unix() nodeResult.DurationMs = time.Since(startTime).Milliseconds() return nodeResult, nil } // 获取处理器 processor, err := models.GetProcessorByType(node.Type, node.Config) if err != nil { nodeResult.Status = "failed" nodeResult.Error = fmt.Sprintf("failed to get processor: %v", err) nodeResult.FinishedAt = time.Now().Unix() nodeResult.DurationMs = time.Since(startTime).Milliseconds() return nodeResult, nil } // 执行处理器(带重试) var retries int maxRetries := node.MaxRetries if !node.RetryOnFail { maxRetries = 0 } for retries <= maxRetries { // 检查是否为分支处理器 if branchProcessor, ok := processor.(models.BranchProcessor); ok { output, err := branchProcessor.ProcessWithBranch(e.ctx, wfCtx) if err != nil { if retries < maxRetries { retries++ time.Sleep(time.Duration(node.RetryInterval) * time.Second) continue } nodeResult.Status = "failed" nodeResult.Error = err.Error() } else { nodeResult.Status = "success" if output != nil { nodeOutput = output if output.WfCtx != nil { wfCtx = output.WfCtx } nodeResult.Message = output.Message nodeResult.BranchIndex = output.BranchIndex if output.Terminate { nodeResult.Status = "terminated" } } } break } // 普通处理器 newWfCtx, msg, err := processor.Process(e.ctx, wfCtx) if err != nil { if retries < maxRetries { retries++ time.Sleep(time.Duration(node.RetryInterval) * time.Second) continue } nodeResult.Status = "failed" nodeResult.Error = err.Error() } else { nodeResult.Status = "success" nodeResult.Message = msg if newWfCtx != nil { wfCtx = newWfCtx // 检测流式输出标记 if newWfCtx.Stream && newWfCtx.StreamChan != nil { nodeOutput = &models.NodeOutput{ WfCtx: newWfCtx, Message: msg, Stream: true, StreamChan: newWfCtx.StreamChan, } } } // 如果事件被 drop(返回 nil 或 Event 为 nil),标记为终止 if newWfCtx == nil || newWfCtx.Event == nil { nodeResult.Status = "terminated" nodeResult.Message = msg } } break } nodeResult.FinishedAt = time.Now().Unix() nodeResult.DurationMs = time.Since(startTime).Milliseconds() logger.Infof("workflow: executed node %s (type=%s) status=%s msg=%s duration=%dms", node.Name, node.Type, nodeResult.Status, nodeResult.Message, nodeResult.DurationMs) return nodeResult, nodeOutput } // shouldFollowBranch 判断是否应该走某个分支 func (e *WorkflowEngine) shouldFollowBranch(nodeID string, outputIndex int, branchResults map[string]*int) bool { branchIndex, hasBranch := branchResults[nodeID] if !hasBranch { // 没有分支结果,说明不是分支节点,只走第一个输出 return outputIndex == 0 } if branchIndex == nil { // branchIndex 为 nil,走默认分支(通常是最后一个) return true } // 只走选中的分支 return outputIndex == *branchIndex } func (e *WorkflowEngine) saveExecutionRecord(pipeline *models.EventPipeline, wfCtx *models.WorkflowContext, result *models.WorkflowResult, triggerCtx *models.WorkflowTriggerContext, startTime int64, duration int64) { executionID := triggerCtx.RequestID if executionID == "" { executionID = uuid.New().String() } execution := &models.EventPipelineExecution{ ID: executionID, PipelineID: pipeline.ID, PipelineName: pipeline.Name, Mode: triggerCtx.Mode, Status: result.Status, ErrorMessage: result.Message, ErrorNode: result.ErrorNode, CreatedAt: startTime, FinishedAt: time.Now().Unix(), DurationMs: duration, TriggerBy: triggerCtx.TriggerBy, } if wfCtx.Event != nil { execution.EventID = wfCtx.Event.Id } if err := execution.SetNodeResults(result.NodeResults); err != nil { logger.Errorf("workflow: failed to set node results: pipeline_id=%d, error=%v", pipeline.ID, err) } if err := execution.SetInputsSnapshot(wfCtx.Inputs); err != nil { logger.Errorf("workflow: failed to set inputs snapshot: pipeline_id=%d, error=%v", pipeline.ID, err) } if err := models.CreateEventPipelineExecution(e.ctx, execution); err != nil { logger.Errorf("workflow: failed to save execution record: pipeline_id=%d, error=%v", pipeline.ID, err) } } ================================================ FILE: alert/pipeline/pipeline.go ================================================ package pipeline import ( _ "github.com/ccfos/nightingale/v6/alert/pipeline/processor/aisummary" _ "github.com/ccfos/nightingale/v6/alert/pipeline/processor/callback" _ "github.com/ccfos/nightingale/v6/alert/pipeline/processor/eventdrop" _ "github.com/ccfos/nightingale/v6/alert/pipeline/processor/eventupdate" _ "github.com/ccfos/nightingale/v6/alert/pipeline/processor/logic" _ "github.com/ccfos/nightingale/v6/alert/pipeline/processor/relabel" ) func Init() { } ================================================ FILE: alert/pipeline/processor/aisummary/ai_summary.go ================================================ package aisummary import ( "bytes" "crypto/tls" "encoding/json" "fmt" "io" "net/http" "net/url" "strconv" "strings" "text/template" "time" "github.com/ccfos/nightingale/v6/alert/pipeline/processor/callback" "github.com/ccfos/nightingale/v6/alert/pipeline/processor/common" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/tplx" ) const ( HTTP_STATUS_SUCCESS_MAX = 299 ) // AISummaryConfig 配置结构体 type AISummaryConfig struct { callback.HTTPConfig ModelName string `json:"model_name"` APIKey string `json:"api_key"` PromptTemplate string `json:"prompt_template"` CustomParams map[string]interface{} `json:"custom_params"` } type Message struct { Role string `json:"role"` Content string `json:"content"` } type ChatCompletionResponse struct { Choices []struct { Message struct { Content string `json:"content"` } `json:"message"` } `json:"choices"` } func init() { models.RegisterProcessor("ai_summary", &AISummaryConfig{}) } func (c *AISummaryConfig) Init(settings interface{}) (models.Processor, error) { result, err := common.InitProcessor[*AISummaryConfig](settings) return result, err } func (c *AISummaryConfig) Process(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.WorkflowContext, string, error) { event := wfCtx.Event if c.Client == nil { if err := c.initHTTPClient(); err != nil { return wfCtx, "", fmt.Errorf("failed to initialize HTTP client: %v processor: %v", err, c) } } // 准备告警事件信息 eventInfo, err := c.prepareEventInfo(wfCtx) if err != nil { return wfCtx, "", fmt.Errorf("failed to prepare event info: %v processor: %v", err, c) } // 调用AI模型生成总结 summary, err := c.generateAISummary(eventInfo) if err != nil { return wfCtx, "", fmt.Errorf("failed to generate AI summary: %v processor: %v", err, c) } // 将总结添加到annotations字段 if event.AnnotationsJSON == nil { event.AnnotationsJSON = make(map[string]string) } event.AnnotationsJSON["ai_summary"] = summary // 更新Annotations字段 b, err := json.Marshal(event.AnnotationsJSON) if err != nil { return wfCtx, "", fmt.Errorf("failed to marshal annotations: %v processor: %v", err, c) } event.Annotations = string(b) return wfCtx, "", nil } func (c *AISummaryConfig) initHTTPClient() error { transport := &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: c.SkipSSLVerify}, } if c.Proxy != "" { proxyURL, err := url.Parse(c.Proxy) if err != nil { return fmt.Errorf("failed to parse proxy url: %v", err) } transport.Proxy = http.ProxyURL(proxyURL) } c.Client = &http.Client{ Timeout: time.Duration(c.Timeout) * time.Millisecond, Transport: transport, } return nil } func (c *AISummaryConfig) prepareEventInfo(wfCtx *models.WorkflowContext) (string, error) { var defs = []string{ "{{$event := .Event}}", "{{$inputs := .Inputs}}", } text := strings.Join(append(defs, c.PromptTemplate), "") t, err := template.New("prompt").Funcs(template.FuncMap(tplx.TemplateFuncMap)).Parse(text) if err != nil { return "", fmt.Errorf("failed to parse prompt template: %v", err) } var body bytes.Buffer err = t.Execute(&body, wfCtx) if err != nil { return "", fmt.Errorf("failed to execute prompt template: %v", err) } return body.String(), nil } func (c *AISummaryConfig) generateAISummary(eventInfo string) (string, error) { // 构建基础请求参数 reqParams := map[string]interface{}{ "model": c.ModelName, "messages": []Message{ { Role: "user", Content: eventInfo, }, }, } // 合并自定义参数 for k, v := range c.CustomParams { converted, err := convertCustomParam(v) if err != nil { return "", fmt.Errorf("failed to convert custom param %s: %v", k, err) } reqParams[k] = converted } // 序列化请求体 jsonData, err := json.Marshal(reqParams) if err != nil { return "", fmt.Errorf("failed to marshal request body: %v", err) } // 创建HTTP请求 req, err := http.NewRequest("POST", c.URL, bytes.NewBuffer(jsonData)) if err != nil { return "", fmt.Errorf("failed to create request: %v", err) } // 设置请求头 req.Header.Set("Authorization", "Bearer "+c.APIKey) req.Header.Set("Content-Type", "application/json") for k, v := range c.Headers { req.Header.Set(k, v) } // 发送请求 resp, err := c.Client.Do(req) if err != nil { return "", fmt.Errorf("failed to send request: %v", err) } defer resp.Body.Close() // 检查响应状态码 if resp.StatusCode > HTTP_STATUS_SUCCESS_MAX { body, _ := io.ReadAll(resp.Body) return "", fmt.Errorf("unexpected status code: %d, body: %s", resp.StatusCode, string(body)) } // 读取响应 body, err := io.ReadAll(resp.Body) if err != nil { return "", fmt.Errorf("failed to read response body: %v", err) } // 解析响应 var chatResp ChatCompletionResponse if err := json.Unmarshal(body, &chatResp); err != nil { return "", fmt.Errorf("failed to unmarshal response: %v", err) } if len(chatResp.Choices) == 0 { return "", fmt.Errorf("no response from AI model") } return chatResp.Choices[0].Message.Content, nil } // convertCustomParam 将前端传入的参数转换为正确的类型 func convertCustomParam(value interface{}) (interface{}, error) { if value == nil { return nil, nil } // 如果是字符串,尝试转换为其他类型 if str, ok := value.(string); ok { // 尝试转换为数字 if f, err := strconv.ParseFloat(str, 64); err == nil { // 检查是否为整数 if f == float64(int64(f)) { return int64(f), nil } return f, nil } // 尝试转换为布尔值 if b, err := strconv.ParseBool(str); err == nil { return b, nil } // 尝试解析为JSON数组 if strings.HasPrefix(strings.TrimSpace(str), "[") { var arr []interface{} if err := json.Unmarshal([]byte(str), &arr); err == nil { return arr, nil } } // 尝试解析为JSON对象 if strings.HasPrefix(strings.TrimSpace(str), "{") { var obj map[string]interface{} if err := json.Unmarshal([]byte(str), &obj); err == nil { return obj, nil } } } return value, nil } ================================================ FILE: alert/pipeline/processor/aisummary/ai_summary_test.go ================================================ package aisummary import ( "testing" "github.com/ccfos/nightingale/v6/alert/pipeline/processor/callback" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/stretchr/testify/assert" ) func TestAISummaryConfig_Process(t *testing.T) { // 创建测试配置 config := &AISummaryConfig{ HTTPConfig: callback.HTTPConfig{ URL: "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions", Timeout: 30000, SkipSSLVerify: true, Headers: map[string]string{ "Content-Type": "application/json", }, }, ModelName: "gemini-2.0-flash", APIKey: "*", PromptTemplate: "告警规则:{{$event.RuleName}}\n严重程度:{{$event.Severity}}", CustomParams: map[string]interface{}{ "temperature": 0.7, "max_tokens": 2000, "top_p": 0.9, }, } // 创建测试事件 event := &models.AlertCurEvent{ RuleName: "Test Rule", Severity: 1, TagsMap: map[string]string{ "host": "test-host", }, AnnotationsJSON: map[string]string{ "description": "Test alert", }, } // 创建 WorkflowContext wfCtx := &models.WorkflowContext{ Event: event, Inputs: map[string]string{}, } // 测试模板处理 eventInfo, err := config.prepareEventInfo(wfCtx) assert.NoError(t, err) assert.Contains(t, eventInfo, "Test Rule") assert.Contains(t, eventInfo, "1") // 测试配置初始化 processor, err := config.Init(config) assert.NoError(t, err) assert.NotNil(t, processor) // 测试处理函数 result, _, err := processor.Process(&ctx.Context{}, wfCtx) assert.NoError(t, err) assert.NotNil(t, result) assert.NotEmpty(t, result.Event.AnnotationsJSON["ai_summary"]) // 展示处理结果 t.Log("\n=== 处理结果 ===") t.Logf("告警规则: %s", result.Event.RuleName) t.Logf("严重程度: %d", result.Event.Severity) t.Logf("标签: %v", result.Event.TagsMap) t.Logf("原始注释: %v", result.Event.AnnotationsJSON["description"]) t.Logf("AI总结: %s", result.Event.AnnotationsJSON["ai_summary"]) } func TestConvertCustomParam(t *testing.T) { tests := []struct { name string input interface{} expected interface{} hasError bool }{ { name: "nil value", input: nil, expected: nil, hasError: false, }, { name: "string number to int64", input: "123", expected: int64(123), hasError: false, }, { name: "string float to float64", input: "123.45", expected: 123.45, hasError: false, }, { name: "string boolean to bool", input: "true", expected: true, hasError: false, }, { name: "string false to bool", input: "false", expected: false, hasError: false, }, { name: "JSON array string to slice", input: `["a", "b", "c"]`, expected: []interface{}{"a", "b", "c"}, hasError: false, }, { name: "JSON object string to map", input: `{"key": "value", "num": 123}`, expected: map[string]interface{}{"key": "value", "num": float64(123)}, hasError: false, }, { name: "plain string remains string", input: "hello world", expected: "hello world", hasError: false, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { converted, err := convertCustomParam(test.input) if test.hasError { assert.Error(t, err) return } assert.NoError(t, err) assert.Equal(t, test.expected, converted) }) } } ================================================ FILE: alert/pipeline/processor/callback/callback.go ================================================ package callback import ( "crypto/tls" "encoding/json" "fmt" "io" "net/http" "net/url" "strings" "time" "github.com/ccfos/nightingale/v6/alert/pipeline/processor/common" "github.com/ccfos/nightingale/v6/alert/pipeline/processor/utils" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/toolkits/pkg/logger" ) type HTTPConfig struct { URL string `json:"url"` Method string `json:"method,omitempty"` Body string `json:"body,omitempty"` Headers map[string]string `json:"header"` AuthUsername string `json:"auth_username"` AuthPassword string `json:"auth_password"` Timeout int `json:"timeout"` // 单位:ms SkipSSLVerify bool `json:"skip_ssl_verify"` Proxy string `json:"proxy"` Client *http.Client `json:"-"` } // RelabelConfig type CallbackConfig struct { HTTPConfig } func init() { models.RegisterProcessor("callback", &CallbackConfig{}) } func (c *CallbackConfig) Init(settings interface{}) (models.Processor, error) { result, err := common.InitProcessor[*CallbackConfig](settings) return result, err } func (c *CallbackConfig) Process(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.WorkflowContext, string, error) { event := wfCtx.Event if c.Client == nil { transport := &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: c.SkipSSLVerify}, } if c.Proxy != "" { proxyURL, err := url.Parse(c.Proxy) if err != nil { return wfCtx, "", fmt.Errorf("failed to parse proxy url: %v processor: %v", err, c) } else { transport.Proxy = http.ProxyURL(proxyURL) } } c.Client = &http.Client{ Timeout: time.Duration(c.Timeout) * time.Millisecond, Transport: transport, } } headers := make(map[string]string) headers["Content-Type"] = "application/json" for k, v := range c.Headers { headers[k] = v } url, err := utils.TplRender(wfCtx, c.URL) if err != nil { return wfCtx, "", fmt.Errorf("failed to render url template: %v processor: %v", err, c) } body, err := json.Marshal(event) if err != nil { return wfCtx, "", fmt.Errorf("failed to marshal event: %v processor: %v", err, c) } req, err := http.NewRequest("POST", url, strings.NewReader(string(body))) if err != nil { return wfCtx, "", fmt.Errorf("failed to create request: %v processor: %v", err, c) } for k, v := range headers { req.Header.Set(k, v) } if c.AuthUsername != "" && c.AuthPassword != "" { req.SetBasicAuth(c.AuthUsername, c.AuthPassword) } resp, err := c.Client.Do(req) if err != nil { return wfCtx, "", fmt.Errorf("failed to send request: %v processor: %v", err, c) } b, err := io.ReadAll(resp.Body) if err != nil { return wfCtx, "", fmt.Errorf("failed to read response body: %v processor: %v", err, c) } logger.Debugf("callback processor response body: %s", string(b)) return wfCtx, "callback success", nil } ================================================ FILE: alert/pipeline/processor/common/common.go ================================================ package common import ( "encoding/json" ) // InitProcessor 是一个通用的初始化处理器的方法 // 使用泛型简化处理器初始化逻辑 // T 必须是 models.Processor 接口的实现 func InitProcessor[T any](settings interface{}) (T, error) { var zero T b, err := json.Marshal(settings) if err != nil { return zero, err } var result T err = json.Unmarshal(b, &result) if err != nil { return zero, err } return result, nil } ================================================ FILE: alert/pipeline/processor/eventdrop/event_drop.go ================================================ package eventdrop import ( "bytes" "fmt" "strings" texttemplate "text/template" "github.com/ccfos/nightingale/v6/alert/pipeline/processor/common" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/tplx" "github.com/toolkits/pkg/logger" ) type EventDropConfig struct { Content string `json:"content"` } func init() { models.RegisterProcessor("event_drop", &EventDropConfig{}) } func (c *EventDropConfig) Init(settings interface{}) (models.Processor, error) { result, err := common.InitProcessor[*EventDropConfig](settings) return result, err } func (c *EventDropConfig) Process(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.WorkflowContext, string, error) { // 使用背景是可以根据此处理器,实现对事件进行更加灵活的过滤的逻辑 // 在标签过滤和属性过滤都不满足需求时可以使用 // 如果模板执行结果为 true,则删除该事件 event := wfCtx.Event var defs = []string{ "{{ $event := .Event }}", "{{ $labels := .Event.TagsMap }}", "{{ $value := .Event.TriggerValue }}", "{{ $inputs := .Inputs }}", } text := strings.Join(append(defs, c.Content), "") tpl, err := texttemplate.New("eventdrop").Funcs(tplx.TemplateFuncMap).Parse(text) if err != nil { return wfCtx, "", fmt.Errorf("processor failed to parse template: %v processor: %v", err, c) } var body bytes.Buffer if err = tpl.Execute(&body, wfCtx); err != nil { return wfCtx, "", fmt.Errorf("processor failed to execute template: %v processor: %v", err, c) } result := strings.TrimSpace(body.String()) logger.Infof("processor eventdrop result: %v", result) if result == "true" { wfCtx.Event = nil logger.Infof("processor eventdrop drop event: %s", event.Hash) return wfCtx, "drop event success", nil } return wfCtx, "drop event failed", nil } ================================================ FILE: alert/pipeline/processor/eventupdate/event_update.go ================================================ package eventupdate import ( "crypto/tls" "encoding/json" "fmt" "io" "net/http" "net/url" "strings" "time" "github.com/ccfos/nightingale/v6/alert/pipeline/processor/callback" "github.com/ccfos/nightingale/v6/alert/pipeline/processor/common" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/toolkits/pkg/logger" ) // RelabelConfig type EventUpdateConfig struct { callback.HTTPConfig } func init() { models.RegisterProcessor("event_update", &EventUpdateConfig{}) } func (c *EventUpdateConfig) Init(settings interface{}) (models.Processor, error) { result, err := common.InitProcessor[*EventUpdateConfig](settings) return result, err } func (c *EventUpdateConfig) Process(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.WorkflowContext, string, error) { event := wfCtx.Event if c.Client == nil { transport := &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: c.SkipSSLVerify}, } if c.Proxy != "" { proxyURL, err := url.Parse(c.Proxy) if err != nil { return wfCtx, "", fmt.Errorf("failed to parse proxy url: %v processor: %v", err, c) } else { transport.Proxy = http.ProxyURL(proxyURL) } } c.Client = &http.Client{ Timeout: time.Duration(c.Timeout) * time.Millisecond, Transport: transport, } } headers := make(map[string]string) headers["Content-Type"] = "application/json" for k, v := range c.Headers { headers[k] = v } body, err := json.Marshal(event) if err != nil { return wfCtx, "", fmt.Errorf("failed to marshal event: %v processor: %v", err, c) } req, err := http.NewRequest("POST", c.URL, strings.NewReader(string(body))) if err != nil { return wfCtx, "", fmt.Errorf("failed to create request: %v processor: %v", err, c) } for k, v := range headers { req.Header.Set(k, v) } if c.AuthUsername != "" && c.AuthPassword != "" { req.SetBasicAuth(c.AuthUsername, c.AuthPassword) } resp, err := c.Client.Do(req) if err != nil { return wfCtx, "", fmt.Errorf("failed to send request: %v processor: %v", err, c) } b, err := io.ReadAll(resp.Body) if err != nil { return nil, "", fmt.Errorf("failed to read response body: %v processor: %v", err, c) } logger.Debugf("event update processor response body: %s", string(b)) err = json.Unmarshal(b, &event) if err != nil { return wfCtx, "", fmt.Errorf("failed to unmarshal response body: %v processor: %v", err, c) } return wfCtx, "", nil } ================================================ FILE: alert/pipeline/processor/logic/if.go ================================================ package logic import ( "bytes" "fmt" "strings" "text/template" alertCommon "github.com/ccfos/nightingale/v6/alert/common" "github.com/ccfos/nightingale/v6/alert/pipeline/processor/common" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/tplx" ) // 判断模式常量 const ( ConditionModeExpression = "expression" // 表达式模式(默认) ConditionModeTags = "tags" // 标签/属性模式 ) // IfConfig If 条件处理器配置 type IfConfig struct { // 判断模式:expression(表达式)或 tags(标签/属性) Mode string `json:"mode,omitempty"` // 表达式模式配置 // 条件表达式(支持 Go 模板语法) // 例如:{{ if eq .Severity 1 }}true{{ end }} Condition string `json:"condition,omitempty"` // 标签/属性模式配置 LabelKeys []models.TagFilter `json:"label_keys,omitempty"` // 适用标签 Attributes []models.TagFilter `json:"attributes,omitempty"` // 适用属性 // 内部使用,解析后的过滤器 parsedLabelKeys []models.TagFilter `json:"-"` parsedAttributes []models.TagFilter `json:"-"` } func init() { models.RegisterProcessor("logic.if", &IfConfig{}) } func (c *IfConfig) Init(settings interface{}) (models.Processor, error) { result, err := common.InitProcessor[*IfConfig](settings) if err != nil { return nil, err } // 解析标签过滤器 if len(result.LabelKeys) > 0 { // Deep copy to avoid concurrent map writes on cached objects labelKeysCopy := make([]models.TagFilter, len(result.LabelKeys)) copy(labelKeysCopy, result.LabelKeys) for i := range labelKeysCopy { if labelKeysCopy[i].Func == "" { labelKeysCopy[i].Func = labelKeysCopy[i].Op } } result.parsedLabelKeys, err = models.ParseTagFilter(labelKeysCopy) if err != nil { return nil, fmt.Errorf("failed to parse label_keys: %v", err) } } // 解析属性过滤器 if len(result.Attributes) > 0 { // Deep copy to avoid concurrent map writes on cached objects attributesCopy := make([]models.TagFilter, len(result.Attributes)) copy(attributesCopy, result.Attributes) for i := range attributesCopy { if attributesCopy[i].Func == "" { attributesCopy[i].Func = attributesCopy[i].Op } } result.parsedAttributes, err = models.ParseTagFilter(attributesCopy) if err != nil { return nil, fmt.Errorf("failed to parse attributes: %v", err) } } return result, nil } // Process 实现 Processor 接口(兼容旧模式) func (c *IfConfig) Process(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.WorkflowContext, string, error) { result, err := c.evaluateCondition(wfCtx) if err != nil { return wfCtx, "", fmt.Errorf("if processor: failed to evaluate condition: %v", err) } if result { return wfCtx, "condition matched (true branch)", nil } return wfCtx, "condition not matched (false branch)", nil } // ProcessWithBranch 实现 BranchProcessor 接口 func (c *IfConfig) ProcessWithBranch(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.NodeOutput, error) { result, err := c.evaluateCondition(wfCtx) if err != nil { return nil, fmt.Errorf("if processor: failed to evaluate condition: %v", err) } output := &models.NodeOutput{ WfCtx: wfCtx, } if result { // 条件为 true,走输出 0(true 分支) branchIndex := 0 output.BranchIndex = &branchIndex output.Message = "condition matched (true branch)" } else { // 条件为 false,走输出 1(false 分支) branchIndex := 1 output.BranchIndex = &branchIndex output.Message = "condition not matched (false branch)" } return output, nil } // evaluateCondition 评估条件 func (c *IfConfig) evaluateCondition(wfCtx *models.WorkflowContext) (bool, error) { mode := c.Mode if mode == "" { mode = ConditionModeExpression // 默认表达式模式 } switch mode { case ConditionModeTags: return c.evaluateTagsCondition(wfCtx.Event) default: return c.evaluateExpressionCondition(wfCtx) } } // evaluateExpressionCondition 评估表达式条件 func (c *IfConfig) evaluateExpressionCondition(wfCtx *models.WorkflowContext) (bool, error) { if c.Condition == "" { return true, nil } // 构建模板数据 var defs = []string{ "{{ $event := .Event }}", "{{ $labels := .Event.TagsMap }}", "{{ $value := .Event.TriggerValue }}", "{{ $inputs := .Inputs }}", } text := strings.Join(append(defs, c.Condition), "") tpl, err := template.New("if_condition").Funcs(tplx.TemplateFuncMap).Parse(text) if err != nil { return false, err } var buf bytes.Buffer if err = tpl.Execute(&buf, wfCtx); err != nil { return false, err } result := strings.TrimSpace(strings.ToLower(buf.String())) return result == "true" || result == "1", nil } // evaluateTagsCondition 评估标签/属性条件 func (c *IfConfig) evaluateTagsCondition(event *models.AlertCurEvent) (bool, error) { // 如果没有配置任何过滤条件,默认返回 true if len(c.parsedLabelKeys) == 0 && len(c.parsedAttributes) == 0 { return true, nil } // 匹配标签 (TagsMap) if len(c.parsedLabelKeys) > 0 { tagsMap := event.TagsMap if tagsMap == nil { tagsMap = make(map[string]string) } if !alertCommon.MatchTags(tagsMap, c.parsedLabelKeys) { return false, nil } } // 匹配属性 (JsonTagsAndValue - 所有 JSON 字段) if len(c.parsedAttributes) > 0 { attributesMap := event.JsonTagsAndValue() if !alertCommon.MatchTags(attributesMap, c.parsedAttributes) { return false, nil } } return true, nil } ================================================ FILE: alert/pipeline/processor/logic/switch.go ================================================ package logic import ( "bytes" "fmt" "strings" "text/template" alertCommon "github.com/ccfos/nightingale/v6/alert/common" "github.com/ccfos/nightingale/v6/alert/pipeline/processor/common" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/tplx" ) // SwitchCase Switch 分支定义 type SwitchCase struct { // 判断模式:expression(表达式)或 tags(标签/属性) Mode string `json:"mode,omitempty"` // 表达式模式配置 // 条件表达式(支持 Go 模板语法) Condition string `json:"condition,omitempty"` // 标签/属性模式配置 LabelKeys []models.TagFilter `json:"label_keys,omitempty"` // 适用标签 Attributes []models.TagFilter `json:"attributes,omitempty"` // 适用属性 // 分支名称(可选,用于日志) Name string `json:"name,omitempty"` // 内部使用,解析后的过滤器 parsedLabelKeys []models.TagFilter `json:"-"` parsedAttributes []models.TagFilter `json:"-"` } // SwitchConfig Switch 多分支处理器配置 type SwitchConfig struct { // 分支条件列表 // 按顺序匹配,第一个为 true 的分支将被选中 Cases []SwitchCase `json:"cases"` // 是否允许多个分支同时匹配(默认 false,只走第一个匹配的) AllowMultiple bool `json:"allow_multiple,omitempty"` } func init() { models.RegisterProcessor("logic.switch", &SwitchConfig{}) } func (c *SwitchConfig) Init(settings interface{}) (models.Processor, error) { result, err := common.InitProcessor[*SwitchConfig](settings) if err != nil { return nil, err } // 解析每个 case 的标签和属性过滤器 for i := range result.Cases { if len(result.Cases[i].LabelKeys) > 0 { // Deep copy to avoid concurrent map writes on cached objects labelKeysCopy := make([]models.TagFilter, len(result.Cases[i].LabelKeys)) copy(labelKeysCopy, result.Cases[i].LabelKeys) for j := range labelKeysCopy { if labelKeysCopy[j].Func == "" { labelKeysCopy[j].Func = labelKeysCopy[j].Op } } result.Cases[i].parsedLabelKeys, err = models.ParseTagFilter(labelKeysCopy) if err != nil { return nil, fmt.Errorf("failed to parse label_keys for case[%d]: %v", i, err) } } if len(result.Cases[i].Attributes) > 0 { // Deep copy to avoid concurrent map writes on cached objects attributesCopy := make([]models.TagFilter, len(result.Cases[i].Attributes)) copy(attributesCopy, result.Cases[i].Attributes) for j := range attributesCopy { if attributesCopy[j].Func == "" { attributesCopy[j].Func = attributesCopy[j].Op } } result.Cases[i].parsedAttributes, err = models.ParseTagFilter(attributesCopy) if err != nil { return nil, fmt.Errorf("failed to parse attributes for case[%d]: %v", i, err) } } } return result, nil } // Process 实现 Processor 接口(兼容旧模式) func (c *SwitchConfig) Process(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.WorkflowContext, string, error) { index, caseName, err := c.evaluateCases(wfCtx) if err != nil { return wfCtx, "", fmt.Errorf("switch processor: failed to evaluate cases: %v", err) } if index >= 0 { if caseName != "" { return wfCtx, fmt.Sprintf("matched case[%d]: %s", index, caseName), nil } return wfCtx, fmt.Sprintf("matched case[%d]", index), nil } // 走默认分支(最后一个输出) return wfCtx, "no case matched, using default branch", nil } // ProcessWithBranch 实现 BranchProcessor 接口 func (c *SwitchConfig) ProcessWithBranch(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.NodeOutput, error) { index, caseName, err := c.evaluateCases(wfCtx) if err != nil { return nil, fmt.Errorf("switch processor: failed to evaluate cases: %v", err) } output := &models.NodeOutput{ WfCtx: wfCtx, } if index >= 0 { output.BranchIndex = &index if caseName != "" { output.Message = fmt.Sprintf("matched case[%d]: %s", index, caseName) } else { output.Message = fmt.Sprintf("matched case[%d]", index) } } else { // 默认分支的索引是 cases 数量(即最后一个输出端口) defaultIndex := len(c.Cases) output.BranchIndex = &defaultIndex output.Message = "no case matched, using default branch" } return output, nil } // evaluateCases 评估所有分支条件 // 返回匹配的分支索引和分支名称,如果没有匹配返回 -1 func (c *SwitchConfig) evaluateCases(wfCtx *models.WorkflowContext) (int, string, error) { for i := range c.Cases { matched, err := c.evaluateCaseCondition(&c.Cases[i], wfCtx) if err != nil { return -1, "", fmt.Errorf("case[%d] evaluation error: %v", i, err) } if matched { return i, c.Cases[i].Name, nil } } return -1, "", nil } // evaluateCaseCondition 评估单个分支条件 func (c *SwitchConfig) evaluateCaseCondition(caseItem *SwitchCase, wfCtx *models.WorkflowContext) (bool, error) { mode := caseItem.Mode if mode == "" { mode = ConditionModeExpression // 默认表达式模式 } switch mode { case ConditionModeTags: return c.evaluateTagsCondition(caseItem, wfCtx.Event) default: return c.evaluateExpressionCondition(caseItem.Condition, wfCtx) } } // evaluateExpressionCondition 评估表达式条件 func (c *SwitchConfig) evaluateExpressionCondition(condition string, wfCtx *models.WorkflowContext) (bool, error) { if condition == "" { return false, nil } var defs = []string{ "{{ $event := .Event }}", "{{ $labels := .Event.TagsMap }}", "{{ $value := .Event.TriggerValue }}", "{{ $inputs := .Inputs }}", } text := strings.Join(append(defs, condition), "") tpl, err := template.New("switch_condition").Funcs(tplx.TemplateFuncMap).Parse(text) if err != nil { return false, err } var buf bytes.Buffer if err = tpl.Execute(&buf, wfCtx); err != nil { return false, err } result := strings.TrimSpace(strings.ToLower(buf.String())) return result == "true" || result == "1", nil } // evaluateTagsCondition 评估标签/属性条件 func (c *SwitchConfig) evaluateTagsCondition(caseItem *SwitchCase, event *models.AlertCurEvent) (bool, error) { // 如果没有配置任何过滤条件,默认返回 false(不匹配) if len(caseItem.parsedLabelKeys) == 0 && len(caseItem.parsedAttributes) == 0 { return false, nil } // 匹配标签 (TagsMap) if len(caseItem.parsedLabelKeys) > 0 { tagsMap := event.TagsMap if tagsMap == nil { tagsMap = make(map[string]string) } if !alertCommon.MatchTags(tagsMap, caseItem.parsedLabelKeys) { return false, nil } } // 匹配属性 (JsonTagsAndValue - 所有 JSON 字段) if len(caseItem.parsedAttributes) > 0 { attributesMap := event.JsonTagsAndValue() if !alertCommon.MatchTags(attributesMap, caseItem.parsedAttributes) { return false, nil } } return true, nil } ================================================ FILE: alert/pipeline/processor/relabel/relabel.go ================================================ package relabel import ( "fmt" "regexp" "strings" "github.com/ccfos/nightingale/v6/alert/pipeline/processor/common" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pushgw/pconf" "github.com/ccfos/nightingale/v6/pushgw/writer" "github.com/prometheus/common/model" "github.com/prometheus/prometheus/prompb" ) const ( REPLACE_DOT = "___" ) // RelabelConfig type RelabelConfig struct { SourceLabels []string `json:"source_labels"` Separator string `json:"separator"` Regex string `json:"regex"` RegexCompiled *regexp.Regexp If string `json:"if"` IfRegex *regexp.Regexp Modulus uint64 `json:"modulus"` TargetLabel string `json:"target_label"` Replacement string `json:"replacement"` Action string `json:"action"` } func init() { models.RegisterProcessor("relabel", &RelabelConfig{}) } func (r *RelabelConfig) Init(settings interface{}) (models.Processor, error) { result, err := common.InitProcessor[*RelabelConfig](settings) return result, err } func (r *RelabelConfig) Process(ctx *ctx.Context, wfCtx *models.WorkflowContext) (*models.WorkflowContext, string, error) { sourceLabels := make([]model.LabelName, len(r.SourceLabels)) for i := range r.SourceLabels { sourceLabels[i] = model.LabelName(strings.ReplaceAll(r.SourceLabels[i], ".", REPLACE_DOT)) } relabelConfigs := []*pconf.RelabelConfig{ { SourceLabels: sourceLabels, Separator: r.Separator, Regex: r.Regex, RegexCompiled: r.RegexCompiled, If: r.If, IfRegex: r.IfRegex, Modulus: r.Modulus, TargetLabel: r.TargetLabel, Replacement: r.Replacement, Action: r.Action, }, } EventRelabel(wfCtx.Event, relabelConfigs) return wfCtx, "", nil } func EventRelabel(event *models.AlertCurEvent, relabelConfigs []*pconf.RelabelConfig) { labels := make([]prompb.Label, len(event.TagsJSON)) event.OriginalTagsJSON = make([]string, len(event.TagsJSON)) for i, tag := range event.TagsJSON { label := strings.SplitN(tag, "=", 2) if len(label) != 2 { continue } event.OriginalTagsJSON[i] = tag label[0] = strings.ReplaceAll(string(label[0]), ".", REPLACE_DOT) labels[i] = prompb.Label{Name: label[0], Value: label[1]} } for i := 0; i < len(relabelConfigs); i++ { if relabelConfigs[i].Replacement == "" { relabelConfigs[i].Replacement = "$1" } if relabelConfigs[i].Separator == "" { relabelConfigs[i].Separator = ";" } if relabelConfigs[i].Regex == "" { relabelConfigs[i].Regex = "(.*)" } } gotLabels := writer.Process(labels, relabelConfigs...) event.TagsJSON = make([]string, len(gotLabels)) event.TagsMap = make(map[string]string, len(gotLabels)) for i, label := range gotLabels { label.Name = strings.ReplaceAll(string(label.Name), REPLACE_DOT, ".") event.TagsJSON[i] = fmt.Sprintf("%s=%s", label.Name, label.Value) event.TagsMap[label.Name] = label.Value } event.Tags = strings.Join(event.TagsJSON, ",,") } ================================================ FILE: alert/pipeline/processor/utils/utils.go ================================================ package utils import ( "bytes" "fmt" "strings" "text/template" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/tplx" ) func TplRender(wfCtx *models.WorkflowContext, content string) (string, error) { var defs = []string{ "{{ $event := .Event }}", "{{ $labels := .Event.TagsMap }}", "{{ $value := .Event.TriggerValue }}", "{{ $inputs := .Inputs }}", } text := strings.Join(append(defs, content), "") tpl, err := template.New("tpl").Funcs(tplx.TemplateFuncMap).Parse(text) if err != nil { return "", fmt.Errorf("failed to parse template: %v", err) } var body bytes.Buffer if err = tpl.Execute(&body, wfCtx); err != nil { return "", fmt.Errorf("failed to execute template: %v", err) } return strings.TrimSpace(body.String()), nil } ================================================ FILE: alert/process/alert_cur_event.go ================================================ package process import ( "sync" "github.com/ccfos/nightingale/v6/models" ) type AlertCurEventMap struct { sync.RWMutex Data map[string]*models.AlertCurEvent } func NewAlertCurEventMap(data map[string]*models.AlertCurEvent) *AlertCurEventMap { if data == nil { return &AlertCurEventMap{ Data: make(map[string]*models.AlertCurEvent), } } return &AlertCurEventMap{ Data: data, } } func (a *AlertCurEventMap) SetAll(data map[string]*models.AlertCurEvent) { a.Lock() defer a.Unlock() a.Data = data } func (a *AlertCurEventMap) Set(key string, value *models.AlertCurEvent) { a.Lock() defer a.Unlock() a.Data[key] = value } func (a *AlertCurEventMap) Get(key string) (*models.AlertCurEvent, bool) { a.RLock() defer a.RUnlock() event, exists := a.Data[key] return event, exists } func (a *AlertCurEventMap) UpdateLastEvalTime(key string, lastEvalTime int64) { a.Lock() defer a.Unlock() event, exists := a.Data[key] if !exists { return } event.LastEvalTime = lastEvalTime } func (a *AlertCurEventMap) Delete(key string) { a.Lock() defer a.Unlock() delete(a.Data, key) } func (a *AlertCurEventMap) Keys() []string { a.RLock() defer a.RUnlock() keys := make([]string, 0, len(a.Data)) for k := range a.Data { keys = append(keys, k) } return keys } func (a *AlertCurEventMap) GetAll() map[string]*models.AlertCurEvent { a.RLock() defer a.RUnlock() return a.Data } ================================================ FILE: alert/process/process.go ================================================ package process import ( "bytes" "encoding/json" "fmt" "html/template" "sort" "strings" "sync" "time" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/alert/common" "github.com/ccfos/nightingale/v6/alert/dispatch" "github.com/ccfos/nightingale/v6/alert/mute" "github.com/ccfos/nightingale/v6/alert/pipeline/processor/relabel" "github.com/ccfos/nightingale/v6/alert/queue" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/tplx" "github.com/robfig/cron/v3" "github.com/toolkits/pkg/logger" "github.com/toolkits/pkg/str" ) type ExternalProcessorsType struct { ExternalLock sync.RWMutex Processors map[string]*Processor } var ExternalProcessors ExternalProcessorsType func NewExternalProcessors() *ExternalProcessorsType { return &ExternalProcessorsType{ Processors: make(map[string]*Processor), } } func (e *ExternalProcessorsType) GetExternalAlertRule(datasourceId, id int64) (*Processor, bool) { e.ExternalLock.RLock() defer e.ExternalLock.RUnlock() processor, has := e.Processors[common.RuleKey(datasourceId, id)] return processor, has } type HandleEventFunc func(event *models.AlertCurEvent) type Processor struct { datasourceId int64 EngineName string rule *models.AlertRule fires *AlertCurEventMap pendings *AlertCurEventMap pendingsUseByRecover *AlertCurEventMap inhibit bool tagsMap map[string]string tagsArr []string groupName string alertRuleCache *memsto.AlertRuleCacheType TargetCache *memsto.TargetCacheType TargetsOfAlertRuleCache *memsto.TargetsOfAlertRuleCacheType BusiGroupCache *memsto.BusiGroupCacheType alertMuteCache *memsto.AlertMuteCacheType datasourceCache *memsto.DatasourceCacheType ctx *ctx.Context Stats *astats.Stats HandleFireEventHook HandleEventFunc HandleRecoverEventHook HandleEventFunc ScheduleEntry cron.Entry PromEvalInterval int } func (p *Processor) Key() string { return common.RuleKey(p.datasourceId, p.rule.Id) } func (p *Processor) DatasourceId() int64 { return p.datasourceId } func (p *Processor) Hash() string { return str.MD5(fmt.Sprintf("%d_%s_%s_%d", p.rule.Id, p.rule.CronPattern, p.rule.RuleConfig, p.datasourceId, )) } func NewProcessor(engineName string, rule *models.AlertRule, datasourceId int64, alertRuleCache *memsto.AlertRuleCacheType, targetCache *memsto.TargetCacheType, targetsOfAlertRuleCache *memsto.TargetsOfAlertRuleCacheType, busiGroupCache *memsto.BusiGroupCacheType, alertMuteCache *memsto.AlertMuteCacheType, datasourceCache *memsto.DatasourceCacheType, ctx *ctx.Context, stats *astats.Stats) *Processor { p := &Processor{ EngineName: engineName, datasourceId: datasourceId, rule: rule, TargetCache: targetCache, TargetsOfAlertRuleCache: targetsOfAlertRuleCache, BusiGroupCache: busiGroupCache, alertMuteCache: alertMuteCache, alertRuleCache: alertRuleCache, datasourceCache: datasourceCache, ctx: ctx, Stats: stats, HandleFireEventHook: func(event *models.AlertCurEvent) {}, HandleRecoverEventHook: func(event *models.AlertCurEvent) {}, } p.mayHandleGroup() return p } func (p *Processor) Handle(anomalyPoints []models.AnomalyPoint, from string, inhibit bool) { // 有可能rule的一些配置已经发生变化,比如告警接收人、callbacks等 // 这些信息的修改是不会引起worker restart的,但是确实会影响告警处理逻辑 // 所以,这里直接从memsto.AlertRuleCache中获取并覆盖 p.inhibit = inhibit cachedRule := p.alertRuleCache.Get(p.rule.Id) if cachedRule == nil { logger.Warningf("alert_eval_%d datasource_%d handle error: rule not found, maybe rule has been deleted, anomalyPoints:%+v", p.rule.Id, p.datasourceId, anomalyPoints) p.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", p.DatasourceId()), "handle_event", p.BusiGroupCache.GetNameByBusiGroupId(p.rule.GroupId), fmt.Sprintf("%v", p.rule.Id)).Inc() return } // 在 rule 变化之前取到 ruleHash ruleHash := p.rule.Hash() p.rule = cachedRule now := time.Now().Unix() alertingKeys := map[string]struct{}{} // 根据 event 的 tag 将 events 分组,处理告警抑制的情况 eventsMap := make(map[string][]*models.AlertCurEvent) for _, anomalyPoint := range anomalyPoints { event := p.BuildEvent(anomalyPoint, from, now, ruleHash) event.NotifyRuleIds = cachedRule.NotifyRuleIds // 如果 event 被 mute 了,本质也是 fire 的状态,这里无论如何都添加到 alertingKeys 中,防止 fire 的事件自动恢复了 hash := event.Hash alertingKeys[hash] = struct{}{} // event processor eventCopy := event.DeepCopy() event = dispatch.HandleEventPipeline(cachedRule.PipelineConfigs, eventCopy, event, dispatch.EventProcessorCache, p.ctx, cachedRule.Id, "alert_rule") if event == nil { logger.Infof("alert_eval_%d datasource_%d is muted drop by pipeline event:%s", p.rule.Id, p.datasourceId, eventCopy.Hash) continue } // event mute isMuted, detail, muteId := mute.IsMuted(cachedRule, event, p.TargetCache, p.alertMuteCache) if isMuted { logger.Infof("alert_eval_%d datasource_%d is muted, detail:%s event:%s", p.rule.Id, p.datasourceId, detail, event.Hash) p.Stats.CounterMuteTotal.WithLabelValues( fmt.Sprintf("%v", event.GroupName), fmt.Sprintf("%v", p.rule.Id), fmt.Sprintf("%v", muteId), fmt.Sprintf("%v", p.datasourceId), ).Inc() continue } if dispatch.EventMuteHook(event) { logger.Infof("alert_eval_%d datasource_%d is muted by hook event:%s", p.rule.Id, p.datasourceId, event.Hash) p.Stats.CounterMuteTotal.WithLabelValues( fmt.Sprintf("%v", event.GroupName), fmt.Sprintf("%v", p.rule.Id), fmt.Sprintf("%v", 0), fmt.Sprintf("%v", p.datasourceId), ).Inc() continue } tagHash := TagHash(anomalyPoint) eventsMap[tagHash] = append(eventsMap[tagHash], event) } for _, events := range eventsMap { p.handleEvent(events) } if from == "inner" { p.HandleRecover(alertingKeys, now, inhibit) } } func (p *Processor) BuildEvent(anomalyPoint models.AnomalyPoint, from string, now int64, ruleHash string) *models.AlertCurEvent { p.fillTags(anomalyPoint) hash := Hash(p.rule.Id, p.datasourceId, anomalyPoint) ds := p.datasourceCache.GetById(p.datasourceId) var dsName string if ds != nil { dsName = ds.Name } event := p.rule.GenerateNewEvent(p.ctx) bg := p.BusiGroupCache.GetByBusiGroupId(p.rule.GroupId) if bg != nil { event.GroupName = bg.Name } event.TriggerTime = anomalyPoint.Timestamp event.TagsMap = p.tagsMap event.DatasourceId = p.datasourceId event.Cluster = dsName event.Hash = hash event.TriggerValue = anomalyPoint.ReadableValue() event.TriggerValues = anomalyPoint.Values event.TriggerValuesJson = models.EventTriggerValues{ValuesWithUnit: anomalyPoint.ValuesUnit} event.TagsJSON = p.tagsArr event.Tags = strings.Join(p.tagsArr, ",,") event.IsRecovered = false event.Callbacks = p.rule.Callbacks event.CallbacksJSON = p.rule.CallbacksJSON event.Annotations = p.rule.Annotations event.RuleConfig = p.rule.RuleConfig event.RuleConfigJson = p.rule.RuleConfigJson event.Severity = anomalyPoint.Severity event.ExtraConfig = p.rule.ExtraConfigJSON event.PromQl = anomalyPoint.Query event.RecoverConfig = anomalyPoint.RecoverConfig event.RuleHash = ruleHash if anomalyPoint.TriggerType == models.TriggerTypeNodata { event.TriggerValue = "nodata" ruleConfig := models.RuleQuery{} json.Unmarshal([]byte(p.rule.RuleConfig), &ruleConfig) ruleConfig.TriggerType = anomalyPoint.TriggerType b, _ := json.Marshal(ruleConfig) event.RuleConfig = string(b) } if err := json.Unmarshal([]byte(p.rule.Annotations), &event.AnnotationsJSON); err != nil { event.AnnotationsJSON = make(map[string]string) // 解析失败时使用空 map logger.Warningf("alert_eval_%d datasource_%d unmarshal annotations json failed: %v", p.rule.Id, p.datasourceId, err) } if event.TriggerValues != "" && strings.Count(event.TriggerValues, "$") > 1 { // TriggerValues 有多个变量,将多个变量都放到 TriggerValue 中 event.TriggerValue = event.TriggerValues } if from == "inner" { event.LastEvalTime = now } else { event.LastEvalTime = event.TriggerTime } // 生成事件之后,立马进程 relabel 处理 Relabel(p.rule, event) // 放到 Relabel(p.rule, event) 下面,为了处理 relabel 之后,标签里才出现 ident 的情况 p.mayHandleIdent(event) if event.TargetIdent != "" { if pt, exist := p.TargetCache.Get(event.TargetIdent); exist { pt.GroupNames = p.BusiGroupCache.GetNamesByBusiGroupIds(pt.GroupIds) event.Target = pt } else { logger.Infof("alert_eval_%d datasource_%d fill event target error, ident: %s doesn't exist in cache.", p.rule.Id, p.datasourceId, event.TargetIdent) } } return event } func Relabel(rule *models.AlertRule, event *models.AlertCurEvent) { if rule == nil { return } // need to keep the original label event.OriginalTags = event.Tags event.OriginalTagsJSON = event.TagsJSON if len(rule.EventRelabelConfig) == 0 { return } relabel.EventRelabel(event, rule.EventRelabelConfig) } func (p *Processor) HandleRecover(alertingKeys map[string]struct{}, now int64, inhibit bool) { for _, hash := range p.pendings.Keys() { if _, has := alertingKeys[hash]; has { continue } p.pendings.Delete(hash) } hashArr := make([]string, 0, len(alertingKeys)) for hash, _ := range p.fires.GetAll() { if _, has := alertingKeys[hash]; has { continue } hashArr = append(hashArr, hash) } p.HandleRecoverEvent(hashArr, now, inhibit) } func (p *Processor) HandleRecoverEvent(hashArr []string, now int64, inhibit bool) { cachedRule := p.rule if cachedRule == nil { return } if !inhibit { for _, hash := range hashArr { p.RecoverSingle(false, hash, now, nil) } return } eventMap := make(map[string]models.AlertCurEvent) for _, hash := range hashArr { event, has := p.fires.Get(hash) if !has { continue } e, exists := eventMap[event.Tags] if !exists { eventMap[event.Tags] = *event continue } if e.Severity > event.Severity { // hash 对应的恢复事件的被抑制了,把之前的事件删除 p.fires.Delete(e.Hash) p.pendings.Delete(e.Hash) models.AlertCurEventDelByHash(p.ctx, e.Hash) eventMap[event.Tags] = *event } } for _, event := range eventMap { p.RecoverSingle(false, event.Hash, now, nil) } } func (p *Processor) RecoverSingle(byRecover bool, hash string, now int64, value *string, values ...string) { cachedRule := p.rule if cachedRule == nil { return } event, has := p.fires.Get(hash) if !has { return } // 如果配置了留观时长,就不能立马恢复了 if cachedRule.RecoverDuration > 0 { lastPendingEvent, has := p.pendingsUseByRecover.Get(hash) if !has { // 说明没有产生过异常点,就不需要恢复了 logger.Debugf("alert_eval_%d datasource_%d event:%s do not has pending event, not recover", p.rule.Id, p.datasourceId, event.Hash) return } if now-lastPendingEvent.LastEvalTime < cachedRule.RecoverDuration { logger.Debugf("alert_eval_%d datasource_%d event:%s not recover", p.rule.Id, p.datasourceId, event.Hash) return } } // 如果设置了恢复条件,则不能在此处恢复,必须依靠 recoverPoint 来恢复 if event.RecoverConfig.JudgeType != models.Origin && !byRecover { logger.Debugf("alert_eval_%d datasource_%d event:%s not recover", p.rule.Id, p.datasourceId, event.Hash) return } if value != nil { event.TriggerValue = *value if len(values) > 0 { event.TriggerValues = values[0] } } // 没查到触发阈值的vector,姑且就认为这个vector的值恢复了 // 我确实无法分辨,是prom中有值但是未满足阈值所以没返回,还是prom中确实丢了一些点导致没有数据可以返回,尴尬 p.fires.Delete(hash) p.pendings.Delete(hash) p.pendingsUseByRecover.Delete(hash) // 可能是因为调整了promql才恢复的,所以事件里边要体现最新的promql,否则用户会比较困惑 // 当然,其实rule的各个字段都可能发生变化了,都更新一下吧 cachedRule.UpdateEvent(event) event.IsRecovered = true event.LastEvalTime = now p.HandleRecoverEventHook(event) p.pushEventToQueue(event) } func (p *Processor) handleEvent(events []*models.AlertCurEvent) { var fireEvents []*models.AlertCurEvent // severity 初始为最低优先级, 一定为遇到比自己优先级高的事件 severity := models.SeverityLowest for _, event := range events { if event == nil { continue } if _, has := p.pendingsUseByRecover.Get(event.Hash); has { p.pendingsUseByRecover.UpdateLastEvalTime(event.Hash, event.LastEvalTime) } else { p.pendingsUseByRecover.Set(event.Hash, event) } event.PromEvalInterval = p.PromEvalInterval if p.rule.PromForDuration == 0 { fireEvents = append(fireEvents, event) if severity > event.Severity { severity = event.Severity } continue } var preEvalTime int64 // 第一个 pending event 的检测时间 preEvent, has := p.pendings.Get(event.Hash) if has { p.pendings.UpdateLastEvalTime(event.Hash, event.LastEvalTime) preEvalTime = preEvent.FirstEvalTime } else { event.FirstEvalTime = event.LastEvalTime p.pendings.Set(event.Hash, event) preEvalTime = event.FirstEvalTime } if event.LastEvalTime-preEvalTime+int64(event.PromEvalInterval) >= int64(p.rule.PromForDuration) { fireEvents = append(fireEvents, event) if severity > event.Severity { severity = event.Severity } continue } } p.inhibitEvent(fireEvents, severity) } func (p *Processor) inhibitEvent(events []*models.AlertCurEvent, highSeverity int) { for _, event := range events { if p.inhibit && event.Severity > highSeverity { logger.Debugf("alert_eval_%d datasource_%d event:%s inhibit highSeverity:%d", p.rule.Id, p.datasourceId, event.Hash, highSeverity) continue } p.fireEvent(event) } } func (p *Processor) fireEvent(event *models.AlertCurEvent) { // As p.rule maybe outdated, use rule from cache cachedRule := p.rule if cachedRule == nil { return } message := "unknown" defer func() { logger.Infof("alert_eval_%d datasource_%d event-hash-%s %s", p.rule.Id, p.datasourceId, event.Hash, message) }() if fired, has := p.fires.Get(event.Hash); has { p.fires.UpdateLastEvalTime(event.Hash, event.LastEvalTime) event.FirstTriggerTime = fired.FirstTriggerTime p.HandleFireEventHook(event) if cachedRule.NotifyRepeatStep == 0 { message = "stalled, rule.notify_repeat_step is 0, no need to repeat notify" return } // 之前发送过告警了,这次是否要继续发送,要看是否过了通道静默时间 if event.LastEvalTime >= fired.LastSentTime+int64(cachedRule.NotifyRepeatStep)*60 { if cachedRule.NotifyMaxNumber == 0 { // 最大可以发送次数如果是0,表示不想限制最大发送次数,一直发即可 event.NotifyCurNumber = fired.NotifyCurNumber + 1 message = fmt.Sprintf("fired, notify_repeat_step_matched(%d >= %d + %d * 60) notify_max_number_ignore(#%d / %d)", event.LastEvalTime, fired.LastSentTime, cachedRule.NotifyRepeatStep, event.NotifyCurNumber, cachedRule.NotifyMaxNumber) p.pushEventToQueue(event) } else { // 有最大发送次数的限制,就要看已经发了几次了,是否达到了最大发送次数 if fired.NotifyCurNumber >= cachedRule.NotifyMaxNumber { message = fmt.Sprintf("stalled, notify_repeat_step_matched(%d >= %d + %d * 60) notify_max_number_not_matched(#%d / %d)", event.LastEvalTime, fired.LastSentTime, cachedRule.NotifyRepeatStep, fired.NotifyCurNumber, cachedRule.NotifyMaxNumber) return } else { event.NotifyCurNumber = fired.NotifyCurNumber + 1 message = fmt.Sprintf("fired, notify_repeat_step_matched(%d >= %d + %d * 60) notify_max_number_matched(#%d / %d)", event.LastEvalTime, fired.LastSentTime, cachedRule.NotifyRepeatStep, event.NotifyCurNumber, cachedRule.NotifyMaxNumber) p.pushEventToQueue(event) } } } else { message = fmt.Sprintf("stalled, notify_repeat_step_not_matched(%d < %d + %d * 60)", event.LastEvalTime, fired.LastSentTime, cachedRule.NotifyRepeatStep) } } else { event.NotifyCurNumber = 1 event.FirstTriggerTime = event.TriggerTime message = fmt.Sprintf("fired, first_trigger_time: %d", event.FirstTriggerTime) p.HandleFireEventHook(event) p.pushEventToQueue(event) } } func (p *Processor) pushEventToQueue(e *models.AlertCurEvent) { if !e.IsRecovered { e.LastSentTime = e.LastEvalTime p.fires.Set(e.Hash, e) } dispatch.LogEvent(e, "push_queue") if !queue.EventQueue.PushFront(e) { logger.Warningf("alert_eval_%d datasource_%d event_push_queue: queue is full, event:%s", p.rule.Id, p.datasourceId, e.Hash) p.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", p.DatasourceId()), "push_event_queue", p.BusiGroupCache.GetNameByBusiGroupId(p.rule.GroupId), fmt.Sprintf("%v", p.rule.Id)).Inc() } } func (p *Processor) RecoverAlertCurEventFromDb() { p.pendings = NewAlertCurEventMap(nil) p.pendingsUseByRecover = NewAlertCurEventMap(nil) curEvents, err := models.AlertCurEventGetByRuleIdAndDsId(p.ctx, p.rule.Id, p.datasourceId) if err != nil { logger.Errorf("alert_eval_%d datasource_%d recover event from db failed, err:%s", p.rule.Id, p.datasourceId, err) p.Stats.CounterRuleEvalErrorTotal.WithLabelValues(fmt.Sprintf("%v", p.DatasourceId()), "get_recover_event", p.BusiGroupCache.GetNameByBusiGroupId(p.rule.GroupId), fmt.Sprintf("%v", p.rule.Id)).Inc() p.fires = NewAlertCurEventMap(nil) return } fireMap := make(map[string]*models.AlertCurEvent) pendingsUseByRecoverMap := make(map[string]*models.AlertCurEvent) for _, event := range curEvents { alertRule := p.alertRuleCache.Get(event.RuleId) if alertRule == nil { continue } event.NotifyRuleIds = alertRule.NotifyRuleIds if event.Cate == models.HOST { target, exists := p.TargetCache.Get(event.TargetIdent) if exists && target.EngineName != p.EngineName && !(p.ctx.IsCenter && target.EngineName == "") { // 如果是 host rule,且 target 的 engineName 不是当前的 engineName 或者是中心机房 target EngineName 为空,就跳过 continue } } event.DB2Mem() target, exists := p.TargetCache.Get(event.TargetIdent) if exists { target.GroupNames = p.BusiGroupCache.GetNamesByBusiGroupIds(target.GroupIds) event.Target = target } fireMap[event.Hash] = event e := *event pendingsUseByRecoverMap[event.Hash] = &e } p.fires = NewAlertCurEventMap(fireMap) // 修改告警规则,或者进程重启之后,需要重新加载 pendingsUseByRecover p.pendingsUseByRecover = NewAlertCurEventMap(pendingsUseByRecoverMap) } func (p *Processor) fillTags(anomalyPoint models.AnomalyPoint) { // handle series tags tagsMap := make(map[string]string) for label, value := range anomalyPoint.Labels { tagsMap[string(label)] = string(value) } var e = &models.AlertCurEvent{ TagsMap: tagsMap, } // handle rule tags tags := p.rule.AppendTagsJSON tags = append(tags, "rulename="+p.rule.Name) for _, tag := range tags { arr := strings.SplitN(tag, "=", 2) var defs = []string{ "{{$labels := .TagsMap}}", "{{$value := .TriggerValue}}", } tagValue := arr[1] text := strings.Join(append(defs, tagValue), "") t, err := template.New(fmt.Sprint(p.rule.Id)).Funcs(template.FuncMap(tplx.TemplateFuncMap)).Parse(text) if err != nil { tagValue = fmt.Sprintf("parse tag value failed, err:%s", err) tagsMap[arr[0]] = tagValue continue } var body bytes.Buffer err = t.Execute(&body, e) if err != nil { tagValue = fmt.Sprintf("parse tag value failed, err:%s", err) tagsMap[arr[0]] = tagValue continue } tagsMap[arr[0]] = body.String() } p.tagsMap = tagsMap // handle tagsArr p.tagsArr = labelMapToArr(tagsMap) } func (p *Processor) mayHandleIdent(event *models.AlertCurEvent) { // handle ident if ident, has := event.TagsMap["ident"]; has { if target, exists := p.TargetCache.Get(ident); exists { event.TargetIdent = target.Ident event.TargetNote = target.Note } else { event.TargetIdent = ident event.TargetNote = "" } } else { event.TargetIdent = "" event.TargetNote = "" } } func (p *Processor) mayHandleGroup() { // handle bg bg := p.BusiGroupCache.GetByBusiGroupId(p.rule.GroupId) if bg != nil { p.groupName = bg.Name } } func (p *Processor) DeleteProcessEvent(hash string) { p.fires.Delete(hash) p.pendings.Delete(hash) p.pendingsUseByRecover.Delete(hash) } func labelMapToArr(m map[string]string) []string { numLabels := len(m) labelStrings := make([]string, 0, numLabels) for label, value := range m { labelStrings = append(labelStrings, fmt.Sprintf("%s=%s", label, value)) } if numLabels > 1 { sort.Strings(labelStrings) } return labelStrings } func Hash(ruleId, datasourceId int64, vector models.AnomalyPoint) string { return str.MD5(fmt.Sprintf("%d_%s_%d_%d_%s", ruleId, vector.Labels.String(), datasourceId, vector.Severity, vector.Query)) } func TagHash(vector models.AnomalyPoint) string { return str.MD5(vector.Labels.String()) } ================================================ FILE: alert/queue/queue.go ================================================ package queue import ( "time" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/toolkits/pkg/container/list" ) var EventQueue = list.NewSafeListLimited(10000000) func ReportQueueSize(stats *astats.Stats) { for { time.Sleep(time.Second) stats.GaugeAlertQueueSize.Set(float64(EventQueue.Len())) } } ================================================ FILE: alert/record/prom_rule.go ================================================ package record import ( "context" "fmt" "strings" "time" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/prom" "github.com/ccfos/nightingale/v6/pushgw/writer" "github.com/robfig/cron/v3" "github.com/toolkits/pkg/logger" "github.com/toolkits/pkg/str" ) type RecordRuleContext struct { datasourceId int64 quit chan struct{} scheduler *cron.Cron rule *models.RecordingRule promClients *prom.PromClientMap stats *astats.Stats } func NewRecordRuleContext(rule *models.RecordingRule, datasourceId int64, promClients *prom.PromClientMap, writers *writer.WritersType, stats *astats.Stats) *RecordRuleContext { rrc := &RecordRuleContext{ datasourceId: datasourceId, quit: make(chan struct{}), rule: rule, promClients: promClients, stats: stats, } if rule.CronPattern == "" && rule.PromEvalInterval != 0 { rule.CronPattern = fmt.Sprintf("@every %ds", rule.PromEvalInterval) } rrc.scheduler = cron.New(cron.WithSeconds(), cron.WithChain(cron.SkipIfStillRunning(cron.DefaultLogger))) _, err := rrc.scheduler.AddFunc(rule.CronPattern, func() { rrc.Eval() }) if err != nil { logger.Errorf("add cron pattern error: %v", err) } return rrc } func (rrc *RecordRuleContext) Key() string { return fmt.Sprintf("record-%d-%d", rrc.datasourceId, rrc.rule.Id) } func (rrc *RecordRuleContext) Hash() string { return str.MD5(fmt.Sprintf("%d_%s_%s_%d_%s_%s", rrc.rule.Id, rrc.rule.CronPattern, rrc.rule.PromQl, rrc.datasourceId, rrc.rule.AppendTags, rrc.rule.Name, )) } func (rrc *RecordRuleContext) Prepare() {} func (rrc *RecordRuleContext) Start() { logger.Infof("eval:%s started", rrc.Key()) rrc.scheduler.Start() } func (rrc *RecordRuleContext) Eval() { rrc.stats.CounterRecordEval.WithLabelValues(fmt.Sprintf("%d", rrc.datasourceId)).Inc() promql := strings.TrimSpace(rrc.rule.PromQl) if promql == "" { logger.Errorf("eval:%s promql is blank", rrc.Key()) return } if rrc.promClients.IsNil(rrc.datasourceId) { logger.Errorf("eval:%s reader client is nil", rrc.Key()) rrc.stats.CounterRecordEvalErrorTotal.WithLabelValues(fmt.Sprintf("%d", rrc.datasourceId)).Inc() return } value, warnings, err := rrc.promClients.GetCli(rrc.datasourceId).Query(context.Background(), promql, time.Now()) if err != nil { logger.Errorf("eval:%s promql:%s, error:%v", rrc.Key(), promql, err) rrc.stats.CounterRecordEvalErrorTotal.WithLabelValues(fmt.Sprintf("%d", rrc.datasourceId)).Inc() return } if len(warnings) > 0 { logger.Errorf("eval:%s promql:%s, warnings:%v", rrc.Key(), promql, warnings) rrc.stats.CounterRecordEvalErrorTotal.WithLabelValues(fmt.Sprintf("%d", rrc.datasourceId)).Inc() return } ts := ConvertToTimeSeries(value, rrc.rule) if len(ts) != 0 { err := rrc.promClients.GetWriterCli(rrc.datasourceId).Write(ts) if err != nil { logger.Errorf("eval:%s promql:%s, error:%v", rrc.Key(), promql, err) rrc.stats.CounterRecordEvalErrorTotal.WithLabelValues(fmt.Sprintf("%d", rrc.datasourceId)).Inc() } } } func (rrc *RecordRuleContext) Stop() { logger.Infof("%s stopped", rrc.Key()) c := rrc.scheduler.Stop() <-c.Done() close(rrc.quit) } ================================================ FILE: alert/record/sample.go ================================================ package record import ( "math" "strings" "time" "github.com/ccfos/nightingale/v6/models" "github.com/prometheus/common/model" "github.com/prometheus/prometheus/prompb" ) const ( LabelName = "__name__" ) func ConvertToTimeSeries(value model.Value, rule *models.RecordingRule) (lst []prompb.TimeSeries) { switch value.Type() { case model.ValVector: items, ok := value.(model.Vector) if !ok { return } for _, item := range items { if math.IsNaN(float64(item.Value)) { continue } s := prompb.Sample{} s.Timestamp = time.Unix(item.Timestamp.Unix(), 0).UnixNano() / 1e6 s.Value = float64(item.Value) l := labelsToLabelsProto(item.Metric, rule) lst = append(lst, prompb.TimeSeries{ Labels: l, Samples: []prompb.Sample{s}, }) } case model.ValMatrix: items, ok := value.(model.Matrix) if !ok { return } for _, item := range items { if len(item.Values) == 0 { return } last := item.Values[len(item.Values)-1] if math.IsNaN(float64(last.Value)) { continue } l := labelsToLabelsProto(item.Metric, rule) var slst []prompb.Sample for _, v := range item.Values { if math.IsNaN(float64(v.Value)) { continue } slst = append(slst, prompb.Sample{ Timestamp: time.Unix(v.Timestamp.Unix(), 0).UnixNano() / 1e6, Value: float64(v.Value), }) } lst = append(lst, prompb.TimeSeries{ Labels: l, Samples: slst, }) } case model.ValScalar: item, ok := value.(*model.Scalar) if !ok { return } if math.IsNaN(float64(item.Value)) { return } lst = append(lst, prompb.TimeSeries{ Labels: nil, Samples: []prompb.Sample{{Value: float64(item.Value), Timestamp: time.Unix(item.Timestamp.Unix(), 0).UnixNano() / 1e6}}, }) default: return } return } func labelsToLabelsProto(labels model.Metric, rule *models.RecordingRule) (result []prompb.Label) { //name nameLs := prompb.Label{ Name: LabelName, Value: rule.Name, } result = append(result, nameLs) for k, v := range labels { if k == LabelName { continue } if model.LabelNameRE.MatchString(string(k)) { result = append(result, prompb.Label{ Name: string(k), Value: string(v), }) } } if len(rule.AppendTagsJSON) != 0 { for _, v := range rule.AppendTagsJSON { index := strings.Index(v, "=") if model.LabelNameRE.MatchString(v[:index]) { result = append(result, prompb.Label{ Name: v[:index], Value: v[index+1:], }) } } } return result } ================================================ FILE: alert/record/scheduler.go ================================================ package record import ( "context" "fmt" "strconv" "time" "github.com/ccfos/nightingale/v6/alert/aconf" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/alert/naming" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/prom" "github.com/ccfos/nightingale/v6/pushgw/writer" ) type Scheduler struct { // key: hash recordRules map[string]*RecordRuleContext aconf aconf.Alert recordingRuleCache *memsto.RecordingRuleCacheType promClients *prom.PromClientMap writers *writer.WritersType stats *astats.Stats datasourceCache *memsto.DatasourceCacheType } func NewScheduler(aconf aconf.Alert, rrc *memsto.RecordingRuleCacheType, promClients *prom.PromClientMap, writers *writer.WritersType, stats *astats.Stats, datasourceCache *memsto.DatasourceCacheType) *Scheduler { scheduler := &Scheduler{ aconf: aconf, recordRules: make(map[string]*RecordRuleContext), recordingRuleCache: rrc, promClients: promClients, writers: writers, stats: stats, datasourceCache: datasourceCache, } go scheduler.LoopSyncRules(context.Background()) return scheduler } func (s *Scheduler) LoopSyncRules(ctx context.Context) { time.Sleep(time.Duration(s.aconf.EngineDelay) * time.Second) duration := 9000 * time.Millisecond for { select { case <-ctx.Done(): return case <-time.After(duration): s.syncRecordRules() } } } func (s *Scheduler) syncRecordRules() { ids := s.recordingRuleCache.GetRuleIds() recordRules := make(map[string]*RecordRuleContext) for _, id := range ids { rule := s.recordingRuleCache.Get(id) if rule == nil { continue } datasourceIds := s.datasourceCache.GetIDsByDsCateAndQueries("prometheus", rule.DatasourceQueries) for _, dsId := range datasourceIds { if !naming.DatasourceHashRing.IsHit(strconv.FormatInt(dsId, 10), fmt.Sprintf("%d", rule.Id), s.aconf.Heartbeat.Endpoint) { continue } recordRule := NewRecordRuleContext(rule, dsId, s.promClients, s.writers, s.stats) recordRules[recordRule.Hash()] = recordRule } } for hash, rule := range recordRules { if _, has := s.recordRules[hash]; !has { rule.Prepare() rule.Start() s.recordRules[hash] = rule } } for hash, rule := range s.recordRules { if _, has := recordRules[hash]; !has { rule.Stop() delete(s.recordRules, hash) } } } ================================================ FILE: alert/router/router.go ================================================ package router import ( "net/http" "github.com/ccfos/nightingale/v6/alert/aconf" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/alert/process" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/httpx" "github.com/gin-gonic/gin" ) type Router struct { HTTP httpx.Config Alert aconf.Alert AlertMuteCache *memsto.AlertMuteCacheType TargetCache *memsto.TargetCacheType BusiGroupCache *memsto.BusiGroupCacheType AlertStats *astats.Stats Ctx *ctx.Context ExternalProcessors *process.ExternalProcessorsType LogDir string } func New(httpConfig httpx.Config, alert aconf.Alert, amc *memsto.AlertMuteCacheType, tc *memsto.TargetCacheType, bgc *memsto.BusiGroupCacheType, astats *astats.Stats, ctx *ctx.Context, externalProcessors *process.ExternalProcessorsType, logDir string) *Router { return &Router{ HTTP: httpConfig, Alert: alert, AlertMuteCache: amc, TargetCache: tc, BusiGroupCache: bgc, AlertStats: astats, Ctx: ctx, ExternalProcessors: externalProcessors, LogDir: logDir, } } func (rt *Router) Config(r *gin.Engine) { if !rt.HTTP.APIForService.Enable { return } service := r.Group("/v1/n9e") if len(rt.HTTP.APIForService.BasicAuth) > 0 { service.Use(gin.BasicAuth(rt.HTTP.APIForService.BasicAuth)) } service.POST("/event", rt.pushEventToQueue) service.POST("/event-persist", rt.eventPersist) service.POST("/make-event", rt.makeEvent) service.GET("/event-detail/:hash", rt.eventDetail) service.GET("/alert-eval-detail/:id", rt.alertEvalDetail) service.GET("/trace-logs/:traceid", rt.traceLogs) } func Render(c *gin.Context, data, msg interface{}) { if msg == nil { if data == nil { data = struct{}{} } c.JSON(http.StatusOK, gin.H{"data": data, "error": ""}) } else { c.JSON(http.StatusOK, gin.H{"error": gin.H{"message": msg}}) } } func Dangerous(c *gin.Context, v interface{}, code ...int) { if v == nil { return } switch t := v.(type) { case string: if t != "" { c.JSON(http.StatusOK, gin.H{"error": gin.H{"message": v}}) } case error: c.JSON(http.StatusOK, gin.H{"error": gin.H{"message": t.Error()}}) } } ================================================ FILE: alert/router/router_alert_eval_detail.go ================================================ package router import ( "fmt" "github.com/ccfos/nightingale/v6/pkg/loggrep" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) func (rt *Router) alertEvalDetail(c *gin.Context) { id := ginx.UrlParamStr(c, "id") if !loggrep.IsValidRuleID(id) { ginx.Bomb(200, "invalid rule id format") } instance := fmt.Sprintf("%s:%d", rt.Alert.Heartbeat.IP, rt.HTTP.Port) keyword := fmt.Sprintf("alert_eval_%s", id) logs, err := loggrep.GrepLogDir(rt.LogDir, keyword) ginx.Dangerous(err) ginx.NewRender(c).Data(loggrep.EventDetailResp{ Logs: logs, Instance: instance, }, nil) } ================================================ FILE: alert/router/router_event.go ================================================ package router import ( "fmt" "strconv" "strings" "time" "github.com/ccfos/nightingale/v6/alert/dispatch" "github.com/ccfos/nightingale/v6/alert/mute" "github.com/ccfos/nightingale/v6/alert/naming" "github.com/ccfos/nightingale/v6/alert/process" "github.com/ccfos/nightingale/v6/alert/queue" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/poster" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/logger" ) func (rt *Router) pushEventToQueue(c *gin.Context) { var event *models.AlertCurEvent ginx.BindJSON(c, &event) if event.RuleId == 0 { ginx.Bomb(200, "event is illegal") } event.FE2DB() event.TagsMap = make(map[string]string) for i := 0; i < len(event.TagsJSON); i++ { pair := strings.TrimSpace(event.TagsJSON[i]) if pair == "" { continue } arr := strings.SplitN(pair, "=", 2) if len(arr) != 2 { continue } event.TagsMap[arr[0]] = arr[1] } hit, _ := mute.EventMuteStrategy(event, rt.AlertMuteCache) if hit { logger.Infof("event_muted: rule_id=%d %s", event.RuleId, event.Hash) ginx.NewRender(c).Message(nil) return } if err := event.ParseRule("rule_name"); err != nil { event.RuleName = fmt.Sprintf("failed to parse rule name: %v", err) } if err := event.ParseRule("rule_note"); err != nil { event.RuleNote = fmt.Sprintf("failed to parse rule note: %v", err) } if err := event.ParseRule("annotations"); err != nil { event.RuleNote = fmt.Sprintf("failed to parse rule note: %v", err) } // 如果 rule_note 中有 ; 前缀,则使用 rule_note 替换 tags 中的内容 if strings.HasPrefix(event.RuleNote, ";") { event.RuleNote = strings.TrimPrefix(event.RuleNote, ";") event.Tags = strings.ReplaceAll(event.RuleNote, " ", ",,") event.TagsJSON = strings.Split(event.Tags, ",,") } else { event.Tags = strings.Join(event.TagsJSON, ",,") } event.Callbacks = strings.Join(event.CallbacksJSON, " ") event.NotifyChannels = strings.Join(event.NotifyChannelsJSON, " ") event.NotifyGroups = strings.Join(event.NotifyGroupsJSON, " ") dispatch.LogEvent(event, "http_push_queue") if !queue.EventQueue.PushFront(event) { msg := fmt.Sprintf("event:%s push_queue err: queue is full", event.Hash) ginx.Bomb(200, msg) logger.Warningf(msg) } ginx.NewRender(c).Message(nil) } func (rt *Router) eventPersist(c *gin.Context) { var event *models.AlertCurEvent ginx.BindJSON(c, &event) event.FE2DB() err := models.EventPersist(rt.Ctx, event) ginx.NewRender(c).Data(event.Id, err) } type eventForm struct { Alert bool `json:"alert"` AnomalyPoints []models.AnomalyPoint `json:"vectors"` RuleId int64 `json:"rule_id"` DatasourceId int64 `json:"datasource_id"` Inhibit bool `json:"inhibit"` } func (rt *Router) makeEvent(c *gin.Context) { var events []*eventForm ginx.BindJSON(c, &events) //now := time.Now().Unix() for i := 0; i < len(events); i++ { node, err := naming.DatasourceHashRing.GetNode(strconv.FormatInt(events[i].DatasourceId, 10), fmt.Sprintf("%d", events[i].RuleId)) if err != nil { logger.Warningf("event(rule_id=%d ds_id=%d) get node err:%v", events[i].RuleId, events[i].DatasourceId, err) ginx.Bomb(200, "event node not exists") } if node != rt.Alert.Heartbeat.Endpoint { err := forwardEvent(events[i], node) if err != nil { logger.Warningf("event(rule_id=%d ds_id=%d) forward err:%v", events[i].RuleId, events[i].DatasourceId, err) ginx.Bomb(200, "event forward error") } continue } ruleWorker, exists := rt.ExternalProcessors.GetExternalAlertRule(events[i].DatasourceId, events[i].RuleId) logger.Debugf("handle event(rule_id=%d ds_id=%d) exists:%v", events[i].RuleId, events[i].DatasourceId, exists) if !exists { ginx.Bomb(200, "rule not exists") } if events[i].Alert { go ruleWorker.Handle(events[i].AnomalyPoints, "http", events[i].Inhibit) } else { for _, vector := range events[i].AnomalyPoints { readableString := vector.ReadableValue() go ruleWorker.RecoverSingle(false, process.Hash(events[i].RuleId, events[i].DatasourceId, vector), vector.Timestamp, &readableString) } } } ginx.NewRender(c).Message(nil) } // event 不归本实例处理,转发给对应的实例 func forwardEvent(event *eventForm, instance string) error { ur := fmt.Sprintf("http://%s/v1/n9e/make-event", instance) res, code, err := poster.PostJSON(ur, time.Second*5, []*eventForm{event}, 3) if err != nil { return err } logger.Infof("forward event: result=succ url=%s code=%d rule_id=%d response=%s", ur, code, event.RuleId, string(res)) return nil } ================================================ FILE: alert/router/router_event_detail.go ================================================ package router import ( "fmt" "github.com/ccfos/nightingale/v6/pkg/loggrep" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) func (rt *Router) eventDetail(c *gin.Context) { hash := ginx.UrlParamStr(c, "hash") if !loggrep.IsValidHash(hash) { ginx.Bomb(200, "invalid hash format") } instance := fmt.Sprintf("%s:%d", rt.Alert.Heartbeat.IP, rt.HTTP.Port) logs, err := loggrep.GrepLogDir(rt.LogDir, hash) ginx.Dangerous(err) ginx.NewRender(c).Data(loggrep.EventDetailResp{ Logs: logs, Instance: instance, }, nil) } ================================================ FILE: alert/router/router_trace_logs.go ================================================ package router import ( "fmt" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/ccfos/nightingale/v6/pkg/loggrep" "github.com/gin-gonic/gin" ) func (rt *Router) traceLogs(c *gin.Context) { traceId := ginx.UrlParamStr(c, "traceid") if !loggrep.IsValidTraceID(traceId) { ginx.Bomb(200, "invalid trace id format") } instance := fmt.Sprintf("%s:%d", rt.Alert.Heartbeat.IP, rt.HTTP.Port) keyword := "trace_id=" + traceId logs, err := loggrep.GrepLatestLogFiles(rt.LogDir, keyword) ginx.Dangerous(err) ginx.NewRender(c).Data(loggrep.EventDetailResp{ Logs: logs, Instance: instance, }, nil) } ================================================ FILE: alert/sender/callback.go ================================================ package sender import ( "fmt" "html/template" "net/url" "strings" "time" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/poster" "github.com/toolkits/pkg/logger" ) type ( // CallBacker 进行回调的接口 CallBacker interface { CallBack(ctx CallBackContext) } // CallBackContext 回调时所需的上下文 CallBackContext struct { Ctx *ctx.Context CallBackURL string Users []*models.User Rule *models.AlertRule Events []*models.AlertCurEvent Stats *astats.Stats BatchSend bool } DefaultCallBacker struct{} ) func BuildCallBackContext(ctx *ctx.Context, callBackURL string, rule *models.AlertRule, events []*models.AlertCurEvent, uids []int64, userCache *memsto.UserCacheType, batchSend bool, stats *astats.Stats) CallBackContext { users := userCache.GetByUserIds(uids) newCallBackUrl, _ := events[0].ParseURL(callBackURL) return CallBackContext{ Ctx: ctx, CallBackURL: newCallBackUrl, Rule: rule, Events: events, Users: users, BatchSend: batchSend, Stats: stats, } } func ExtractAtsParams(rawURL string) []string { ans := make([]string, 0, 1) parsedURL, err := url.Parse(rawURL) if err != nil { logger.Errorf("ExtractAtsParams(url=%s), err: %v", rawURL, err) return ans } queryParams := parsedURL.Query() atParam := queryParams.Get("ats") if atParam == "" { return ans } // Split the atParam by comma and return the result as a slice return strings.Split(atParam, ",") } func NewCallBacker( key string, targetCache *memsto.TargetCacheType, userCache *memsto.UserCacheType, taskTplCache *memsto.TaskTplCache, tpls map[string]*template.Template, ) CallBacker { switch key { case models.IbexDomain: // Distribute to Ibex return &IbexCallBacker{ targetCache: targetCache, userCache: userCache, taskTplCache: taskTplCache, } case models.DefaultDomain: // default callback return &DefaultCallBacker{} case models.DingtalkDomain: return &DingtalkSender{tpl: tpls[models.Dingtalk]} case models.WecomDomain: return &WecomSender{tpl: tpls[models.Wecom]} case models.FeishuDomain: return &FeishuSender{tpl: tpls[models.Feishu]} case models.FeishuCardDomain: return &FeishuCardSender{tpl: tpls[models.FeishuCard]} //case models.Mm: // return &MmSender{tpl: tpls[models.Mm]} case models.TelegramDomain: return &TelegramSender{tpl: tpls[models.Telegram]} case models.LarkDomain: return &LarkSender{tpl: tpls[models.Lark]} case models.LarkCardDomain: return &LarkCardSender{tpl: tpls[models.LarkCard]} } return nil } func (c *DefaultCallBacker) CallBack(ctx CallBackContext) { if len(ctx.CallBackURL) == 0 || len(ctx.Events) == 0 { return } event := ctx.Events[0] if ctx.BatchSend { webhookConf := &models.Webhook{ Type: models.RuleCallback, Enable: true, Url: ctx.CallBackURL, Timeout: 5, RetryCount: 3, RetryInterval: 10, Batch: 1000, } PushCallbackEvent(ctx.Ctx, webhookConf, event, ctx.Stats) return } doSendAndRecord(ctx.Ctx, ctx.CallBackURL, ctx.CallBackURL, event, "callback", ctx.Stats, ctx.Events) } func doSendAndRecord(ctx *ctx.Context, url, token string, body interface{}, channel string, stats *astats.Stats, events []*models.AlertCurEvent) { res, err := doSend(url, body, channel, stats) NotifyRecord(ctx, events, 0, channel, token, res, err) } func NotifyRecord(ctx *ctx.Context, evts []*models.AlertCurEvent, notifyRuleID int64, channel, target, res string, err error) { // 一个通知可能对应多个 event,都需要记录 notis := make([]*models.NotificationRecord, 0, len(evts)) for _, evt := range evts { noti := models.NewNotificationRecord(evt, notifyRuleID, channel, target) if err != nil { noti.SetStatus(models.NotiStatusFailure) noti.SetDetails(err.Error()) } else if res != "" { noti.SetDetails(string(res)) } notis = append(notis, noti) } if !ctx.IsCenter { err := poster.PostByUrls(ctx, "/v1/n9e/notify-record", notis) if err != nil { logger.Errorf("add notis:%v failed, err: %v", notis, err) } return } PushNotifyRecords(notis) } func doSend(url string, body interface{}, channel string, stats *astats.Stats) (string, error) { stats.AlertNotifyTotal.WithLabelValues(channel).Inc() start := time.Now() res, code, err := poster.PostJSON(url, time.Second*5, body, 3) res = []byte(fmt.Sprintf("duration: %d ms status_code:%d, response:%s", time.Since(start).Milliseconds(), code, string(res))) if err != nil { logger.Errorf("%s_sender: result=fail url=%s code=%d error=%v req:%v response=%s", channel, url, code, err, body, string(res)) stats.AlertNotifyErrorTotal.WithLabelValues(channel).Inc() return string(res), err } logger.Infof("%s_sender: result=succ url=%s code=%d req:%v response=%s", channel, url, code, body, string(res)) return string(res), nil } type TaskCreateReply struct { Err string `json:"err"` Dat int64 `json:"dat"` // task.id } func PushCallbackEvent(ctx *ctx.Context, webhook *models.Webhook, event *models.AlertCurEvent, stats *astats.Stats) { CallbackEventQueueLock.RLock() queue := CallbackEventQueue[webhook.Url] CallbackEventQueueLock.RUnlock() if queue == nil { queue = &WebhookQueue{ eventQueue: NewSafeEventQueue(QueueMaxSize), closeCh: make(chan struct{}), } CallbackEventQueueLock.Lock() CallbackEventQueue[webhook.Url] = queue CallbackEventQueueLock.Unlock() StartConsumer(ctx, queue, webhook.Batch, webhook, stats) } succ := queue.eventQueue.Push(event) if !succ { logger.Warningf("Write channel(%s) full, current channel size: %d event:%s", webhook.Url, queue.eventQueue.Len(), event.Hash) } } ================================================ FILE: alert/sender/dingtalk.go ================================================ package sender import ( "html/template" "strings" "github.com/ccfos/nightingale/v6/models" ) type dingtalkMarkdown struct { Title string `json:"title"` Text string `json:"text"` } type dingtalkAt struct { AtMobiles []string `json:"atMobiles"` IsAtAll bool `json:"isAtAll"` } type dingtalk struct { Msgtype string `json:"msgtype"` Markdown dingtalkMarkdown `json:"markdown"` At dingtalkAt `json:"at"` } var ( _ CallBacker = (*DingtalkSender)(nil) ) type DingtalkSender struct { tpl *template.Template } func (ds *DingtalkSender) Send(ctx MessageContext) { if len(ctx.Users) == 0 || len(ctx.Events) == 0 { return } urls, ats, tokens := ds.extract(ctx.Users) if len(urls) == 0 { return } message := BuildTplMessage(models.Dingtalk, ds.tpl, ctx.Events) for i, url := range urls { var body dingtalk // NoAt in url if strings.Contains(url, "noat=1") { body = dingtalk{ Msgtype: "markdown", Markdown: dingtalkMarkdown{ Title: ctx.Events[0].RuleName, Text: message, }, } } else { body = dingtalk{ Msgtype: "markdown", Markdown: dingtalkMarkdown{ Title: ctx.Events[0].RuleName, Text: message + "\n" + strings.Join(ats, " "), }, At: dingtalkAt{ AtMobiles: ats, IsAtAll: false, }, } } doSendAndRecord(ctx.Ctx, url, tokens[i], body, models.Dingtalk, ctx.Stats, ctx.Events) } } func (ds *DingtalkSender) CallBack(ctx CallBackContext) { if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 { return } body := dingtalk{ Msgtype: "markdown", Markdown: dingtalkMarkdown{ Title: ctx.Events[0].RuleName, }, } ats := ExtractAtsParams(ctx.CallBackURL) message := BuildTplMessage(models.Dingtalk, ds.tpl, ctx.Events) if len(ats) > 0 { body.Markdown.Text = message + "\n@" + strings.Join(ats, "@") body.At = dingtalkAt{ AtMobiles: ats, IsAtAll: false, } } else { // NoAt in url body.Markdown.Text = message } doSendAndRecord(ctx.Ctx, ctx.CallBackURL, ctx.CallBackURL, body, "callback", ctx.Stats, ctx.Events) } // extract urls and ats from Users func (ds *DingtalkSender) extract(users []*models.User) ([]string, []string, []string) { urls := make([]string, 0, len(users)) ats := make([]string, 0, len(users)) tokens := make([]string, 0, len(users)) for _, user := range users { if user.Phone != "" { ats = append(ats, "@"+user.Phone) } if token, has := user.ExtractToken(models.Dingtalk); has { url := token if !strings.HasPrefix(token, "https://") && !strings.HasPrefix(token, "http://") { url = "https://oapi.dingtalk.com/robot/send?access_token=" + token } urls = append(urls, url) tokens = append(tokens, token) } } return urls, ats, tokens } ================================================ FILE: alert/sender/email.go ================================================ package sender import ( "crypto/tls" "errors" "html/template" "time" "github.com/ccfos/nightingale/v6/alert/aconf" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/toolkits/pkg/logger" "gopkg.in/gomail.v2" ) var mailch chan *EmailContext type EmailSender struct { subjectTpl *template.Template contentTpl *template.Template smtp aconf.SMTPConfig } type EmailContext struct { events []*models.AlertCurEvent mail *gomail.Message } func (es *EmailSender) Send(ctx MessageContext) { if len(ctx.Users) == 0 || len(ctx.Events) == 0 { return } tos := extract(ctx.Users) var subject string if es.subjectTpl != nil { subject = BuildTplMessage(models.Email, es.subjectTpl, []*models.AlertCurEvent{ctx.Events[0]}) } else { subject = ctx.Events[0].RuleName } content := BuildTplMessage(models.Email, es.contentTpl, ctx.Events) es.WriteEmail(subject, content, tos, ctx.Events) ctx.Stats.AlertNotifyTotal.WithLabelValues(models.Email).Add(float64(len(tos))) } func extract(users []*models.User) []string { tos := make([]string, 0, len(users)) for _, u := range users { if u.Email != "" { tos = append(tos, u.Email) } } return tos } func SendEmail(subject, content string, tos []string, stmp aconf.SMTPConfig) error { conf := stmp d := gomail.NewDialer(conf.Host, conf.Port, conf.User, conf.Pass) if conf.InsecureSkipVerify { d.TLSConfig = &tls.Config{InsecureSkipVerify: true} } m := gomail.NewMessage() m.SetHeader("From", stmp.From) m.SetHeader("To", tos...) m.SetHeader("Subject", subject) m.SetBody("text/html", content) err := d.DialAndSend(m) if err != nil { return errors.New("email_sender: failed to send: " + err.Error()) } return nil } func (es *EmailSender) WriteEmail(subject, content string, tos []string, events []*models.AlertCurEvent) { m := gomail.NewMessage() m.SetHeader("From", es.smtp.From) m.SetHeader("To", tos...) m.SetHeader("Subject", subject) m.SetBody("text/html", content) mailch <- &EmailContext{events, m} } func dialSmtp(d *gomail.Dialer) gomail.SendCloser { for { select { case <-mailQuit: // Note that Sendcloser is not obtained below, // and the outgoing signal (with configuration changes) exits the current dial return nil default: if s, err := d.Dial(); err != nil { logger.Errorf("email_sender: failed to dial smtp: %s", err) } else { return s } time.Sleep(time.Second) } } } var mailQuit = make(chan struct{}) func RestartEmailSender(ctx *ctx.Context, smtp aconf.SMTPConfig) { // Notify internal start exit mailQuit <- struct{}{} startEmailSender(ctx, smtp) } var smtpConfig aconf.SMTPConfig func InitEmailSender(ctx *ctx.Context, ncc *memsto.NotifyConfigCacheType) { mailch = make(chan *EmailContext, 100000) go updateSmtp(ctx, ncc) smtpConfig = ncc.GetSMTP() go startEmailSender(ctx, smtpConfig) } func updateSmtp(ctx *ctx.Context, ncc *memsto.NotifyConfigCacheType) { for { time.Sleep(1 * time.Minute) smtp := ncc.GetSMTP() if smtpConfig.Host != smtp.Host || smtpConfig.Batch != smtp.Batch || smtpConfig.From != smtp.From || smtpConfig.Pass != smtp.Pass || smtpConfig.User != smtp.User || smtpConfig.Port != smtp.Port || smtpConfig.InsecureSkipVerify != smtp.InsecureSkipVerify { //diff smtpConfig = smtp RestartEmailSender(ctx, smtp) } } } func startEmailSender(ctx *ctx.Context, smtp aconf.SMTPConfig) { conf := smtp if conf.Host == "" || conf.Port == 0 { logger.Debug("SMTP configurations invalid") <-mailQuit return } logger.Infof("start email sender... conf.Host:%+v,conf.Port:%+v", conf.Host, conf.Port) d := gomail.NewDialer(conf.Host, conf.Port, conf.User, conf.Pass) if conf.InsecureSkipVerify { d.TLSConfig = &tls.Config{InsecureSkipVerify: true} } var s gomail.SendCloser var open bool var size int for { select { case <-mailQuit: return case m, ok := <-mailch: if !ok { return } if !open { s = dialSmtp(d) if s == nil { // Indicates that the dialing failed and exited the current goroutine directly, // but put the Message back in the mailch mailch <- m return } open = true } var err error if err = gomail.Send(s, m.mail); err != nil { logger.Errorf("email_sender: failed to send: %s", err) // close and retry if err := s.Close(); err != nil { logger.Warningf("email_sender: failed to close smtp connection: %s", err) } s = dialSmtp(d) if s == nil { // Indicates that the dialing failed and exited the current goroutine directly, // but put the Message back in the mailch mailch <- m return } open = true if err = gomail.Send(s, m.mail); err != nil { logger.Errorf("email_sender: failed to retry send: %s", err) } } else { logger.Infof("email_sender: result=succ subject=%v to=%v", m.mail.GetHeader("Subject"), m.mail.GetHeader("To")) } for _, to := range m.mail.GetHeader("To") { msg := "" if err == nil { msg = "ok" } NotifyRecord(ctx, m.events, 0, models.Email, to, msg, err) } size++ if size >= conf.Batch { if err := s.Close(); err != nil { logger.Warningf("email_sender: failed to close smtp connection: %s", err) } open = false size = 0 } // Close the connection to the SMTP server if no email was sent in // the last 30 seconds. case <-time.After(30 * time.Second): if open { if err := s.Close(); err != nil { logger.Warningf("email_sender: failed to close smtp connection: %s", err) } open = false } } } } ================================================ FILE: alert/sender/feishu.go ================================================ package sender import ( "fmt" "html/template" "strings" "github.com/ccfos/nightingale/v6/models" ) type feishuContent struct { Text string `json:"text"` } type feishuAt struct { AtMobiles []string `json:"atMobiles"` IsAtAll bool `json:"isAtAll"` } type feishu struct { Msgtype string `json:"msg_type"` Content feishuContent `json:"content"` At feishuAt `json:"at"` } var ( _ CallBacker = (*FeishuSender)(nil) ) type FeishuSender struct { tpl *template.Template } func (fs *FeishuSender) CallBack(ctx CallBackContext) { if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 { return } ats := ExtractAtsParams(ctx.CallBackURL) message := BuildTplMessage(models.Feishu, fs.tpl, ctx.Events) if len(ats) > 0 { atTags := "" for _, at := range ats { atTags += fmt.Sprintf(" ", at) } message = atTags + message } body := feishu{ Msgtype: "text", Content: feishuContent{ Text: message, }, } doSendAndRecord(ctx.Ctx, ctx.CallBackURL, ctx.CallBackURL, body, "callback", ctx.Stats, ctx.Events) } func (fs *FeishuSender) Send(ctx MessageContext) { if len(ctx.Users) == 0 || len(ctx.Events) == 0 { return } urls, ats, tokens := fs.extract(ctx.Users) message := BuildTplMessage(models.Feishu, fs.tpl, ctx.Events) for i, url := range urls { body := feishu{ Msgtype: "text", Content: feishuContent{ Text: message, }, } if !strings.Contains(url, "noat=1") { body.At = feishuAt{ AtMobiles: ats, IsAtAll: false, } } doSendAndRecord(ctx.Ctx, url, tokens[i], body, models.Feishu, ctx.Stats, ctx.Events) } } func (fs *FeishuSender) extract(users []*models.User) ([]string, []string, []string) { urls := make([]string, 0, len(users)) ats := make([]string, 0, len(users)) tokens := make([]string, 0, len(users)) for _, user := range users { if user.Phone != "" { ats = append(ats, user.Phone) } if token, has := user.ExtractToken(models.Feishu); has { url := token if !strings.HasPrefix(token, "https://") && !strings.HasPrefix(token, "http://") { url = "https://open.feishu.cn/open-apis/bot/v2/hook/" + token } urls = append(urls, url) tokens = append(tokens, token) } } return urls, ats, tokens } ================================================ FILE: alert/sender/feishucard.go ================================================ package sender import ( "fmt" "html/template" "net/url" "strings" "github.com/ccfos/nightingale/v6/models" ) type Conf struct { WideScreenMode bool `json:"wide_screen_mode"` EnableForward bool `json:"enable_forward"` } type Te struct { Content string `json:"content"` Tag string `json:"tag"` } type Element struct { Tag string `json:"tag"` Text Te `json:"text"` Content string `json:"content"` Elements []Element `json:"elements"` } type Titles struct { Content string `json:"content"` Tag string `json:"tag"` } type Headers struct { Title Titles `json:"title"` Template string `json:"template"` } type Cards struct { Config Conf `json:"config"` Elements []Element `json:"elements"` Header Headers `json:"header"` } type feishuCard struct { feishu Card Cards `json:"card"` } type FeishuCardSender struct { tpl *template.Template } const ( Recovered = "recovered" Triggered = "triggered" ) func createFeishuCardBody() feishuCard { return feishuCard{ feishu: feishu{Msgtype: "interactive"}, Card: Cards{ Config: Conf{ WideScreenMode: true, EnableForward: true, }, Header: Headers{ Title: Titles{ Tag: "plain_text", }, }, Elements: []Element{ { Tag: "div", Text: Te{ Tag: "lark_md", }, }, { Tag: "hr", }, { Tag: "note", Elements: []Element{ { Tag: "lark_md", }, }, }, }, }, } } func (fs *FeishuCardSender) CallBack(ctx CallBackContext) { if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 { return } ats := ExtractAtsParams(ctx.CallBackURL) message := BuildTplMessage(models.FeishuCard, fs.tpl, ctx.Events) if len(ats) > 0 { atTags := "" for _, at := range ats { if strings.Contains(at, "@") { atTags += fmt.Sprintf("", at) } else { atTags += fmt.Sprintf("", at) } } message = atTags + message } color := "red" lowerUnicode := strings.ToLower(message) if strings.Count(lowerUnicode, Recovered) > 0 && strings.Count(lowerUnicode, Triggered) > 0 { color = "orange" } else if strings.Count(lowerUnicode, Recovered) > 0 { color = "green" } SendTitle := fmt.Sprintf("🔔 %s", ctx.Events[0].RuleName) body := createFeishuCardBody() body.Card.Header.Title.Content = SendTitle body.Card.Header.Template = color body.Card.Elements[0].Text.Content = message body.Card.Elements[2].Elements[0].Content = SendTitle // This is to be compatible with the feishucard interface, if with query string parameters, the request will fail // Remove query parameters from the URL, parsedURL, err := url.Parse(ctx.CallBackURL) if err != nil { return } parsedURL.RawQuery = "" doSendAndRecord(ctx.Ctx, parsedURL.String(), parsedURL.String(), body, "callback", ctx.Stats, ctx.Events) } func (fs *FeishuCardSender) Send(ctx MessageContext) { if len(ctx.Users) == 0 || len(ctx.Events) == 0 { return } urls, tokens := fs.extract(ctx.Users) message := BuildTplMessage(models.FeishuCard, fs.tpl, ctx.Events) color := "red" lowerUnicode := strings.ToLower(message) if strings.Count(lowerUnicode, Recovered) > 0 && strings.Count(lowerUnicode, Triggered) > 0 { color = "orange" } else if strings.Count(lowerUnicode, Recovered) > 0 { color = "green" } SendTitle := fmt.Sprintf("🔔 %s", ctx.Events[0].RuleName) body := createFeishuCardBody() body.Card.Header.Title.Content = SendTitle body.Card.Header.Template = color body.Card.Elements[0].Text.Content = message body.Card.Elements[2].Elements[0].Content = SendTitle for i, url := range urls { doSendAndRecord(ctx.Ctx, url, tokens[i], body, models.FeishuCard, ctx.Stats, ctx.Events) } } func (fs *FeishuCardSender) extract(users []*models.User) ([]string, []string) { urls := make([]string, 0, len(users)) tokens := make([]string, 0, len(users)) for i := range users { if token, has := users[i].ExtractToken(models.FeishuCard); has { url := token if !strings.HasPrefix(token, "https://") && !strings.HasPrefix(token, "http://") { url = "https://open.feishu.cn/open-apis/bot/v2/hook/" + strings.TrimSpace(token) } urls = append(urls, url) tokens = append(tokens, token) } } return urls, tokens } ================================================ FILE: alert/sender/global_webhook.go ================================================ package sender import ( "bytes" "crypto/tls" "encoding/json" "fmt" "io" "net/http" "time" "github.com/ccfos/nightingale/v6/alert/aconf" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/poster" "github.com/toolkits/pkg/logger" ) var staticGlobalWebhookClient *http.Client var staticGlobalWebhookConf aconf.GlobalWebhook const staticGlobalWebhookChannel = "static_global_webhook" func InitStaticGlobalWebhook(conf aconf.GlobalWebhook) { staticGlobalWebhookConf = conf if !conf.Enable || conf.Url == "" { return } if len(conf.Headers) > 0 && len(conf.Headers)%2 != 0 { logger.Warningf("static_global_webhook headers count is odd(%d), headers will be ignored", len(conf.Headers)) } timeout := conf.Timeout if timeout <= 0 { timeout = 10 } transport := &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: conf.SkipVerify}, MaxIdleConns: 100, MaxIdleConnsPerHost: 10, IdleConnTimeout: 90 * time.Second, } if poster.UseProxy(conf.Url) { transport.Proxy = http.ProxyFromEnvironment } staticGlobalWebhookClient = &http.Client{ Timeout: time.Duration(timeout) * time.Second, Transport: transport, } logger.Infof("static_global_webhook initialized, url:%s", conf.Url) } func SendStaticGlobalWebhook(ctx *ctx.Context, event *models.AlertCurEvent, stats *astats.Stats) { if staticGlobalWebhookClient == nil { return } bs, err := json.Marshal(event) if err != nil { logger.Errorf("%s failed to marshal event err:%v", staticGlobalWebhookChannel, err) NotifyRecord(ctx, []*models.AlertCurEvent{event}, 0, staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, "", err) return } req, err := http.NewRequest("POST", staticGlobalWebhookConf.Url, bytes.NewBuffer(bs)) if err != nil { logger.Warningf("%s failed to new request event:%s err:%v", staticGlobalWebhookChannel, string(bs), err) NotifyRecord(ctx, []*models.AlertCurEvent{event}, 0, staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, "", err) return } req.Header.Set("Content-Type", "application/json") if staticGlobalWebhookConf.BasicAuthUser != "" && staticGlobalWebhookConf.BasicAuthPass != "" { req.SetBasicAuth(staticGlobalWebhookConf.BasicAuthUser, staticGlobalWebhookConf.BasicAuthPass) } if len(staticGlobalWebhookConf.Headers) > 0 && len(staticGlobalWebhookConf.Headers)%2 == 0 { for i := 0; i < len(staticGlobalWebhookConf.Headers); i += 2 { if staticGlobalWebhookConf.Headers[i] == "Host" || staticGlobalWebhookConf.Headers[i] == "host" { req.Host = staticGlobalWebhookConf.Headers[i+1] continue } req.Header.Set(staticGlobalWebhookConf.Headers[i], staticGlobalWebhookConf.Headers[i+1]) } } stats.AlertNotifyTotal.WithLabelValues(staticGlobalWebhookChannel).Inc() resp, err := staticGlobalWebhookClient.Do(req) if err != nil { stats.AlertNotifyErrorTotal.WithLabelValues(staticGlobalWebhookChannel).Inc() logger.Errorf("%s_fail url:%s event:%s error:%v", staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, event.Hash, err) NotifyRecord(ctx, []*models.AlertCurEvent{event}, 0, staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, "", err) return } defer resp.Body.Close() body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) res := fmt.Sprintf("status_code:%d, response:%s", resp.StatusCode, string(body)) if resp.StatusCode >= 400 { stats.AlertNotifyErrorTotal.WithLabelValues(staticGlobalWebhookChannel).Inc() logger.Errorf("%s_fail url:%s status:%d body:%s event:%s", staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, resp.StatusCode, string(body), event.Hash) NotifyRecord(ctx, []*models.AlertCurEvent{event}, 0, staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, res, fmt.Errorf("status code %d", resp.StatusCode)) return } logger.Debugf("%s_succ url:%s status:%d body:%s event:%s", staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, resp.StatusCode, string(body), event.Hash) NotifyRecord(ctx, []*models.AlertCurEvent{event}, 0, staticGlobalWebhookChannel, staticGlobalWebhookConf.Url, res, nil) } ================================================ FILE: alert/sender/global_webhook_test.go ================================================ package sender import ( "context" "errors" "net/http" "strings" "testing" "github.com/ccfos/nightingale/v6/alert/aconf" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/models" ctxpkg "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/prometheus/client_golang/prometheus" ) type roundTripperFunc func(*http.Request) (*http.Response, error) func (f roundTripperFunc) RoundTrip(req *http.Request) (*http.Response, error) { return f(req) } func newStaticWebhookTestStats() *astats.Stats { return &astats.Stats{ AlertNotifyTotal: prometheus.NewCounterVec( prometheus.CounterOpts{Name: "test_static_global_webhook_total"}, []string{"channel"}, ), AlertNotifyErrorTotal: prometheus.NewCounterVec( prometheus.CounterOpts{Name: "test_static_global_webhook_error_total"}, []string{"channel"}, ), } } func TestSendStaticGlobalWebhookRecordsNewRequestFailure(t *testing.T) { prevClient := staticGlobalWebhookClient prevConf := staticGlobalWebhookConf defer func() { staticGlobalWebhookClient = prevClient staticGlobalWebhookConf = prevConf }() NotifyRecordQueue.RemoveAll() defer NotifyRecordQueue.RemoveAll() staticGlobalWebhookClient = &http.Client{} staticGlobalWebhookConf = aconf.GlobalWebhook{Enable: true, Url: "://bad-url"} SendStaticGlobalWebhook( ctxpkg.NewContext(context.Background(), nil, true), &models.AlertCurEvent{Id: 1, Hash: "event-1"}, newStaticWebhookTestStats(), ) if got := NotifyRecordQueue.Len(); got != 1 { t.Fatalf("expected 1 notify record, got %d", got) } record, ok := NotifyRecordQueue.PopBack().(*models.NotificationRecord) if !ok { t.Fatalf("expected *models.NotificationRecord in queue") } if record.Status != models.NotiStatusFailure { t.Fatalf("expected failure status, got %d", record.Status) } if record.Channel != staticGlobalWebhookChannel { t.Fatalf("expected channel %q, got %q", staticGlobalWebhookChannel, record.Channel) } } func TestSendStaticGlobalWebhookRecordsTransportFailure(t *testing.T) { prevClient := staticGlobalWebhookClient prevConf := staticGlobalWebhookConf defer func() { staticGlobalWebhookClient = prevClient staticGlobalWebhookConf = prevConf }() NotifyRecordQueue.RemoveAll() defer NotifyRecordQueue.RemoveAll() staticGlobalWebhookClient = &http.Client{ Transport: roundTripperFunc(func(req *http.Request) (*http.Response, error) { return nil, errors.New("transport boom") }), } staticGlobalWebhookConf = aconf.GlobalWebhook{Enable: true, Url: "http://example.com/webhook"} SendStaticGlobalWebhook( ctxpkg.NewContext(context.Background(), nil, true), &models.AlertCurEvent{Id: 2, Hash: "event-2"}, newStaticWebhookTestStats(), ) if got := NotifyRecordQueue.Len(); got != 1 { t.Fatalf("expected 1 notify record, got %d", got) } record, ok := NotifyRecordQueue.PopBack().(*models.NotificationRecord) if !ok { t.Fatalf("expected *models.NotificationRecord in queue") } if record.Status != models.NotiStatusFailure { t.Fatalf("expected failure status, got %d", record.Status) } if !strings.Contains(record.Details, "transport boom") { t.Fatalf("expected transport error details, got %q", record.Details) } } ================================================ FILE: alert/sender/ibex.go ================================================ // @Author: Ciusyan 6/5/24 package sender import ( "encoding/json" "fmt" "strconv" "strings" "time" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" imodels "github.com/flashcatcloud/ibex/src/models" "github.com/flashcatcloud/ibex/src/storage" "github.com/toolkits/pkg/logger" ) var ( _ CallBacker = (*IbexCallBacker)(nil) ) type IbexCallBacker struct { targetCache *memsto.TargetCacheType userCache *memsto.UserCacheType taskTplCache *memsto.TaskTplCache } func (c *IbexCallBacker) CallBack(ctx CallBackContext) { if len(ctx.CallBackURL) == 0 || len(ctx.Events) == 0 { logger.Warningf("event_callback_ibex: url or events is empty, url: %s", ctx.CallBackURL) return } event := ctx.Events[0] if event.IsRecovered { logger.Infof("event_callback_ibex: event is recovered, event: %s", event.Hash) return } c.handleIbex(ctx.Ctx, ctx.CallBackURL, event) } func (c *IbexCallBacker) handleIbex(ctx *ctx.Context, url string, event *models.AlertCurEvent) { logger.Infof("event_callback_ibex: url: %s, event: %s", url, event.Hash) if imodels.DB() == nil && ctx.IsCenter { logger.Warningf("event_callback_ibex: db is nil, event: %s", event.Hash) return } arr := strings.Split(url, "/") var idstr string var host string if len(arr) > 1 { idstr = arr[1] } if len(arr) > 2 { host = arr[2] } id, err := strconv.ParseInt(idstr, 10, 64) if err != nil { logger.Errorf("event_callback_ibex: failed to parse url: %s event: %s", url, event.Hash) return } if host == "" { // 用户在callback url中没有传入host,就从event中解析 host = event.TargetIdent if host == "" { if ident, has := event.TagsMap["ident"]; has { host = ident } } } if host == "" { logger.Errorf("event_callback_ibex: failed to get host, id: %d, event: %s", id, event.Hash) return } CallIbex(ctx, id, host, c.taskTplCache, c.targetCache, c.userCache, event, "") } func CallIbex(ctx *ctx.Context, id int64, host string, taskTplCache *memsto.TaskTplCache, targetCache *memsto.TargetCacheType, userCache *memsto.UserCacheType, event *models.AlertCurEvent, args string) (int64, error) { logger.Infof("event_callback_ibex: id: %d, host: %s, args: %s, event: %s", id, host, args, event.Hash) tpl := taskTplCache.Get(id) if tpl == nil { err := fmt.Errorf("event_callback_ibex: no such tpl(%d), event: %s", id, event.Hash) logger.Errorf("%s", err) return 0, err } // check perm // tpl.GroupId - host - account 三元组校验权限 can, err := CanDoIbex(tpl.UpdateBy, tpl, host, targetCache, userCache) if err != nil { err = fmt.Errorf("event_callback_ibex: check perm fail: %v, event: %s", err, event.Hash) logger.Errorf("%s", err) return 0, err } if !can { err = fmt.Errorf("event_callback_ibex: user(%s) no permission, event: %s", tpl.UpdateBy, event.Hash) logger.Errorf("%s", err) return 0, err } tagsMap := make(map[string]string) for i := 0; i < len(event.TagsJSON); i++ { pair := strings.TrimSpace(event.TagsJSON[i]) if pair == "" { continue } arr := strings.SplitN(pair, "=", 2) if len(arr) != 2 { continue } tagsMap[arr[0]] = arr[1] } // 附加告警级别 告警触发值标签 tagsMap["alert_severity"] = strconv.Itoa(event.Severity) tagsMap["alert_trigger_value"] = event.TriggerValue tagsMap["is_recovered"] = strconv.FormatBool(event.IsRecovered) tags, err := json.Marshal(tagsMap) if err != nil { err = fmt.Errorf("event_callback_ibex: failed to marshal tags to json: %v, event: %s", tagsMap, event.Hash) logger.Errorf("%s", err) return 0, err } // call ibex taskArgs := tpl.Args if args != "" { taskArgs = args } in := models.TaskForm{ Title: tpl.Title + " FH: " + host, Account: tpl.Account, Batch: tpl.Batch, Tolerance: tpl.Tolerance, Timeout: tpl.Timeout, Pause: tpl.Pause, Script: tpl.Script, Args: taskArgs, Stdin: string(tags), Action: "start", Creator: tpl.UpdateBy, Hosts: []string{host}, AlertTriggered: true, } id, err = TaskAdd(in, tpl.UpdateBy, ctx.IsCenter) if err != nil { err = fmt.Errorf("event_callback_ibex: call ibex fail: %v, event: %s", err, event.Hash) logger.Errorf("%s", err) return 0, err } // write db record := models.TaskRecord{ Id: id, EventId: event.Id, GroupId: tpl.GroupId, Title: in.Title, Account: in.Account, Batch: in.Batch, Tolerance: in.Tolerance, Timeout: in.Timeout, Pause: in.Pause, Script: in.Script, Args: in.Args, CreateAt: time.Now().Unix(), CreateBy: in.Creator, } if err = record.Add(ctx); err != nil { err = fmt.Errorf("event_callback_ibex: persist task_record fail: %v, event: %s", err, event.Hash) logger.Errorf("%s", err) return id, err } return id, nil } func CanDoIbex(username string, tpl *models.TaskTpl, host string, targetCache *memsto.TargetCacheType, userCache *memsto.UserCacheType) (bool, error) { user := userCache.GetByUsername(username) if user != nil && user.IsAdmin() { return true, nil } target, has := targetCache.Get(host) if !has { return false, nil } return target.MatchGroupId(tpl.GroupId), nil } func TaskAdd(f models.TaskForm, authUser string, isCenter bool) (int64, error) { if storage.Cache == nil { logger.Warningf("event_callback_ibex: redis cache is nil, task: %+v", f) return 0, fmt.Errorf("redis cache is nil") } hosts := cleanHosts(f.Hosts) if len(hosts) == 0 { return 0, fmt.Errorf("arg(hosts) empty") } taskMeta := &imodels.TaskMeta{ Title: f.Title, Account: f.Account, Batch: f.Batch, Tolerance: f.Tolerance, Timeout: f.Timeout, Pause: f.Pause, Script: f.Script, Args: f.Args, Stdin: f.Stdin, Creator: f.Creator, } err := taskMeta.CleanFields() if err != nil { return 0, err } taskMeta.HandleFH(hosts[0]) // 任务类型分为"告警规则触发"和"n9e center用户下发"两种; // 边缘机房"告警规则触发"的任务不需要规划,并且它可能是失联的,无法使用db资源,所以放入redis缓存中,直接下发给agentd执行 if !isCenter && f.AlertTriggered { if err := taskMeta.Create(); err != nil { // 当网络不连通时,生成唯一的id,防止边缘机房中不同任务的id相同; // 方法是,redis自增id去防止同一个机房的不同n9e edge生成的id相同; // 但没法防止不同边缘机房生成同样的id,所以,生成id的数据不会上报存入数据库,只用于闭环执行。 taskMeta.Id, err = storage.IdGet() if err != nil { return 0, err } } taskHost := imodels.TaskHost{ Id: taskMeta.Id, Host: hosts[0], Status: "running", } if err = taskHost.Create(); err != nil { logger.Warningf("task_add_fail: authUser=%s title=%s err=%s", authUser, taskMeta.Title, err.Error()) } // 缓存任务元信息和待下发的任务 err = taskMeta.Cache(hosts[0]) if err != nil { return 0, err } } else { // 如果是中心机房,还是保持之前的逻辑 err = taskMeta.Save(hosts, f.Action) if err != nil { return 0, err } } logger.Infof("task_add_succ: authUser=%s title=%s", authUser, taskMeta.Title) return taskMeta.Id, nil } func cleanHosts(formHosts []string) []string { cnt := len(formHosts) arr := make([]string, 0, cnt) for i := 0; i < cnt; i++ { item := strings.TrimSpace(formHosts[i]) if item == "" { continue } if strings.HasPrefix(item, "#") { continue } arr = append(arr, item) } return arr } ================================================ FILE: alert/sender/lark.go ================================================ package sender import ( "html/template" "strings" "github.com/ccfos/nightingale/v6/models" ) var ( _ CallBacker = (*LarkSender)(nil) ) type LarkSender struct { tpl *template.Template } func (lk *LarkSender) CallBack(ctx CallBackContext) { if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 { return } body := feishu{ Msgtype: "text", Content: feishuContent{ Text: BuildTplMessage(models.Lark, lk.tpl, ctx.Events), }, } doSendAndRecord(ctx.Ctx, ctx.CallBackURL, ctx.CallBackURL, body, "callback", ctx.Stats, ctx.Events) } func (lk *LarkSender) Send(ctx MessageContext) { if len(ctx.Users) == 0 || len(ctx.Events) == 0 { return } urls, tokens := lk.extract(ctx.Users) message := BuildTplMessage(models.Lark, lk.tpl, ctx.Events) for i, url := range urls { body := feishu{ Msgtype: "text", Content: feishuContent{ Text: message, }, } doSendAndRecord(ctx.Ctx, url, tokens[i], body, models.Lark, ctx.Stats, ctx.Events) } } func (lk *LarkSender) extract(users []*models.User) ([]string, []string) { urls := make([]string, 0, len(users)) tokens := make([]string, 0, len(users)) for _, user := range users { if token, has := user.ExtractToken(models.Lark); has { url := token if !strings.HasPrefix(token, "https://") && !strings.HasPrefix(token, "http://") { url = "https://open.larksuite.com/open-apis/bot/v2/hook/" + token } urls = append(urls, url) tokens = append(tokens, token) } } return urls, tokens } ================================================ FILE: alert/sender/larkcard.go ================================================ package sender import ( "fmt" "html/template" "net/url" "strings" "github.com/ccfos/nightingale/v6/models" ) type LarkCardSender struct { tpl *template.Template } func (fs *LarkCardSender) CallBack(ctx CallBackContext) { if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 { return } ats := ExtractAtsParams(ctx.CallBackURL) message := BuildTplMessage(models.LarkCard, fs.tpl, ctx.Events) if len(ats) > 0 { atTags := "" for _, at := range ats { if strings.Contains(at, "@") { atTags += fmt.Sprintf("", at) } else { atTags += fmt.Sprintf("", at) } } message = atTags + message } color := "red" lowerUnicode := strings.ToLower(message) if strings.Count(lowerUnicode, Recovered) > 0 && strings.Count(lowerUnicode, Triggered) > 0 { color = "orange" } else if strings.Count(lowerUnicode, Recovered) > 0 { color = "green" } SendTitle := fmt.Sprintf("🔔 %s", ctx.Events[0].RuleName) body := createFeishuCardBody() body.Card.Header.Title.Content = SendTitle body.Card.Header.Template = color body.Card.Elements[0].Text.Content = message body.Card.Elements[2].Elements[0].Content = SendTitle // This is to be compatible with the Larkcard interface, if with query string parameters, the request will fail // Remove query parameters from the URL, parsedURL, err := url.Parse(ctx.CallBackURL) if err != nil { return } parsedURL.RawQuery = "" doSendAndRecord(ctx.Ctx, ctx.CallBackURL, ctx.CallBackURL, body, "callback", ctx.Stats, ctx.Events) } func (fs *LarkCardSender) Send(ctx MessageContext) { if len(ctx.Users) == 0 || len(ctx.Events) == 0 { return } urls, tokens := fs.extract(ctx.Users) message := BuildTplMessage(models.LarkCard, fs.tpl, ctx.Events) color := "red" lowerUnicode := strings.ToLower(message) if strings.Count(lowerUnicode, Recovered) > 0 && strings.Count(lowerUnicode, Triggered) > 0 { color = "orange" } else if strings.Count(lowerUnicode, Recovered) > 0 { color = "green" } SendTitle := fmt.Sprintf("🔔 %s", ctx.Events[0].RuleName) body := createFeishuCardBody() body.Card.Header.Title.Content = SendTitle body.Card.Header.Template = color body.Card.Elements[0].Text.Content = message body.Card.Elements[2].Elements[0].Content = SendTitle for i, url := range urls { doSendAndRecord(ctx.Ctx, url, tokens[i], body, models.LarkCard, ctx.Stats, ctx.Events) } } func (fs *LarkCardSender) extract(users []*models.User) ([]string, []string) { urls := make([]string, 0, len(users)) tokens := make([]string, 0) for i := range users { if token, has := users[i].ExtractToken(models.Lark); has { url := token if !strings.HasPrefix(token, "https://") && !strings.HasPrefix(token, "http://") { url = "https://open.larksuite.com/open-apis/bot/v2/hook/" + strings.TrimSpace(token) } urls = append(urls, url) tokens = append(tokens, token) } } return urls, tokens } ================================================ FILE: alert/sender/mm.go ================================================ package sender import ( "html/template" "net/url" "strings" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/toolkits/pkg/logger" ) type MatterMostMessage struct { Text string Tokens []string Stats *astats.Stats } type mm struct { Channel string `json:"channel"` Username string `json:"username"` Text string `json:"text"` } type MmSender struct { tpl *template.Template } func (ms *MmSender) Send(ctx MessageContext) { if len(ctx.Users) == 0 || len(ctx.Events) == 0 { return } urls := ms.extract(ctx.Users) if len(urls) == 0 { return } message := BuildTplMessage(models.Mm, ms.tpl, ctx.Events) SendMM(ctx.Ctx, MatterMostMessage{ Text: message, Tokens: urls, Stats: ctx.Stats, }, ctx.Events, models.Mm) } func (ms *MmSender) CallBack(ctx CallBackContext) { if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 { return } message := BuildTplMessage(models.Mm, ms.tpl, ctx.Events) SendMM(ctx.Ctx, MatterMostMessage{ Text: message, Tokens: []string{ctx.CallBackURL}, Stats: ctx.Stats, }, ctx.Events, "callback") } func (ms *MmSender) extract(users []*models.User) []string { tokens := make([]string, 0, len(users)) for _, user := range users { if token, has := user.ExtractToken(models.Mm); has { tokens = append(tokens, token) } } return tokens } func SendMM(ctx *ctx.Context, message MatterMostMessage, events []*models.AlertCurEvent, channel string) { for i := 0; i < len(message.Tokens); i++ { u, err := url.Parse(message.Tokens[i]) if err != nil { logger.Errorf("mm_sender: failed to parse error=%v", err) NotifyRecord(ctx, events, 0, channel, message.Tokens[i], "", err) continue } v, err := url.ParseQuery(u.RawQuery) if err != nil { logger.Errorf("mm_sender: failed to parse query error=%v", err) } channels := v["channel"] // do not get txt := "" atuser := v["atuser"] if len(atuser) != 0 { txt = strings.Join(MapStrToStr(atuser, func(u string) string { return "@" + u }), ",") + "\n" } username := v.Get("username") if err != nil { logger.Errorf("mm_sender: failed to parse error=%v", err) } // simple concatenating ur := u.Scheme + "://" + u.Host + u.Path for _, channel := range channels { body := mm{ Channel: channel, Username: username, Text: txt + message.Text, } doSendAndRecord(ctx, ur, message.Tokens[i], body, channel, message.Stats, events) } } } func MapStrToStr(arr []string, fn func(s string) string) []string { var newArray = []string{} for _, it := range arr { newArray = append(newArray, fn(it)) } return newArray } ================================================ FILE: alert/sender/notify_record_queue.go ================================================ package sender import ( "errors" "time" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/toolkits/pkg/container/list" "github.com/toolkits/pkg/logger" ) // 通知记录队列,最大长度 1000000 var NotifyRecordQueue = list.NewSafeListLimited(1000000) // 每秒上报通知记录队列大小 func ReportNotifyRecordQueueSize(stats *astats.Stats) { for { time.Sleep(time.Second) stats.GaugeNotifyRecordQueueSize.Set(float64(NotifyRecordQueue.Len())) } } // 推送通知记录到队列 // 若队列满 则返回 error func PushNotifyRecords(records []*models.NotificationRecord) error { for _, record := range records { if ok := NotifyRecordQueue.PushFront(record); !ok { logger.Warningf("notify record queue is full, record: %+v", record) return errors.New("notify record queue is full") } } return nil } type NotifyRecordConsumer struct { ctx *ctx.Context } func NewNotifyRecordConsumer(ctx *ctx.Context) *NotifyRecordConsumer { return &NotifyRecordConsumer{ ctx: ctx, } } // 消费通知记录队列 每 100ms 检测一次队列是否为空 func (c *NotifyRecordConsumer) LoopConsume() { duration := time.Duration(100) * time.Millisecond for { // 无论队列是否为空 都需要等待 time.Sleep(duration) inotis := NotifyRecordQueue.PopBackBy(100) if len(inotis) == 0 { continue } // 类型转换,不然 CreateInBatches 会报错 notis := make([]*models.NotificationRecord, 0, len(inotis)) for _, inoti := range inotis { notis = append(notis, inoti.(*models.NotificationRecord)) } c.consume(notis) } } func (c *NotifyRecordConsumer) consume(notis []*models.NotificationRecord) { if err := models.DB(c.ctx).CreateInBatches(notis, 100).Error; err != nil { logger.Errorf("add notis:%v failed, err: %v", notis, err) } } ================================================ FILE: alert/sender/plugin.go ================================================ package sender import ( "bytes" "fmt" "os" "os/exec" "time" "unicode/utf8" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/toolkits/pkg/file" "github.com/toolkits/pkg/logger" "github.com/toolkits/pkg/sys" ) func MayPluginNotify(ctx *ctx.Context, noticeBytes []byte, notifyScript models.NotifyScript, stats *astats.Stats, event *models.AlertCurEvent) { if len(noticeBytes) == 0 { return } alertingCallScript(ctx, noticeBytes, notifyScript, stats, event) } func alertingCallScript(ctx *ctx.Context, stdinBytes []byte, notifyScript models.NotifyScript, stats *astats.Stats, event *models.AlertCurEvent) { // not enable or no notify.py? do nothing config := notifyScript if !config.Enable || config.Content == "" { return } channel := "script" stats.AlertNotifyTotal.WithLabelValues(channel).Inc() fpath := ".notify_script" if config.Type == 1 { fpath = config.Content } else { rewrite := true if file.IsExist(fpath) { oldContent, err := file.ToString(fpath) if err != nil { logger.Errorf("event_script_notify_fail: read script file err: %v", err) stats.AlertNotifyErrorTotal.WithLabelValues(channel).Inc() return } if oldContent == config.Content { rewrite = false } } if rewrite { _, err := file.WriteString(fpath, config.Content) if err != nil { logger.Errorf("event_script_notify_fail: write script file err: %v", err) stats.AlertNotifyErrorTotal.WithLabelValues(channel).Inc() return } err = os.Chmod(fpath, 0777) if err != nil { logger.Errorf("event_script_notify_fail: chmod script file err: %v", err) stats.AlertNotifyErrorTotal.WithLabelValues(channel).Inc() return } } fpath = "./" + fpath } cmd := exec.Command(fpath) cmd.Stdin = bytes.NewReader(stdinBytes) // combine stdout and stderr var buf bytes.Buffer cmd.Stdout = &buf cmd.Stderr = &buf start := time.Now() err := startCmd(cmd) if err != nil { logger.Errorf("event_script_notify_fail: run cmd err: %v", err) return } err, isTimeout := sys.WrapTimeout(cmd, time.Duration(config.Timeout)*time.Second) res := buf.String() res = fmt.Sprintf("send_time: %s duration: %d ms %s", time.Now().Format("2006-01-02 15:04:05"), time.Since(start).Milliseconds(), res) // 截断超出长度的输出 if len(res) > 512 { // 确保在有效的UTF-8字符边界处截断 validLen := 0 for i := 0; i < 512 && i < len(res); { _, size := utf8.DecodeRuneInString(res[i:]) if i+size > 512 { break } i += size validLen = i } res = res[:validLen] + "..." } NotifyRecord(ctx, []*models.AlertCurEvent{event}, 0, channel, cmd.String(), res, buildErr(err, isTimeout)) if isTimeout { if err == nil { logger.Errorf("event_script_notify_fail: timeout and killed process %s", fpath) } if err != nil { logger.Errorf("event_script_notify_fail: kill process %s occur error %v", fpath, err) stats.AlertNotifyErrorTotal.WithLabelValues(channel).Inc() } return } if err != nil { logger.Errorf("event_script_notify_fail: exec script %s occur error: %v, output: %s", fpath, err, res) stats.AlertNotifyErrorTotal.WithLabelValues(channel).Inc() return } logger.Infof("event_script_notify_ok: exec %s output: %s", fpath, res) } func buildErr(err error, isTimeout bool) error { if err == nil && !isTimeout { return nil } else { return fmt.Errorf("is_timeout: %v, err: %v", isTimeout, err) } } ================================================ FILE: alert/sender/plugin_cmd_unix.go ================================================ //go:build !windows // +build !windows package sender import ( "os/exec" "syscall" ) func startCmd(c *exec.Cmd) error { c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} return c.Start() } ================================================ FILE: alert/sender/plugin_cmd_windows.go ================================================ package sender import "os/exec" func startCmd(c *exec.Cmd) error { return c.Start() } ================================================ FILE: alert/sender/sender.go ================================================ package sender import ( "bytes" "html/template" "github.com/ccfos/nightingale/v6/alert/aconf" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" ) type ( // Sender 发送消息通知的接口 Sender interface { Send(ctx MessageContext) } // MessageContext 一个event所生成的告警通知的上下文 MessageContext struct { Users []*models.User Rule *models.AlertRule Events []*models.AlertCurEvent Stats *astats.Stats Ctx *ctx.Context } ) func NewSender(key string, tpls map[string]*template.Template, smtp ...aconf.SMTPConfig) Sender { switch key { case models.Dingtalk: return &DingtalkSender{tpl: tpls[models.Dingtalk]} case models.Wecom: return &WecomSender{tpl: tpls[models.Wecom]} case models.Feishu: return &FeishuSender{tpl: tpls[models.Feishu]} case models.FeishuCard: return &FeishuCardSender{tpl: tpls[models.FeishuCard]} case models.Email: return &EmailSender{subjectTpl: tpls[models.EmailSubject], contentTpl: tpls[models.Email], smtp: smtp[0]} case models.Mm: return &MmSender{tpl: tpls[models.Mm]} case models.Telegram: return &TelegramSender{tpl: tpls[models.Telegram]} case models.Lark: return &LarkSender{tpl: tpls[models.Lark]} case models.LarkCard: return &LarkCardSender{tpl: tpls[models.LarkCard]} } return nil } func BuildMessageContext(ctx *ctx.Context, rule *models.AlertRule, events []*models.AlertCurEvent, uids []int64, userCache *memsto.UserCacheType, stats *astats.Stats) MessageContext { users := userCache.GetByUserIds(uids) return MessageContext{ Rule: rule, Events: events, Users: users, Stats: stats, Ctx: ctx, } } type BuildTplMessageFunc func(channel string, tpl *template.Template, events []*models.AlertCurEvent) string var BuildTplMessage BuildTplMessageFunc = buildTplMessage func buildTplMessage(channel string, tpl *template.Template, events []*models.AlertCurEvent) string { if tpl == nil { return "tpl for current sender not found, please check configuration" } var content string for _, event := range events { var body bytes.Buffer if err := tpl.Execute(&body, event); err != nil { return err.Error() } content += body.String() + "\n\n" } return content } ================================================ FILE: alert/sender/telegram.go ================================================ package sender import ( "errors" "html/template" "strings" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/toolkits/pkg/logger" ) type TelegramMessage struct { Text string Tokens []string Stats *astats.Stats } type telegram struct { ParseMode string `json:"parse_mode"` Text string `json:"text"` } var ( _ CallBacker = (*TelegramSender)(nil) ) type TelegramSender struct { tpl *template.Template } func (ts *TelegramSender) CallBack(ctx CallBackContext) { if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 { return } message := BuildTplMessage(models.Telegram, ts.tpl, ctx.Events) SendTelegram(ctx.Ctx, TelegramMessage{ Text: message, Tokens: []string{ctx.CallBackURL}, Stats: ctx.Stats, }, ctx.Events, "callback") } func (ts *TelegramSender) Send(ctx MessageContext) { if len(ctx.Users) == 0 || len(ctx.Events) == 0 { return } tokens := ts.extract(ctx.Users) message := BuildTplMessage(models.Telegram, ts.tpl, ctx.Events) SendTelegram(ctx.Ctx, TelegramMessage{ Text: message, Tokens: tokens, Stats: ctx.Stats, }, ctx.Events, models.Telegram) } func (ts *TelegramSender) extract(users []*models.User) []string { tokens := make([]string, 0, len(users)) for _, user := range users { if token, has := user.ExtractToken(models.Telegram); has { tokens = append(tokens, token) } } return tokens } func SendTelegram(ctx *ctx.Context, message TelegramMessage, events []*models.AlertCurEvent, channel string) { for i := 0; i < len(message.Tokens); i++ { if !strings.Contains(message.Tokens[i], "/") && !strings.HasPrefix(message.Tokens[i], "https://") { logger.Errorf("telegram_sender: result=fail invalid token=%s", message.Tokens[i]) NotifyRecord(ctx, events, 0, channel, message.Tokens[i], "", errors.New("invalid token")) continue } var url string if strings.HasPrefix(message.Tokens[i], "https://") || strings.HasPrefix(message.Tokens[i], "http://") { url = message.Tokens[i] } else { array := strings.Split(message.Tokens[i], "/") if len(array) != 2 { logger.Errorf("telegram_sender: result=fail invalid token=%s", message.Tokens[i]) continue } botToken := array[0] chatId := array[1] url = "https://api.telegram.org/bot" + botToken + "/sendMessage?chat_id=" + chatId } body := telegram{ ParseMode: "markdown", Text: message.Text, } doSendAndRecord(ctx, url, message.Tokens[i], body, channel, message.Stats, events) } } ================================================ FILE: alert/sender/webhook.go ================================================ package sender import ( "bytes" "crypto/tls" "encoding/json" "fmt" "io" "net/http" "sync" "time" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/poster" "github.com/toolkits/pkg/logger" ) // webhookClientCache 缓存 http.Client,避免每次请求都创建新的 Client 导致连接泄露 var webhookClientCache sync.Map // key: clientKey (string), value: *http.Client // 相同配置的 webhook 会复用同一个 Client func getWebhookClient(webhook *models.Webhook) *http.Client { clientKey := webhook.Hash() if client, ok := webhookClientCache.Load(clientKey); ok { return client.(*http.Client) } // 创建新的 Client transport := &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: webhook.SkipVerify}, MaxIdleConns: 100, MaxIdleConnsPerHost: 10, IdleConnTimeout: 90 * time.Second, } if poster.UseProxy(webhook.Url) { transport.Proxy = http.ProxyFromEnvironment } timeout := webhook.Timeout if timeout <= 0 { timeout = 10 } newClient := &http.Client{ Timeout: time.Duration(timeout) * time.Second, Transport: transport, } // 使用 LoadOrStore 确保并发安全,避免重复创建 actual, loaded := webhookClientCache.LoadOrStore(clientKey, newClient) if loaded { return actual.(*http.Client) } return newClient } func sendWebhook(webhook *models.Webhook, event interface{}, stats *astats.Stats) (bool, string, error) { channel := "webhook" if webhook.Type == models.RuleCallback { channel = "callback" } conf := webhook if conf.Url == "" || !conf.Enable { return false, "", nil } bs, err := json.Marshal(event) if err != nil { logger.Errorf("%s alertingWebhook failed to marshal event err:%v", channel, err) return false, "", err } bf := bytes.NewBuffer(bs) req, err := http.NewRequest("POST", conf.Url, bf) if err != nil { logger.Warningf("%s alertingWebhook failed to new request event:%s err:%v", channel, string(bs), err) return true, "", err } req.Header.Set("Content-Type", "application/json") if conf.BasicAuthUser != "" && conf.BasicAuthPass != "" { req.SetBasicAuth(conf.BasicAuthUser, conf.BasicAuthPass) } if len(conf.Headers) > 0 && len(conf.Headers)%2 == 0 { for i := 0; i < len(conf.Headers); i += 2 { if conf.Headers[i] == "host" || conf.Headers[i] == "Host" { req.Host = conf.Headers[i+1] continue } req.Header.Set(conf.Headers[i], conf.Headers[i+1]) } } // 使用全局 Client 缓存,避免每次请求都创建新的 Client 导致连接泄露 client := getWebhookClient(conf) stats.AlertNotifyTotal.WithLabelValues(channel).Inc() var resp *http.Response var body []byte resp, err = client.Do(req) if err != nil { stats.AlertNotifyErrorTotal.WithLabelValues(channel).Inc() logger.Errorf("event_%s_fail, event:%s, url: [%s], error: [%s]", channel, string(bs), conf.Url, err) return true, "", err } if resp.Body != nil { defer resp.Body.Close() body, _ = io.ReadAll(resp.Body) } if resp.StatusCode == 429 { logger.Errorf("event_%s_fail, url: %s, response code: %d, body: %s event:%s", channel, conf.Url, resp.StatusCode, string(body), string(bs)) return true, fmt.Sprintf("status_code:%d, response:%s", resp.StatusCode, string(body)), fmt.Errorf("status code is 429") } logger.Debugf("event_%s_succ, url: %s, response code: %d, body: %s event:%s", channel, conf.Url, resp.StatusCode, string(body), string(bs)) return false, fmt.Sprintf("status_code:%d, response:%s", resp.StatusCode, string(body)), nil } func SingleSendWebhooks(ctx *ctx.Context, webhooks map[string]*models.Webhook, event *models.AlertCurEvent, stats *astats.Stats) { for _, conf := range webhooks { retryCount := 0 for retryCount < 3 { start := time.Now() needRetry, res, err := sendWebhook(conf, event, stats) res = fmt.Sprintf("send_time: %s duration: %d ms %s", time.Now().Format("2006-01-02 15:04:05"), time.Since(start).Milliseconds(), res) NotifyRecord(ctx, []*models.AlertCurEvent{event}, 0, "webhook", conf.Url, res, err) if !needRetry { break } retryCount++ time.Sleep(time.Minute * 1 * time.Duration(retryCount)) } } } func BatchSendWebhooks(ctx *ctx.Context, webhooks map[string]*models.Webhook, event *models.AlertCurEvent, stats *astats.Stats) { for _, conf := range webhooks { logger.Infof("push event:%s to queue:%v", event.Hash, conf) PushEvent(ctx, conf, event, stats) } } var EventQueue = make(map[string]*WebhookQueue) var CallbackEventQueue = make(map[string]*WebhookQueue) var CallbackEventQueueLock sync.RWMutex var EventQueueLock sync.RWMutex const QueueMaxSize = 100000 type WebhookQueue struct { eventQueue *SafeEventQueue closeCh chan struct{} } func PushEvent(ctx *ctx.Context, webhook *models.Webhook, event *models.AlertCurEvent, stats *astats.Stats) { EventQueueLock.RLock() queue := EventQueue[webhook.Url] EventQueueLock.RUnlock() if queue == nil { queue = &WebhookQueue{ eventQueue: NewSafeEventQueue(QueueMaxSize), closeCh: make(chan struct{}), } EventQueueLock.Lock() EventQueue[webhook.Url] = queue EventQueueLock.Unlock() StartConsumer(ctx, queue, webhook.Batch, webhook, stats) } succ := queue.eventQueue.Push(event) if !succ { stats.AlertNotifyErrorTotal.WithLabelValues("push_event_queue").Inc() logger.Warningf("Write channel(%s) full, current channel size: %d event:%s", webhook.Url, queue.eventQueue.Len(), event.Hash) } } func StartConsumer(ctx *ctx.Context, queue *WebhookQueue, popSize int, webhook *models.Webhook, stats *astats.Stats) { for { select { case <-queue.closeCh: logger.Infof("event queue:%v closed", queue) return default: events := queue.eventQueue.PopN(popSize) if len(events) == 0 { time.Sleep(time.Millisecond * 400) continue } retryCount := 0 for retryCount < webhook.RetryCount { start := time.Now() needRetry, res, err := sendWebhook(webhook, events, stats) res = fmt.Sprintf("send_time: %s duration: %d ms %s", time.Now().Format("2006-01-02 15:04:05"), time.Since(start).Milliseconds(), res) go NotifyRecord(ctx, events, 0, "webhook", webhook.Url, res, err) if !needRetry { break } retryCount++ time.Sleep(time.Second * time.Duration(webhook.RetryInterval) * time.Duration(retryCount)) } } } } ================================================ FILE: alert/sender/webhook_event_queue.go ================================================ package sender import ( "container/list" "sync" "github.com/ccfos/nightingale/v6/models" ) type SafeEventQueue struct { lock sync.RWMutex maxSize int queueHigh *list.List queueMiddle *list.List queueLow *list.List } const ( High = 1 Middle = 2 Low = 3 ) func NewSafeEventQueue(maxSize int) *SafeEventQueue { return &SafeEventQueue{ maxSize: maxSize, lock: sync.RWMutex{}, queueHigh: list.New(), queueMiddle: list.New(), queueLow: list.New(), } } func (spq *SafeEventQueue) Len() int { spq.lock.RLock() defer spq.lock.RUnlock() return spq.queueHigh.Len() + spq.queueMiddle.Len() + spq.queueLow.Len() } // len 无锁读取长度,不要在本文件外调用 func (spq *SafeEventQueue) len() int { return spq.queueHigh.Len() + spq.queueMiddle.Len() + spq.queueLow.Len() } func (spq *SafeEventQueue) Push(event *models.AlertCurEvent) bool { spq.lock.Lock() defer spq.lock.Unlock() for spq.len() > spq.maxSize { return false } switch event.Severity { case High: spq.queueHigh.PushBack(event) case Middle: spq.queueMiddle.PushBack(event) case Low: spq.queueLow.PushBack(event) default: return false } return true } // pop 无锁弹出事件,不要在本文件外调用 func (spq *SafeEventQueue) pop() *models.AlertCurEvent { if spq.len() == 0 { return nil } var elem interface{} if spq.queueHigh.Len() > 0 { elem = spq.queueHigh.Remove(spq.queueHigh.Front()) } else if spq.queueMiddle.Len() > 0 { elem = spq.queueMiddle.Remove(spq.queueMiddle.Front()) } else { elem = spq.queueLow.Remove(spq.queueLow.Front()) } event, ok := elem.(*models.AlertCurEvent) if !ok { return nil } return event } func (spq *SafeEventQueue) Pop() *models.AlertCurEvent { spq.lock.Lock() defer spq.lock.Unlock() return spq.pop() } func (spq *SafeEventQueue) PopN(n int) []*models.AlertCurEvent { spq.lock.Lock() defer spq.lock.Unlock() events := make([]*models.AlertCurEvent, 0, n) count := 0 for count < n && spq.len() > 0 { event := spq.pop() if event != nil { events = append(events, event) } count++ } return events } ================================================ FILE: alert/sender/webhook_event_queue_test.go ================================================ package sender import ( "sync" "testing" "time" "github.com/ccfos/nightingale/v6/models" "github.com/stretchr/testify/assert" ) func TestSafePriorityQueue_ConcurrentPushPop(t *testing.T) { spq := NewSafeEventQueue(100000) var wg sync.WaitGroup numGoroutines := 100 numEvents := 1000 // 并发 Push wg.Add(numGoroutines) for i := 0; i < numGoroutines; i++ { go func(goroutineID int) { defer wg.Done() for j := 0; j < numEvents; j++ { event := &models.AlertCurEvent{ Severity: goroutineID%3 + 1, TriggerTime: time.Now().UnixNano(), } spq.Push(event) } }(i) } wg.Wait() // 检查队列长度是否正确 expectedLen := numGoroutines * numEvents assert.Equal(t, expectedLen, spq.Len(), "Queue length mismatch after concurrent pushes") // 并发 Pop wg.Add(numGoroutines) for i := 0; i < numGoroutines; i++ { go func() { defer wg.Done() for { event := spq.Pop() if event == nil { return } } }() } wg.Wait() // 最终队列应该为空 assert.Equal(t, 0, spq.Len(), "Queue should be empty after concurrent pops") } func TestSafePriorityQueue_ConcurrentPopMax(t *testing.T) { spq := NewSafeEventQueue(100000) // 添加初始数据 for i := 0; i < 1000; i++ { spq.Push(&models.AlertCurEvent{ Severity: i%3 + 1, TriggerTime: time.Now().UnixNano(), }) } var wg sync.WaitGroup numGoroutines := 10 popMax := 100 // 并发 PopN wg.Add(numGoroutines) for i := 0; i < numGoroutines; i++ { go func() { defer wg.Done() events := spq.PopN(popMax) assert.LessOrEqual(t, len(events), popMax, "PopN exceeded maximum") }() } wg.Wait() // 检查队列长度是否正确 expectedRemaining := 1000 - (numGoroutines * popMax) if expectedRemaining < 0 { expectedRemaining = 0 } assert.Equal(t, expectedRemaining, spq.Len(), "Queue length mismatch after concurrent PopN") } func TestSafePriorityQueue_ConcurrentPushPopWithDifferentSeverities(t *testing.T) { spq := NewSafeEventQueue(100000) var wg sync.WaitGroup numGoroutines := 50 numEvents := 500 // 并发 Push 不同优先级的事件 wg.Add(numGoroutines) for i := 0; i < numGoroutines; i++ { go func(goroutineID int) { defer wg.Done() for j := 0; j < numEvents; j++ { event := &models.AlertCurEvent{ Severity: goroutineID%3 + 1, // 模拟不同的 Severity TriggerTime: time.Now().UnixNano(), } spq.Push(event) } }(i) } wg.Wait() // 检查队列长度是否正确 expectedLen := numGoroutines * numEvents assert.Equal(t, expectedLen, spq.Len(), "Queue length mismatch after concurrent pushes") // 检查事件的顺序是否按照优先级排列 var lastEvent *models.AlertCurEvent for spq.Len() > 0 { event := spq.Pop() if lastEvent != nil { assert.LessOrEqual(t, lastEvent.Severity, event.Severity, "Events are not in correct priority order") } lastEvent = event } } func TestSafePriorityQueue_ExceedMaxSize(t *testing.T) { spq := NewSafeEventQueue(5) // 插入超过最大容量的事件 for i := 0; i < 10; i++ { spq.Push(&models.AlertCurEvent{ Severity: i % 3, TriggerTime: int64(i), }) } // 验证队列的长度是否不超过 maxSize assert.LessOrEqual(t, spq.Len(), spq.maxSize) // 验证队列中剩余事件的内容 expectedEvents := 5 if spq.Len() < 5 { expectedEvents = spq.Len() } // 检查最后存入的事件是否是按优先级排序 for i := 0; i < expectedEvents; i++ { event := spq.Pop() if event != nil { assert.LessOrEqual(t, event.Severity, 2) } } } ================================================ FILE: alert/sender/webhook_queue.go ================================================ package sender import ( "container/list" "sync" "github.com/ccfos/nightingale/v6/models" ) type SafeList struct { sync.RWMutex L *list.List } func NewSafeList() *SafeList { return &SafeList{L: list.New()} } func (sl *SafeList) PushFront(v interface{}) *list.Element { sl.Lock() e := sl.L.PushFront(v) sl.Unlock() return e } func (sl *SafeList) PushFrontBatch(vs []interface{}) { sl.Lock() for _, item := range vs { sl.L.PushFront(item) } sl.Unlock() } func (sl *SafeList) PopBack(max int) []*models.AlertCurEvent { sl.Lock() count := sl.L.Len() if count == 0 { sl.Unlock() return []*models.AlertCurEvent{} } if count > max { count = max } items := make([]*models.AlertCurEvent, 0, count) for i := 0; i < count; i++ { item := sl.L.Remove(sl.L.Back()) sample, ok := item.(*models.AlertCurEvent) if ok { items = append(items, sample) } } sl.Unlock() return items } func (sl *SafeList) RemoveAll() { sl.Lock() sl.L.Init() sl.Unlock() } func (sl *SafeList) Len() int { sl.RLock() size := sl.L.Len() sl.RUnlock() return size } // SafeList with Limited Size type SafeListLimited struct { maxSize int SL *SafeList } func NewSafeListLimited(maxSize int) *SafeListLimited { return &SafeListLimited{SL: NewSafeList(), maxSize: maxSize} } func (sll *SafeListLimited) PopBack(max int) []*models.AlertCurEvent { return sll.SL.PopBack(max) } func (sll *SafeListLimited) PushFront(v interface{}) bool { if sll.SL.Len() >= sll.maxSize { return false } sll.SL.PushFront(v) return true } func (sll *SafeListLimited) PushFrontBatch(vs []interface{}) bool { if sll.SL.Len() >= sll.maxSize { return false } sll.SL.PushFrontBatch(vs) return true } func (sll *SafeListLimited) RemoveAll() { sll.SL.RemoveAll() } func (sll *SafeListLimited) Len() int { return sll.SL.Len() } ================================================ FILE: alert/sender/wecom.go ================================================ package sender import ( "html/template" "strings" "github.com/ccfos/nightingale/v6/models" ) type wecomMarkdown struct { Content string `json:"content"` } type wecom struct { Msgtype string `json:"msgtype"` Markdown wecomMarkdown `json:"markdown"` } var ( _ CallBacker = (*WecomSender)(nil) ) type WecomSender struct { tpl *template.Template } func (ws *WecomSender) CallBack(ctx CallBackContext) { if len(ctx.Events) == 0 || len(ctx.CallBackURL) == 0 { return } message := BuildTplMessage(models.Wecom, ws.tpl, ctx.Events) body := wecom{ Msgtype: "markdown", Markdown: wecomMarkdown{ Content: message, }, } doSendAndRecord(ctx.Ctx, ctx.CallBackURL, ctx.CallBackURL, body, "callback", ctx.Stats, ctx.Events) } func (ws *WecomSender) Send(ctx MessageContext) { if len(ctx.Users) == 0 || len(ctx.Events) == 0 { return } urls, tokens := ws.extract(ctx.Users) message := BuildTplMessage(models.Wecom, ws.tpl, ctx.Events) for i, url := range urls { body := wecom{ Msgtype: "markdown", Markdown: wecomMarkdown{ Content: message, }, } doSendAndRecord(ctx.Ctx, url, tokens[i], body, models.Wecom, ctx.Stats, ctx.Events) } } func (ws *WecomSender) extract(users []*models.User) ([]string, []string) { urls := make([]string, 0, len(users)) tokens := make([]string, 0, len(users)) for _, user := range users { if token, has := user.ExtractToken(models.Wecom); has { url := token if !strings.HasPrefix(token, "https://") && !strings.HasPrefix(token, "http://") { url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=" + token } urls = append(urls, url) tokens = append(tokens, token) } } return urls, tokens } ================================================ FILE: center/cconf/conf.go ================================================ package cconf import ( "time" "github.com/ccfos/nightingale/v6/pkg/httpx" ) type Center struct { Plugins []Plugin MetricsYamlFile string OpsYamlFile string BuiltinIntegrationsDir string I18NHeaderKey string MetricDesc MetricDescType AnonymousAccess AnonymousAccess UseFileAssets bool FlashDuty FlashDuty EventHistoryGroupView bool CleanNotifyRecordDay int CleanPipelineExecutionDay int MigrateBusiGroupLabel bool RSA httpx.RSAConfig } type Plugin struct { Id int64 `json:"id"` Category string `json:"category"` Type string `json:"plugin_type"` TypeName string `json:"plugin_type_name"` } type FlashDuty struct { Api string Headers map[string]string Timeout time.Duration } type AnonymousAccess struct { PromQuerier bool AlertDetail bool } func (c *Center) PreCheck() { if len(c.Plugins) == 0 { c.Plugins = Plugins } } ================================================ FILE: center/cconf/event_example.go ================================================ package cconf const EVENT_EXAMPLE = ` { "id": 1000000, "cate": "prometheus", "datasource_id": 1, "group_id": 1, "group_name": "Default Busi Group", "hash": "2cb966f9ba1cdc7af94c3796e855955a", "rule_id": 23, "rule_name": "测试告警", "rule_note": "测试告警", "rule_prod": "metric", "rule_config": { "queries": [ { "key": "all_hosts", "op": "==", "values": [] } ], "triggers": [ { "duration": 3, "percent": 10, "severity": 3, "type": "pct_target_miss" } ] }, "prom_for_duration": 60, "prom_eval_interval": 30, "callbacks": ["https://n9e.github.io"], "notify_recovered": 1, "notify_channels": ["dingtalk"], "notify_groups": [], "notify_groups_obj": null, "target_ident": "host01", "target_note": "机器备注", "trigger_time": 1677229517, "trigger_value": "2273533952", "tags": [ "__name__=disk_free", "dc=qcloud-dev", "device=vda1", "fstype=ext4", "ident=tt-fc-dev00.nj" ], "is_recovered": false, "notify_users_obj": null, "last_eval_time": 1677229517, "last_sent_time": 1677229517, "notify_cur_number": 1, "first_trigger_time": 1677229517, "annotations": { "summary": "测试告警" } } ` ================================================ FILE: center/cconf/metric.go ================================================ package cconf import ( "path" "github.com/toolkits/pkg/file" ) // metricDesc , As load map happens before read map, there is no necessary to use concurrent map for metric desc store type MetricDescType struct { CommonDesc map[string]string `yaml:",inline" json:"common"` Zh map[string]string `yaml:"zh" json:"zh"` En map[string]string `yaml:"en" json:"en"` } var MetricDesc MetricDescType // GetMetricDesc , if metric is not registered, empty string will be returned func GetMetricDesc(lang, metric string) string { var m map[string]string switch lang { case "en": m = MetricDesc.En default: m = MetricDesc.Zh } if m != nil { if desc, ok := m[metric]; ok { return desc } } if MetricDesc.CommonDesc != nil { if desc, ok := MetricDesc.CommonDesc[metric]; ok { return desc } } return "" } func LoadMetricsYaml(configDir, metricsYamlFile string) error { fp := metricsYamlFile if fp == "" { fp = path.Join(configDir, "metrics.yaml") } if !file.IsExist(fp) { return nil } return file.ReadYaml(fp, &MetricDesc) } ================================================ FILE: center/cconf/ops.go ================================================ package cconf import ( "fmt" "path" "github.com/toolkits/pkg/file" "gopkg.in/yaml.v2" ) var Operations = Operation{} type Operation struct { Ops []Ops `yaml:"ops"` } type Ops struct { Name string `yaml:"name" json:"name"` Cname string `yaml:"cname" json:"cname"` Ops []SingleOp `yaml:"ops" json:"ops"` } // SingleOp Name 为 op 名称;Cname 为展示名称,默认英文 type SingleOp struct { Name string `yaml:"name" json:"name"` Cname string `yaml:"cname" json:"cname"` } func TransformNames(name []string, nameToName map[string]string) []string { var ret []string for _, n := range name { if v, has := nameToName[n]; has { ret = append(ret, v) } } return ret } func LoadOpsYaml(configDir string, opsYamlFile string) error { fp := opsYamlFile if fp == "" { fp = path.Join(configDir, "ops.yaml") } if !file.IsExist(fp) { return nil } hash, _ := file.MD5(fp) if hash == "2f91a9ed265cf2024e266dc1d538ee77" { // ops.yaml 是老的默认文件,删除 file.Remove(fp) return nil } return file.ReadYaml(fp, &Operations) } func GetAllOps(ops []Ops) []SingleOp { var ret []SingleOp for _, op := range ops { ret = append(ret, op.Ops...) } return ret } func MergeOperationConf() error { var opsBuiltIn Operation err := yaml.Unmarshal([]byte(builtInOps), &opsBuiltIn) if err != nil { return fmt.Errorf("cannot parse builtInOps: %s", err.Error()) } configOpsMap := make(map[string]struct{}) for _, op := range Operations.Ops { configOpsMap[op.Name] = struct{}{} } //If the opBu.Name is not a constant in the target (Operations.Ops), add Ops from the built-in options for _, opBu := range opsBuiltIn.Ops { if _, has := configOpsMap[opBu.Name]; !has { Operations.Ops = append(Operations.Ops, opBu) } } return nil } const ( builtInOps = ` ops: - name: Infrastructure cname: Infrastructure ops: - name: /targets cname: Host - View - name: /targets/put cname: Host - Modify - name: /targets/del cname: Host - Delete - name: /targets/bind cname: Host - Bind Uncategorized - name: Explorer cname: Explorer ops: - name: /metric/explorer cname: Metrics Explorer - name: /object/explorer cname: Quick View - name: /metrics-built-in cname: Built-in Metric - View - name: /builtin-metrics/add cname: Built-in Metric - Add - name: /builtin-metrics/put cname: Built-in Metric - Modify - name: /builtin-metrics/del cname: Built-in Metric - Delete - name: /recording-rules cname: Recording Rule - View - name: /recording-rules/add cname: Recording Rule - Add - name: /recording-rules/put cname: Recording Rule - Modify - name: /recording-rules/del cname: Recording Rule - Delete - name: /log/explorer cname: Logs Explorer - name: /log/index-patterns # 前端有个管理索引模式的页面,所以需要一个权限点来控制,后面应该改成侧拉板 cname: Index Pattern - View - name: /log/index-patterns/add cname: Index Pattern - Add - name: /log/index-patterns/put cname: Index Pattern - Modify - name: /log/index-patterns/del cname: Index Pattern - Delete - name: /dashboards cname: Dashboard - View - name: /dashboards/add cname: Dashboard - Add - name: /dashboards/put cname: Dashboard - Modify - name: /dashboards/del cname: Dashboard - Delete - name: /public-dashboards cname: Dashboard - View Public - name: alerting cname: Alerting ops: - name: /alert-rules cname: Alerting Rule - View - name: /alert-rules/add cname: Alerting Rule - Add - name: /alert-rules/put cname: Alerting Rule - Modify - name: /alert-rules/del cname: Alerting Rule - Delete - name: /alert-mutes cname: Mutting Rule - View - name: /alert-mutes/add cname: Mutting Rule - Add - name: /alert-mutes/put cname: Mutting Rule - Modify - name: /alert-mutes/del cname: Mutting Rule - Delete - name: /alert-subscribes cname: Subscribing Rule - View - name: /alert-subscribes/add cname: Subscribing Rule - Add - name: /alert-subscribes/put cname: Subscribing Rule - Modify - name: /alert-subscribes/del cname: Subscribing Rule - Delete - name: /job-tpls cname: Self-healing-Script - View - name: /job-tpls/add cname: Self-healing-Script - Add - name: /job-tpls/put cname: Self-healing-Script - Modify - name: /job-tpls/del cname: Self-healing-Script - Delete - name: /job-tasks cname: Self-healing-Job - View - name: /job-tasks/add cname: Self-healing-Job - Add - name: /job-tasks/put cname: Self-healing-Job - Modify - name: /alert-cur-events cname: Active Event - View - name: /alert-cur-events/del cname: Active Event - Delete - name: /alert-his-events cname: Historical Event - View - name: Notification cname: Notification ops: - name: /notification-rules cname: Notification Rule - View - name: /notification-rules/add cname: Notification Rule - Add - name: /notification-rules/put cname: Notification Rule - Modify - name: /notification-rules/del cname: Notification Rule - Delete - name: /notification-channels cname: Media Type - View - name: /notification-channels/add cname: Media Type - Add - name: /notification-channels/put cname: Media Type - Modify - name: /notification-channels/del cname: Media Type - Delete - name: /notification-templates cname: Message Template - View - name: /notification-templates/add cname: Message Template - Add - name: /notification-templates/put cname: Message Template - Modify - name: /notification-templates/del cname: Message Template - Delete - name: /event-pipelines cname: Event Pipeline - View - name: /event-pipelines/add cname: Event Pipeline - Add - name: /event-pipelines/put cname: Event Pipeline - Modify - name: /event-pipelines/del cname: Event Pipeline - Delete - name: /help/notification-settings # 用于控制老版本的通知设置菜单是否展示 cname: Notification Settings - View - name: /help/notification-tpls # 用于控制老版本的通知模板菜单是否展示 cname: Notification Templates - View - name: Integrations cname: Integrations ops: - name: /datasources # 用于控制能否看到数据源列表页面的菜单。只有 Admin 才能修改、删除数据源 cname: Data Source - View - name: /components cname: Component - View - name: /components/add cname: Component - Add - name: /components/put cname: Component - Modify - name: /components/del cname: Component - Delete - name: /embedded-products cname: Embedded Product - View - name: /embedded-product/add cname: Embedded Product - Add - name: /embedded-product/put cname: Embedded Product - Modify - name: /embedded-product/delete cname: Embedded Product - Delete - name: Organization cname: Organization ops: - name: /users cname: User - View - name: /users/add cname: User - Add - name: /users/put cname: User - Modify - name: /users/del cname: User - Delete - name: /user-groups cname: Team - View - name: /user-groups/add cname: Team - Add - name: /user-groups/put cname: Team - Modify - name: /user-groups/del cname: Team - Delete - name: /busi-groups cname: Business Group - View - name: /busi-groups/add cname: Business Group - Add - name: /busi-groups/put cname: Business Group - Modify - name: /busi-groups/del cname: Business Group - Delete - name: /roles cname: Role - View - name: /roles/add cname: Role - Add - name: /roles/put cname: Role - Modify - name: /roles/del cname: Role - Delete - name: System Settings cname: System Settings ops: - name: /system/site-settings # 仅用于控制能否展示菜单,只有 Admin 才能修改、删除 cname: View Site Settings - name: /system/variable-settings cname: View Variable Settings - name: /system/sso-settings cname: View SSO Settings - name: /system/alerting-engines cname: View Alerting Engines - name: /system/version cname: View Product Version ` ) ================================================ FILE: center/cconf/plugin.go ================================================ package cconf var Plugins = []Plugin{ { Id: 1, Category: "timeseries", Type: "prometheus", TypeName: "Prometheus Like", }, { Id: 2, Category: "logging", Type: "elasticsearch", TypeName: "Elasticsearch", }, { Id: 3, Category: "loki", Type: "loki", TypeName: "Loki", }, { Id: 4, Category: "timeseries", Type: "tdengine", TypeName: "TDengine", }, { Id: 5, Category: "logging", Type: "ck", TypeName: "ClickHouse", }, { Id: 6, Category: "timeseries", Type: "mysql", TypeName: "MySQL", }, { Id: 7, Category: "timeseries", Type: "pgsql", TypeName: "PostgreSQL", }, { Id: 8, Category: "logging", Type: "doris", TypeName: "Doris", }, { Id: 9, Category: "logging", Type: "opensearch", TypeName: "OpenSearch", }, { Id: 10, Category: "logging", Type: "victorialogs", TypeName: "VictoriaLogs", }, } ================================================ FILE: center/cconf/rsa/rsa_conf.go ================================================ package rsa import ( "os" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/httpx" "github.com/ccfos/nightingale/v6/pkg/secu" "github.com/pkg/errors" "github.com/toolkits/pkg/file" "github.com/toolkits/pkg/logger" ) func InitRSAConfig(ctx *ctx.Context, rsaConfig *httpx.RSAConfig) error { // 1.Load RSA keys from Database rsaPassWord, err := models.ConfigsGet(ctx, models.RSA_PASSWORD) if err != nil { return errors.WithMessagef(err, "cannot query config(%s)", models.RSA_PASSWORD) } privateKeyVal, err := models.ConfigsGet(ctx, models.RSA_PRIVATE_KEY) if err != nil { return errors.WithMessagef(err, "cannot query config(%s)", models.RSA_PRIVATE_KEY) } publicKeyVal, err := models.ConfigsGet(ctx, models.RSA_PUBLIC_KEY) if err != nil { return errors.WithMessagef(err, "cannot query config(%s)", models.RSA_PUBLIC_KEY) } if rsaPassWord != "" && privateKeyVal != "" && publicKeyVal != "" { rsaConfig.RSAPassWord = rsaPassWord rsaConfig.RSAPrivateKey = []byte(privateKeyVal) rsaConfig.RSAPublicKey = []byte(publicKeyVal) return nil } // 2.Read RSA configuration from file if exists if file.IsExist(rsaConfig.RSAPrivateKeyPath) && file.IsExist(rsaConfig.RSAPublicKeyPath) { //password already read from config rsaConfig.RSAPrivateKey, rsaConfig.RSAPublicKey, err = readConfigFile(rsaConfig) if err != nil { return errors.WithMessage(err, "failed to read rsa config from file") } return nil } // 3.Generate RSA keys if not exist rsaConfig.RSAPassWord, rsaConfig.RSAPrivateKey, rsaConfig.RSAPublicKey, err = initRSAKeyPairs(ctx, rsaConfig.RSAPassWord) if err != nil { return errors.WithMessage(err, "failed to generate rsa key pair") } return nil } func initRSAKeyPairs(ctx *ctx.Context, rsaPassWord string) (password string, privateByte, publicByte []byte, err error) { // Generate RSA keys // Generate RSA password if rsaPassWord != "" { logger.Debug("Using existing RSA password") password = rsaPassWord err = models.ConfigsSet(ctx, models.RSA_PASSWORD, password) if err != nil { err = errors.WithMessagef(err, "failed to set config(%s)", models.RSA_PASSWORD) return } } else { password, err = models.InitRSAPassWord(ctx) if err != nil { err = errors.WithMessage(err, "failed to generate rsa password") return } } privateByte, publicByte, err = secu.GenerateRsaKeyPair(password) if err != nil { err = errors.WithMessage(err, "failed to generate rsa key pair") return } // Save generated RSA keys err = models.ConfigsSet(ctx, models.RSA_PRIVATE_KEY, string(privateByte)) if err != nil { err = errors.WithMessagef(err, "failed to set config(%s)", models.RSA_PRIVATE_KEY) return } err = models.ConfigsSet(ctx, models.RSA_PUBLIC_KEY, string(publicByte)) if err != nil { err = errors.WithMessagef(err, "failed to set config(%s)", models.RSA_PUBLIC_KEY) return } return } func readConfigFile(rsaConfig *httpx.RSAConfig) (privateBuf, publicBuf []byte, err error) { publicBuf, err = os.ReadFile(rsaConfig.RSAPublicKeyPath) if err != nil { err = errors.WithMessagef(err, "could not read RSAPublicKeyPath %q", rsaConfig.RSAPublicKeyPath) return } privateBuf, err = os.ReadFile(rsaConfig.RSAPrivateKeyPath) if err != nil { err = errors.WithMessagef(err, "could not read RSAPrivateKeyPath %q", rsaConfig.RSAPrivateKeyPath) } return } ================================================ FILE: center/cconf/sql_tpl.go ================================================ package cconf var TDengineSQLTpl = map[string]string{ "load5": "SELECT _wstart as ts, last(load5) FROM $database.system WHERE host = '$server' and _ts >= $from and _ts <= $to interval($interval) fill(null)", "process_total": "SELECT _wstart as ts, last(total) FROM $database.processes WHERE host = '$server' and _ts >= $from and _ts <= $to interval($interval) fill(null)", "thread_total": "SELECT _wstart as ts, last(total) FROM $database.threads WHERE host = '$server' and _ts >= $from and _ts <= $to interval($interval) fill(null)", "cpu_idle": "SELECT _wstart as ts, last(usage_idle) * -1 + 100 FROM $database.cpu WHERE (host = '$server' and cpu = 'cpu-total') and _ts >= $from and _ts <= $to interval($interval) fill(null)", "mem_used_percent": "SELECT _wstart as ts, last(used_percent) FROM $database.mem WHERE (host = '$server') and _ts >= $from and _ts <= $to interval($interval) fill(null)", "disk_used_percent": "SELECT _wstart as ts, last(used_percent) FROM $database.disk WHERE (host = '$server' and path = '/') and _ts >= $from and _ts <= $to interval($interval) fill(null)", "cpu_context_switches": "select ts, derivative(context_switches, 1s, 0) as context FROM (SELECT _wstart as ts, avg(context_switches) as context_switches FROM $database.kernel WHERE host = '$server' and _ts >= $from and _ts <= $to interval($interval) )", "tcp": "SELECT _wstart as ts, avg(tcp_close) as CLOSED, avg(tcp_close_wait) as CLOSE_WAIT, avg(tcp_closing) as CLOSING, avg(tcp_established) as ESTABLISHED, avg(tcp_fin_wait1) as FIN_WAIT1, avg(tcp_fin_wait2) as FIN_WAIT2, avg(tcp_last_ack) as LAST_ACK, avg(tcp_syn_recv) as SYN_RECV, avg(tcp_syn_sent) as SYN_SENT, avg(tcp_time_wait) as TIME_WAIT FROM $database.netstat WHERE host = '$server' and _ts >= $from and _ts <= $to interval($interval)", "net_bytes_recv": "SELECT _wstart as ts, derivative(bytes_recv,1s, 1) as bytes_in FROM $database.net WHERE host = '$server' and interface = '$netif' and _ts >= $from and _ts <= $to group by tbname", "net_bytes_sent": "SELECT _wstart as ts, derivative(bytes_sent,1s, 1) as bytes_out FROM $database.net WHERE host = '$server' and interface = '$netif' and _ts >= $from and _ts <= $to group by tbname", "disk_total": "SELECT _wstart as ts, avg(total) AS total, avg(used) as used FROM $database.disk WHERE path = '$mountpoint' and _ts >= $from and _ts <= $to interval($interval) group by host", } ================================================ FILE: center/center.go ================================================ package center import ( "context" "encoding/json" "fmt" "github.com/ccfos/nightingale/v6/dscache" "github.com/toolkits/pkg/logger" "github.com/ccfos/nightingale/v6/alert" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/alert/dispatch" "github.com/ccfos/nightingale/v6/alert/process" alertrt "github.com/ccfos/nightingale/v6/alert/router" "github.com/ccfos/nightingale/v6/center/cconf" "github.com/ccfos/nightingale/v6/center/cconf/rsa" "github.com/ccfos/nightingale/v6/center/integration" "github.com/ccfos/nightingale/v6/center/metas" centerrt "github.com/ccfos/nightingale/v6/center/router" "github.com/ccfos/nightingale/v6/center/sso" "github.com/ccfos/nightingale/v6/conf" "github.com/ccfos/nightingale/v6/cron" "github.com/ccfos/nightingale/v6/dumper" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/models/migrate" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/flashduty" "github.com/ccfos/nightingale/v6/pkg/httpx" "github.com/ccfos/nightingale/v6/pkg/i18nx" "github.com/ccfos/nightingale/v6/pkg/logx" "github.com/ccfos/nightingale/v6/pkg/macros" "github.com/ccfos/nightingale/v6/pkg/version" "github.com/ccfos/nightingale/v6/prom" "github.com/ccfos/nightingale/v6/pushgw/idents" pushgwrt "github.com/ccfos/nightingale/v6/pushgw/router" "github.com/ccfos/nightingale/v6/pushgw/writer" "github.com/ccfos/nightingale/v6/storage" "github.com/flashcatcloud/ibex/src/cmd/ibex" ) func Initialize(configDir string, cryptoKey string) (func(), error) { config, err := conf.InitConfig(configDir, cryptoKey) if err != nil { return nil, fmt.Errorf("failed to init config: %v", err) } cconf.LoadMetricsYaml(configDir, config.Center.MetricsYamlFile) cconf.LoadOpsYaml(configDir, config.Center.OpsYamlFile) cconf.MergeOperationConf() if config.Alert.Heartbeat.EngineName == "" { config.Alert.Heartbeat.EngineName = "default" } logxClean, err := logx.Init(config.Log) if err != nil { return nil, err } i18nx.Init(configDir) flashduty.Init(config.Center.FlashDuty) db, err := storage.New(config.DB) if err != nil { return nil, err } ctx := ctx.NewContext(context.Background(), db, true) migrate.Migrate(db) isRootInit := models.InitRoot(ctx) config.HTTP.JWTAuth.SigningKey = models.InitJWTSigningKey(ctx) err = rsa.InitRSAConfig(ctx, &config.HTTP.RSA) if err != nil { return nil, err } go integration.Init(ctx, config.Center.BuiltinIntegrationsDir) var redis storage.Redis redis, err = storage.NewRedis(config.Redis) if err != nil { return nil, err } metas := metas.New(redis) idents := idents.New(ctx, redis, config.Pushgw) syncStats := memsto.NewSyncStats() alertStats := astats.NewSyncStats() if config.Center.MigrateBusiGroupLabel || models.CanMigrateBg(ctx) { models.MigrateBg(ctx, config.Pushgw.BusiGroupLabelKey) } if models.CanMigrateEP(ctx) { models.MigrateEP(ctx) } // 初始化 siteUrl,如果为空则设置默认值 InitSiteUrl(ctx, config.Alert.Heartbeat.IP, config.HTTP.Port) configCache := memsto.NewConfigCache(ctx, syncStats, config.HTTP.RSA.RSAPrivateKey, config.HTTP.RSA.RSAPassWord) busiGroupCache := memsto.NewBusiGroupCache(ctx, syncStats) targetCache := memsto.NewTargetCache(ctx, syncStats, redis) dsCache := memsto.NewDatasourceCache(ctx, syncStats) alertMuteCache := memsto.NewAlertMuteCache(ctx, syncStats) alertRuleCache := memsto.NewAlertRuleCache(ctx, syncStats) notifyConfigCache := memsto.NewNotifyConfigCache(ctx, configCache) userCache := memsto.NewUserCache(ctx, syncStats) userGroupCache := memsto.NewUserGroupCache(ctx, syncStats) taskTplCache := memsto.NewTaskTplCache(ctx) configCvalCache := memsto.NewCvalCache(ctx, syncStats) notifyRuleCache := memsto.NewNotifyRuleCache(ctx, syncStats) notifyChannelCache := memsto.NewNotifyChannelCache(ctx, syncStats) messageTemplateCache := memsto.NewMessageTemplateCache(ctx, syncStats) userTokenCache := memsto.NewUserTokenCache(ctx, syncStats) sso := sso.Init(config.Center, ctx, configCache) promClients := prom.NewPromClient(ctx) dispatch.InitRegisterQueryFunc(promClients) externalProcessors := process.NewExternalProcessors() macros.RegisterMacro(macros.MacroInVain) dscache.Init(ctx, false) alert.Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, taskTplCache, dsCache, ctx, promClients, userCache, userGroupCache, notifyRuleCache, notifyChannelCache, messageTemplateCache, configCvalCache) writers := writer.NewWriters(config.Pushgw) go version.GetGithubVersion() go cron.CleanNotifyRecord(ctx, config.Center.CleanNotifyRecordDay) go cron.CleanPipelineExecution(ctx, config.Center.CleanPipelineExecutionDay) alertrtRouter := alertrt.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors, config.Log.Dir) centerRouter := centerrt.New(config.HTTP, config.Center, config.Alert, config.Ibex, cconf.Operations, dsCache, notifyConfigCache, promClients, redis, sso, ctx, metas, idents, targetCache, userCache, userGroupCache, userTokenCache, config.Log.Dir) pushgwRouter := pushgwrt.New(config.HTTP, config.Pushgw, config.Alert, targetCache, busiGroupCache, idents, metas, writers, ctx) r := httpx.GinEngine(config.Global.RunMode, config.HTTP, configCvalCache.PrintBodyPaths, configCvalCache.PrintAccessLog) centerRouter.Config(r) alertrtRouter.Config(r) pushgwRouter.Config(r) dumper.ConfigRouter(r) if config.Ibex.Enable { migrate.MigrateIbexTables(db) ibex.ServerStart(true, db, redis, config.HTTP.APIForService.BasicAuth, config.Alert.Heartbeat, &config.CenterApi, r, centerRouter, config.Ibex, config.HTTP.Port) } httpClean := httpx.Init(config.HTTP, r) fmt.Printf("please view n9e at http://%v:%v\n", config.Alert.Heartbeat.IP, config.HTTP.Port) if isRootInit { fmt.Println("username/password: root/root.2020") } return func() { logxClean() httpClean() }, nil } // initSiteUrl 初始化 site_info 中的 site_url,如果为空则使用服务器IP和端口设置默认值 func InitSiteUrl(ctx *ctx.Context, serverIP string, serverPort int) { // 构造默认的 SiteUrl defaultSiteUrl := fmt.Sprintf("http://%s:%d", serverIP, serverPort) // 获取现有的 site_info 配置 siteInfoStr, err := models.ConfigsGet(ctx, "site_info") if err != nil { logger.Errorf("failed to get site_info config: %v", err) return } // 如果 site_info 不存在,创建新的 if siteInfoStr == "" { newSiteInfo := memsto.SiteInfo{ SiteUrl: defaultSiteUrl, } siteInfoBytes, err := json.Marshal(newSiteInfo) if err != nil { logger.Errorf("failed to marshal site_info: %v", err) return } err = models.ConfigsSet(ctx, "site_info", string(siteInfoBytes)) if err != nil { logger.Errorf("failed to set site_info: %v", err) return } logger.Infof("initialized site_url with default value: %s", defaultSiteUrl) return } // 检查现有的 site_info 中的 site_url 字段 var existingSiteInfo memsto.SiteInfo err = json.Unmarshal([]byte(siteInfoStr), &existingSiteInfo) if err != nil { logger.Errorf("failed to unmarshal site_info: %v", err) return } // 如果 site_url 已经有值,则不需要初始化 if existingSiteInfo.SiteUrl != "" { return } // 设置 site_url existingSiteInfo.SiteUrl = defaultSiteUrl siteInfoBytes, err := json.Marshal(existingSiteInfo) if err != nil { logger.Errorf("failed to marshal updated site_info: %v", err) return } err = models.ConfigsSet(ctx, "site_info", string(siteInfoBytes)) if err != nil { logger.Errorf("failed to update site_info: %v", err) return } logger.Infof("initialized site_url with default value: %s", defaultSiteUrl) } ================================================ FILE: center/cstats/stats.go ================================================ package cstats import ( "time" "github.com/prometheus/client_golang/prometheus" ) const ( namespace = "n9e" subsystem = "center" ) var ( uptime = prometheus.NewCounter( prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, Name: "uptime", Help: "HTTP service uptime.", }, ) RequestDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: namespace, Subsystem: subsystem, Buckets: prometheus.DefBuckets, Name: "http_request_duration_seconds", Help: "HTTP request latencies in seconds.", }, []string{"code", "path", "method"}, ) RedisOperationLatency = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: namespace, Subsystem: subsystem, Name: "redis_operation_latency_seconds", Help: "Histogram of latencies for Redis operations", Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5}, }, []string{"operation", "status"}, ) ) func init() { // Register the summary and the histogram with Prometheus's default registry. prometheus.MustRegister( uptime, RequestDuration, RedisOperationLatency, ) go recordUptime() } // recordUptime increases service uptime per second. func recordUptime() { for range time.Tick(time.Second) { uptime.Inc() } } ================================================ FILE: center/integration/init.go ================================================ package integration import ( "encoding/json" "path" "sort" "strings" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/pkg/errors" "github.com/toolkits/pkg/container/set" "github.com/toolkits/pkg/file" "github.com/toolkits/pkg/logger" "github.com/toolkits/pkg/runner" ) const SYSTEM = "system" var BuiltinPayloadInFile *BuiltinPayloadInFileType type BuiltinPayloadInFileType struct { Data map[uint64]map[string]map[string][]*models.BuiltinPayload // map[component_id]map[type]map[cate][]*models.BuiltinPayload IndexData map[int64]*models.BuiltinPayload // map[uuid]payload BuiltinMetrics map[string]*models.BuiltinMetric } func Init(ctx *ctx.Context, builtinIntegrationsDir string) { BuiltinPayloadInFile = NewBuiltinPayloadInFileType() err := models.InitBuiltinPayloads(ctx) if err != nil { logger.Warning("init old builtinPayloads fail ", err) return } if res, err := models.ConfigsSelectByCkey(ctx, "disable_integration_init"); err != nil { logger.Error("fail to get value 'disable_integration_init' from configs", err) return } else if len(res) != 0 { logger.Info("disable_integration_init is set, skip integration init") return } fp := builtinIntegrationsDir if fp == "" { fp = path.Join(runner.Cwd, "integrations") } // var fileList []string dirList, err := file.DirsUnder(fp) if err != nil { logger.Warning("read builtin component dir fail ", err) return } for _, dir := range dirList { // components icon componentDir := fp + "/" + dir component := models.BuiltinComponent{ Ident: dir, } // get logo name // /api/n9e/integrations/icon/AliYun/aliyun.png files, err := file.FilesUnder(componentDir + "/icon") if err == nil && len(files) > 0 { component.Logo = "/api/n9e/integrations/icon/" + component.Ident + "/" + files[0] } else if err != nil { logger.Warningf("read builtin component icon dir fail %s %v", component.Ident, err) } // get description files, err = file.FilesUnder(componentDir + "/markdown") if err == nil && len(files) > 0 { var readmeFile string for _, file := range files { if strings.HasSuffix(strings.ToLower(file), "md") { readmeFile = componentDir + "/markdown/" + file break } } if readmeFile != "" { component.Readme, _ = file.ReadString(readmeFile) } } else if err != nil { logger.Warningf("read builtin component markdown dir fail %s %v", component.Ident, err) } exists, _ := models.BuiltinComponentExists(ctx, &component) if !exists { err = component.Add(ctx, SYSTEM) if err != nil { logger.Warning("add builtin component fail ", component, err) continue } } else { old, err := models.BuiltinComponentGet(ctx, "ident = ?", component.Ident) if err != nil { logger.Warning("get builtin component fail ", component, err) continue } if old == nil { logger.Warning("get builtin component nil ", component) continue } if old.UpdatedBy == SYSTEM { now := time.Now().Unix() old.CreatedAt = now old.UpdatedAt = now old.Readme = component.Readme old.UpdatedBy = SYSTEM err = models.DB(ctx).Model(old).Select("*").Updates(old).Error if err != nil { logger.Warning("update builtin component fail ", old, err) } } component.ID = old.ID } // delete uuid is empty err = models.DB(ctx).Exec("delete from builtin_payloads where uuid = 0 and type != 'collect' and (updated_by = 'system' or updated_by = '')").Error if err != nil { logger.Warning("delete builtin payloads fail ", err) } // delete builtin metrics uuid is empty err = models.DB(ctx).Exec("delete from builtin_metrics where uuid = 0 and (updated_by = 'system' or updated_by = '')").Error if err != nil { logger.Warning("delete builtin metrics fail ", err) } // 删除 uuid%1000 不为 0 uuid > 1000000000000000000 且 type 为 dashboard 的记录 err = models.DB(ctx).Exec("delete from builtin_payloads where uuid%1000 != 0 and uuid > 1000000000000000000 and type = 'dashboard' and updated_by = 'system'").Error if err != nil { logger.Warning("delete builtin payloads fail ", err) } // alerts files, err = file.FilesUnder(componentDir + "/alerts") if err == nil && len(files) > 0 { for _, f := range files { fp := componentDir + "/alerts/" + f bs, err := file.ReadBytes(fp) if err != nil { logger.Warning("read builtin component alerts file fail ", f, err) continue } alerts := []models.AlertRule{} err = json.Unmarshal(bs, &alerts) if err != nil { logger.Warning("parse builtin component alerts file fail ", f, err) continue } newAlerts := []models.AlertRule{} for _, alert := range alerts { if alert.UUID == 0 { time.Sleep(time.Microsecond) alert.UUID = time.Now().UnixMicro() } newAlerts = append(newAlerts, alert) content, err := json.Marshal(alert) if err != nil { logger.Warning("marshal builtin alert fail ", alert, err) continue } cate := strings.Replace(f, ".json", "", -1) builtinAlert := models.BuiltinPayload{ ComponentID: component.ID, Type: "alert", Cate: cate, Name: alert.Name, Tags: alert.AppendTags, Content: string(content), UUID: alert.UUID, ID: alert.UUID, CreatedBy: SYSTEM, UpdatedBy: SYSTEM, } BuiltinPayloadInFile.AddBuiltinPayload(&builtinAlert) } } } // dashboards files, err = file.FilesUnder(componentDir + "/dashboards") if err == nil && len(files) > 0 { for _, f := range files { fp := componentDir + "/dashboards/" + f bs, err := file.ReadBytes(fp) if err != nil { logger.Warning("read builtin component dashboards file fail ", f, err) continue } dashboard := BuiltinBoard{} err = json.Unmarshal(bs, &dashboard) if err != nil { logger.Warning("parse builtin component dashboards file fail ", f, err) continue } if dashboard.UUID == 0 { time.Sleep(time.Microsecond) dashboard.UUID = time.Now().UnixMicro() // 补全文件中的 uuid bs, err = json.MarshalIndent(dashboard, "", " ") if err != nil { logger.Warning("marshal builtin dashboard fail ", dashboard, err) continue } _, err = file.WriteBytes(fp, bs) if err != nil { logger.Warning("write builtin dashboard file fail ", f, err) } } content, err := json.Marshal(dashboard) if err != nil { logger.Warning("marshal builtin dashboard fail ", dashboard, err) continue } builtinDashboard := models.BuiltinPayload{ ComponentID: component.ID, Type: "dashboard", Cate: "", Name: dashboard.Name, Tags: dashboard.Tags, Note: dashboard.Note, Content: string(content), UUID: dashboard.UUID, ID: dashboard.UUID, CreatedBy: SYSTEM, UpdatedBy: SYSTEM, } BuiltinPayloadInFile.AddBuiltinPayload(&builtinDashboard) } } else if err != nil { logger.Warningf("read builtin component dash dir fail %s %v", component.Ident, err) } // metrics files, err = file.FilesUnder(componentDir + "/metrics") if err == nil && len(files) > 0 { for _, f := range files { fp := componentDir + "/metrics/" + f bs, err := file.ReadBytes(fp) if err != nil { logger.Warning("read builtin component metrics file fail", f, err) continue } metrics := []models.BuiltinMetric{} err = json.Unmarshal(bs, &metrics) if err != nil { logger.Warning("parse builtin component metrics file fail", f, err) continue } for _, metric := range metrics { time.Sleep(time.Microsecond) metric.UUID = time.Now().UnixMicro() metric.ID = metric.UUID metric.CreatedBy = SYSTEM metric.UpdatedBy = SYSTEM BuiltinPayloadInFile.BuiltinMetrics[metric.Expression] = &metric } } } else if err != nil { logger.Warningf("read builtin component metrics dir fail %s %v", component.Ident, err) } } } type BuiltinBoard struct { Id int64 `json:"id" gorm:"primaryKey"` GroupId int64 `json:"group_id"` Name string `json:"name"` Ident string `json:"ident"` Tags string `json:"tags"` Note string `json:"note"` CreateAt int64 `json:"create_at"` CreateBy string `json:"create_by"` UpdateAt int64 `json:"update_at"` UpdateBy string `json:"update_by"` Configs interface{} `json:"configs" gorm:"-"` Public int `json:"public"` // 0: false, 1: true PublicCate int `json:"public_cate"` // 0: anonymous, 1: login, 2: busi Bgids []int64 `json:"bgids" gorm:"-"` BuiltIn int `json:"built_in"` // 0: false, 1: true Hide int `json:"hide"` // 0: false, 1: true UUID int64 `json:"uuid"` } func NewBuiltinPayloadInFileType() *BuiltinPayloadInFileType { return &BuiltinPayloadInFileType{ Data: make(map[uint64]map[string]map[string][]*models.BuiltinPayload), IndexData: make(map[int64]*models.BuiltinPayload), BuiltinMetrics: make(map[string]*models.BuiltinMetric), } } func (b *BuiltinPayloadInFileType) AddBuiltinPayload(bp *models.BuiltinPayload) { if _, exists := b.Data[bp.ComponentID]; !exists { b.Data[bp.ComponentID] = make(map[string]map[string][]*models.BuiltinPayload) } bpInType := b.Data[bp.ComponentID] if _, exists := bpInType[bp.Type]; !exists { bpInType[bp.Type] = make(map[string][]*models.BuiltinPayload) } bpInCate := bpInType[bp.Type] if _, exists := bpInCate[bp.Cate]; !exists { bpInCate[bp.Cate] = make([]*models.BuiltinPayload, 0) } bpInCate[bp.Cate] = append(bpInCate[bp.Cate], bp) b.IndexData[bp.UUID] = bp } func (b *BuiltinPayloadInFileType) GetComponentIdentByCate(typ, cate string) string { for _, source := range b.Data { if source == nil { continue } typeMap, exists := source[typ] if !exists { continue } payloads, exists := typeMap[cate] if !exists { continue } if len(payloads) > 0 { return payloads[0].Component } } return "" } func (b *BuiltinPayloadInFileType) GetBuiltinPayload(typ, cate, query string, componentId uint64) ([]*models.BuiltinPayload, error) { var result []*models.BuiltinPayload source := b.Data[componentId] if source == nil { return nil, nil } typeMap, exists := source[typ] if !exists { return nil, nil } if cate != "" { payloads, exists := typeMap[cate] if !exists { return nil, nil } result = append(result, filterByQuery(payloads, query)...) } else { for _, payloads := range typeMap { result = append(result, filterByQuery(payloads, query)...) } } if len(result) > 0 { sort.Slice(result, func(i, j int) bool { return result[i].Name < result[j].Name }) } return result, nil } func (b *BuiltinPayloadInFileType) GetBuiltinPayloadCates(typ string, componentId uint64) ([]string, error) { var result []string source := b.Data[componentId] if source == nil { return result, nil } typeData := source[typ] if typeData == nil { return result, nil } for cate := range typeData { result = append(result, cate) } sort.Strings(result) return result, nil } func filterByQuery(payloads []*models.BuiltinPayload, query string) []*models.BuiltinPayload { if query == "" { return payloads } queryLower := strings.ToLower(query) var filtered []*models.BuiltinPayload for _, p := range payloads { if strings.Contains(strings.ToLower(p.Name), queryLower) || strings.Contains(strings.ToLower(p.Tags), queryLower) { filtered = append(filtered, p) } } return filtered } func (b *BuiltinPayloadInFileType) BuiltinMetricGets(metricsInDB []*models.BuiltinMetric, lang, collector, typ, query, unit string, limit, offset int) ([]*models.BuiltinMetric, int, error) { var filteredMetrics []*models.BuiltinMetric expressionSet := set.NewStringSet() builtinMetricsByDB := convertBuiltinMetricByDB(metricsInDB) builtinMetricsMap := make(map[string]*models.BuiltinMetric) for expression, metric := range builtinMetricsByDB { builtinMetricsMap[expression] = metric } for expression, metric := range b.BuiltinMetrics { builtinMetricsMap[expression] = metric } for _, metric := range builtinMetricsMap { if !applyFilter(metric, collector, typ, query, unit) { continue } // Skip if expression is already in db cache // NOTE: 忽略重复的expression,特别的,在旧版本中,用户可能已经创建了重复的metrics,需要覆盖掉ByFile中相同的Metrics // NOTE: Ignore duplicate expressions, especially in the old version, users may have created duplicate metrics, if expressionSet.Exists(metric.Expression) { continue } // Add db expression in set. expressionSet.Add(metric.Expression) // Apply language trans, err := getTranslationWithLanguage(metric, lang) if err != nil { logger.Errorf("Error getting translation for metric %s: %v", metric.Name, err) continue // Skip if translation not found } metric.Name = trans.Name metric.Note = trans.Note filteredMetrics = append(filteredMetrics, metric) } // Sort metrics sort.Slice(filteredMetrics, func(i, j int) bool { if filteredMetrics[i].Collector != filteredMetrics[j].Collector { return filteredMetrics[i].Collector < filteredMetrics[j].Collector } if filteredMetrics[i].Typ != filteredMetrics[j].Typ { return filteredMetrics[i].Typ < filteredMetrics[j].Typ } return filteredMetrics[i].Expression < filteredMetrics[j].Expression }) totalCount := len(filteredMetrics) // Validate parameters if offset < 0 { offset = 0 } if limit < 0 { limit = 0 } // Handle edge cases if offset >= totalCount || limit == 0 { return []*models.BuiltinMetric{}, totalCount, nil } // Apply pagination end := offset + limit if end > totalCount { end = totalCount } return filteredMetrics[offset:end], totalCount, nil } func (b *BuiltinPayloadInFileType) BuiltinMetricTypes(lang, collector, query string) []string { typeSet := set.NewStringSet() for _, metric := range b.BuiltinMetrics { if !applyFilter(metric, collector, "", query, "") { continue } typeSet.Add(metric.Typ) } return typeSet.ToSlice() } func (b *BuiltinPayloadInFileType) BuiltinMetricCollectors(lang, typ, query string) []string { collectorSet := set.NewStringSet() for _, metric := range b.BuiltinMetrics { if !applyFilter(metric, "", typ, query, "") { continue } collectorSet.Add(metric.Collector) } return collectorSet.ToSlice() } func applyFilter(metric *models.BuiltinMetric, collector, typ, query, unit string) bool { if collector != "" && collector != metric.Collector { return false } if typ != "" && typ != metric.Typ { return false } if unit != "" && !containsUnit(unit, metric.Unit) { return false } if query != "" && !applyQueryFilter(metric, query) { return false } return true } func containsUnit(unit, metricUnit string) bool { us := strings.Split(unit, ",") for _, u := range us { if u == metricUnit { return true } } return false } func applyQueryFilter(metric *models.BuiltinMetric, query string) bool { qs := strings.Split(query, " ") for _, q := range qs { if strings.HasPrefix(q, "-") { q = strings.TrimPrefix(q, "-") if strings.Contains(metric.Name, q) || strings.Contains(metric.Note, q) || strings.Contains(metric.Expression, q) { return false } } else { if !strings.Contains(metric.Name, q) && !strings.Contains(metric.Note, q) && !strings.Contains(metric.Expression, q) { return false } } } return true } func getTranslationWithLanguage(bm *models.BuiltinMetric, lang string) (*models.Translation, error) { var defaultTranslation *models.Translation for _, t := range bm.Translation { if t.Lang == lang { return &t, nil } if t.Lang == "en_US" { defaultTranslation = &t } } if defaultTranslation != nil { return defaultTranslation, nil } return nil, errors.Errorf("translation not found for metric %s", bm.Name) } func convertBuiltinMetricByDB(metricsInDB []*models.BuiltinMetric) map[string]*models.BuiltinMetric { builtinMetricsByDB := make(map[string]*models.BuiltinMetric) builtinMetricsByDBList := make(map[string][]*models.BuiltinMetric) for _, metric := range metricsInDB { builtinMetrics, ok := builtinMetricsByDBList[metric.Expression] if !ok { builtinMetrics = []*models.BuiltinMetric{} } builtinMetrics = append(builtinMetrics, metric) builtinMetricsByDBList[metric.Expression] = builtinMetrics } for expression, builtinMetrics := range builtinMetricsByDBList { if len(builtinMetrics) == 0 { continue } // NOTE: 为兼容旧版本用户已经创建的 metrics,同时将修改 metrics 收敛到同一个记录上, // 我们选择使用 expression 相同但是 id 最小的 metric 记录作为主要的 Metric。 sort.Slice(builtinMetrics, func(i, j int) bool { return builtinMetrics[i].ID < builtinMetrics[j].ID }) currentBuiltinMetric := builtinMetrics[0] // User has no customized translation, so we can merge it if len(currentBuiltinMetric.Translation) == 0 { translationMap := make(map[string]models.Translation) for _, bm := range builtinMetrics { for _, t := range getDefaultTranslation(bm) { translationMap[t.Lang] = t } } currentBuiltinMetric.Translation = make([]models.Translation, 0, len(translationMap)) for _, t := range translationMap { currentBuiltinMetric.Translation = append(currentBuiltinMetric.Translation, t) } } builtinMetricsByDB[expression] = currentBuiltinMetric } return builtinMetricsByDB } func getDefaultTranslation(bm *models.BuiltinMetric) []models.Translation { if len(bm.Translation) != 0 { return bm.Translation } return []models.Translation{{ Lang: bm.Lang, Name: bm.Name, Note: bm.Note, }} } ================================================ FILE: center/metas/metas.go ================================================ package metas import ( "context" "encoding/json" "sync" "time" "github.com/ccfos/nightingale/v6/center/cstats" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/storage" "github.com/toolkits/pkg/logger" ) type Set struct { sync.RWMutex items map[string]models.HostMeta redis storage.Redis } func New(redis storage.Redis) *Set { set := &Set{ items: make(map[string]models.HostMeta), redis: redis, } set.Init() return set } func (s *Set) Init() { go s.LoopPersist() } func (s *Set) MSet(items map[string]models.HostMeta) { s.Lock() defer s.Unlock() for ident, meta := range items { s.items[ident] = meta } } func (s *Set) Set(ident string, meta models.HostMeta) { s.Lock() defer s.Unlock() s.items[ident] = meta } func (s *Set) LoopPersist() { for { time.Sleep(time.Second) s.persist() } } func (s *Set) persist() { var items map[string]models.HostMeta s.Lock() if len(s.items) == 0 { s.Unlock() return } items = s.items s.items = make(map[string]models.HostMeta) s.Unlock() s.updateMeta(items) } func (s *Set) updateMeta(items map[string]models.HostMeta) { m := make(map[string]models.HostMeta, 100) num := 0 for _, meta := range items { m[meta.Hostname] = meta num++ if num == 100 { if err := s.updateTargets(m); err != nil { logger.Errorf("failed to update targets: %v", err) } m = make(map[string]models.HostMeta, 100) num = 0 } } if err := s.updateTargets(m); err != nil { logger.Errorf("failed to update targets: %v", err) } } func (s *Set) updateTargets(m map[string]models.HostMeta) error { if s.redis == nil { logger.Warningf("redis is nil") return nil } count := int64(len(m)) if count == 0 { return nil } newMap := make(map[string]interface{}, count) extendMap := make(map[string]interface{}) for ident, meta := range m { if meta.ExtendInfo != nil { extendMeta := meta.ExtendInfo meta.ExtendInfo = make(map[string]interface{}) extendMetaStr, err := json.Marshal(extendMeta) if err != nil { return err } extendMap[models.WrapExtendIdent(ident)] = extendMetaStr } newMap[models.WrapIdent(ident)] = meta } start := time.Now() err := storage.MSet(context.Background(), s.redis, newMap, 7*24*time.Hour) if err != nil { cstats.RedisOperationLatency.WithLabelValues("mset_target_meta", "fail").Observe(time.Since(start).Seconds()) return err } else { cstats.RedisOperationLatency.WithLabelValues("mset_target_meta", "success").Observe(time.Since(start).Seconds()) } if len(extendMap) > 0 { err = storage.MSet(context.Background(), s.redis, extendMap, 7*24*time.Hour) if err != nil { cstats.RedisOperationLatency.WithLabelValues("mset_target_extend", "fail").Observe(time.Since(start).Seconds()) return err } else { cstats.RedisOperationLatency.WithLabelValues("mset_target_extend", "success").Observe(time.Since(start).Seconds()) } } return err } ================================================ FILE: center/router/router.go ================================================ package router import ( "fmt" "net/http" "path" "runtime" "strings" "time" "github.com/ccfos/nightingale/v6/alert/aconf" "github.com/ccfos/nightingale/v6/center/cconf" "github.com/ccfos/nightingale/v6/center/cstats" "github.com/ccfos/nightingale/v6/center/metas" "github.com/ccfos/nightingale/v6/center/sso" "github.com/ccfos/nightingale/v6/conf" _ "github.com/ccfos/nightingale/v6/front/statik" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/aop" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/httpx" "github.com/ccfos/nightingale/v6/pkg/version" "github.com/ccfos/nightingale/v6/prom" "github.com/ccfos/nightingale/v6/pushgw/idents" "github.com/ccfos/nightingale/v6/storage" "github.com/ccfos/nightingale/v6/pkg/ginx" "gorm.io/gorm" "github.com/gin-gonic/gin" "github.com/rakyll/statik/fs" "github.com/toolkits/pkg/logger" "github.com/toolkits/pkg/runner" ) type Router struct { HTTP httpx.Config Center cconf.Center Ibex conf.Ibex Alert aconf.Alert Operations cconf.Operation DatasourceCache *memsto.DatasourceCacheType NotifyConfigCache *memsto.NotifyConfigCacheType PromClients *prom.PromClientMap Redis storage.Redis MetaSet *metas.Set IdentSet *idents.Set TargetCache *memsto.TargetCacheType Sso *sso.SsoClient UserCache *memsto.UserCacheType UserGroupCache *memsto.UserGroupCacheType UserTokenCache *memsto.UserTokenCacheType Ctx *ctx.Context LogDir string HeartbeatHook HeartbeatHookFunc TargetDeleteHook models.TargetDeleteHookFunc AlertRuleModifyHook AlertRuleModifyHookFunc } func New(httpConfig httpx.Config, center cconf.Center, alert aconf.Alert, ibex conf.Ibex, operations cconf.Operation, ds *memsto.DatasourceCacheType, ncc *memsto.NotifyConfigCacheType, pc *prom.PromClientMap, redis storage.Redis, sso *sso.SsoClient, ctx *ctx.Context, metaSet *metas.Set, idents *idents.Set, tc *memsto.TargetCacheType, uc *memsto.UserCacheType, ugc *memsto.UserGroupCacheType, utc *memsto.UserTokenCacheType, logDir string) *Router { return &Router{ HTTP: httpConfig, Center: center, Alert: alert, Ibex: ibex, Operations: operations, DatasourceCache: ds, NotifyConfigCache: ncc, PromClients: pc, Redis: redis, MetaSet: metaSet, IdentSet: idents, TargetCache: tc, Sso: sso, UserCache: uc, UserGroupCache: ugc, UserTokenCache: utc, Ctx: ctx, LogDir: logDir, HeartbeatHook: func(ident string) map[string]interface{} { return nil }, TargetDeleteHook: func(tx *gorm.DB, idents []string) error { return nil }, AlertRuleModifyHook: func(ar *models.AlertRule) {}, } } func stat() gin.HandlerFunc { return func(c *gin.Context) { start := time.Now() c.Next() code := fmt.Sprintf("%d", c.Writer.Status()) method := c.Request.Method labels := []string{code, c.FullPath(), method} cstats.RequestDuration.WithLabelValues(labels...).Observe(time.Since(start).Seconds()) } } func languageDetector(i18NHeaderKey string) gin.HandlerFunc { headerKey := i18NHeaderKey return func(c *gin.Context) { if headerKey != "" { lang := c.GetHeader(headerKey) if lang != "" { if strings.HasPrefix(lang, "zh_HK") { c.Request.Header.Set("X-Language", "zh_HK") } else if strings.HasPrefix(lang, "zh") { c.Request.Header.Set("X-Language", "zh_CN") } else if strings.HasPrefix(lang, "en") { c.Request.Header.Set("X-Language", "en") } else { c.Request.Header.Set("X-Language", lang) } } else { c.Request.Header.Set("X-Language", "zh_CN") } } c.Next() } } func (rt *Router) configNoRoute(r *gin.Engine, fs *http.FileSystem) { r.NoRoute(func(c *gin.Context) { arr := strings.Split(c.Request.URL.Path, ".") suffix := arr[len(arr)-1] switch suffix { case "png", "jpeg", "jpg", "svg", "ico", "gif", "css", "js", "html", "htm", "gz", "zip", "map", "ttf", "md": if !rt.Center.UseFileAssets { c.FileFromFS(c.Request.URL.Path, *fs) } else { cwdarr := []string{"/"} if runtime.GOOS == "windows" { cwdarr[0] = "" } cwdarr = append(cwdarr, strings.Split(runner.Cwd, "/")...) cwdarr = append(cwdarr, "pub") cwdarr = append(cwdarr, strings.Split(c.Request.URL.Path, "/")...) c.File(path.Join(cwdarr...)) } default: if !rt.Center.UseFileAssets { c.FileFromFS("/", *fs) } else { cwdarr := []string{"/"} if runtime.GOOS == "windows" { cwdarr[0] = "" } cwdarr = append(cwdarr, strings.Split(runner.Cwd, "/")...) cwdarr = append(cwdarr, "pub") cwdarr = append(cwdarr, "index.html") c.File(path.Join(cwdarr...)) } } }) } func (rt *Router) Config(r *gin.Engine) { r.Use(stat()) r.Use(languageDetector(rt.Center.I18NHeaderKey)) r.Use(aop.Recovery()) statikFS, err := fs.New() if err != nil { logger.Errorf("cannot create statik fs: %v", err) } if !rt.Center.UseFileAssets { r.StaticFS("/pub", statikFS) } pagesPrefix := "/api/n9e" pages := r.Group(pagesPrefix) { pages.DELETE("/datasource/series", rt.auth(), rt.admin(), rt.deleteDatasourceSeries) if rt.Center.AnonymousAccess.PromQuerier { pages.Any("/proxy/:id/*url", rt.dsProxy) pages.POST("/query-range-batch", rt.promBatchQueryRange) pages.POST("/query-instant-batch", rt.promBatchQueryInstant) pages.GET("/datasource/brief", rt.datasourceBriefs) pages.POST("/datasource/query", rt.datasourceQuery) pages.POST("/ds-query", rt.QueryData) pages.POST("/logs-query", rt.QueryLogV2) pages.POST("/tdengine-databases", rt.tdengineDatabases) pages.POST("/tdengine-tables", rt.tdengineTables) pages.POST("/tdengine-columns", rt.tdengineColumns) pages.POST("/log-query-batch", rt.QueryLogBatch) // 数据库元数据接口 pages.POST("/db-databases", rt.ShowDatabases) pages.POST("/db-tables", rt.ShowTables) pages.POST("/db-desc-table", rt.DescribeTable) // es 专用接口 pages.POST("/indices", rt.auth(), rt.user(), rt.QueryIndices) pages.POST("/es-variable", rt.auth(), rt.user(), rt.QueryESVariable) pages.POST("/fields", rt.auth(), rt.user(), rt.QueryFields) pages.POST("/log-query", rt.auth(), rt.user(), rt.QueryLog) } else { pages.Any("/proxy/:id/*url", rt.auth(), rt.dsProxy) pages.POST("/query-range-batch", rt.auth(), rt.promBatchQueryRange) pages.POST("/query-instant-batch", rt.auth(), rt.promBatchQueryInstant) pages.GET("/datasource/brief", rt.auth(), rt.user(), rt.datasourceBriefs) pages.POST("/datasource/query", rt.auth(), rt.user(), rt.datasourceQuery) pages.POST("/ds-query", rt.auth(), rt.user(), rt.QueryData) pages.POST("/logs-query", rt.auth(), rt.user(), rt.QueryLogV2) pages.POST("/tdengine-databases", rt.auth(), rt.tdengineDatabases) pages.POST("/tdengine-tables", rt.auth(), rt.tdengineTables) pages.POST("/tdengine-columns", rt.auth(), rt.tdengineColumns) pages.POST("/log-query-batch", rt.auth(), rt.user(), rt.QueryLogBatch) // 数据库元数据接口 pages.POST("/db-databases", rt.auth(), rt.user(), rt.ShowDatabases) pages.POST("/db-tables", rt.auth(), rt.user(), rt.ShowTables) pages.POST("/db-desc-table", rt.auth(), rt.user(), rt.DescribeTable) // es 专用接口 pages.POST("/indices", rt.auth(), rt.user(), rt.QueryIndices) pages.POST("/es-variable", rt.QueryESVariable) pages.POST("/fields", rt.QueryFields) pages.POST("/log-query", rt.QueryLog) } // OpenSearch 专用接口 pages.POST("/os-indices", rt.QueryOSIndices) pages.POST("/os-variable", rt.QueryOSVariable) pages.POST("/os-fields", rt.QueryOSFields) pages.GET("/sql-template", rt.QuerySqlTemplate) pages.POST("/auth/login", rt.jwtMock(), rt.loginPost) pages.POST("/auth/logout", rt.jwtMock(), rt.auth(), rt.user(), rt.logoutPost) pages.POST("/auth/refresh", rt.jwtMock(), rt.refreshPost) pages.POST("/auth/captcha", rt.jwtMock(), rt.generateCaptcha) pages.POST("/auth/captcha-verify", rt.jwtMock(), rt.captchaVerify) pages.GET("/auth/ifshowcaptcha", rt.ifShowCaptcha) pages.GET("/auth/sso-config", rt.ssoConfigNameGet) pages.GET("/auth/rsa-config", rt.rsaConfigGet) pages.GET("/auth/redirect", rt.loginRedirect) pages.GET("/auth/redirect/cas", rt.loginRedirectCas) pages.GET("/auth/redirect/oauth", rt.loginRedirectOAuth) pages.GET("/auth/redirect/dingtalk", rt.loginRedirectDingTalk) pages.GET("/auth/redirect/feishu", rt.loginRedirectFeiShu) pages.GET("/auth/callback", rt.loginCallback) pages.GET("/auth/callback/cas", rt.loginCallbackCas) pages.GET("/auth/callback/oauth", rt.loginCallbackOAuth) pages.GET("/auth/callback/dingtalk", rt.loginCallbackDingTalk) pages.GET("/auth/callback/feishu", rt.loginCallbackFeiShu) pages.GET("/auth/perms", rt.allPerms) pages.GET("/metrics/desc", rt.metricsDescGetFile) pages.POST("/metrics/desc", rt.metricsDescGetMap) pages.GET("/notify-channels", rt.notifyChannelsGets) pages.GET("/contact-keys", rt.contactKeysGets) pages.GET("/install-date", rt.installDateGet) pages.GET("/self/perms", rt.auth(), rt.user(), rt.permsGets) pages.GET("/self/profile", rt.auth(), rt.user(), rt.selfProfileGet) pages.PUT("/self/profile", rt.auth(), rt.user(), rt.selfProfilePut) pages.PUT("/self/password", rt.auth(), rt.user(), rt.selfPasswordPut) pages.GET("/self/token", rt.auth(), rt.user(), rt.getToken) pages.POST("/self/token", rt.auth(), rt.user(), rt.addToken) pages.DELETE("/self/token/:id", rt.auth(), rt.user(), rt.deleteToken) pages.GET("/users", rt.auth(), rt.user(), rt.perm("/users"), rt.userGets) pages.POST("/users", rt.auth(), rt.user(), rt.perm("/users/add"), rt.userAddPost) pages.GET("/user/:id/profile", rt.auth(), rt.userProfileGet) pages.PUT("/user/:id/profile", rt.auth(), rt.user(), rt.perm("/users/put"), rt.userProfilePut) pages.PUT("/user/:id/password", rt.auth(), rt.user(), rt.perm("/users/put"), rt.userPasswordPut) pages.DELETE("/user/:id", rt.auth(), rt.user(), rt.perm("/users/del"), rt.userDel) pages.GET("/metric-views", rt.auth(), rt.metricViewGets) pages.DELETE("/metric-views", rt.auth(), rt.user(), rt.metricViewDel) pages.POST("/metric-views", rt.auth(), rt.user(), rt.metricViewAdd) pages.PUT("/metric-views", rt.auth(), rt.user(), rt.metricViewPut) pages.GET("/builtin-metric-filters", rt.auth(), rt.user(), rt.metricFilterGets) pages.DELETE("/builtin-metric-filters", rt.auth(), rt.user(), rt.metricFilterDel) pages.POST("/builtin-metric-filters", rt.auth(), rt.user(), rt.metricFilterAdd) pages.PUT("/builtin-metric-filters", rt.auth(), rt.user(), rt.metricFilterPut) pages.POST("/builtin-metric-promql", rt.auth(), rt.user(), rt.getMetricPromql) pages.POST("/builtin-metrics", rt.auth(), rt.user(), rt.perm("/builtin-metrics/add"), rt.builtinMetricsAdd) pages.PUT("/builtin-metrics", rt.auth(), rt.user(), rt.perm("/builtin-metrics/put"), rt.builtinMetricsPut) pages.DELETE("/builtin-metrics", rt.auth(), rt.user(), rt.perm("/builtin-metrics/del"), rt.builtinMetricsDel) pages.GET("/builtin-metrics", rt.auth(), rt.user(), rt.builtinMetricsGets) pages.GET("/builtin-metrics/types", rt.auth(), rt.user(), rt.builtinMetricsTypes) pages.GET("/builtin-metrics/types/default", rt.auth(), rt.user(), rt.builtinMetricsDefaultTypes) pages.GET("/builtin-metrics/collectors", rt.auth(), rt.user(), rt.builtinMetricsCollectors) pages.GET("/user-groups", rt.auth(), rt.user(), rt.userGroupGets) pages.POST("/user-groups", rt.auth(), rt.user(), rt.perm("/user-groups/add"), rt.userGroupAdd) pages.GET("/user-group/:id", rt.auth(), rt.user(), rt.userGroupGet) pages.PUT("/user-group/:id", rt.auth(), rt.user(), rt.perm("/user-groups/put"), rt.userGroupWrite(), rt.userGroupPut) pages.DELETE("/user-group/:id", rt.auth(), rt.user(), rt.perm("/user-groups/del"), rt.userGroupWrite(), rt.userGroupDel) pages.POST("/user-group/:id/members", rt.auth(), rt.user(), rt.perm("/user-groups/put"), rt.userGroupWrite(), rt.userGroupMemberAdd) pages.DELETE("/user-group/:id/members", rt.auth(), rt.user(), rt.perm("/user-groups/put"), rt.userGroupWrite(), rt.userGroupMemberDel) pages.GET("/busi-groups", rt.auth(), rt.user(), rt.busiGroupGets) pages.POST("/busi-groups", rt.auth(), rt.user(), rt.perm("/busi-groups/add"), rt.busiGroupAdd) pages.GET("/busi-groups/alertings", rt.auth(), rt.busiGroupAlertingsGets) pages.GET("/busi-group/:id", rt.auth(), rt.user(), rt.bgro(), rt.busiGroupGet) pages.PUT("/busi-group/:id", rt.auth(), rt.user(), rt.perm("/busi-groups/put"), rt.bgrw(), rt.busiGroupPut) pages.POST("/busi-group/:id/members", rt.auth(), rt.user(), rt.perm("/busi-groups/put"), rt.bgrw(), rt.busiGroupMemberAdd) pages.DELETE("/busi-group/:id/members", rt.auth(), rt.user(), rt.perm("/busi-groups/put"), rt.bgrw(), rt.busiGroupMemberDel) pages.DELETE("/busi-group/:id", rt.auth(), rt.user(), rt.perm("/busi-groups/del"), rt.bgrw(), rt.busiGroupDel) pages.GET("/busi-group/:id/perm/:perm", rt.auth(), rt.user(), rt.checkBusiGroupPerm) pages.GET("/busi-groups/tags", rt.auth(), rt.user(), rt.busiGroupsGetTags) pages.GET("/targets", rt.auth(), rt.user(), rt.targetGets) pages.GET("/targets/stats", rt.auth(), rt.user(), rt.targetStats) pages.POST("/target-update", rt.auth(), rt.targetUpdate) pages.GET("/target/extra-meta", rt.auth(), rt.user(), rt.targetExtendInfoByIdent) pages.POST("/target/list", rt.auth(), rt.user(), rt.targetGetsByHostFilter) pages.DELETE("/targets", rt.auth(), rt.user(), rt.perm("/targets/del"), rt.targetDel) pages.GET("/targets/tags", rt.auth(), rt.user(), rt.targetGetTags) pages.POST("/targets/tags", rt.auth(), rt.user(), rt.perm("/targets/put"), rt.targetBindTagsByFE) pages.DELETE("/targets/tags", rt.auth(), rt.user(), rt.perm("/targets/put"), rt.targetUnbindTagsByFE) pages.PUT("/targets/note", rt.auth(), rt.user(), rt.perm("/targets/put"), rt.targetUpdateNote) pages.PUT("/targets/bgids", rt.auth(), rt.user(), rt.perm("/targets/put"), rt.targetBindBgids) pages.POST("/builtin-cate-favorite", rt.auth(), rt.user(), rt.builtinCateFavoriteAdd) pages.DELETE("/builtin-cate-favorite/:name", rt.auth(), rt.user(), rt.builtinCateFavoriteDel) pages.GET("/integrations/icon/:cate/:name", rt.builtinIcon) // pages.GET("/builtin-boards", rt.builtinBoardGets) // pages.GET("/builtin-board/:name", rt.builtinBoardGet) // pages.GET("/dashboards/builtin/list", rt.builtinBoardGets) // pages.GET("/builtin-boards-cates", rt.auth(), rt.user(), rt.builtinBoardCateGets) // pages.POST("/builtin-boards-detail", rt.auth(), rt.user(), rt.builtinBoardDetailGets) // pages.GET("/integrations/makedown/:cate", rt.builtinMarkdown) pages.GET("/busi-groups/public-boards", rt.auth(), rt.user(), rt.perm("/dashboards"), rt.publicBoardGets) pages.GET("/busi-groups/boards", rt.auth(), rt.user(), rt.perm("/dashboards"), rt.boardGetsByGids) pages.GET("/busi-group/:id/boards", rt.auth(), rt.user(), rt.perm("/dashboards"), rt.bgro(), rt.boardGets) pages.POST("/busi-group/:id/boards", rt.auth(), rt.user(), rt.perm("/dashboards/add"), rt.bgrw(), rt.boardAdd) pages.POST("/busi-group/:id/board/:bid/clone", rt.auth(), rt.user(), rt.perm("/dashboards/add"), rt.bgrw(), rt.boardClone) pages.POST("/busi-groups/boards/clones", rt.auth(), rt.user(), rt.perm("/dashboards/add"), rt.boardBatchClone) pages.GET("/boards", rt.auth(), rt.user(), rt.boardGetsByBids) pages.GET("/board/:bid", rt.boardGet) pages.GET("/board/:bid/pure", rt.boardPureGet) pages.PUT("/board/:bid", rt.auth(), rt.user(), rt.perm("/dashboards/put"), rt.boardPut) pages.PUT("/board/:bid/configs", rt.auth(), rt.user(), rt.perm("/dashboards/put"), rt.boardPutConfigs) pages.PUT("/board/:bid/public", rt.auth(), rt.user(), rt.perm("/dashboards/put"), rt.boardPutPublic) pages.DELETE("/boards", rt.auth(), rt.user(), rt.perm("/dashboards/del"), rt.boardDel) pages.GET("/share-charts", rt.chartShareGets) pages.POST("/share-charts", rt.auth(), rt.chartShareAdd) pages.POST("/dashboard-annotations", rt.auth(), rt.user(), rt.perm("/dashboards/put"), rt.dashAnnotationAdd) pages.GET("/dashboard-annotations", rt.dashAnnotationGets) pages.PUT("/dashboard-annotation/:id", rt.auth(), rt.user(), rt.perm("/dashboards/put"), rt.dashAnnotationPut) pages.DELETE("/dashboard-annotation/:id", rt.auth(), rt.user(), rt.perm("/dashboards/del"), rt.dashAnnotationDel) // pages.GET("/alert-rules/builtin/alerts-cates", rt.auth(), rt.user(), rt.builtinAlertCateGets) // pages.GET("/alert-rules/builtin/list", rt.auth(), rt.user(), rt.builtinAlertRules) pages.GET("/alert-rules/callbacks", rt.auth(), rt.user(), rt.alertRuleCallbacks) pages.GET("/timezones", rt.auth(), rt.user(), rt.timezonesGet) pages.GET("/busi-groups/alert-rules", rt.auth(), rt.user(), rt.perm("/alert-rules"), rt.alertRuleGetsByGids) pages.GET("/busi-group/:id/alert-rules", rt.auth(), rt.user(), rt.perm("/alert-rules"), rt.alertRuleGets) pages.POST("/busi-group/:id/alert-rules", rt.auth(), rt.user(), rt.perm("/alert-rules/add"), rt.bgrw(), rt.alertRuleAddByFE) pages.POST("/busi-group/:id/alert-rules/import", rt.auth(), rt.user(), rt.perm("/alert-rules/add"), rt.bgrw(), rt.alertRuleAddByImport) pages.POST("/busi-group/:id/alert-rules/import-prom-rule", rt.auth(), rt.user(), rt.perm("/alert-rules/add"), rt.bgrw(), rt.alertRuleAddByImportPromRule) pages.DELETE("/busi-group/:id/alert-rules", rt.auth(), rt.user(), rt.perm("/alert-rules/del"), rt.bgrw(), rt.alertRuleDel) pages.PUT("/busi-group/:id/alert-rules/fields", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.bgrw(), rt.alertRulePutFields) pages.PUT("/busi-group/:id/alert-rule/:arid", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.alertRulePutByFE) pages.GET("/alert-rule/:arid", rt.auth(), rt.user(), rt.perm("/alert-rules"), rt.alertRuleGet) pages.GET("/alert-rule/:arid/pure", rt.auth(), rt.user(), rt.perm("/alert-rules"), rt.alertRulePureGet) pages.PUT("/busi-group/alert-rule/validate", rt.auth(), rt.user(), rt.perm("/alert-rules/put"), rt.alertRuleValidation) pages.POST("/relabel-test", rt.auth(), rt.user(), rt.relabelTest) pages.POST("/busi-group/:id/alert-rules/clone", rt.auth(), rt.user(), rt.perm("/alert-rules/add"), rt.bgrw(), rt.cloneToMachine) pages.POST("/busi-groups/alert-rules/clones", rt.auth(), rt.user(), rt.perm("/alert-rules/add"), rt.batchAlertRuleClone) pages.POST("/busi-group/alert-rules/notify-tryrun", rt.auth(), rt.user(), rt.perm("/alert-rules/add"), rt.alertRuleNotifyTryRun) pages.POST("/busi-group/alert-rules/enable-tryrun", rt.auth(), rt.user(), rt.perm("/alert-rules/add"), rt.alertRuleEnableTryRun) pages.GET("/busi-groups/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules"), rt.recordingRuleGetsByGids) pages.GET("/busi-group/:id/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules"), rt.recordingRuleGets) pages.POST("/busi-group/:id/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules/add"), rt.bgrw(), rt.recordingRuleAddByFE) pages.DELETE("/busi-group/:id/recording-rules", rt.auth(), rt.user(), rt.perm("/recording-rules/del"), rt.bgrw(), rt.recordingRuleDel) pages.GET("/recording-rule/:rrid", rt.auth(), rt.user(), rt.perm("/recording-rules"), rt.recordingRuleGet) pages.PUT("/recording-rule/:rrid", rt.auth(), rt.user(), rt.perm("/recording-rules"), rt.recordingRulePutByFE) pages.PUT("/busi-group/:id/recording-rules/fields", rt.auth(), rt.user(), rt.perm("/recording-rules/put"), rt.recordingRulePutFields) pages.GET("/busi-groups/alert-mutes", rt.auth(), rt.user(), rt.perm("/alert-mutes"), rt.alertMuteGetsByGids) pages.GET("/busi-group/:id/alert-mutes", rt.auth(), rt.user(), rt.perm("/alert-mutes"), rt.bgro(), rt.alertMuteGetsByBG) pages.POST("/busi-group/:id/alert-mutes/preview", rt.auth(), rt.user(), rt.perm("/alert-mutes/add"), rt.bgrw(), rt.alertMutePreview) pages.POST("/busi-group/:id/alert-mutes", rt.auth(), rt.user(), rt.perm("/alert-mutes/add"), rt.bgrw(), rt.alertMuteAdd) pages.DELETE("/busi-group/:id/alert-mutes", rt.auth(), rt.user(), rt.perm("/alert-mutes/del"), rt.bgrw(), rt.alertMuteDel) pages.PUT("/busi-group/:id/alert-mute/:amid", rt.auth(), rt.user(), rt.perm("/alert-mutes/put"), rt.alertMutePutByFE) pages.GET("/busi-group/:id/alert-mute/:amid", rt.auth(), rt.user(), rt.perm("/alert-mutes"), rt.alertMuteGet) pages.PUT("/busi-group/:id/alert-mutes/fields", rt.auth(), rt.user(), rt.perm("/alert-mutes/put"), rt.bgrw(), rt.alertMutePutFields) pages.POST("/alert-mute-tryrun", rt.auth(), rt.user(), rt.perm("/alert-mutes/add"), rt.alertMuteTryRun) pages.GET("/busi-groups/alert-subscribes", rt.auth(), rt.user(), rt.perm("/alert-subscribes"), rt.alertSubscribeGetsByGids) pages.GET("/busi-group/:id/alert-subscribes", rt.auth(), rt.user(), rt.perm("/alert-subscribes"), rt.bgro(), rt.alertSubscribeGets) pages.GET("/alert-subscribe/:sid", rt.auth(), rt.user(), rt.perm("/alert-subscribes"), rt.alertSubscribeGet) pages.POST("/busi-group/:id/alert-subscribes", rt.auth(), rt.user(), rt.perm("/alert-subscribes/add"), rt.bgrw(), rt.alertSubscribeAdd) pages.PUT("/busi-group/:id/alert-subscribes", rt.auth(), rt.user(), rt.perm("/alert-subscribes/put"), rt.bgrw(), rt.alertSubscribePut) pages.DELETE("/busi-group/:id/alert-subscribes", rt.auth(), rt.user(), rt.perm("/alert-subscribes/del"), rt.bgrw(), rt.alertSubscribeDel) pages.POST("/alert-subscribe/alert-subscribes-tryrun", rt.auth(), rt.user(), rt.perm("/alert-subscribes/add"), rt.alertSubscribeTryRun) pages.GET("/alert-cur-event/:eid", rt.alertCurEventGet) pages.GET("/alert-his-event/:eid", rt.alertHisEventGet) pages.GET("/event-notify-records/:eid", rt.notificationRecordList) pages.GET("/event-detail/:hash", rt.eventDetailPage) pages.GET("/alert-eval-detail/:id", rt.alertEvalDetailPage) pages.GET("/trace-logs/:traceid", rt.traceLogsPage) // card logic pages.GET("/alert-cur-events/list", rt.auth(), rt.user(), rt.alertCurEventsList) pages.GET("/alert-cur-events/card", rt.auth(), rt.user(), rt.alertCurEventsCard) pages.POST("/alert-cur-events/card/details", rt.auth(), rt.alertCurEventsCardDetails) pages.GET("/alert-his-events/list", rt.auth(), rt.user(), rt.alertHisEventsList) pages.DELETE("/alert-his-events", rt.auth(), rt.admin(), rt.alertHisEventsDelete) pages.DELETE("/alert-cur-events", rt.auth(), rt.user(), rt.perm("/alert-cur-events/del"), rt.alertCurEventDel) pages.GET("/alert-cur-events/stats", rt.auth(), rt.alertCurEventsStatistics) pages.GET("/alert-aggr-views", rt.auth(), rt.alertAggrViewGets) pages.DELETE("/alert-aggr-views", rt.auth(), rt.user(), rt.alertAggrViewDel) pages.POST("/alert-aggr-views", rt.auth(), rt.user(), rt.alertAggrViewAdd) pages.PUT("/alert-aggr-views", rt.auth(), rt.user(), rt.alertAggrViewPut) pages.GET("/busi-groups/task-tpls", rt.auth(), rt.user(), rt.perm("/job-tpls"), rt.taskTplGetsByGids) pages.GET("/busi-group/:id/task-tpls", rt.auth(), rt.user(), rt.perm("/job-tpls"), rt.bgro(), rt.taskTplGets) pages.POST("/busi-group/:id/task-tpls", rt.auth(), rt.user(), rt.perm("/job-tpls/add"), rt.bgrw(), rt.taskTplAdd) pages.DELETE("/busi-group/:id/task-tpl/:tid", rt.auth(), rt.user(), rt.perm("/job-tpls/del"), rt.bgrw(), rt.taskTplDel) pages.POST("/busi-group/:id/task-tpls/tags", rt.auth(), rt.user(), rt.perm("/job-tpls/put"), rt.bgrw(), rt.taskTplBindTags) pages.DELETE("/busi-group/:id/task-tpls/tags", rt.auth(), rt.user(), rt.perm("/job-tpls/put"), rt.bgrw(), rt.taskTplUnbindTags) pages.GET("/busi-group/:id/task-tpl/:tid", rt.auth(), rt.user(), rt.perm("/job-tpls"), rt.bgro(), rt.taskTplGet) pages.PUT("/busi-group/:id/task-tpl/:tid", rt.auth(), rt.user(), rt.perm("/job-tpls/put"), rt.bgrw(), rt.taskTplPut) pages.GET("/busi-groups/tasks", rt.auth(), rt.user(), rt.perm("/job-tasks"), rt.taskGetsByGids) pages.GET("/busi-group/:id/tasks", rt.auth(), rt.user(), rt.perm("/job-tasks"), rt.bgro(), rt.taskGets) pages.POST("/busi-group/:id/tasks", rt.auth(), rt.user(), rt.perm("/job-tasks/add"), rt.bgrw(), rt.taskAdd) pages.GET("/servers", rt.auth(), rt.user(), rt.serversGet) pages.GET("/server-clusters", rt.auth(), rt.user(), rt.serverClustersGet) pages.POST("/datasource/list", rt.auth(), rt.user(), rt.datasourceList) pages.POST("/datasource/plugin/list", rt.auth(), rt.pluginList) pages.POST("/datasource/upsert", rt.auth(), rt.admin(), rt.datasourceUpsert) pages.POST("/datasource/desc", rt.auth(), rt.admin(), rt.datasourceGet) pages.POST("/datasource/status/update", rt.auth(), rt.admin(), rt.datasourceUpdataStatus) pages.DELETE("/datasource/", rt.auth(), rt.admin(), rt.datasourceDel) pages.GET("/roles", rt.auth(), rt.user(), rt.roleGets) pages.POST("/roles", rt.auth(), rt.user(), rt.perm("/roles/add"), rt.roleAdd) pages.PUT("/roles", rt.auth(), rt.user(), rt.perm("/roles/put"), rt.rolePut) pages.DELETE("/role/:id", rt.auth(), rt.user(), rt.perm("/roles/del"), rt.roleDel) pages.GET("/role/:id/ops", rt.auth(), rt.user(), rt.perm("/roles"), rt.operationOfRole) pages.PUT("/role/:id/ops", rt.auth(), rt.user(), rt.perm("/roles/put"), rt.roleBindOperation) pages.GET("/operation", rt.operations) pages.GET("/notify-tpls", rt.auth(), rt.user(), rt.notifyTplGets) pages.PUT("/notify-tpl/content", rt.auth(), rt.user(), rt.notifyTplUpdateContent) pages.PUT("/notify-tpl", rt.auth(), rt.user(), rt.notifyTplUpdate) pages.POST("/notify-tpl", rt.auth(), rt.user(), rt.notifyTplAdd) pages.DELETE("/notify-tpl/:id", rt.auth(), rt.user(), rt.notifyTplDel) pages.POST("/notify-tpl/preview", rt.auth(), rt.user(), rt.notifyTplPreview) pages.GET("/sso-configs", rt.auth(), rt.admin(), rt.ssoConfigGets) pages.PUT("/sso-config", rt.auth(), rt.admin(), rt.ssoConfigUpdate) pages.GET("/webhooks", rt.auth(), rt.user(), rt.webhookGets) pages.PUT("/webhooks", rt.auth(), rt.admin(), rt.webhookPuts) pages.GET("/notify-script", rt.auth(), rt.user(), rt.perm("/help/notification-settings"), rt.notifyScriptGet) pages.PUT("/notify-script", rt.auth(), rt.admin(), rt.notifyScriptPut) pages.GET("/notify-channel", rt.auth(), rt.user(), rt.perm("/help/notification-settings"), rt.notifyChannelGets) pages.PUT("/notify-channel", rt.auth(), rt.admin(), rt.notifyChannelPuts) pages.GET("/notify-contact", rt.auth(), rt.user(), rt.notifyContactGets) pages.PUT("/notify-contact", rt.auth(), rt.admin(), rt.notifyContactPuts) pages.GET("/notify-config", rt.auth(), rt.user(), rt.perm("/help/notification-settings"), rt.notifyConfigGet) pages.PUT("/notify-config", rt.auth(), rt.admin(), rt.notifyConfigPut) pages.PUT("/smtp-config-test", rt.auth(), rt.admin(), rt.attemptSendEmail) pages.GET("/es-index-pattern", rt.auth(), rt.esIndexPatternGet) pages.GET("/es-index-pattern-list", rt.auth(), rt.esIndexPatternGetList) pages.POST("/es-index-pattern", rt.auth(), rt.user(), rt.perm("/log/index-patterns/add"), rt.esIndexPatternAdd) pages.PUT("/es-index-pattern", rt.auth(), rt.user(), rt.perm("/log/index-patterns/put"), rt.esIndexPatternPut) pages.DELETE("/es-index-pattern", rt.auth(), rt.user(), rt.perm("/log/index-patterns/del"), rt.esIndexPatternDel) pages.GET("/embedded-dashboards", rt.auth(), rt.user(), rt.perm("/embedded-dashboards"), rt.embeddedDashboardsGet) pages.PUT("/embedded-dashboards", rt.auth(), rt.user(), rt.perm("/embedded-dashboards/put"), rt.embeddedDashboardsPut) // 获取 embedded-product 列表 pages.GET("/embedded-product", rt.auth(), rt.user(), rt.embeddedProductGets) pages.GET("/embedded-product/:id", rt.auth(), rt.user(), rt.embeddedProductGet) pages.POST("/embedded-product", rt.auth(), rt.user(), rt.perm("/embedded-product/add"), rt.embeddedProductAdd) pages.PUT("/embedded-product/:id", rt.auth(), rt.user(), rt.perm("/embedded-product/put"), rt.embeddedProductPut) pages.DELETE("/embedded-product/:id", rt.auth(), rt.user(), rt.perm("/embedded-product/delete"), rt.embeddedProductDelete) pages.GET("/user-variable-configs", rt.auth(), rt.user(), rt.perm("/help/variable-configs"), rt.userVariableConfigGets) pages.POST("/user-variable-config", rt.auth(), rt.user(), rt.perm("/help/variable-configs"), rt.userVariableConfigAdd) pages.PUT("/user-variable-config/:id", rt.auth(), rt.user(), rt.perm("/help/variable-configs"), rt.userVariableConfigPut) pages.DELETE("/user-variable-config/:id", rt.auth(), rt.user(), rt.perm("/help/variable-configs"), rt.userVariableConfigDel) pages.GET("/config", rt.auth(), rt.admin(), rt.configGetByKey) pages.PUT("/config", rt.auth(), rt.admin(), rt.configPutByKey) pages.GET("/site-info", rt.siteInfo) // source token 相关路由 pages.POST("/source-token", rt.auth(), rt.user(), rt.sourceTokenAdd) // for admin api pages.GET("/user/busi-groups", rt.auth(), rt.admin(), rt.userBusiGroupsGets) pages.GET("/builtin-components", rt.auth(), rt.user(), rt.builtinComponentsGets) pages.POST("/builtin-components", rt.auth(), rt.user(), rt.perm("/components/add"), rt.builtinComponentsAdd) pages.PUT("/builtin-components", rt.auth(), rt.user(), rt.perm("/components/put"), rt.builtinComponentsPut) pages.DELETE("/builtin-components", rt.auth(), rt.user(), rt.perm("/components/del"), rt.builtinComponentsDel) pages.GET("/builtin-payloads", rt.auth(), rt.user(), rt.builtinPayloadsGets) pages.GET("/builtin-payloads/cates", rt.auth(), rt.user(), rt.builtinPayloadcatesGet) pages.POST("/builtin-payloads", rt.auth(), rt.user(), rt.perm("/components/add"), rt.builtinPayloadsAdd) pages.PUT("/builtin-payloads", rt.auth(), rt.user(), rt.perm("/components/put"), rt.builtinPayloadsPut) pages.DELETE("/builtin-payloads", rt.auth(), rt.user(), rt.perm("/components/del"), rt.builtinPayloadsDel) pages.GET("/builtin-payload", rt.auth(), rt.user(), rt.builtinPayloadsGetByUUID) pages.POST("/message-templates", rt.auth(), rt.user(), rt.perm("/notification-templates/add"), rt.messageTemplatesAdd) pages.DELETE("/message-templates", rt.auth(), rt.user(), rt.perm("/notification-templates/del"), rt.messageTemplatesDel) pages.PUT("/message-template/:id", rt.auth(), rt.user(), rt.perm("/notification-templates/put"), rt.messageTemplatePut) pages.GET("/message-template/:id", rt.auth(), rt.user(), rt.perm("/notification-templates"), rt.messageTemplateGet) pages.GET("/message-templates", rt.auth(), rt.user(), rt.messageTemplatesGet) pages.POST("/events-message", rt.auth(), rt.user(), rt.eventsMessage) pages.POST("/notify-rules", rt.auth(), rt.user(), rt.perm("/notification-rules/add"), rt.notifyRulesAdd) pages.DELETE("/notify-rules", rt.auth(), rt.user(), rt.perm("/notification-rules/del"), rt.notifyRulesDel) pages.PUT("/notify-rule/:id", rt.auth(), rt.user(), rt.perm("/notification-rules/put"), rt.notifyRulePut) pages.GET("/notify-rule/:id", rt.auth(), rt.user(), rt.perm("/notification-rules"), rt.notifyRuleGet) pages.GET("/notify-rules", rt.auth(), rt.user(), rt.perm("/notification-rules"), rt.notifyRulesGet) pages.POST("/notify-rule/test", rt.auth(), rt.user(), rt.perm("/notification-rules"), rt.notifyTest) pages.GET("/notify-rule/custom-params", rt.auth(), rt.user(), rt.perm("/notification-rules"), rt.notifyRuleCustomParamsGet) pages.POST("/notify-rule/event-pipelines-tryrun", rt.auth(), rt.user(), rt.perm("/notification-rules/add"), rt.tryRunEventProcessorByNotifyRule) pages.GET("/event-tagkeys", rt.auth(), rt.user(), rt.eventTagKeys) pages.GET("/event-tagvalues", rt.auth(), rt.user(), rt.eventTagValues) // 事件Pipeline相关路由 pages.GET("/event-pipelines", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.eventPipelinesList) pages.POST("/event-pipeline", rt.auth(), rt.user(), rt.perm("/event-pipelines/add"), rt.addEventPipeline) pages.PUT("/event-pipeline", rt.auth(), rt.user(), rt.perm("/event-pipelines/put"), rt.updateEventPipeline) pages.GET("/event-pipeline/:id", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.getEventPipeline) pages.DELETE("/event-pipelines", rt.auth(), rt.user(), rt.perm("/event-pipelines/del"), rt.deleteEventPipelines) pages.POST("/event-pipeline-tryrun", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.tryRunEventPipeline) pages.POST("/event-processor-tryrun", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.tryRunEventProcessor) // API 触发工作流 pages.POST("/event-pipeline/:id/trigger", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.triggerEventPipelineByAPI) // SSE 流式执行工作流 pages.POST("/event-pipeline/:id/stream", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.streamEventPipeline) // 事件Pipeline执行记录路由 pages.GET("/event-pipeline-executions", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.listAllEventPipelineExecutions) pages.GET("/event-pipeline/:id/executions", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.listEventPipelineExecutions) pages.GET("/event-pipeline/:id/execution/:exec_id", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.getEventPipelineExecution) pages.GET("/event-pipeline-execution/:exec_id", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.getEventPipelineExecution) pages.GET("/event-pipeline/:id/execution-stats", rt.auth(), rt.user(), rt.perm("/event-pipelines"), rt.getEventPipelineExecutionStats) pages.POST("/event-pipeline-executions/clean", rt.auth(), rt.user(), rt.admin(), rt.cleanEventPipelineExecutions) pages.POST("/notify-channel-configs", rt.auth(), rt.user(), rt.perm("/notification-channels/add"), rt.notifyChannelsAdd) pages.DELETE("/notify-channel-configs", rt.auth(), rt.user(), rt.perm("/notification-channels/del"), rt.notifyChannelsDel) pages.PUT("/notify-channel-config/:id", rt.auth(), rt.user(), rt.perm("/notification-channels/put"), rt.notifyChannelPut) pages.GET("/notify-channel-config/:id", rt.auth(), rt.user(), rt.perm("/notification-channels"), rt.notifyChannelGet) pages.GET("/notify-channel-configs", rt.auth(), rt.user(), rt.perm("/notification-channels"), rt.notifyChannelsGet) pages.GET("/simplified-notify-channel-configs", rt.notifyChannelsGetForNormalUser) pages.GET("/flashduty-channel-list/:id", rt.auth(), rt.user(), rt.flashDutyNotifyChannelsGet) pages.GET("/pagerduty-integration-key/:id/:service_id/:integration_id", rt.auth(), rt.user(), rt.pagerDutyIntegrationKeyGet) pages.GET("/pagerduty-service-list/:id", rt.auth(), rt.user(), rt.pagerDutyNotifyServicesGet) pages.GET("/notify-channel-config", rt.auth(), rt.user(), rt.notifyChannelGetBy) pages.GET("/notify-channel-config/idents", rt.notifyChannelIdentsGet) // saved view 查询条件保存相关路由 pages.GET("/saved-views", rt.auth(), rt.user(), rt.savedViewGets) pages.POST("/saved-views", rt.auth(), rt.user(), rt.savedViewAdd) pages.PUT("/saved-view/:id", rt.auth(), rt.user(), rt.savedViewPut) pages.DELETE("/saved-view/:id", rt.auth(), rt.user(), rt.savedViewDel) pages.POST("/saved-view/:id/favorite", rt.auth(), rt.user(), rt.savedViewFavoriteAdd) pages.DELETE("/saved-view/:id/favorite", rt.auth(), rt.user(), rt.savedViewFavoriteDel) } r.GET("/api/n9e/versions", func(c *gin.Context) { v := version.Version lastIndex := strings.LastIndex(version.Version, "-") if lastIndex != -1 { v = version.Version[:lastIndex] } gv := version.GithubVersion.Load() if gv != nil { ginx.NewRender(c).Data(gin.H{"version": v, "github_verison": gv.(string)}, nil) } else { ginx.NewRender(c).Data(gin.H{"version": v, "github_verison": ""}, nil) } }) if rt.HTTP.APIForService.Enable { service := r.Group("/v1/n9e") if len(rt.HTTP.APIForService.BasicAuth) > 0 { service.Use(gin.BasicAuth(rt.HTTP.APIForService.BasicAuth)) } { service.Any("/prometheus/*url", rt.dsProxy) service.POST("/users", rt.userAddPost) service.PUT("/user/:id", rt.userProfilePutByService) service.DELETE("/user/:id", rt.userDel) service.GET("/users", rt.userFindAll) service.GET("/user-groups", rt.userGroupGetsByService) service.GET("/user-group-members", rt.userGroupMemberGetsByService) service.GET("/targets", rt.targetGetsByService) service.GET("/target/extra-meta", rt.targetExtendInfoByIdent) service.POST("/target/list", rt.targetGetsByHostFilter) service.DELETE("/targets", rt.targetDelByService) service.GET("/targets/tags", rt.targetGetTags) service.POST("/targets/tags", rt.targetBindTagsByService) service.DELETE("/targets/tags", rt.targetUnbindTagsByService) service.PUT("/targets/note", rt.targetUpdateNoteByService) service.PUT("/targets/bgid", rt.targetUpdateBgidByService) service.POST("/targets-of-host-query", rt.targetsOfHostQuery) service.POST("/alert-rules", rt.alertRuleAddByService) service.POST("/alert-rule-add", rt.alertRuleAddOneByService) service.DELETE("/alert-rules", rt.alertRuleDelByService) service.PUT("/alert-rule/:arid", rt.alertRulePutByService) service.GET("/alert-rule/:arid", rt.alertRuleGet) service.GET("/alert-rules", rt.alertRulesGetByService) service.GET("/alert-subscribes", rt.alertSubscribeGetsByService) service.GET("/busi-groups", rt.busiGroupGetsByService) service.GET("/datasources", rt.datasourceGetsByService) service.GET("/datasource-rsa-config", rt.datasourceRsaConfigGet) service.GET("/datasource-ids", rt.getDatasourceIds) service.POST("/server-heartbeat", rt.serverHeartbeat) service.GET("/servers-active", rt.serversActive) service.GET("/recording-rules", rt.recordingRuleGetsByService) service.GET("/alert-mutes", rt.alertMuteGets) service.GET("/active-alert-mutes", rt.activeAlertMuteGets) service.POST("/alert-mutes", rt.alertMuteAddByService) service.DELETE("/alert-mutes", rt.alertMuteDel) service.GET("/alert-cur-events", rt.alertCurEventsList) service.GET("/alert-cur-events-get-by-rid", rt.alertCurEventsGetByRid) service.GET("/alert-his-events", rt.alertHisEventsList) service.GET("/alert-his-event/:eid", rt.alertHisEventGet) service.GET("/task-tpl/:tid", rt.taskTplGetByService) service.GET("/task-tpls", rt.taskTplGetsByService) service.GET("/task-tpl/statistics", rt.taskTplStatistics) service.GET("/config/:id", rt.configGet) service.GET("/configs", rt.configsGet) service.GET("/config", rt.configGetByKey) service.GET("/all-configs", rt.configGetAll) service.PUT("/configs", rt.configsPut) service.POST("/configs", rt.configsPost) service.DELETE("/configs", rt.configsDel) service.POST("/conf-prop/encrypt", rt.confPropEncrypt) service.POST("/conf-prop/decrypt", rt.confPropDecrypt) service.GET("/statistic", rt.statistic) service.GET("/notify-tpls", rt.notifyTplGets) service.POST("/task-record-add", rt.taskRecordAdd) service.GET("/user-variable/decrypt", rt.userVariableGetDecryptByService) service.GET("/targets-of-alert-rule", rt.targetsOfAlertRule) service.POST("/notify-record", rt.notificationRecordAdd) service.GET("/alert-cur-events-del-by-hash", rt.alertCurEventDelByHash) service.POST("/center/heartbeat", rt.heartbeat) service.GET("/es-index-pattern-list", rt.esIndexPatternGetList) service.GET("/notify-rules", rt.notifyRulesGetByService) service.GET("/notify-channels", rt.notifyChannelConfigGets) service.GET("/message-templates", rt.messageTemplateGets) service.GET("/event-pipelines", rt.eventPipelinesListByService) service.POST("/event-pipeline/:id/trigger", rt.triggerEventPipelineByService) service.POST("/event-pipeline/:id/stream", rt.streamEventPipelineByService) service.POST("/event-pipeline-execution", rt.eventPipelineExecutionAdd) // 手机号加密存储配置接口 service.POST("/users/phone/encrypt", rt.usersPhoneEncrypt) service.POST("/users/phone/decrypt", rt.usersPhoneDecrypt) service.POST("/users/phone/refresh-encryption-config", rt.usersPhoneDecryptRefresh) service.GET("/builtin-components", rt.builtinComponentsGets) service.GET("/builtin-payloads", rt.builtinPayloadsGets) } } if rt.HTTP.APIForAgent.Enable { heartbeat := r.Group("/v1/n9e") { if len(rt.HTTP.APIForAgent.BasicAuth) > 0 { heartbeat.Use(gin.BasicAuth(rt.HTTP.APIForAgent.BasicAuth)) } heartbeat.POST("/heartbeat", rt.heartbeat) } } rt.configNoRoute(r, &statikFS) } func Render(c *gin.Context, data, msg interface{}) { if msg == nil { if data == nil { data = struct{}{} } c.JSON(http.StatusOK, gin.H{"data": data, "error": ""}) } else { c.JSON(http.StatusOK, gin.H{"error": gin.H{"message": msg}}) } } func Dangerous(c *gin.Context, v interface{}, code ...int) { if v == nil { return } switch t := v.(type) { case string: if t != "" { c.JSON(http.StatusOK, gin.H{"error": v}) } case error: c.JSON(http.StatusOK, gin.H{"error": t.Error()}) } } ================================================ FILE: center/router/router_alert_aggr_view.go ================================================ package router import ( "net/http" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) // no param func (rt *Router) alertAggrViewGets(c *gin.Context) { lst, err := models.AlertAggrViewGets(rt.Ctx, c.MustGet("userid")) ginx.NewRender(c).Data(lst, err) } // body: name, rule, cate func (rt *Router) alertAggrViewAdd(c *gin.Context) { var f models.AlertAggrView ginx.BindJSON(c, &f) me := c.MustGet("user").(*models.User) if !me.IsAdmin() { // 管理员可以选择当前这个视图是公开呢,还是私有,普通用户的话就只能是私有的 f.Cate = 1 } f.Id = 0 f.CreateBy = me.Id ginx.Dangerous(f.Add(rt.Ctx)) ginx.NewRender(c).Data(f, nil) } // body: ids func (rt *Router) alertAggrViewDel(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) f.Verify() me := c.MustGet("user").(*models.User) if me.IsAdmin() { ginx.NewRender(c).Message(models.AlertAggrViewDel(rt.Ctx, f.Ids)) } else { ginx.NewRender(c).Message(models.AlertAggrViewDel(rt.Ctx, f.Ids, me.Id)) } } // body: id, name, rule, cate func (rt *Router) alertAggrViewPut(c *gin.Context) { var f models.AlertAggrView ginx.BindJSON(c, &f) view, err := models.AlertAggrViewGet(rt.Ctx, "id = ?", f.Id) ginx.Dangerous(err) if view == nil { ginx.NewRender(c).Message("no such item(id: %d)", f.Id) return } me := c.MustGet("user").(*models.User) if !me.IsAdmin() { f.Cate = 1 if view.CreateBy != me.Id { ginx.NewRender(c, http.StatusForbidden).Message("forbidden") return } } view.Name = f.Name view.Rule = f.Rule view.Cate = f.Cate if view.CreateBy == 0 { view.CreateBy = me.Id } ginx.NewRender(c).Message(view.Update(rt.Ctx)) } ================================================ FILE: center/router/router_alert_cur_event.go ================================================ package router import ( "fmt" "net/http" "sort" "strings" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/strx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/logger" ) func getUserGroupIds(ctx *gin.Context, rt *Router, myGroups bool) ([]int64, error) { if !myGroups { return nil, nil } me := ctx.MustGet("user").(*models.User) return models.MyGroupIds(rt.Ctx, me.Id) } func (rt *Router) alertCurEventsCard(c *gin.Context) { stime, etime := getTimeRange(c) severity := strx.IdsInt64ForAPI(ginx.QueryStr(c, "severity", ""), ",") query := ginx.QueryStr(c, "query", "") myGroups := ginx.QueryBool(c, "my_groups", false) // 是否只看自己组,默认false var gids []int64 var err error if myGroups { gids, err = getUserGroupIds(c, rt, myGroups) ginx.Dangerous(err) if len(gids) == 0 { gids = append(gids, -1) } } viewId := ginx.QueryInt64(c, "view_id") alertView, err := models.GetAlertAggrViewByViewID(rt.Ctx, viewId) ginx.Dangerous(err) if alertView == nil { ginx.Bomb(http.StatusNotFound, "alert aggr view not found") } dsIds := queryDatasourceIds(c) prod := ginx.QueryStr(c, "prods", "") if prod == "" { prod = ginx.QueryStr(c, "rule_prods", "") } prods := []string{} if prod != "" { prods = strings.Split(prod, ",") } cate := ginx.QueryStr(c, "cate", "$all") cates := []string{} if cate != "$all" { cates = strings.Split(cate, ",") } bgids, err := GetBusinessGroupIds(c, rt.Ctx, rt.Center.EventHistoryGroupView, myGroups) ginx.Dangerous(err) // 最多获取50000个,获取太多也没啥意义 list, err := models.AlertCurEventsGet(rt.Ctx, prods, bgids, stime, etime, severity, dsIds, cates, 0, query, 50000, 0, []int64{}) ginx.Dangerous(err) cardmap := make(map[string]*AlertCard) for _, event := range list { title, err := event.GenCardTitle(alertView.Rule) ginx.Dangerous(err) if _, has := cardmap[title]; has { cardmap[title].Total++ cardmap[title].EventIds = append(cardmap[title].EventIds, event.Id) if event.Severity < cardmap[title].Severity { cardmap[title].Severity = event.Severity } } else { cardmap[title] = &AlertCard{ Total: 1, EventIds: []int64{event.Id}, Title: title, Severity: event.Severity, } } if cardmap[title].Severity < 1 { cardmap[title].Severity = 3 } } titles := make([]string, 0, len(cardmap)) for title := range cardmap { titles = append(titles, title) } sort.Strings(titles) cards := make([]*AlertCard, len(titles)) for i := 0; i < len(titles); i++ { cards[i] = cardmap[titles[i]] } sort.SliceStable(cards, func(i, j int) bool { if cards[i].Severity != cards[j].Severity { return cards[i].Severity < cards[j].Severity } return cards[i].Total > cards[j].Total }) ginx.NewRender(c).Data(cards, nil) } type AlertCard struct { Title string `json:"title"` Total int `json:"total"` EventIds []int64 `json:"event_ids"` Severity int `json:"severity"` } func (rt *Router) alertCurEventsCardDetails(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) list, err := models.AlertCurEventGetByIds(rt.Ctx, f.Ids) if err == nil { cache := make(map[int64]*models.UserGroup) for i := 0; i < len(list); i++ { list[i].FillNotifyGroups(rt.Ctx, cache) } } ginx.NewRender(c).Data(list, err) } // alertCurEventsGetByRid func (rt *Router) alertCurEventsGetByRid(c *gin.Context) { rid := ginx.QueryInt64(c, "rid") dsId := ginx.QueryInt64(c, "dsid") ginx.NewRender(c).Data(models.AlertCurEventGetByRuleIdAndDsId(rt.Ctx, rid, dsId)) } // 列表方式,拉取活跃告警 func (rt *Router) alertCurEventsList(c *gin.Context) { stime, etime := getTimeRange(c) severity := strx.IdsInt64ForAPI(ginx.QueryStr(c, "severity", ""), ",") query := ginx.QueryStr(c, "query", "") limit := ginx.QueryInt(c, "limit", 20) myGroups := ginx.QueryBool(c, "my_groups", false) // 是否只看自己组,默认false dsIds := queryDatasourceIds(c) eventIds := strx.IdsInt64ForAPI(ginx.QueryStr(c, "event_ids", ""), ",") prod := ginx.QueryStr(c, "prods", "") if prod == "" { prod = ginx.QueryStr(c, "rule_prods", "") } prods := []string{} if prod != "" { prods = strings.Split(prod, ",") } cate := ginx.QueryStr(c, "cate", "$all") cates := []string{} if cate != "$all" { cates = strings.Split(cate, ",") } ruleId := ginx.QueryInt64(c, "rid", 0) bgids, err := GetBusinessGroupIds(c, rt.Ctx, rt.Center.EventHistoryGroupView, myGroups) ginx.Dangerous(err) total, err := models.AlertCurEventTotal(rt.Ctx, prods, bgids, stime, etime, severity, dsIds, cates, ruleId, query, eventIds) ginx.Dangerous(err) list, err := models.AlertCurEventsGet(rt.Ctx, prods, bgids, stime, etime, severity, dsIds, cates, ruleId, query, limit, ginx.Offset(c, limit), eventIds) ginx.Dangerous(err) cache := make(map[int64]*models.UserGroup) for i := 0; i < len(list); i++ { list[i].FillNotifyGroups(rt.Ctx, cache) } ginx.NewRender(c).Data(gin.H{ "list": list, "total": total, }, nil) } func (rt *Router) alertCurEventDel(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) f.Verify() rt.checkCurEventBusiGroupRWPermission(c, f.Ids) ginx.NewRender(c).Message(models.AlertCurEventDel(rt.Ctx, f.Ids)) } func (rt *Router) checkCurEventBusiGroupRWPermission(c *gin.Context, ids []int64) { set := make(map[int64]struct{}) // event group id is 0, ignore perm check set[0] = struct{}{} for i := 0; i < len(ids); i++ { event, err := models.AlertCurEventGetById(rt.Ctx, ids[i]) ginx.Dangerous(err) if event == nil { continue } if _, has := set[event.GroupId]; !has { rt.bgrwCheck(c, event.GroupId) set[event.GroupId] = struct{}{} } } } func (rt *Router) alertCurEventGet(c *gin.Context) { eid := ginx.UrlParamInt64(c, "eid") event, err := GetCurEventDetail(rt.Ctx, eid) hasPermission := HasPermission(rt.Ctx, c, "event", fmt.Sprintf("%d", eid), rt.Center.AnonymousAccess.AlertDetail) if !hasPermission { rt.auth()(c) rt.user()(c) rt.bgroCheck(c, event.GroupId) } ginx.NewRender(c).Data(event, err) } func GetCurEventDetail(ctx *ctx.Context, eid int64) (*models.AlertCurEvent, error) { event, err := models.AlertCurEventGetById(ctx, eid) if err != nil { return nil, err } if event == nil { return nil, fmt.Errorf("no such active event") } ruleConfig, needReset := models.FillRuleConfigTplName(ctx, event.RuleConfig) if needReset { event.RuleConfigJson = ruleConfig } event.LastEvalTime = event.TriggerTime event.NotifyVersion, err = GetEventNotifyVersion(ctx, event.RuleId, event.NotifyRuleIds) ginx.Dangerous(err) event.NotifyRules, err = GetEventNotifyRuleNames(ctx, event.NotifyRuleIds) return event, err } func GetEventNotifyRuleNames(ctx *ctx.Context, notifyRuleIds []int64) ([]*models.EventNotifyRule, error) { notifyRuleNames := make([]*models.EventNotifyRule, 0) notifyRules, err := models.NotifyRulesGet(ctx, "id in ?", notifyRuleIds) if err != nil { return nil, err } for _, notifyRule := range notifyRules { notifyRuleNames = append(notifyRuleNames, &models.EventNotifyRule{ Id: notifyRule.ID, Name: notifyRule.Name, }) } return notifyRuleNames, nil } func GetEventNotifyVersion(ctx *ctx.Context, ruleId int64, notifyRuleIds []int64) (int, error) { if len(notifyRuleIds) != 0 { // 如果存在 notify_rule_ids,则认为使用新的告警通知方式 return 1, nil } rule, err := models.AlertRuleGetById(ctx, ruleId) if err != nil { return 0, err } return rule.NotifyVersion, nil } func (rt *Router) alertCurEventsStatistics(c *gin.Context) { ginx.NewRender(c).Data(models.AlertCurEventStatistics(rt.Ctx, time.Now()), nil) } func (rt *Router) alertCurEventDelByHash(c *gin.Context) { hash := ginx.QueryStr(c, "hash") ginx.NewRender(c).Message(models.AlertCurEventDelByHash(rt.Ctx, hash)) } func (rt *Router) eventTagKeys(c *gin.Context) { // 获取最近1天的活跃告警事件 now := time.Now().Unix() stime := now - 24*3600 etime := now // 获取用户可见的业务组ID列表 bgids, err := GetBusinessGroupIds(c, rt.Ctx, rt.Center.EventHistoryGroupView, false) if err != nil { logger.Warningf("failed to get business group ids: %v", err) ginx.NewRender(c).Data([]string{"ident", "app", "service", "instance"}, nil) return } // 查询活跃告警事件,限制数量以提高性能 events, err := models.AlertCurEventsGet(rt.Ctx, []string{}, bgids, stime, etime, []int64{}, []int64{}, []string{}, 0, "", 200, 0, []int64{}) if err != nil { logger.Warningf("failed to get current alert events: %v", err) ginx.NewRender(c).Data([]string{"ident", "app", "service", "instance"}, nil) return } // 如果没有查到事件,返回默认标签 if len(events) == 0 { ginx.NewRender(c).Data([]string{"ident", "app", "service", "instance"}, nil) return } // 收集所有标签键并去重 tagKeys := make(map[string]struct{}) for _, event := range events { for key := range event.TagsMap { tagKeys[key] = struct{}{} } } // 转换为字符串切片 var result []string for key := range tagKeys { result = append(result, key) } // 如果没有收集到任何标签键,返回默认值 if len(result) == 0 { result = []string{"ident", "app", "service", "instance"} } ginx.NewRender(c).Data(result, nil) } func (rt *Router) eventTagValues(c *gin.Context) { // 获取标签key tagKey := ginx.QueryStr(c, "key") // 获取最近1天的活跃告警事件 now := time.Now().Unix() stime := now - 24*3600 etime := now // 获取用户可见的业务组ID列表 bgids, err := GetBusinessGroupIds(c, rt.Ctx, rt.Center.EventHistoryGroupView, false) if err != nil { logger.Warningf("failed to get business group ids: %v", err) ginx.NewRender(c).Data([]string{}, nil) return } // 查询活跃告警事件,获取更多数据以保证统计准确性 events, err := models.AlertCurEventsGet(rt.Ctx, []string{}, bgids, stime, etime, []int64{}, []int64{}, []string{}, 0, "", 1000, 0, []int64{}) if err != nil { logger.Warningf("failed to get current alert events: %v", err) ginx.NewRender(c).Data([]string{}, nil) return } // 如果没有查到事件,返回空数组 if len(events) == 0 { ginx.NewRender(c).Data([]string{}, nil) return } // 统计标签值出现次数 valueCount := make(map[string]int) for _, event := range events { // TagsMap已经在AlertCurEventsGet中处理,直接使用 if value, exists := event.TagsMap[tagKey]; exists && value != "" { valueCount[value]++ } } // 转换为切片并按出现次数降序排序 type tagValue struct { value string count int } tagValues := make([]tagValue, 0, len(valueCount)) for value, count := range valueCount { tagValues = append(tagValues, tagValue{value, count}) } // 按出现次数降序排序 sort.Slice(tagValues, func(i, j int) bool { return tagValues[i].count > tagValues[j].count }) // 只取Top20并转换为字符串数组 limit := 20 if len(tagValues) < limit { limit = len(tagValues) } result := make([]string, 0, limit) for i := 0; i < limit; i++ { result = append(result, tagValues[i].value) } ginx.NewRender(c).Data(result, nil) } ================================================ FILE: center/router/router_alert_eval_detail.go ================================================ package router import ( "encoding/json" "fmt" "io" "net/http" "sort" "strconv" "strings" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/loggrep" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) // alertEvalDetailPage renders an HTML log viewer page for alert rule evaluation logs. func (rt *Router) alertEvalDetailPage(c *gin.Context) { id := ginx.UrlParamStr(c, "id") if !loggrep.IsValidRuleID(id) { c.String(http.StatusBadRequest, "invalid rule id format") return } logs, instance, err := rt.getAlertEvalLogs(id) if err != nil { c.String(http.StatusInternalServerError, "Error: %v", err) return } c.Header("Content-Type", "text/html; charset=utf-8") err = loggrep.RenderAlertEvalHTML(c.Writer, loggrep.AlertEvalPageData{ RuleID: id, Instance: instance, Logs: logs, Total: len(logs), }) if err != nil { c.String(http.StatusInternalServerError, "render error: %v", err) } } // alertEvalDetailJSON returns JSON for alert rule evaluation logs. func (rt *Router) alertEvalDetailJSON(c *gin.Context) { id := ginx.UrlParamStr(c, "id") if !loggrep.IsValidRuleID(id) { ginx.Bomb(200, "invalid rule id format") } logs, instance, err := rt.getAlertEvalLogs(id) ginx.Dangerous(err) ginx.NewRender(c).Data(loggrep.EventDetailResp{ Logs: logs, Instance: instance, }, nil) } // getAlertEvalLogs resolves the target instance(s) and retrieves alert eval logs. func (rt *Router) getAlertEvalLogs(id string) ([]string, string, error) { ruleId, _ := strconv.ParseInt(id, 10, 64) rule, err := models.AlertRuleGetById(rt.Ctx, ruleId) if err != nil { return nil, "", err } if rule == nil { return nil, "", fmt.Errorf("no such alert rule") } instance := fmt.Sprintf("%s:%d", rt.Alert.Heartbeat.IP, rt.HTTP.Port) keyword := fmt.Sprintf("alert_eval_%s", id) // Get datasource IDs for this rule dsIds := rt.DatasourceCache.GetIDsByDsCateAndQueries(rule.Cate, rule.DatasourceQueries) if len(dsIds) == 0 { // No datasources found (e.g. host rule), try local grep logs, err := loggrep.GrepLogDir(rt.LogDir, keyword) return logs, instance, err } // Find unique target nodes via hash ring, with DB fallback nodeSet := make(map[string]struct{}) for _, dsId := range dsIds { node, err := rt.getNodeForDatasource(dsId, id) if err != nil { continue } nodeSet[node] = struct{}{} } if len(nodeSet) == 0 { // Hash ring not ready, grep locally logs, err := loggrep.GrepLogDir(rt.LogDir, keyword) return logs, instance, err } // Collect logs from all target nodes var allLogs []string var instances []string for node := range nodeSet { if node == instance { logs, err := loggrep.GrepLogDir(rt.LogDir, keyword) if err == nil { allLogs = append(allLogs, logs...) instances = append(instances, node) } } else { logs, nodeAddr, err := rt.forwardAlertEvalDetail(node, id) if err == nil { allLogs = append(allLogs, logs...) instances = append(instances, nodeAddr) } } } // Sort logs by timestamp descending sort.Slice(allLogs, func(i, j int) bool { return allLogs[i] > allLogs[j] }) if len(allLogs) > loggrep.MaxLogLines { allLogs = allLogs[:loggrep.MaxLogLines] } return allLogs, strings.Join(instances, ", "), nil } func (rt *Router) forwardAlertEvalDetail(node, id string) ([]string, string, error) { url := fmt.Sprintf("http://%s/v1/n9e/alert-eval-detail/%s", node, id) req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, node, err } for user, pass := range rt.HTTP.APIForService.BasicAuth { req.SetBasicAuth(user, pass) break } client := &http.Client{Timeout: 15 * time.Second} resp, err := client.Do(req) if err != nil { return nil, node, fmt.Errorf("forward to %s failed: %v", node, err) } defer resp.Body.Close() body, err := io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024)) // 10MB limit if err != nil { return nil, node, err } var result struct { Dat loggrep.EventDetailResp `json:"dat"` Err string `json:"err"` } if err := json.Unmarshal(body, &result); err != nil { return nil, node, err } if result.Err != "" { return nil, node, fmt.Errorf("%s", result.Err) } return result.Dat.Logs, result.Dat.Instance, nil } ================================================ FILE: center/router/router_alert_his_event.go ================================================ package router import ( "fmt" "net/http" "strings" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/logger" "golang.org/x/exp/slices" ) func getTimeRange(c *gin.Context) (stime, etime int64) { stime = ginx.QueryInt64(c, "stime", 0) etime = ginx.QueryInt64(c, "etime", 0) hours := ginx.QueryInt64(c, "hours", 0) now := time.Now().Unix() if hours != 0 { stime = now - 3600*hours etime = now + 3600*24 } if stime != 0 && etime == 0 { etime = now + 3600*24 } return } func (rt *Router) alertHisEventsList(c *gin.Context) { stime, etime := getTimeRange(c) severity := ginx.QueryInt(c, "severity", -1) recovered := ginx.QueryInt(c, "is_recovered", -1) query := ginx.QueryStr(c, "query", "") limit := ginx.QueryInt(c, "limit", 20) dsIds := queryDatasourceIds(c) prod := ginx.QueryStr(c, "prods", "") if prod == "" { prod = ginx.QueryStr(c, "rule_prods", "") } prods := []string{} if prod != "" { prods = strings.Split(prod, ",") } cate := ginx.QueryStr(c, "cate", "$all") cates := []string{} if cate != "$all" { cates = strings.Split(cate, ",") } ruleId := ginx.QueryInt64(c, "rid", 0) bgids, err := GetBusinessGroupIds(c, rt.Ctx, rt.Center.EventHistoryGroupView, false) ginx.Dangerous(err) total, err := models.AlertHisEventTotal(rt.Ctx, prods, bgids, stime, etime, severity, recovered, dsIds, cates, ruleId, query, []int64{}) ginx.Dangerous(err) list, err := models.AlertHisEventGets(rt.Ctx, prods, bgids, stime, etime, severity, recovered, dsIds, cates, ruleId, query, limit, ginx.Offset(c, limit), []int64{}) ginx.Dangerous(err) cache := make(map[int64]*models.UserGroup) for i := 0; i < len(list); i++ { list[i].FillNotifyGroups(rt.Ctx, cache) } ginx.NewRender(c).Data(gin.H{ "list": list, "total": total, }, nil) } type alertHisEventsDeleteForm struct { Severities []int `json:"severities"` Timestamp int64 `json:"timestamp" binding:"required"` } func (rt *Router) alertHisEventsDelete(c *gin.Context) { var f alertHisEventsDeleteForm ginx.BindJSON(c, &f) // 校验 if f.Timestamp == 0 { ginx.Bomb(http.StatusBadRequest, "timestamp parameter is required") return } user := c.MustGet("user").(*models.User) // 启动后台清理任务 go func() { limit := 100 for { n, err := models.AlertHisEventBatchDelete(rt.Ctx, f.Timestamp, f.Severities, limit) if err != nil { logger.Errorf("Failed to delete alert history events: operator=%s, timestamp=%d, severities=%v, error=%v", user.Username, f.Timestamp, f.Severities, err) break } logger.Debugf("Successfully deleted alert history events: operator=%s, timestamp=%d, severities=%v, deleted=%d", user.Username, f.Timestamp, f.Severities, n) if n < int64(limit) { break // 已经删完 } time.Sleep(100 * time.Millisecond) // 防止锁表 } }() ginx.NewRender(c).Data("Alert history events deletion started", nil) } var TransferEventToCur func(*ctx.Context, *models.AlertHisEvent) *models.AlertCurEvent func init() { TransferEventToCur = transferEventToCur } func transferEventToCur(ctx *ctx.Context, event *models.AlertHisEvent) *models.AlertCurEvent { cur := event.ToCur() return cur } func (rt *Router) alertHisEventGet(c *gin.Context) { eid := ginx.UrlParamInt64(c, "eid") event, err := models.AlertHisEventGetById(rt.Ctx, eid) ginx.Dangerous(err) if event == nil { ginx.Bomb(404, "No such alert event") } hasPermission := HasPermission(rt.Ctx, c, "event", fmt.Sprintf("%d", eid), rt.Center.AnonymousAccess.AlertDetail) if !hasPermission { rt.auth()(c) rt.user()(c) rt.bgroCheck(c, event.GroupId) } ruleConfig, needReset := models.FillRuleConfigTplName(rt.Ctx, event.RuleConfig) if needReset { event.RuleConfigJson = ruleConfig } event.NotifyVersion, err = GetEventNotifyVersion(rt.Ctx, event.RuleId, event.NotifyRuleIds) ginx.Dangerous(err) event.NotifyRules, err = GetEventNotifyRuleNames(rt.Ctx, event.NotifyRuleIds) ginx.NewRender(c).Data(TransferEventToCur(rt.Ctx, event), err) } func GetBusinessGroupIds(c *gin.Context, ctx *ctx.Context, onlySelfGroupView bool, myGroups bool) ([]int64, error) { bgid := ginx.QueryInt64(c, "bgid", 0) var bgids []int64 if strings.HasPrefix(c.Request.URL.Path, "/v1") { // 如果请求路径以 /v1 开头,不查询用户信息 if bgid > 0 { return []int64{bgid}, nil } return bgids, nil } user := c.MustGet("user").(*models.User) if myGroups || (onlySelfGroupView && !user.IsAdmin()) { // 1. 页面上勾选了我的业务组,需要查询用户所属的业务组 // 2. 如果 onlySelfGroupView 为 true,表示只允许查询用户所属的业务组 bussGroupIds, err := models.MyBusiGroupIds(ctx, user.Id) if err != nil { return nil, err } if len(bussGroupIds) == 0 { // 如果没查到用户属于任何业务组,需要返回一个0,否则会导致查询到全部告警历史 return []int64{0}, nil } if bgid > 0 { if !slices.Contains(bussGroupIds, bgid) && !user.IsAdmin() { return nil, fmt.Errorf("business group ID not allowed") } return []int64{bgid}, nil } return bussGroupIds, nil } if bgid > 0 { return []int64{bgid}, nil } return bgids, nil } ================================================ FILE: center/router/router_alert_rule.go ================================================ package router import ( "encoding/json" "fmt" "net/http" "regexp" "strconv" "strings" "time" "gopkg.in/yaml.v2" "github.com/ccfos/nightingale/v6/alert/mute" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/ccfos/nightingale/v6/pkg/strx" "github.com/ccfos/nightingale/v6/pushgw/pconf" "github.com/ccfos/nightingale/v6/pushgw/writer" "github.com/gin-gonic/gin" "github.com/jinzhu/copier" "github.com/pkg/errors" "github.com/prometheus/prometheus/prompb" "github.com/toolkits/pkg/i18n" ) type AlertRuleModifyHookFunc func(ar *models.AlertRule) // Return all, front-end search and paging func (rt *Router) alertRuleGets(c *gin.Context) { busiGroupId := ginx.UrlParamInt64(c, "id") ars, err := models.AlertRuleGets(rt.Ctx, busiGroupId) if err == nil { cache := make(map[int64]*models.UserGroup) for i := 0; i < len(ars); i++ { ars[i].FillNotifyGroups(rt.Ctx, cache) } models.FillUpdateByNicknames(rt.Ctx, ars) } ginx.NewRender(c).Data(ars, err) } func GetAlertCueEventTimeRange(c *gin.Context) (stime, etime int64) { stime = ginx.QueryInt64(c, "stime", 0) etime = ginx.QueryInt64(c, "etime", 0) if etime == 0 { etime = time.Now().Unix() } if stime == 0 || stime >= etime { stime = etime - 30*24*int64(time.Hour.Seconds()) } return } func (rt *Router) alertRuleGetsByGids(c *gin.Context) { gids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",") if len(gids) > 0 { for _, gid := range gids { rt.bgroCheck(c, gid) } } else { me := c.MustGet("user").(*models.User) if !me.IsAdmin() { var err error gids, err = models.MyBusiGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) if len(gids) == 0 { ginx.NewRender(c).Data([]int{}, nil) return } } } ars, err := models.AlertRuleGetsByBGIds(rt.Ctx, gids) if err == nil { cache := make(map[int64]*models.UserGroup) rids := make([]int64, 0, len(ars)) for i := 0; i < len(ars); i++ { ars[i].FillNotifyGroups(rt.Ctx, cache) if len(ars[i].DatasourceQueries) != 0 { ars[i].DatasourceIdsJson = rt.DatasourceCache.GetIDsByDsCateAndQueries(ars[i].Cate, ars[i].DatasourceQueries) } rids = append(rids, ars[i].Id) } stime, etime := GetAlertCueEventTimeRange(c) cnt := models.AlertCurEventCountByRuleId(rt.Ctx, rids, stime, etime) if cnt != nil { for i := 0; i < len(ars); i++ { ars[i].CurEventCount = cnt[ars[i].Id] } } models.FillUpdateByNicknames(rt.Ctx, ars) } ginx.NewRender(c).Data(ars, err) } func (rt *Router) alertRulesGetByService(c *gin.Context) { prods := []string{} prodStr := ginx.QueryStr(c, "prods", "") if prodStr != "" { prods = strings.Split(ginx.QueryStr(c, "prods", ""), ",") } query := ginx.QueryStr(c, "query", "") algorithm := ginx.QueryStr(c, "algorithm", "") cluster := ginx.QueryStr(c, "cluster", "") cate := ginx.QueryStr(c, "cate", "$all") cates := []string{} if cate != "$all" { cates = strings.Split(cate, ",") } disabled := ginx.QueryInt(c, "disabled", -1) ars, err := models.AlertRulesGetsBy(rt.Ctx, prods, query, algorithm, cluster, cates, disabled) if err == nil { cache := make(map[int64]*models.UserGroup) for i := 0; i < len(ars); i++ { ars[i].FillNotifyGroups(rt.Ctx, cache) if len(ars[i].DatasourceQueries) != 0 { ars[i].DatasourceIdsJson = rt.DatasourceCache.GetIDsByDsCateAndQueries(ars[i].Cate, ars[i].DatasourceQueries) } } models.FillUpdateByNicknames(rt.Ctx, ars) } ginx.NewRender(c).Data(ars, err) } // single or import func (rt *Router) alertRuleAddByFE(c *gin.Context) { username := c.MustGet("username").(string) var lst []models.AlertRule ginx.BindJSON(c, &lst) count := len(lst) if count == 0 { ginx.Bomb(http.StatusBadRequest, "input json is empty") } bgid := ginx.UrlParamInt64(c, "id") reterr := rt.alertRuleAdd(lst, username, bgid, c.GetHeader("X-Language")) ginx.NewRender(c).Data(reterr, nil) } type AlertRuleTryRunForm struct { EventId int64 `json:"event_id" binding:"required"` AlertRuleConfig models.AlertRule `json:"config" binding:"required"` } func (rt *Router) alertRuleNotifyTryRun(c *gin.Context) { // check notify channels of old version var f AlertRuleTryRunForm ginx.BindJSON(c, &f) hisEvent, err := models.AlertHisEventGetById(rt.Ctx, f.EventId) ginx.Dangerous(err) if hisEvent == nil { ginx.Bomb(http.StatusNotFound, "event not found") } curEvent := *hisEvent.ToCur() curEvent.SetTagsMap() if f.AlertRuleConfig.NotifyVersion == 1 { for _, id := range f.AlertRuleConfig.NotifyRuleIds { notifyRule, err := models.GetNotifyRule(rt.Ctx, id) ginx.Dangerous(err) for _, notifyConfig := range notifyRule.NotifyConfigs { _, err = SendNotifyChannelMessage(rt.Ctx, rt.UserCache, rt.UserGroupCache, notifyConfig, []*models.AlertCurEvent{&curEvent}) ginx.Dangerous(err) } } ginx.NewRender(c).Data("notification test ok", nil) return } if len(f.AlertRuleConfig.NotifyChannelsJSON) == 0 { ginx.Bomb(http.StatusOK, "no notify channels selected") } if len(f.AlertRuleConfig.NotifyGroupsJSON) == 0 { ginx.Bomb(http.StatusOK, "no notify groups selected") } ancs := make([]string, 0, len(curEvent.NotifyChannelsJSON)) ugids := f.AlertRuleConfig.NotifyGroupsJSON ngids := make([]int64, 0) for i := 0; i < len(ugids); i++ { if gid, err := strconv.ParseInt(ugids[i], 10, 64); err == nil { ngids = append(ngids, gid) } } userGroups := rt.UserGroupCache.GetByUserGroupIds(ngids) uids := make([]int64, 0) for i := range userGroups { uids = append(uids, userGroups[i].UserIds...) } users := rt.UserCache.GetByUserIds(uids) for _, NotifyChannels := range curEvent.NotifyChannelsJSON { flag := true // ignore non-default channels switch NotifyChannels { case models.Dingtalk, models.Wecom, models.Feishu, models.Mm, models.Telegram, models.Email, models.FeishuCard: // do nothing default: continue } // default channels for ui := range users { if _, b := users[ui].ExtractToken(NotifyChannels); b { flag = false break } } if flag { ancs = append(ancs, NotifyChannels) } } if len(ancs) > 0 { ginx.Dangerous(errors.New(fmt.Sprintf("All users are missing notify channel configurations. Please check for missing tokens (each channel should be configured with at least one user). %v", ancs))) } ginx.NewRender(c).Data("notification test ok", nil) } func (rt *Router) alertRuleEnableTryRun(c *gin.Context) { // check notify channels of old version var f AlertRuleTryRunForm ginx.BindJSON(c, &f) hisEvent, err := models.AlertHisEventGetById(rt.Ctx, f.EventId) ginx.Dangerous(err) if hisEvent == nil { ginx.Bomb(http.StatusNotFound, "event not found") } curEvent := *hisEvent.ToCur() curEvent.SetTagsMap() if f.AlertRuleConfig.Disabled == 1 { ginx.Bomb(http.StatusOK, "rule is disabled") } if mute.TimeSpanMuteStrategy(&f.AlertRuleConfig, &curEvent) { ginx.Bomb(http.StatusOK, "event is not match for period of time") } if mute.BgNotMatchMuteStrategy(&f.AlertRuleConfig, &curEvent, rt.TargetCache) { ginx.Bomb(http.StatusOK, "event target busi group not match rule busi group") } ginx.NewRender(c).Data("event is effective", nil) } func (rt *Router) alertRuleAddByImport(c *gin.Context) { username := c.MustGet("username").(string) var lst []models.AlertRule ginx.BindJSON(c, &lst) count := len(lst) if count == 0 { ginx.Bomb(http.StatusBadRequest, "input json is empty") } for i := range lst { if len(lst[i].DatasourceQueries) == 0 { lst[i].DatasourceQueries = []models.DatasourceQuery{ models.DataSourceQueryAll, } } // 将导入的规则统一转为新版本的通知规则配置 lst[i].NotifyVersion = 1 lst[i].NotifyChannelsJSON = []string{} lst[i].NotifyGroupsJSON = []string{} lst[i].NotifyChannels = "" lst[i].NotifyGroups = "" lst[i].Callbacks = "" lst[i].CallbacksJSON = []string{} } bgid := ginx.UrlParamInt64(c, "id") reterr := rt.alertRuleAdd(lst, username, bgid, c.GetHeader("X-Language")) ginx.NewRender(c).Data(reterr, nil) } type promRuleForm struct { Payload string `json:"payload" binding:"required"` DatasourceQueries []models.DatasourceQuery `json:"datasource_queries" binding:"required"` Disabled int `json:"disabled" binding:"gte=0,lte=1"` } func (rt *Router) alertRuleAddByImportPromRule(c *gin.Context) { var f promRuleForm ginx.Dangerous(c.BindJSON(&f)) // 首先尝试解析带 groups 的格式 var pr struct { Groups []models.PromRuleGroup `yaml:"groups"` } err := yaml.Unmarshal([]byte(f.Payload), &pr) var groups []models.PromRuleGroup if err != nil || len(pr.Groups) == 0 { // 如果解析失败或没有 groups,尝试解析规则数组格式 var rules []models.PromRule err = yaml.Unmarshal([]byte(f.Payload), &rules) if err != nil { // 最后尝试解析单个规则格式 var singleRule models.PromRule err = yaml.Unmarshal([]byte(f.Payload), &singleRule) if err != nil { ginx.Bomb(http.StatusBadRequest, "invalid yaml format. err: %v", err) } // 验证单个规则是否有效 if singleRule.Alert == "" && singleRule.Record == "" { ginx.Bomb(http.StatusBadRequest, "input yaml is empty or invalid") } rules = []models.PromRule{singleRule} } // 验证规则数组是否为空 if len(rules) == 0 { ginx.Bomb(http.StatusBadRequest, "input yaml contains no rules") } // 将规则数组包装成 group groups = []models.PromRuleGroup{ { Name: "imported_rules", Rules: rules, }, } } else { // 使用已解析的 groups groups = pr.Groups } lst := models.DealPromGroup(groups, f.DatasourceQueries, f.Disabled) username := c.MustGet("username").(string) bgid := ginx.UrlParamInt64(c, "id") ginx.NewRender(c).Data(rt.alertRuleAdd(lst, username, bgid, c.GetHeader("X-Language")), nil) } func (rt *Router) alertRuleAddByService(c *gin.Context) { var lst []models.AlertRule ginx.BindJSON(c, &lst) count := len(lst) if count == 0 { ginx.Bomb(http.StatusBadRequest, "input json is empty") } reterr := rt.alertRuleAddForService(lst, "") ginx.NewRender(c).Data(reterr, nil) } func (rt *Router) alertRuleAddOneByService(c *gin.Context) { var f models.AlertRule ginx.BindJSON(c, &f) err := f.FE2DB() ginx.Dangerous(err) err = f.Add(rt.Ctx) ginx.NewRender(c).Data(f.Id, err) } func (rt *Router) alertRuleAddForService(lst []models.AlertRule, username string) map[string]string { count := len(lst) // alert rule name -> error string reterr := make(map[string]string) for i := 0; i < count; i++ { lst[i].Id = 0 if username != "" { lst[i].CreateBy = username lst[i].UpdateBy = username } if err := lst[i].FE2DB(); err != nil { reterr[lst[i].Name] = err.Error() continue } if err := lst[i].Add(rt.Ctx); err != nil { reterr[lst[i].Name] = err.Error() } else { reterr[lst[i].Name] = "" } } return reterr } func (rt *Router) alertRuleAdd(lst []models.AlertRule, username string, bgid int64, lang string) map[string]string { count := len(lst) // alert rule name -> error string reterr := make(map[string]string) for i := 0; i < count; i++ { lst[i].Id = 0 lst[i].GroupId = bgid if username != "" { lst[i].CreateBy = username lst[i].UpdateBy = username } if err := lst[i].FE2DB(); err != nil { reterr[lst[i].Name] = i18n.Sprintf(lang, err.Error()) continue } if err := lst[i].Add(rt.Ctx); err != nil { reterr[lst[i].Name] = i18n.Sprintf(lang, err.Error()) } else { reterr[lst[i].Name] = "" } } return reterr } func (rt *Router) alertRuleDel(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) f.Verify() // param(busiGroupId) for protect ginx.NewRender(c).Message(models.AlertRuleDels(rt.Ctx, f.Ids, ginx.UrlParamInt64(c, "id"))) } func (rt *Router) alertRuleDelByService(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) f.Verify() ginx.NewRender(c).Message(models.AlertRuleDels(rt.Ctx, f.Ids)) } func (rt *Router) alertRulePutByFE(c *gin.Context) { var f models.AlertRule ginx.BindJSON(c, &f) arid := ginx.UrlParamInt64(c, "arid") ar, err := models.AlertRuleGetById(rt.Ctx, arid) ginx.Dangerous(err) if ar == nil { ginx.NewRender(c, http.StatusNotFound).Message("No such AlertRule") return } rt.bgrwCheck(c, ar.GroupId) f.UpdateBy = c.MustGet("username").(string) ginx.NewRender(c).Message(ar.Update(rt.Ctx, f)) } func (rt *Router) alertRulePutByService(c *gin.Context) { var f models.AlertRule ginx.BindJSON(c, &f) arid := ginx.UrlParamInt64(c, "arid") ar, err := models.AlertRuleGetById(rt.Ctx, arid) ginx.Dangerous(err) if ar == nil { ginx.NewRender(c, http.StatusNotFound).Message("No such AlertRule") return } ginx.NewRender(c).Message(ar.Update(rt.Ctx, f)) } type alertRuleFieldForm struct { Ids []int64 `json:"ids"` Fields map[string]interface{} `json:"fields"` Action string `json:"action"` } // update one field: cluster note severity disabled prom_eval_interval prom_for_duration notify_channels notify_groups notify_recovered notify_repeat_step callbacks runbook_url append_tags func (rt *Router) alertRulePutFields(c *gin.Context) { var f alertRuleFieldForm ginx.BindJSON(c, &f) if len(f.Fields) == 0 { ginx.Bomb(http.StatusBadRequest, "fields empty") } updateBy := c.MustGet("username").(string) updateAt := time.Now().Unix() for i := 0; i < len(f.Ids); i++ { ar, err := models.AlertRuleGetById(rt.Ctx, f.Ids[i]) ginx.Dangerous(err) if ar == nil { continue } if f.Action == "update_triggers" { if triggers, has := f.Fields["triggers"]; has { originRule := ar.RuleConfigJson.(map[string]interface{}) originRule["triggers"] = triggers b, err := json.Marshal(originRule) ginx.Dangerous(err) ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, map[string]interface{}{"rule_config": string(b)})) } } if f.Action == "annotations_add" { if annotations, has := f.Fields["annotations"]; has { annotationsMap := annotations.(map[string]interface{}) for k, v := range annotationsMap { ar.AnnotationsJSON[k] = v.(string) } b, err := json.Marshal(ar.AnnotationsJSON) ginx.Dangerous(err) ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, map[string]interface{}{"annotations": string(b)})) } } if f.Action == "annotations_del" { if annotations, has := f.Fields["annotations"]; has { annotationsKeys := annotations.(map[string]interface{}) for key := range annotationsKeys { delete(ar.AnnotationsJSON, key) } b, err := json.Marshal(ar.AnnotationsJSON) ginx.Dangerous(err) ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, map[string]interface{}{"annotations": string(b)})) } } if f.Action == "callback_add" { // 增加一个 callback 地址 if callbacks, has := f.Fields["callbacks"]; has { callback := callbacks.(string) if !strings.Contains(ar.Callbacks, callback) { ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, map[string]interface{}{"callbacks": ar.Callbacks + " " + callback})) } } } if f.Action == "callback_del" { // 删除一个 callback 地址 if callbacks, has := f.Fields["callbacks"]; has { callback := callbacks.(string) ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, map[string]interface{}{"callbacks": strings.ReplaceAll(ar.Callbacks, callback, "")})) } } if f.Action == "datasource_change" { // 修改数据源 if datasourceQueries, has := f.Fields["datasource_queries"]; has { bytes, err := json.Marshal(datasourceQueries) ginx.Dangerous(err) ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, map[string]interface{}{"datasource_queries": bytes})) } } for k, v := range f.Fields { // 检查 v 是否为各种切片类型 switch v.(type) { case []interface{}, []int64, []int, []string: // 将切片转换为 JSON 字符串 bytes, err := json.Marshal(v) ginx.Dangerous(err) ginx.Dangerous(ar.UpdateColumn(rt.Ctx, k, string(bytes))) default: ginx.Dangerous(ar.UpdateColumn(rt.Ctx, k, v)) } } // 统一更新更新时间和更新人,只有更新时间变了,告警规则才会被引擎拉取 ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, map[string]interface{}{ "update_by": updateBy, "update_at": updateAt, })) } ginx.NewRender(c).Message(nil) } func (rt *Router) alertRuleGet(c *gin.Context) { arid := ginx.UrlParamInt64(c, "arid") ar, err := models.AlertRuleGetById(rt.Ctx, arid) ginx.Dangerous(err) if ar == nil { ginx.NewRender(c, http.StatusNotFound).Message("No such AlertRule") return } if len(ar.DatasourceQueries) != 0 { ar.DatasourceIdsJson = rt.DatasourceCache.GetIDsByDsCateAndQueries(ar.Cate, ar.DatasourceQueries) } err = ar.FillNotifyGroups(rt.Ctx, make(map[int64]*models.UserGroup)) ginx.Dangerous(err) rt.AlertRuleModifyHook(ar) ginx.NewRender(c).Data(ar, err) } func (rt *Router) alertRulePureGet(c *gin.Context) { arid := ginx.UrlParamInt64(c, "arid") ar, err := models.AlertRuleGetById(rt.Ctx, arid) ginx.Dangerous(err) if ar == nil { ginx.NewRender(c, http.StatusNotFound).Message("No such AlertRule") return } ginx.NewRender(c).Data(ar, err) } // pre validation before save rule func (rt *Router) alertRuleValidation(c *gin.Context) { var f models.AlertRule //new ginx.BindJSON(c, &f) if len(f.NotifyChannelsJSON) > 0 && len(f.NotifyGroupsJSON) > 0 { //Validation NotifyChannels ngids := make([]int64, 0, len(f.NotifyChannelsJSON)) for i := range f.NotifyGroupsJSON { id, _ := strconv.ParseInt(f.NotifyGroupsJSON[i], 10, 64) ngids = append(ngids, id) } userGroups := rt.UserGroupCache.GetByUserGroupIds(ngids) uids := make([]int64, 0) for i := range userGroups { uids = append(uids, userGroups[i].UserIds...) } users := rt.UserCache.GetByUserIds(uids) //If any users have a certain notify channel's token, it will be okay. Otherwise, this notify channel is absent of tokens. ancs := make([]string, 0, len(f.NotifyChannelsJSON)) //absent Notify Channels for i := range f.NotifyChannelsJSON { flag := true //ignore non-default channels switch f.NotifyChannelsJSON[i] { case models.Dingtalk, models.Wecom, models.Feishu, models.Mm, models.Telegram, models.Email, models.FeishuCard: // do nothing default: continue } //default channels for ui := range users { if _, b := users[ui].ExtractToken(f.NotifyChannelsJSON[i]); b { flag = false break } } if flag { ancs = append(ancs, f.NotifyChannelsJSON[i]) } } if len(ancs) > 0 { ginx.NewRender(c).Message("All users are missing notify channel configurations. Please check for missing tokens (each channel should be configured with at least one user). %s", ancs) return } } ginx.NewRender(c).Message("") } func (rt *Router) alertRuleCallbacks(c *gin.Context) { user := c.MustGet("user").(*models.User) bussGroupIds, err := models.MyBusiGroupIds(rt.Ctx, user.Id) ginx.Dangerous(err) ars, err := models.AlertRuleGetsByBGIds(rt.Ctx, bussGroupIds) ginx.Dangerous(err) var callbacks []string callbackFilter := make(map[string]struct{}) for i := range ars { for _, callback := range ars[i].CallbacksJSON { if _, ok := callbackFilter[callback]; !ok { callbackFilter[callback] = struct{}{} callbacks = append(callbacks, callback) } } } ginx.NewRender(c).Data(callbacks, nil) } type alertRuleTestForm struct { Configs []*pconf.RelabelConfig `json:"configs"` Tags []string `json:"tags"` } func (rt *Router) relabelTest(c *gin.Context) { var f alertRuleTestForm ginx.BindJSON(c, &f) if len(f.Tags) == 0 || len(f.Configs) == 0 { ginx.Bomb(http.StatusBadRequest, "relabel config is empty") } labels := make([]prompb.Label, len(f.Tags)) for i, tag := range f.Tags { label := strings.SplitN(tag, "=", 2) if len(label) != 2 { ginx.Bomb(http.StatusBadRequest, "tag:%s format error", tag) } labels[i] = prompb.Label{Name: label[0], Value: label[1]} } for i := 0; i < len(f.Configs); i++ { if f.Configs[i].Replacement == "" { f.Configs[i].Replacement = "$1" } if f.Configs[i].Separator == "" { f.Configs[i].Separator = ";" } if f.Configs[i].Regex == "" { f.Configs[i].Regex = "(.*)" } } relabels := writer.Process(labels, f.Configs...) var tags []string for _, label := range relabels { tags = append(tags, fmt.Sprintf("%s=%s", label.Name, label.Value)) } ginx.NewRender(c).Data(tags, nil) } type identListForm struct { Ids []int64 `json:"ids"` IdentList []string `json:"ident_list"` } func containsIdentOperator(s string) bool { pattern := `ident\s*(!=|!~|=~)` matched, err := regexp.MatchString(pattern, s) if err != nil { return false } return matched } func (rt *Router) cloneToMachine(c *gin.Context) { var f identListForm ginx.BindJSON(c, &f) if len(f.IdentList) == 0 { ginx.Bomb(http.StatusBadRequest, "ident_list is empty") } alertRules, err := models.AlertRuleGetsByIds(rt.Ctx, f.Ids) ginx.Dangerous(err) re := regexp.MustCompile(`ident\s*=\s*\\".*?\\"`) user := c.MustGet("username").(string) now := time.Now().Unix() newRules := make([]*models.AlertRule, 0) reterr := make(map[string]map[string]string) for i := range alertRules { errMsg := make(map[string]string) if alertRules[i].Cate != "prometheus" { errMsg["all"] = "Only Prometheus rule can be cloned to machines" reterr[alertRules[i].Name] = errMsg continue } if containsIdentOperator(alertRules[i].RuleConfig) { errMsg["all"] = "promql is missing ident" reterr[alertRules[i].Name] = errMsg continue } for j := range f.IdentList { alertRules[i].RuleConfig = re.ReplaceAllString(alertRules[i].RuleConfig, fmt.Sprintf(`ident=\"%s\"`, f.IdentList[j])) newRule := &models.AlertRule{} if err := copier.Copy(newRule, alertRules[i]); err != nil { errMsg[f.IdentList[j]] = fmt.Sprintf("fail to clone rule, err: %s", err) continue } newRule.Id = 0 newRule.Name = alertRules[i].Name + "_" + f.IdentList[j] newRule.CreateBy = user newRule.UpdateBy = user newRule.UpdateAt = now newRule.CreateAt = now newRule.RuleConfig = alertRules[i].RuleConfig exist, err := models.AlertRuleExists(rt.Ctx, 0, newRule.GroupId, newRule.Name) if err != nil { errMsg[f.IdentList[j]] = err.Error() continue } if exist { errMsg[f.IdentList[j]] = fmt.Sprintf("rule already exists, ruleName: %s", newRule.Name) continue } newRules = append(newRules, newRule) } if len(errMsg) > 0 { reterr[alertRules[i].Name] = errMsg } } ginx.NewRender(c).Data(reterr, models.InsertAlertRule(rt.Ctx, newRules)) } type alertBatchCloneForm struct { RuleIds []int64 `json:"rule_ids"` Bgids []int64 `json:"bgids"` } // 批量克隆告警规则 func (rt *Router) batchAlertRuleClone(c *gin.Context) { me := c.MustGet("user").(*models.User) var f alertBatchCloneForm ginx.BindJSON(c, &f) // 校验 bgids 操作权限 for _, bgid := range f.Bgids { rt.bgrwCheck(c, bgid) } reterr := make(map[string]string, len(f.RuleIds)) lang := c.GetHeader("X-Language") for _, arid := range f.RuleIds { ar, err := models.AlertRuleGetById(rt.Ctx, arid) for _, bgid := range f.Bgids { // 为了让 bgid 和 arid 对应,将上面的 err 放到这里处理 if err != nil { reterr[fmt.Sprintf("%d-%d", arid, bgid)] = i18n.Sprintf(lang, err.Error()) continue } if ar == nil { reterr[fmt.Sprintf("%d-%d", arid, bgid)] = i18n.Sprintf(lang, "alert rule not found") continue } newAr := ar.Clone(me.Username, bgid) err = newAr.Add(rt.Ctx) if err != nil { reterr[fmt.Sprintf("%d-%d", arid, bgid)] = i18n.Sprintf(lang, err.Error()) continue } } } ginx.NewRender(c).Data(reterr, nil) } func (rt *Router) timezonesGet(c *gin.Context) { // 返回常用时区列表(按时差去重,每个时差只保留一个代表性时区) timezones := []string{ "Local", "UTC", "Asia/Shanghai", // UTC+8 (代表 Asia/Hong_Kong, Asia/Singapore 等) "Asia/Tokyo", // UTC+9 (代表 Asia/Seoul 等) "Asia/Dubai", // UTC+4 "Asia/Kolkata", // UTC+5:30 "Asia/Bangkok", // UTC+7 (代表 Asia/Jakarta 等) "Europe/London", // UTC+0 (代表 UTC) "Europe/Paris", // UTC+1 (代表 Europe/Berlin, Europe/Rome, Europe/Madrid 等) "Europe/Moscow", // UTC+3 "America/New_York", // UTC-5 (代表 America/Toronto 等) "America/Chicago", // UTC-6 (代表 America/Mexico_City 等) "America/Denver", // UTC-7 "America/Los_Angeles", // UTC-8 "America/Sao_Paulo", // UTC-3 "Australia/Sydney", // UTC+10 (代表 Australia/Melbourne 等) "Pacific/Auckland", // UTC+12 } ginx.NewRender(c).Data(timezones, nil) } ================================================ FILE: center/router/router_alert_subscribe.go ================================================ package router import ( "net/http" "strconv" "strings" "time" "github.com/ccfos/nightingale/v6/alert/common" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/strx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/i18n" ) // Return all, front-end search and paging func (rt *Router) alertSubscribeGets(c *gin.Context) { bgid := ginx.UrlParamInt64(c, "id") lst, err := models.AlertSubscribeGets(rt.Ctx, bgid) ginx.Dangerous(err) ugcache := make(map[int64]*models.UserGroup) rulecache := make(map[int64]string) for i := 0; i < len(lst); i++ { ginx.Dangerous(lst[i].FillUserGroups(rt.Ctx, ugcache)) ginx.Dangerous(lst[i].FillRuleNames(rt.Ctx, rulecache)) ginx.Dangerous(lst[i].FillDatasourceIds(rt.Ctx)) ginx.Dangerous(lst[i].DB2FE()) } models.FillUpdateByNicknames(rt.Ctx, lst) ginx.NewRender(c).Data(lst, err) } func (rt *Router) alertSubscribeGetsByGids(c *gin.Context) { gids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",") if len(gids) > 0 { for _, gid := range gids { rt.bgroCheck(c, gid) } } else { me := c.MustGet("user").(*models.User) if !me.IsAdmin() { var err error gids, err = models.MyBusiGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) if len(gids) == 0 { ginx.NewRender(c).Data([]int{}, nil) return } } } lst, err := models.AlertSubscribeGetsByBGIds(rt.Ctx, gids) ginx.Dangerous(err) ugcache := make(map[int64]*models.UserGroup) rulecache := make(map[int64]string) for i := 0; i < len(lst); i++ { ginx.Dangerous(lst[i].FillUserGroups(rt.Ctx, ugcache)) ginx.Dangerous(lst[i].FillRuleNames(rt.Ctx, rulecache)) ginx.Dangerous(lst[i].FillDatasourceIds(rt.Ctx)) ginx.Dangerous(lst[i].DB2FE()) } models.FillUpdateByNicknames(rt.Ctx, lst) ginx.NewRender(c).Data(lst, err) } func (rt *Router) alertSubscribeGet(c *gin.Context) { subid := ginx.UrlParamInt64(c, "sid") sub, err := models.AlertSubscribeGet(rt.Ctx, "id=?", subid) ginx.Dangerous(err) if sub == nil { ginx.NewRender(c, 404).Message("No such alert subscribe") return } ugcache := make(map[int64]*models.UserGroup) ginx.Dangerous(sub.FillUserGroups(rt.Ctx, ugcache)) rulecache := make(map[int64]string) ginx.Dangerous(sub.FillRuleNames(rt.Ctx, rulecache)) ginx.Dangerous(sub.FillDatasourceIds(rt.Ctx)) ginx.Dangerous(sub.DB2FE()) ginx.NewRender(c).Data(sub, nil) } func (rt *Router) alertSubscribeAdd(c *gin.Context) { var f models.AlertSubscribe ginx.BindJSON(c, &f) username := c.MustGet("username").(string) f.CreateBy = username f.UpdateBy = username f.GroupId = ginx.UrlParamInt64(c, "id") if f.GroupId <= 0 { ginx.Bomb(http.StatusBadRequest, "group_id invalid") } ginx.NewRender(c).Message(f.Add(rt.Ctx)) } type SubscribeTryRunForm struct { EventId int64 `json:"event_id" binding:"required"` SubscribeConfig models.AlertSubscribe `json:"config" binding:"required"` } func (rt *Router) alertSubscribeTryRun(c *gin.Context) { var f SubscribeTryRunForm ginx.BindJSON(c, &f) ginx.Dangerous(f.SubscribeConfig.Verify()) hisEvent, err := models.AlertHisEventGetById(rt.Ctx, f.EventId) ginx.Dangerous(err) if hisEvent == nil { ginx.Bomb(http.StatusNotFound, "event not found") } curEvent := *hisEvent.ToCur() curEvent.SetTagsMap() lang := c.GetHeader("X-Language") // 先判断匹配条件 if !f.SubscribeConfig.MatchCluster(curEvent.DatasourceId) { ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "event datasource not match")) } if len(f.SubscribeConfig.RuleIds) != 0 { match := false for _, rid := range f.SubscribeConfig.RuleIds { if rid == curEvent.RuleId { match = true break } } if !match { ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "event rule id not match")) } } // 匹配 tag f.SubscribeConfig.Parse() if !common.MatchTags(curEvent.TagsMap, f.SubscribeConfig.ITags) { ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "event tags not match")) } // 匹配group name if !common.MatchGroupsName(curEvent.GroupName, f.SubscribeConfig.IBusiGroups) { ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "event group name not match")) } // 检查严重级别(Severity)匹配 if len(f.SubscribeConfig.SeveritiesJson) != 0 { match := false for _, s := range f.SubscribeConfig.SeveritiesJson { if s == curEvent.Severity || s == 0 { match = true break } } if !match { ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "event severity not match")) } } // 新版本通知规则 if f.SubscribeConfig.NotifyVersion == 1 { if len(f.SubscribeConfig.NotifyRuleIds) == 0 { ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "no notify rules selected")) } for _, id := range f.SubscribeConfig.NotifyRuleIds { notifyRule, err := models.GetNotifyRule(rt.Ctx, id) if err != nil { ginx.Bomb(http.StatusNotFound, i18n.Sprintf(lang, "subscribe notify rule not found: %v", err)) } for _, notifyConfig := range notifyRule.NotifyConfigs { _, err = SendNotifyChannelMessage(rt.Ctx, rt.UserCache, rt.UserGroupCache, notifyConfig, []*models.AlertCurEvent{&curEvent}) if err != nil { ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "notify rule send error: %v", err)) } } } ginx.NewRender(c).Data(i18n.Sprintf(lang, "event match subscribe and notification test ok"), nil) return } // 旧版通知方式 f.SubscribeConfig.ModifyEvent(&curEvent) if len(curEvent.NotifyChannelsJSON) == 0 { ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "no notify channels selected")) } if len(curEvent.NotifyGroupsJSON) == 0 { ginx.Bomb(http.StatusOK, i18n.Sprintf(lang, "no notify groups selected")) } ancs := make([]string, 0, len(curEvent.NotifyChannelsJSON)) ugids := strings.Fields(f.SubscribeConfig.UserGroupIds) ngids := make([]int64, 0) for i := 0; i < len(ugids); i++ { if gid, err := strconv.ParseInt(ugids[i], 10, 64); err == nil { ngids = append(ngids, gid) } } userGroups := rt.UserGroupCache.GetByUserGroupIds(ngids) uids := make([]int64, 0) for i := range userGroups { uids = append(uids, userGroups[i].UserIds...) } users := rt.UserCache.GetByUserIds(uids) for _, NotifyChannels := range curEvent.NotifyChannelsJSON { flag := true // ignore non-default channels switch NotifyChannels { case models.Dingtalk, models.Wecom, models.Feishu, models.Mm, models.Telegram, models.Email, models.FeishuCard: // do nothing default: continue } // default channels for ui := range users { if _, b := users[ui].ExtractToken(NotifyChannels); b { flag = false break } } if flag { ancs = append(ancs, NotifyChannels) } } if len(ancs) > 0 { ginx.Bomb(http.StatusBadRequest, i18n.Sprintf(lang, "all users missing notify channel configurations: %v", ancs)) } ginx.NewRender(c).Data(i18n.Sprintf(lang, "event match subscribe and notify settings ok"), nil) } func (rt *Router) alertSubscribePut(c *gin.Context) { var fs []models.AlertSubscribe ginx.BindJSON(c, &fs) timestamp := time.Now().Unix() username := c.MustGet("username").(string) for i := 0; i < len(fs); i++ { fs[i].UpdateBy = username fs[i].UpdateAt = timestamp //After adding the function of batch subscription alert rules, rule_ids is used instead of rule_id. //When the subscription rules are updated, set rule_id=0 to prevent the wrong subscription caused by the old rule_id. fs[i].RuleId = 0 ginx.Dangerous(fs[i].Update( rt.Ctx, "name", "disabled", "prod", "cate", "datasource_ids", "cluster", "rule_id", "rule_ids", "tags", "redefine_severity", "new_severity", "redefine_channels", "new_channels", "user_group_ids", "update_at", "update_by", "webhooks", "for_duration", "redefine_webhooks", "severities", "extra_config", "busi_groups", "note", "notify_rule_ids", "notify_version", )) } ginx.NewRender(c).Message(nil) } func (rt *Router) alertSubscribeDel(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) f.Verify() ginx.NewRender(c).Message(models.AlertSubscribeDel(rt.Ctx, f.Ids)) } func (rt *Router) alertSubscribeGetsByService(c *gin.Context) { lst, err := models.AlertSubscribeGetsByService(rt.Ctx) ginx.NewRender(c).Data(lst, err) } ================================================ FILE: center/router/router_board.go ================================================ package router import ( "fmt" "net/http" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/strx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/i18n" ) type boardForm struct { Name string `json:"name"` Ident string `json:"ident"` Tags string `json:"tags"` Note string `json:"note"` Configs string `json:"configs"` Public int `json:"public"` PublicCate int `json:"public_cate"` Bgids []int64 `json:"bgids"` } func (rt *Router) boardAdd(c *gin.Context) { var f boardForm ginx.BindJSON(c, &f) me := c.MustGet("user").(*models.User) board := &models.Board{ GroupId: ginx.UrlParamInt64(c, "id"), Name: f.Name, Ident: f.Ident, Tags: f.Tags, Note: f.Note, Configs: f.Configs, CreateBy: me.Username, UpdateBy: me.Username, } err := board.Add(rt.Ctx) ginx.Dangerous(err) if f.Configs != "" { ginx.Dangerous(models.BoardPayloadSave(rt.Ctx, board.Id, f.Configs)) } ginx.NewRender(c).Data(board, nil) } func (rt *Router) boardGet(c *gin.Context) { bid := ginx.UrlParamStr(c, "bid") board, err := models.BoardGet(rt.Ctx, "ident = ?", bid) ginx.Dangerous(err) if board == nil { board, err = models.BoardGet(rt.Ctx, "id = ?", bid) ginx.Dangerous(err) } if board == nil { ginx.Bomb(http.StatusNotFound, "No such dashboard") } if board.Public == 0 { rt.auth()(c) rt.user()(c) me := c.MustGet("user").(*models.User) if !me.IsAdmin() { // check permission rt.bgroCheck(c, board.GroupId) } } if board.PublicCate == models.PublicLogin { rt.auth()(c) } else if board.PublicCate == models.PublicBusi { rt.auth()(c) rt.user()(c) me := c.MustGet("user").(*models.User) if !me.IsAdmin() { bgids, err := models.MyBusiGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) if len(bgids) == 0 { ginx.Bomb(http.StatusForbidden, "forbidden") } ok, err := models.BoardBusigroupCheck(rt.Ctx, board.Id, bgids) ginx.Dangerous(err) if !ok { ginx.Bomb(http.StatusForbidden, "forbidden") } } } ginx.NewRender(c).Data(board, nil) } // 根据 bids 参数,获取多个 board func (rt *Router) boardGetsByBids(c *gin.Context) { bids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "bids", ""), ",") boards, err := models.BoardGetsByBids(rt.Ctx, bids) ginx.Dangerous(err) ginx.NewRender(c).Data(boards, err) } func (rt *Router) boardPureGet(c *gin.Context) { board, err := models.BoardGetByID(rt.Ctx, ginx.UrlParamInt64(c, "bid")) ginx.Dangerous(err) if board == nil { ginx.Bomb(http.StatusNotFound, "No such dashboard") } // 清除创建者和更新者信息 board.CreateBy = "" board.UpdateBy = "" ginx.NewRender(c).Data(board, nil) } // bgrwCheck func (rt *Router) boardDel(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) f.Verify() for i := 0; i < len(f.Ids); i++ { bid := f.Ids[i] board, err := models.BoardGet(rt.Ctx, "id = ?", bid) ginx.Dangerous(err) if board == nil { continue } me := c.MustGet("user").(*models.User) if !me.IsAdmin() { // check permission rt.bgrwCheck(c, board.GroupId) } ginx.Dangerous(board.Del(rt.Ctx)) } ginx.NewRender(c).Message(nil) } func (rt *Router) Board(id int64) *models.Board { obj, err := models.BoardGet(rt.Ctx, "id=?", id) ginx.Dangerous(err) if obj == nil { ginx.Bomb(http.StatusNotFound, "No such dashboard") } return obj } // bgrwCheck func (rt *Router) boardPut(c *gin.Context) { var f boardForm ginx.BindJSON(c, &f) me := c.MustGet("user").(*models.User) bo := rt.Board(ginx.UrlParamInt64(c, "bid")) if !me.IsAdmin() { // check permission rt.bgrwCheck(c, bo.GroupId) } can, err := bo.CanRenameIdent(rt.Ctx, f.Ident) ginx.Dangerous(err) if !can { ginx.Bomb(http.StatusOK, "Ident duplicate") } bo.Name = f.Name bo.Ident = f.Ident bo.Tags = f.Tags bo.Note = f.Note bo.UpdateBy = me.Username bo.UpdateAt = time.Now().Unix() err = bo.Update(rt.Ctx, "name", "ident", "tags", "note", "update_by", "update_at") ginx.NewRender(c).Data(bo, err) } // bgrwCheck func (rt *Router) boardPutConfigs(c *gin.Context) { var f boardForm ginx.BindJSON(c, &f) me := c.MustGet("user").(*models.User) bid := ginx.UrlParamStr(c, "bid") bo, err := models.BoardGet(rt.Ctx, "id = ? or ident = ?", bid, bid) ginx.Dangerous(err) if bo == nil { ginx.Bomb(http.StatusNotFound, "No such dashboard") } // check permission if !me.IsAdmin() { rt.bgrwCheck(c, bo.GroupId) } bo.UpdateBy = me.Username bo.UpdateAt = time.Now().Unix() ginx.Dangerous(bo.Update(rt.Ctx, "update_by", "update_at")) bo.Configs = f.Configs ginx.Dangerous(models.BoardPayloadSave(rt.Ctx, bo.Id, f.Configs)) ginx.NewRender(c).Data(bo, nil) } // bgrwCheck func (rt *Router) boardPutPublic(c *gin.Context) { var f boardForm ginx.BindJSON(c, &f) me := c.MustGet("user").(*models.User) bo := rt.Board(ginx.UrlParamInt64(c, "bid")) // check permission if !me.IsAdmin() { rt.bgrwCheck(c, bo.GroupId) } bo.Public = f.Public bo.PublicCate = f.PublicCate if bo.PublicCate == models.PublicBusi { err := models.BoardBusigroupUpdate(rt.Ctx, bo.Id, f.Bgids) ginx.Dangerous(err) } else { err := models.BoardBusigroupDelByBoardId(rt.Ctx, bo.Id) ginx.Dangerous(err) } bo.UpdateBy = me.Username bo.UpdateAt = time.Now().Unix() err := bo.Update(rt.Ctx, "public", "public_cate", "update_by", "update_at") ginx.NewRender(c).Data(bo, err) } func (rt *Router) boardGets(c *gin.Context) { bgid := ginx.UrlParamInt64(c, "id") query := ginx.QueryStr(c, "query", "") boards, err := models.BoardGetsByGroupId(rt.Ctx, bgid, query) if err == nil { models.FillUpdateByNicknames(rt.Ctx, boards) } ginx.NewRender(c).Data(boards, err) } func (rt *Router) publicBoardGets(c *gin.Context) { me := c.MustGet("user").(*models.User) bgids, err := models.MyBusiGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) boardIds, err := models.BoardIdsByBusiGroupIds(rt.Ctx, bgids) ginx.Dangerous(err) boards, err := models.BoardGets(rt.Ctx, "", "public=1 and (public_cate in (?) or id in (?))", []int64{0, 1}, boardIds) if err == nil { models.FillUpdateByNicknames(rt.Ctx, boards) } ginx.NewRender(c).Data(boards, err) } func (rt *Router) boardGetsByGids(c *gin.Context) { gids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",") query := ginx.QueryStr(c, "query", "") if len(gids) > 0 { for _, gid := range gids { rt.bgroCheck(c, gid) } } else { me := c.MustGet("user").(*models.User) if !me.IsAdmin() { var err error gids, err = models.MyBusiGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) if len(gids) == 0 { ginx.NewRender(c).Data([]int{}, nil) return } } } boardBusigroups, err := models.BoardBusigroupGets(rt.Ctx) ginx.Dangerous(err) m := make(map[int64][]int64) for _, boardBusigroup := range boardBusigroups { m[boardBusigroup.BoardId] = append(m[boardBusigroup.BoardId], boardBusigroup.BusiGroupId) } boards, err := models.BoardGetsByBGIds(rt.Ctx, gids, query) ginx.Dangerous(err) for i := 0; i < len(boards); i++ { if ids, ok := m[boards[i].Id]; ok { boards[i].Bgids = ids } } models.FillUpdateByNicknames(rt.Ctx, boards) ginx.NewRender(c).Data(boards, err) } func (rt *Router) boardClone(c *gin.Context) { me := c.MustGet("user").(*models.User) bo := rt.Board(ginx.UrlParamInt64(c, "bid")) newBoard := bo.Clone(me.Username, bo.GroupId, " Cloned") ginx.Dangerous(newBoard.Add(rt.Ctx)) // clone payload payload, err := models.BoardPayloadGet(rt.Ctx, bo.Id) ginx.Dangerous(err) if payload != "" { ginx.Dangerous(models.BoardPayloadSave(rt.Ctx, newBoard.Id, payload)) } ginx.NewRender(c).Message(nil) } type boardsForm struct { BoardIds []int64 `json:"board_ids"` Bgids []int64 `json:"bgids"` } func (rt *Router) boardBatchClone(c *gin.Context) { me := c.MustGet("user").(*models.User) var f boardsForm ginx.BindJSON(c, &f) for _, bgid := range f.Bgids { rt.bgrwCheck(c, bgid) } reterr := make(map[string]string, len(f.BoardIds)) lang := c.GetHeader("X-Language") for _, bgid := range f.Bgids { for _, bid := range f.BoardIds { bo := rt.Board(bid) newBoard := bo.Clone(me.Username, bgid, "") payload, err := models.BoardPayloadGet(rt.Ctx, bo.Id) if err != nil { reterr[fmt.Sprintf("%s-%d", newBoard.Name, bgid)] = i18n.Sprintf(lang, err.Error()) continue } if err = newBoard.AtomicAdd(rt.Ctx, payload); err != nil { reterr[fmt.Sprintf("%s-%d", newBoard.Name, bgid)] = i18n.Sprintf(lang, err.Error()) } } } ginx.NewRender(c).Data(reterr, nil) } ================================================ FILE: center/router/router_builtin.go ================================================ package router import ( "encoding/json" "fmt" "net/http" "path" "strings" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/file" "github.com/toolkits/pkg/logger" "github.com/toolkits/pkg/runner" ) // 创建 builtin_cate func (rt *Router) builtinCateFavoriteAdd(c *gin.Context) { var f models.BuiltinCate ginx.BindJSON(c, &f) if f.Name == "" { ginx.Bomb(http.StatusBadRequest, "name is empty") } me := c.MustGet("user").(*models.User) f.UserId = me.Id ginx.NewRender(c).Message(f.Create(rt.Ctx)) } // 删除 builtin_cate func (rt *Router) builtinCateFavoriteDel(c *gin.Context) { name := ginx.UrlParamStr(c, "name") me := c.MustGet("user").(*models.User) ginx.NewRender(c).Message(models.BuiltinCateDelete(rt.Ctx, name, me.Id)) } type Payload struct { Cate string `json:"cate"` Fname string `json:"fname"` Name string `json:"name"` Configs interface{} `json:"configs"` Tags string `json:"tags"` } type BoardCate struct { Name string `json:"name"` IconUrl string `json:"icon_url"` Boards []Payload `json:"boards"` Favorite bool `json:"favorite"` } func (rt *Router) builtinBoardDetailGets(c *gin.Context) { var payload Payload ginx.BindJSON(c, &payload) fp := rt.Center.BuiltinIntegrationsDir if fp == "" { fp = path.Join(runner.Cwd, "integrations") } fn := fp + "/" + payload.Cate + "/dashboards/" + payload.Fname content, err := file.ReadBytes(fn) ginx.Dangerous(err) err = json.Unmarshal(content, &payload) ginx.NewRender(c).Data(payload, err) } func (rt *Router) builtinBoardCateGets(c *gin.Context) { fp := rt.Center.BuiltinIntegrationsDir if fp == "" { fp = path.Join(runner.Cwd, "integrations") } me := c.MustGet("user").(*models.User) builtinFavoritesMap, err := models.BuiltinCateGetByUserId(rt.Ctx, me.Id) if err != nil { logger.Warningf("get builtin favorites fail: %v", err) } var boardCates []BoardCate dirList, err := file.DirsUnder(fp) ginx.Dangerous(err) for _, dir := range dirList { var boardCate BoardCate boardCate.Name = dir files, err := file.FilesUnder(fp + "/" + dir + "/dashboards") ginx.Dangerous(err) if len(files) == 0 { continue } var boards []Payload for _, f := range files { fn := fp + "/" + dir + "/dashboards/" + f content, err := file.ReadBytes(fn) if err != nil { logger.Warningf("add board fail: %v", err) continue } var payload Payload err = json.Unmarshal(content, &payload) if err != nil { logger.Warningf("add board:%s fail: %v", fn, err) continue } payload.Cate = dir payload.Fname = f payload.Configs = "" boards = append(boards, payload) } boardCate.Boards = boards if _, ok := builtinFavoritesMap[dir]; ok { boardCate.Favorite = true } iconFiles, _ := file.FilesUnder(fp + "/" + dir + "/icon") if len(iconFiles) > 0 { boardCate.IconUrl = fmt.Sprintf("/api/n9e/integrations/icon/%s/%s", dir, iconFiles[0]) } boardCates = append(boardCates, boardCate) } ginx.NewRender(c).Data(boardCates, nil) } func (rt *Router) builtinBoardGets(c *gin.Context) { fp := rt.Center.BuiltinIntegrationsDir if fp == "" { fp = path.Join(runner.Cwd, "integrations") } var fileList []string dirList, err := file.DirsUnder(fp) ginx.Dangerous(err) for _, dir := range dirList { files, err := file.FilesUnder(fp + "/" + dir + "/dashboards") ginx.Dangerous(err) fileList = append(fileList, files...) } names := make([]string, 0, len(fileList)) for _, f := range fileList { if !strings.HasSuffix(f, ".json") { continue } name := strings.TrimSuffix(f, ".json") names = append(names, name) } ginx.NewRender(c).Data(names, nil) } type AlertCate struct { Name string `json:"name"` IconUrl string `json:"icon_url"` AlertRules []models.AlertRule `json:"alert_rules"` Favorite bool `json:"favorite"` } func (rt *Router) builtinAlertCateGets(c *gin.Context) { fp := rt.Center.BuiltinIntegrationsDir if fp == "" { fp = path.Join(runner.Cwd, "integrations") } me := c.MustGet("user").(*models.User) builtinFavoritesMap, err := models.BuiltinCateGetByUserId(rt.Ctx, me.Id) if err != nil { logger.Warningf("get builtin favorites fail: %v", err) } var alertCates []AlertCate dirList, err := file.DirsUnder(fp) ginx.Dangerous(err) for _, dir := range dirList { var alertCate AlertCate alertCate.Name = dir files, err := file.FilesUnder(fp + "/" + dir + "/alerts") ginx.Dangerous(err) var alertRules []models.AlertRule for _, f := range files { fn := fp + "/" + dir + "/alerts/" + f content, err := file.ReadBytes(fn) if err != nil { logger.Warningf("add board fail: %v", err) continue } var ars []models.AlertRule err = json.Unmarshal(content, &ars) if err != nil { logger.Warningf("add board:%s fail: %v", fn, err) continue } alertRules = append(alertRules, ars...) } alertCate.AlertRules = alertRules iconFiles, _ := file.FilesUnder(fp + "/" + dir + "/icon") if len(iconFiles) > 0 { alertCate.IconUrl = fmt.Sprintf("/api/n9e/integrations/icon/%s/%s", dir, iconFiles[0]) } if _, ok := builtinFavoritesMap[dir]; ok { alertCate.Favorite = true } alertCates = append(alertCates, alertCate) } ginx.NewRender(c).Data(alertCates, nil) } type builtinAlertRulesList struct { Name string `json:"name"` IconUrl string `json:"icon_url"` AlertRules map[string][]models.AlertRule `json:"alert_rules"` Favorite bool `json:"favorite"` } func (rt *Router) builtinAlertRules(c *gin.Context) { fp := rt.Center.BuiltinIntegrationsDir if fp == "" { fp = path.Join(runner.Cwd, "integrations") } me := c.MustGet("user").(*models.User) builtinFavoritesMap, err := models.BuiltinCateGetByUserId(rt.Ctx, me.Id) if err != nil { logger.Warningf("get builtin favorites fail: %v", err) } var alertCates []builtinAlertRulesList dirList, err := file.DirsUnder(fp) ginx.Dangerous(err) for _, dir := range dirList { var alertCate builtinAlertRulesList alertCate.Name = dir files, err := file.FilesUnder(fp + "/" + dir + "/alerts") ginx.Dangerous(err) if len(files) == 0 { continue } alertRules := make(map[string][]models.AlertRule) for _, f := range files { fn := fp + "/" + dir + "/alerts/" + f content, err := file.ReadBytes(fn) if err != nil { logger.Warningf("add board fail: %v", err) continue } var ars []models.AlertRule err = json.Unmarshal(content, &ars) if err != nil { logger.Warningf("add board:%s fail: %v", fn, err) continue } alertRules[strings.TrimSuffix(f, ".json")] = ars } alertCate.AlertRules = alertRules iconFiles, _ := file.FilesUnder(fp + "/" + dir + "/icon") if len(iconFiles) > 0 { alertCate.IconUrl = fmt.Sprintf("/api/n9e/integrations/icon/%s/%s", dir, iconFiles[0]) } if _, ok := builtinFavoritesMap[dir]; ok { alertCate.Favorite = true } alertCates = append(alertCates, alertCate) } ginx.NewRender(c).Data(alertCates, nil) } // read the json file content func (rt *Router) builtinBoardGet(c *gin.Context) { name := ginx.UrlParamStr(c, "name") dirpath := rt.Center.BuiltinIntegrationsDir if dirpath == "" { dirpath = path.Join(runner.Cwd, "integrations") } dirList, err := file.DirsUnder(dirpath) ginx.Dangerous(err) for _, dir := range dirList { jsonFile := dirpath + "/" + dir + "/dashboards/" + name + ".json" if file.IsExist(jsonFile) { body, err := file.ReadString(jsonFile) ginx.NewRender(c).Data(body, err) return } } ginx.Bomb(http.StatusBadRequest, "%s not found", name) } func (rt *Router) builtinIcon(c *gin.Context) { fp := rt.Center.BuiltinIntegrationsDir if fp == "" { fp = path.Join(runner.Cwd, "integrations") } cate := ginx.UrlParamStr(c, "cate") iconPath := fp + "/" + cate + "/icon/" + ginx.UrlParamStr(c, "name") c.File(path.Join(iconPath)) } func (rt *Router) builtinMarkdown(c *gin.Context) { fp := rt.Center.BuiltinIntegrationsDir if fp == "" { fp = path.Join(runner.Cwd, "integrations") } cate := ginx.UrlParamStr(c, "cate") var markdown []byte markdownDir := fp + "/" + cate + "/markdown" markdownFiles, err := file.FilesUnder(markdownDir) if err != nil { logger.Warningf("get markdown fail: %v", err) } else if len(markdownFiles) > 0 { f := markdownFiles[0] fn := markdownDir + "/" + f markdown, err = file.ReadBytes(fn) if err != nil { logger.Warningf("get collect fail: %v", err) } } ginx.NewRender(c).Data(string(markdown), nil) } ================================================ FILE: center/router/router_builtin_component.go ================================================ package router import ( "net/http" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "gorm.io/gorm" ) const SYSTEM = "system" func (rt *Router) builtinComponentsAdd(c *gin.Context) { var lst []models.BuiltinComponent ginx.BindJSON(c, &lst) username := Username(c) count := len(lst) if count == 0 { ginx.Bomb(http.StatusBadRequest, "input json is empty") } reterr := make(map[string]string) for i := 0; i < count; i++ { if err := lst[i].Add(rt.Ctx, username); err != nil { reterr[lst[i].Ident] = err.Error() } } ginx.NewRender(c).Data(reterr, nil) } func (rt *Router) builtinComponentsGets(c *gin.Context) { query := ginx.QueryStr(c, "query", "") disabled := ginx.QueryInt(c, "disabled", -1) bc, err := models.BuiltinComponentGets(rt.Ctx, query, disabled) ginx.Dangerous(err) ginx.NewRender(c).Data(bc, nil) } func (rt *Router) builtinComponentsPut(c *gin.Context) { var req models.BuiltinComponent ginx.BindJSON(c, &req) bc, err := models.BuiltinComponentGet(rt.Ctx, "id = ?", req.ID) ginx.Dangerous(err) if bc == nil { ginx.NewRender(c, http.StatusNotFound).Message("No such builtin component") return } if bc.CreatedBy == SYSTEM { req.Ident = bc.Ident } username := Username(c) req.UpdatedBy = username err = models.DB(rt.Ctx).Transaction(func(tx *gorm.DB) error { tCtx := &ctx.Context{ DB: tx, } txErr := models.BuiltinMetricBatchUpdateColumn(tCtx, "typ", bc.Ident, req.Ident, req.UpdatedBy) if txErr != nil { return txErr } txErr = bc.Update(tCtx, req) if txErr != nil { return txErr } return nil }) ginx.NewRender(c).Message(err) } func (rt *Router) builtinComponentsDel(c *gin.Context) { var req idsForm ginx.BindJSON(c, &req) req.Verify() ginx.NewRender(c).Message(models.BuiltinComponentDels(rt.Ctx, req.Ids)) } ================================================ FILE: center/router/router_builtin_metric_filter.go ================================================ package router import ( "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/prom" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) func (rt *Router) metricFilterGets(c *gin.Context) { lst, err := models.MetricFilterGets(rt.Ctx, "") ginx.Dangerous(err) me := c.MustGet("user").(*models.User) gids, err := models.MyGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) arr := make([]models.MetricFilter, 0) for _, f := range lst { if me.Username == f.CreateBy { arr = append(arr, f) continue } if HasPerm(gids, f.GroupsPerm, false) { arr = append(arr, f) } } models.FillUpdateByNicknames(rt.Ctx, arr) ginx.NewRender(c).Data(arr, err) } func (rt *Router) metricFilterAdd(c *gin.Context) { var f models.MetricFilter ginx.BindJSON(c, &f) me := c.MustGet("user").(*models.User) f.CreateBy = me.Username f.UpdateBy = me.Username ginx.Dangerous(f.Add(rt.Ctx)) ginx.NewRender(c).Data(f, nil) } func (rt *Router) metricFilterDel(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) f.Verify() me := c.MustGet("user").(*models.User) for _, id := range f.Ids { old, err := models.MetricFilterGet(rt.Ctx, id) ginx.Dangerous(err) if me.Username != old.CreateBy { gids, err := models.MyGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) if !HasPerm(gids, old.GroupsPerm, true) { ginx.NewRender(c).Message("forbidden") return } } } ginx.NewRender(c).Message(models.MetricFilterDel(rt.Ctx, f.Ids)) } func (rt *Router) metricFilterPut(c *gin.Context) { var f models.MetricFilter ginx.BindJSON(c, &f) me := c.MustGet("user").(*models.User) old, err := models.MetricFilterGet(rt.Ctx, f.ID) ginx.Dangerous(err) if me.Username != old.CreateBy { gids, err := models.MyGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) if !HasPerm(gids, old.GroupsPerm, true) { ginx.NewRender(c).Message("forbidden") return } } f.UpdateBy = me.Username ginx.NewRender(c).Message(f.Update(rt.Ctx)) } type metricPromqlReq struct { LabelFilter string `json:"label_filter"` Promql string `json:"promql"` } func (rt *Router) getMetricPromql(c *gin.Context) { var req metricPromqlReq ginx.BindJSON(c, &req) promql := prom.AddLabelToPromQL(req.LabelFilter, req.Promql) ginx.NewRender(c).Data(promql, nil) } func HasPerm(gids []int64, gps []models.GroupPerm, checkWrite bool) bool { gmap := make(map[int64]struct{}) for _, gp := range gps { if checkWrite && !gp.Write { continue } gmap[gp.Gid] = struct{}{} } for _, gid := range gids { if _, ok := gmap[gid]; ok { return true } } return false } ================================================ FILE: center/router/router_builtin_metrics.go ================================================ package router import ( "net/http" "sort" "time" "github.com/ccfos/nightingale/v6/center/integration" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/i18n" ) // single or import func (rt *Router) builtinMetricsAdd(c *gin.Context) { var lst []models.BuiltinMetric ginx.BindJSON(c, &lst) username := Username(c) count := len(lst) if count == 0 { ginx.Bomb(http.StatusBadRequest, "input json is empty") } lang := c.GetHeader("X-Language") if lang == "" { lang = "zh_CN" } reterr := make(map[string]string) for i := 0; i < count; i++ { lst[i].Lang = lang lst[i].UUID = time.Now().UnixMicro() if err := lst[i].Add(rt.Ctx, username); err != nil { reterr[lst[i].Name] = i18n.Sprintf(c.GetHeader("X-Language"), err.Error()) } } ginx.NewRender(c).Data(reterr, nil) } func (rt *Router) builtinMetricsGets(c *gin.Context) { collector := ginx.QueryStr(c, "collector", "") typ := ginx.QueryStr(c, "typ", "") query := ginx.QueryStr(c, "query", "") limit := ginx.QueryInt(c, "limit", 20) lang := c.GetHeader("X-Language") unit := ginx.QueryStr(c, "unit", "") if lang == "" { lang = "zh_CN" } bmInDB, err := models.BuiltinMetricGets(rt.Ctx, "", collector, typ, query, unit) ginx.Dangerous(err) bm, total, err := integration.BuiltinPayloadInFile.BuiltinMetricGets(bmInDB, lang, collector, typ, query, unit, limit, ginx.Offset(c, limit)) ginx.Dangerous(err) ginx.NewRender(c).Data(gin.H{ "list": bm, "total": total, }, nil) } func (rt *Router) builtinMetricsPut(c *gin.Context) { var req models.BuiltinMetric ginx.BindJSON(c, &req) bm, err := models.BuiltinMetricGet(rt.Ctx, "id = ?", req.ID) ginx.Dangerous(err) if bm == nil { ginx.NewRender(c, http.StatusNotFound).Message("No such builtin metric") return } username := Username(c) req.UpdatedBy = username ginx.NewRender(c).Message(bm.Update(rt.Ctx, req)) } func (rt *Router) builtinMetricsDel(c *gin.Context) { var req idsForm ginx.BindJSON(c, &req) req.Verify() ginx.NewRender(c).Message(models.BuiltinMetricDels(rt.Ctx, req.Ids)) } func (rt *Router) builtinMetricsDefaultTypes(c *gin.Context) { lst := []string{ "Linux", "Procstat", "cAdvisor", "Ping", "MySQL", "ClickHouse", } ginx.NewRender(c).Data(lst, nil) } func (rt *Router) builtinMetricsTypes(c *gin.Context) { collector := ginx.QueryStr(c, "collector", "") query := ginx.QueryStr(c, "query", "") lang := c.GetHeader("X-Language") metricTypeListInDB, err := models.BuiltinMetricTypes(rt.Ctx, lang, collector, query) ginx.Dangerous(err) metricTypeListInFile := integration.BuiltinPayloadInFile.BuiltinMetricTypes(lang, collector, query) typeMap := make(map[string]struct{}) for _, metricType := range metricTypeListInDB { typeMap[metricType] = struct{}{} } for _, metricType := range metricTypeListInFile { typeMap[metricType] = struct{}{} } metricTypeList := make([]string, 0, len(typeMap)) for metricType := range typeMap { metricTypeList = append(metricTypeList, metricType) } sort.Strings(metricTypeList) ginx.NewRender(c).Data(metricTypeList, nil) } func (rt *Router) builtinMetricsCollectors(c *gin.Context) { typ := ginx.QueryStr(c, "typ", "") query := ginx.QueryStr(c, "query", "") lang := c.GetHeader("X-Language") collectorListInDB, err := models.BuiltinMetricCollectors(rt.Ctx, lang, typ, query) ginx.Dangerous(err) collectorListInFile := integration.BuiltinPayloadInFile.BuiltinMetricCollectors(lang, typ, query) collectorMap := make(map[string]struct{}) for _, collector := range collectorListInDB { collectorMap[collector] = struct{}{} } for _, collector := range collectorListInFile { collectorMap[collector] = struct{}{} } collectorList := make([]string, 0, len(collectorMap)) for collector := range collectorMap { collectorList = append(collectorList, collector) } sort.Strings(collectorList) ginx.NewRender(c).Data(collectorList, nil) } ================================================ FILE: center/router/router_builtin_payload.go ================================================ package router import ( "encoding/json" "net/http" "strings" "time" "github.com/BurntSushi/toml" "github.com/ccfos/nightingale/v6/center/integration" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/i18n" ) type Board struct { Name string `json:"name"` Tags string `json:"tags"` Configs interface{} `json:"configs"` UUID int64 `json:"uuid"` Note string `json:"note"` } func (rt *Router) builtinPayloadsAdd(c *gin.Context) { var lst []models.BuiltinPayload ginx.BindJSON(c, &lst) username := Username(c) count := len(lst) if count == 0 { ginx.Bomb(http.StatusBadRequest, "input json is empty") } reterr := make(map[string]string) for i := 0; i < count; i++ { if lst[i].Type == "alert" { if strings.HasPrefix(strings.TrimSpace(lst[i].Content), "[") { // 处理多个告警规则模板的情况 alertRules := []models.AlertRule{} if err := json.Unmarshal([]byte(lst[i].Content), &alertRules); err != nil { reterr[lst[i].Name] = err.Error() } for _, rule := range alertRules { if rule.UUID == 0 { rule.UUID = time.Now().UnixMicro() } contentBytes, err := json.Marshal(rule) if err != nil { reterr[rule.Name] = err.Error() continue } bp := models.BuiltinPayload{ Type: lst[i].Type, ComponentID: lst[i].ComponentID, Cate: lst[i].Cate, Name: rule.Name, Tags: rule.AppendTags, UUID: rule.UUID, Content: string(contentBytes), CreatedBy: username, UpdatedBy: username, } if err := bp.Add(rt.Ctx, username); err != nil { reterr[bp.Name] = i18n.Sprintf(c.GetHeader("X-Language"), err.Error()) } } continue } alertRule := models.AlertRule{} if err := json.Unmarshal([]byte(lst[i].Content), &alertRule); err != nil { reterr[lst[i].Name] = err.Error() continue } if alertRule.UUID == 0 { alertRule.UUID = time.Now().UnixMicro() } contentBytes, err := json.Marshal(alertRule) if err != nil { reterr[alertRule.Name] = err.Error() continue } bp := models.BuiltinPayload{ Type: lst[i].Type, ComponentID: lst[i].ComponentID, Cate: lst[i].Cate, Name: alertRule.Name, Tags: alertRule.AppendTags, UUID: alertRule.UUID, Content: string(contentBytes), CreatedBy: username, UpdatedBy: username, } if err := bp.Add(rt.Ctx, username); err != nil { reterr[bp.Name] = i18n.Sprintf(c.GetHeader("X-Language"), err.Error()) } } else if lst[i].Type == "dashboard" { if strings.HasPrefix(strings.TrimSpace(lst[i].Content), "[") { // 处理多个告警规则模板的情况 dashboards := []Board{} if err := json.Unmarshal([]byte(lst[i].Content), &dashboards); err != nil { reterr[lst[i].Name] = err.Error() } for _, dashboard := range dashboards { if dashboard.UUID == 0 { dashboard.UUID = time.Now().UnixMicro() } contentBytes, err := json.Marshal(dashboard) if err != nil { reterr[dashboard.Name] = err.Error() continue } bp := models.BuiltinPayload{ Type: lst[i].Type, ComponentID: lst[i].ComponentID, Cate: lst[i].Cate, Name: dashboard.Name, Tags: dashboard.Tags, UUID: dashboard.UUID, Note: dashboard.Note, Content: string(contentBytes), CreatedBy: username, UpdatedBy: username, } if err := bp.Add(rt.Ctx, username); err != nil { reterr[bp.Name] = i18n.Sprintf(c.GetHeader("X-Language"), err.Error()) } } continue } dashboard := Board{} if err := json.Unmarshal([]byte(lst[i].Content), &dashboard); err != nil { reterr[lst[i].Name] = i18n.Sprintf(c.GetHeader("X-Language"), err.Error()) continue } if dashboard.UUID == 0 { dashboard.UUID = time.Now().UnixMicro() } contentBytes, err := json.Marshal(dashboard) if err != nil { reterr[dashboard.Name] = err.Error() continue } bp := models.BuiltinPayload{ Type: lst[i].Type, ComponentID: lst[i].ComponentID, Cate: lst[i].Cate, Name: dashboard.Name, Tags: dashboard.Tags, UUID: dashboard.UUID, Note: dashboard.Note, Content: string(contentBytes), CreatedBy: username, UpdatedBy: username, } if err := bp.Add(rt.Ctx, username); err != nil { reterr[bp.Name] = i18n.Sprintf(c.GetHeader("X-Language"), err.Error()) } } else { if lst[i].Type == "collect" { c := make(map[string]interface{}) if _, err := toml.Decode(lst[i].Content, &c); err != nil { reterr[lst[i].Name] = err.Error() continue } } if err := lst[i].Add(rt.Ctx, username); err != nil { reterr[lst[i].Name] = i18n.Sprintf(c.GetHeader("X-Language"), err.Error()) } } } ginx.NewRender(c).Data(reterr, nil) } func (rt *Router) builtinPayloadsGets(c *gin.Context) { typ := ginx.QueryStr(c, "type", "") if typ == "" { ginx.Bomb(http.StatusBadRequest, "type is required") return } ComponentID := ginx.QueryInt64(c, "component_id", 0) cate := ginx.QueryStr(c, "cate", "") query := ginx.QueryStr(c, "query", "") lst, err := models.BuiltinPayloadGets(rt.Ctx, uint64(ComponentID), typ, cate, query) ginx.Dangerous(err) lstInFile, err := integration.BuiltinPayloadInFile.GetBuiltinPayload(typ, cate, query, uint64(ComponentID)) ginx.Dangerous(err) if len(lstInFile) > 0 { lst = append(lst, lstInFile...) } ginx.NewRender(c).Data(lst, nil) } func (rt *Router) builtinPayloadcatesGet(c *gin.Context) { typ := ginx.QueryStr(c, "type", "") ComponentID := ginx.QueryInt64(c, "component_id", 0) cates, err := models.BuiltinPayloadCates(rt.Ctx, typ, uint64(ComponentID)) ginx.Dangerous(err) catesInFile, err := integration.BuiltinPayloadInFile.GetBuiltinPayloadCates(typ, uint64(ComponentID)) ginx.Dangerous(err) // 使用 map 进行去重 cateMap := make(map[string]bool) // 添加数据库中的分类 for _, cate := range cates { cateMap[cate] = true } // 添加文件中的分类 for _, cate := range catesInFile { cateMap[cate] = true } // 将去重后的结果转换回切片 result := make([]string, 0, len(cateMap)) for cate := range cateMap { result = append(result, cate) } ginx.NewRender(c).Data(result, nil) } func (rt *Router) builtinPayloadsPut(c *gin.Context) { var req models.BuiltinPayload ginx.BindJSON(c, &req) bp, err := models.BuiltinPayloadGet(rt.Ctx, "id = ?", req.ID) ginx.Dangerous(err) if bp == nil { ginx.NewRender(c, http.StatusNotFound).Message("No such builtin payload") return } if req.Type == "alert" { alertRule := models.AlertRule{} if err := json.Unmarshal([]byte(req.Content), &alertRule); err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } req.Name = alertRule.Name req.Tags = alertRule.AppendTags } else if req.Type == "dashboard" { dashboard := Board{} if err := json.Unmarshal([]byte(req.Content), &dashboard); err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } req.Name = dashboard.Name req.Tags = dashboard.Tags req.Note = dashboard.Note } else if req.Type == "collect" { c := make(map[string]interface{}) if _, err := toml.Decode(req.Content, &c); err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } } username := Username(c) req.UpdatedBy = username ginx.NewRender(c).Message(bp.Update(rt.Ctx, req)) } func (rt *Router) builtinPayloadsDel(c *gin.Context) { var req idsForm ginx.BindJSON(c, &req) req.Verify() ginx.NewRender(c).Message(models.BuiltinPayloadDels(rt.Ctx, req.Ids)) } func (rt *Router) builtinPayloadsGetByUUID(c *gin.Context) { uuid := ginx.QueryInt64(c, "uuid") bp, err := models.BuiltinPayloadGet(rt.Ctx, "uuid = ?", uuid) ginx.Dangerous(err) if bp != nil { ginx.NewRender(c).Data(bp, nil) } else { ginx.NewRender(c).Data(integration.BuiltinPayloadInFile.IndexData[uuid], nil) } } ================================================ FILE: center/router/router_busi_group.go ================================================ package router import ( "net/http" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/strx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/logger" ) type busiGroupForm struct { Name string `json:"name" binding:"required"` LabelEnable int `json:"label_enable"` LabelValue string `json:"label_value"` Members []models.BusiGroupMember `json:"members"` } func (rt *Router) busiGroupAdd(c *gin.Context) { var f busiGroupForm ginx.BindJSON(c, &f) if len(f.Members) == 0 { ginx.Bomb(http.StatusBadRequest, "members empty") } rwhas := false for i := 0; i < len(f.Members); i++ { if f.Members[i].PermFlag == "rw" { rwhas = true break } } if !rwhas { ginx.Bomb(http.StatusBadRequest, "At least one team have rw permission") } username := c.MustGet("username").(string) ginx.Dangerous(models.BusiGroupAdd(rt.Ctx, f.Name, f.LabelEnable, f.LabelValue, f.Members, username)) // 如果创建成功,拿着name去查,应该可以查到 newbg, err := models.BusiGroupGet(rt.Ctx, "name=?", f.Name) ginx.Dangerous(err) if newbg == nil { ginx.NewRender(c).Message("Failed to create BusiGroup(%s)", f.Name) return } ginx.NewRender(c).Data(newbg.Id, nil) } func (rt *Router) busiGroupPut(c *gin.Context) { var f busiGroupForm ginx.BindJSON(c, &f) username := c.MustGet("username").(string) targetbg := c.MustGet("busi_group").(*models.BusiGroup) ginx.NewRender(c).Message(targetbg.Update(rt.Ctx, f.Name, f.LabelEnable, f.LabelValue, username)) } func (rt *Router) busiGroupMemberAdd(c *gin.Context) { var members []models.BusiGroupMember ginx.BindJSON(c, &members) username := c.MustGet("username").(string) targetbg := c.MustGet("busi_group").(*models.BusiGroup) for i := 0; i < len(members); i++ { if members[i].BusiGroupId != targetbg.Id { ginx.Bomb(http.StatusBadRequest, "business group id invalid") } } ginx.NewRender(c).Message(targetbg.AddMembers(rt.Ctx, members, username)) } func (rt *Router) busiGroupMemberDel(c *gin.Context) { var members []models.BusiGroupMember ginx.BindJSON(c, &members) username := c.MustGet("username").(string) targetbg := c.MustGet("busi_group").(*models.BusiGroup) for i := 0; i < len(members); i++ { if members[i].BusiGroupId != targetbg.Id { ginx.Bomb(http.StatusBadRequest, "business group id invalid") } } ginx.NewRender(c).Message(targetbg.DelMembers(rt.Ctx, members, username)) } func (rt *Router) busiGroupDel(c *gin.Context) { username := c.MustGet("username").(string) targetbg := c.MustGet("busi_group").(*models.BusiGroup) err := targetbg.Del(rt.Ctx) if err != nil { logger.Infof("busi_group_delete fail: operator=%s, group_name=%s error=%v", username, targetbg.Name, err) } else { logger.Infof("busi_group_delete succ: operator=%s, group_name=%s", username, targetbg.Name) } ginx.NewRender(c).Message(err) } // 我是超管、或者我是业务组成员 func (rt *Router) busiGroupGets(c *gin.Context) { limit := ginx.QueryInt(c, "limit", defaultLimit) query := ginx.QueryStr(c, "query", "") all := ginx.QueryBool(c, "all", false) me := c.MustGet("user").(*models.User) lst, err := me.BusiGroups(rt.Ctx, limit, query, all) if len(lst) == 0 { lst = []models.BusiGroup{} } if err == nil { models.FillUpdateByNicknames(rt.Ctx, lst) } ginx.NewRender(c).Data(lst, err) } func (rt *Router) busiGroupGetsByService(c *gin.Context) { lst, err := models.BusiGroupGetAll(rt.Ctx) ginx.NewRender(c).Data(lst, err) } // 这个接口只有在活跃告警页面才调用,获取各个BG的活跃告警数量 func (rt *Router) busiGroupAlertingsGets(c *gin.Context) { ids := ginx.QueryStr(c, "ids", "") ret, err := models.AlertNumbers(rt.Ctx, strx.IdsInt64ForAPI(ids)) ginx.NewRender(c).Data(ret, err) } func (rt *Router) busiGroupGet(c *gin.Context) { bg := BusiGroup(rt.Ctx, ginx.UrlParamInt64(c, "id")) ginx.Dangerous(bg.FillUserGroups(rt.Ctx)) ginx.NewRender(c).Data(bg, nil) } func (rt *Router) busiGroupsGetTags(c *gin.Context) { bgids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",") targetIdents, err := models.TargetIndentsGetByBgids(rt.Ctx, bgids) ginx.Dangerous(err) tags, err := models.TargetGetTags(rt.Ctx, targetIdents, true, "busigroup") ginx.Dangerous(err) ginx.NewRender(c).Data(tags, nil) } ================================================ FILE: center/router/router_captcha.go ================================================ package router import ( "context" "time" "github.com/ccfos/nightingale/v6/storage" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" captcha "github.com/mojocn/base64Captcha" "github.com/toolkits/pkg/logger" ) type CaptchaRedisStore struct { redis storage.Redis } func (s *CaptchaRedisStore) Set(id string, value string) error { ctx := context.Background() err := s.redis.Set(ctx, id, value, time.Duration(300*time.Second)).Err() if err != nil { logger.Errorf("captcha id set to redis error : %s", err.Error()) return err } return nil } func (s *CaptchaRedisStore) Get(id string, clear bool) string { ctx := context.Background() val, err := s.redis.Get(ctx, id).Result() if err != nil { logger.Errorf("captcha id get from redis error : %s", err.Error()) return "" } if clear { s.redis.Del(ctx, id) } return val } func (s *CaptchaRedisStore) Verify(id, answer string, clear bool) bool { old := s.Get(id, clear) return old == answer } func (rt *Router) newCaptchaRedisStore() *CaptchaRedisStore { if captchaStore == nil { captchaStore = &CaptchaRedisStore{redis: rt.Redis} } return captchaStore } var captchaStore *CaptchaRedisStore type CaptchaReqBody struct { Id string VerifyValue string } // 生成图形验证码 func (rt *Router) generateCaptcha(c *gin.Context) { var driver = captcha.NewDriverMath(60, 200, 0, captcha.OptionShowHollowLine, nil, nil, []string{"wqy-microhei.ttc"}) cc := captcha.NewCaptcha(driver, rt.newCaptchaRedisStore()) //data:image/png;base64 id, b64s, _, err := cc.Generate() if err != nil { ginx.NewRender(c).Message(err) return } ginx.NewRender(c).Data(gin.H{ "imgdata": b64s, "captchaid": id, }, nil) } // 验证 func (rt *Router) captchaVerify(c *gin.Context) { var param CaptchaReqBody ginx.BindJSON(c, ¶m) //verify the captcha if captchaStore.Verify(param.Id, param.VerifyValue, true) { ginx.NewRender(c).Message("") return } ginx.NewRender(c).Message("incorrect verification code") } // 验证码开关 func (rt *Router) ifShowCaptcha(c *gin.Context) { if rt.HTTP.ShowCaptcha.Enable { ginx.NewRender(c).Data(gin.H{ "show": true, }, nil) return } ginx.NewRender(c).Data(gin.H{ "show": false, }, nil) } // 验证 func CaptchaVerify(id string, value string) bool { //verify the captcha return captchaStore.Verify(id, value, true) } ================================================ FILE: center/router/router_chart_share.go ================================================ package router import ( "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/strx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) func (rt *Router) chartShareGets(c *gin.Context) { ids := ginx.QueryStr(c, "ids", "") lst, err := models.ChartShareGetsByIds(rt.Ctx, strx.IdsInt64ForAPI(ids, ",")) ginx.NewRender(c).Data(lst, err) } type chartShareForm struct { DatasourceId int64 `json:"datasource_id"` Configs string `json:"configs"` } func (rt *Router) chartShareAdd(c *gin.Context) { username := c.MustGet("username").(string) var forms []chartShareForm ginx.BindJSON(c, &forms) ids := []int64{} now := time.Now().Unix() for _, f := range forms { chart := models.ChartShare{ DatasourceId: f.DatasourceId, Configs: f.Configs, CreateBy: username, CreateAt: now, } ginx.Dangerous(chart.Add(rt.Ctx)) ids = append(ids, chart.Id) } ginx.NewRender(c).Data(ids, nil) } ================================================ FILE: center/router/router_config.go ================================================ package router import ( "encoding/json" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) func (rt *Router) notifyChannelsGets(c *gin.Context) { var labelAndKeys []models.LabelAndKey cval, err := models.ConfigsGet(rt.Ctx, models.NOTIFYCHANNEL) ginx.Dangerous(err) if cval == "" { ginx.NewRender(c).Data(labelAndKeys, nil) return } var notifyChannels []models.NotifyChannel err = json.Unmarshal([]byte(cval), ¬ifyChannels) ginx.Dangerous(err) for _, v := range notifyChannels { if v.Hide { continue } var labelAndKey models.LabelAndKey labelAndKey.Label = v.Name labelAndKey.Key = v.Ident labelAndKeys = append(labelAndKeys, labelAndKey) } ginx.NewRender(c).Data(labelAndKeys, nil) } func (rt *Router) contactKeysGets(c *gin.Context) { var labelAndKeys []models.LabelAndKey cval, err := models.ConfigsGet(rt.Ctx, models.NOTIFYCONTACT) ginx.Dangerous(err) if cval == "" { ginx.NewRender(c).Data(labelAndKeys, nil) return } var notifyContacts []models.NotifyContact err = json.Unmarshal([]byte(cval), ¬ifyContacts) ginx.Dangerous(err) for _, v := range notifyContacts { if v.Hide { continue } var labelAndKey models.LabelAndKey labelAndKey.Label = v.Name labelAndKey.Key = v.Ident labelAndKeys = append(labelAndKeys, labelAndKey) } ginx.NewRender(c).Data(labelAndKeys, nil) } func (rt *Router) siteInfo(c *gin.Context) { config, err := models.ConfigsGet(rt.Ctx, "site_info") ginx.NewRender(c).Data(config, err) } ================================================ FILE: center/router/router_configs.go ================================================ package router import ( "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) const EMBEDDEDDASHBOARD = "embedded-dashboards" func (rt *Router) configsGet(c *gin.Context) { prefix := ginx.QueryStr(c, "prefix", "") limit := ginx.QueryInt(c, "limit", 10) configs, err := models.ConfigsGets(rt.Ctx, prefix, limit, ginx.Offset(c, limit)) if err == nil { models.FillUpdateByNicknames(rt.Ctx, configs) } ginx.NewRender(c).Data(configs, err) } func (rt *Router) configGet(c *gin.Context) { id := ginx.UrlParamInt64(c, "id") configs, err := models.ConfigGet(rt.Ctx, id) ginx.NewRender(c).Data(configs, err) } func (rt *Router) configGetAll(c *gin.Context) { config, err := models.ConfigsGetAll(rt.Ctx) ginx.NewRender(c).Data(config, err) } func (rt *Router) configGetByKey(c *gin.Context) { config, err := models.ConfigsGet(rt.Ctx, ginx.QueryStr(c, "key")) ginx.NewRender(c).Data(config, err) } func (rt *Router) configPutByKey(c *gin.Context) { var f models.Configs ginx.BindJSON(c, &f) username := c.MustGet("username").(string) ginx.NewRender(c).Message(models.ConfigsSetWithUname(rt.Ctx, f.Ckey, f.Cval, username)) } func (rt *Router) embeddedDashboardsGet(c *gin.Context) { config, err := models.ConfigsGet(rt.Ctx, EMBEDDEDDASHBOARD) ginx.NewRender(c).Data(config, err) } func (rt *Router) embeddedDashboardsPut(c *gin.Context) { var f models.Configs ginx.BindJSON(c, &f) username := c.MustGet("username").(string) ginx.NewRender(c).Message(models.ConfigsSetWithUname(rt.Ctx, EMBEDDEDDASHBOARD, f.Cval, username)) } func (rt *Router) configsDel(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) ginx.NewRender(c).Message(models.ConfigsDel(rt.Ctx, f.Ids)) } func (rt *Router) configsPut(c *gin.Context) { //for APIForService var arr []models.Configs ginx.BindJSON(c, &arr) username := c.GetString("user") if username == "" { username = "default" } now := time.Now().Unix() for i := 0; i < len(arr); i++ { arr[i].UpdateBy = username arr[i].UpdateAt = now ginx.Dangerous(arr[i].Update(rt.Ctx)) } ginx.NewRender(c).Message(nil) } func (rt *Router) configsPost(c *gin.Context) { //for APIForService var arr []models.Configs ginx.BindJSON(c, &arr) username := c.GetString("user") if username == "" { username = "default" } now := time.Now().Unix() for i := 0; i < len(arr); i++ { arr[i].CreateBy = username arr[i].UpdateBy = username arr[i].CreateAt = now arr[i].UpdateAt = now ginx.Dangerous(arr[i].Add(rt.Ctx)) } ginx.NewRender(c).Message(nil) } ================================================ FILE: center/router/router_crypto.go ================================================ package router import ( "github.com/ccfos/nightingale/v6/pkg/secu" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) type confPropCrypto struct { Data string `json:"data" binding:"required"` Key string `json:"key" binding:"required"` } func (rt *Router) confPropEncrypt(c *gin.Context) { var f confPropCrypto ginx.BindJSON(c, &f) k := len(f.Key) switch k { default: c.String(400, "The key length should be 16, 24 or 32") return case 16, 24, 32: break } s, err := secu.DealWithEncrypt(f.Data, f.Key) if err != nil { c.String(500, err.Error()) } c.JSON(200, gin.H{ "src": f.Data, "key": f.Key, "encrypt": s, }) } func (rt *Router) confPropDecrypt(c *gin.Context) { var f confPropCrypto ginx.BindJSON(c, &f) k := len(f.Key) switch k { default: c.String(400, "The key length should be 16, 24 or 32") return case 16, 24, 32: break } s, err := secu.DealWithDecrypt(f.Data, f.Key) if err != nil { c.String(500, err.Error()) } c.JSON(200, gin.H{ "src": f.Data, "key": f.Key, "decrypt": s, }) } ================================================ FILE: center/router/router_dash_annotation.go ================================================ package router import ( "fmt" "net/http" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) func checkAnnotationPermission(c *gin.Context, ctx *ctx.Context, dashboardId int64) { dashboard, err := models.BoardGetByID(ctx, dashboardId) if err != nil { ginx.Bomb(http.StatusInternalServerError, "failed to get dashboard: %v", err) } if dashboard == nil { ginx.Bomb(http.StatusNotFound, "dashboard not found") } bg := BusiGroup(ctx, dashboard.GroupId) me := c.MustGet("user").(*models.User) can, err := me.CanDoBusiGroup(ctx, bg, "rw") ginx.Dangerous(err) if !can { ginx.Bomb(http.StatusForbidden, "forbidden") } } func (rt *Router) dashAnnotationAdd(c *gin.Context) { var f models.DashAnnotation ginx.BindJSON(c, &f) username := c.MustGet("username").(string) now := time.Now().Unix() checkAnnotationPermission(c, rt.Ctx, f.DashboardId) f.CreateBy = username f.CreateAt = now f.UpdateBy = username f.UpdateAt = now ginx.NewRender(c).Data(f.Id, f.Add(rt.Ctx)) } func (rt *Router) dashAnnotationGets(c *gin.Context) { dashboardId := ginx.QueryInt64(c, "dashboard_id") from := ginx.QueryInt64(c, "from") to := ginx.QueryInt64(c, "to") limit := ginx.QueryInt(c, "limit", 100) lst, err := models.DashAnnotationGets(rt.Ctx, dashboardId, from, to, limit) ginx.NewRender(c).Data(lst, err) } func (rt *Router) dashAnnotationPut(c *gin.Context) { var f models.DashAnnotation ginx.BindJSON(c, &f) id := ginx.UrlParamInt64(c, "id") annotation, err := getAnnotationById(rt.Ctx, id) ginx.Dangerous(err) checkAnnotationPermission(c, rt.Ctx, annotation.DashboardId) f.Id = id f.UpdateAt = time.Now().Unix() f.UpdateBy = c.MustGet("username").(string) ginx.NewRender(c).Message(f.Update(rt.Ctx)) } func (rt *Router) dashAnnotationDel(c *gin.Context) { id := ginx.UrlParamInt64(c, "id") annotation, err := getAnnotationById(rt.Ctx, id) ginx.Dangerous(err) checkAnnotationPermission(c, rt.Ctx, annotation.DashboardId) ginx.NewRender(c).Message(models.DashAnnotationDel(rt.Ctx, id)) } // 可以提取获取注释的通用方法 func getAnnotationById(ctx *ctx.Context, id int64) (*models.DashAnnotation, error) { annotation, err := models.DashAnnotationGet(ctx, "id=?", id) if err != nil { return nil, err } if annotation == nil { return nil, fmt.Errorf("annotation not found") } return annotation, nil } ================================================ FILE: center/router/router_dashboard.go ================================================ package router type ChartPure struct { Configs string `json:"configs"` Weight int `json:"weight"` } type ChartGroupPure struct { Name string `json:"name"` Weight int `json:"weight"` Charts []ChartPure `json:"charts"` } type DashboardPure struct { Name string `json:"name"` Tags string `json:"tags"` Configs string `json:"configs"` ChartGroups []ChartGroupPure `json:"chart_groups"` } ================================================ FILE: center/router/router_datasource.go ================================================ package router import ( "context" "crypto/tls" "encoding/base64" "encoding/json" "fmt" "io" "net/http" "net/url" "strings" "time" "github.com/ccfos/nightingale/v6/datasource/opensearch" "github.com/ccfos/nightingale/v6/dskit/clickhouse" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/i18n" "github.com/toolkits/pkg/logger" ) func (rt *Router) pluginList(c *gin.Context) { Render(c, rt.Center.Plugins, nil) } type listReq struct { Name string `json:"name"` Type string `json:"plugin_type"` Category string `json:"category"` } func (rt *Router) datasourceList(c *gin.Context) { if rt.DatasourceCache.DatasourceCheckHook(c) { Render(c, []int{}, nil) return } var req listReq ginx.BindJSON(c, &req) typ := req.Type category := req.Category name := req.Name user := c.MustGet("user").(*models.User) list, err := models.GetDatasourcesGetsBy(rt.Ctx, typ, category, name, "") Render(c, rt.DatasourceCache.DatasourceFilter(list, user), err) } func (rt *Router) datasourceGetsByService(c *gin.Context) { typ := ginx.QueryStr(c, "typ", "") lst, err := models.GetDatasourcesGetsBy(rt.Ctx, typ, "", "", "") openRsa := rt.Center.RSA.OpenRSA for _, item := range lst { if err := item.Encrypt(openRsa, rt.HTTP.RSA.RSAPublicKey); err != nil { logger.Errorf("datasource %+v encrypt failed: %v", item, err) continue } } ginx.NewRender(c).Data(lst, err) } func (rt *Router) datasourceRsaConfigGet(c *gin.Context) { if rt.Center.RSA.OpenRSA { publicKey := "" privateKey := "" if len(rt.HTTP.RSA.RSAPublicKey) > 0 { publicKey = base64.StdEncoding.EncodeToString(rt.HTTP.RSA.RSAPublicKey) } if len(rt.HTTP.RSA.RSAPrivateKey) > 0 { privateKey = base64.StdEncoding.EncodeToString(rt.HTTP.RSA.RSAPrivateKey) } logger.Debugf("OpenRSA=%v", rt.Center.RSA.OpenRSA) ginx.NewRender(c).Data(models.RsaConfig{ OpenRSA: rt.Center.RSA.OpenRSA, RSAPublicKey: publicKey, RSAPrivateKey: privateKey, RSAPassWord: rt.HTTP.RSA.RSAPassWord, }, nil) } else { ginx.NewRender(c).Data(models.RsaConfig{ OpenRSA: rt.Center.RSA.OpenRSA, }, nil) } } func (rt *Router) datasourceBriefs(c *gin.Context) { var dss []*models.Datasource list, err := models.GetDatasourcesGetsBy(rt.Ctx, "", "", "", "") ginx.Dangerous(err) for _, item := range list { item.AuthJson.BasicAuthPassword = "" if item.PluginType == models.PROMETHEUS { for k, v := range item.SettingsJson { if strings.HasPrefix(k, "prometheus.") { item.SettingsJson[strings.TrimPrefix(k, "prometheus.")] = v delete(item.SettingsJson, k) } } } else if item.PluginType == "cloudwatch" { for k := range item.SettingsJson { if !strings.Contains(k, "region") { delete(item.SettingsJson, k) } } } else { item.SettingsJson = nil } dss = append(dss, item) } if !rt.Center.AnonymousAccess.PromQuerier { user := c.MustGet("user").(*models.User) dss = rt.DatasourceCache.DatasourceFilter(dss, user) } ginx.NewRender(c).Data(dss, err) } func (rt *Router) datasourceUpsert(c *gin.Context) { if rt.DatasourceCache.DatasourceCheckHook(c) { Render(c, []int{}, nil) return } var req models.Datasource ginx.BindJSON(c, &req) username := Username(c) req.UpdatedBy = username var err error var count int64 if !req.ForceSave { if req.PluginType == models.PROMETHEUS || req.PluginType == models.LOKI || req.PluginType == models.TDENGINE { err = DatasourceCheck(c, req) if err != nil { Dangerous(c, err) return } } } for k, v := range req.SettingsJson { if strings.Contains(k, "cluster_name") { req.ClusterName = v.(string) break } } if req.PluginType == models.OPENSEARCH { b, err := json.Marshal(req.SettingsJson) if err != nil { logger.Warningf("marshal settings fail: %v", err) return } var os opensearch.OpenSearch err = json.Unmarshal(b, &os) if err != nil { logger.Warningf("unmarshal settings fail: %v", err) return } if len(os.Nodes) == 0 { logger.Warningf("nodes empty, %+v", req) return } req.HTTPJson = models.HTTP{ Timeout: os.Timeout, Url: os.Nodes[0], Headers: os.Headers, TLS: models.TLS{ SkipTlsVerify: os.TLS.SkipTlsVerify, }, } req.AuthJson = models.Auth{ BasicAuth: os.Basic.Enable, BasicAuthUser: os.Basic.Username, BasicAuthPassword: os.Basic.Password, } } if req.PluginType == models.CLICKHOUSE { b, err := json.Marshal(req.SettingsJson) if err != nil { logger.Warningf("marshal clickhouse settings failed: %v", err) Dangerous(c, err) return } var ckConfig clickhouse.Clickhouse err = json.Unmarshal(b, &ckConfig) if err != nil { logger.Warningf("unmarshal clickhouse settings failed: %v", err) Dangerous(c, err) return } // 检查ckconfig的nodes不应该以http://或https://开头 for _, addr := range ckConfig.Nodes { if strings.HasPrefix(addr, "http://") || strings.HasPrefix(addr, "https://") { err = fmt.Errorf("clickhouse node address should not start with http:// or https:// : %s", addr) logger.Warningf("clickhouse node address invalid: %v", err) Dangerous(c, err) return } } // InitCli 会自动检测并选择 HTTP 或 Native 协议 err = ckConfig.InitCli() if err != nil { logger.Warningf("clickhouse connection failed: %v", err) Dangerous(c, err) return } // 执行 SHOW DATABASES 测试连通性 _, err = ckConfig.ShowDatabases(context.Background()) if err != nil { logger.Warningf("clickhouse test query failed: %v", err) Dangerous(c, err) return } } if req.PluginType == models.ELASTICSEARCH { skipAuto := false // 若用户输入了version(version字符串存在且不为空),则不自动获取 if req.SettingsJson != nil { if v, ok := req.SettingsJson["version"]; ok { switch vv := v.(type) { case string: if strings.TrimSpace(vv) != "" { skipAuto = true } default: if strings.TrimSpace(fmt.Sprint(vv)) != "" { skipAuto = true } } } } if !skipAuto { version, err := getElasticsearchVersion(req, 10*time.Second) if err != nil { logger.Warningf("failed to get elasticsearch version: %v", err) } else { if req.SettingsJson == nil { req.SettingsJson = make(map[string]interface{}) } req.SettingsJson["version"] = version } } } if req.Id == 0 { req.CreatedBy = username req.Status = "enabled" count, err = models.GetDatasourcesCountBy(rt.Ctx, "", "", req.Name) if err != nil { Render(c, nil, err) return } if count > 0 { Render(c, nil, "name already exists") return } err = req.Add(rt.Ctx) } else { err = req.Update(rt.Ctx, "name", "identifier", "description", "cluster_name", "settings", "http", "auth", "updated_by", "updated_at", "is_default", "weight") } Render(c, nil, err) } func DatasourceCheck(c *gin.Context, ds models.Datasource) error { if ds.PluginType == models.PROMETHEUS || ds.PluginType == models.LOKI || ds.PluginType == models.TDENGINE { if ds.HTTPJson.Url == "" { return fmt.Errorf("url is empty") } if !strings.HasPrefix(ds.HTTPJson.Url, "http") { return fmt.Errorf("url must start with http or https") } } // 使用 TLS 配置(支持 mTLS) tlsConfig, err := ds.HTTPJson.TLS.TLSConfig() if err != nil { return fmt.Errorf("failed to create TLS config: %v", err) } client := &http.Client{ Transport: &http.Transport{ TLSClientConfig: tlsConfig, }, } ds.HTTPJson.Url = strings.TrimRight(ds.HTTPJson.Url, "/") var fullURL string req, err := ds.HTTPJson.NewReq(&fullURL) if err != nil { logger.Errorf("Error creating request: %v", err) return fmt.Errorf("request urls:%v failed: %v", ds.HTTPJson.GetUrls(), err) } if ds.PluginType == models.PROMETHEUS { subPath := "/api/v1/query" query := url.Values{} if ds.HTTPJson.IsLoki() { subPath = "/api/v1/labels" } else { query.Add("query", "1+1") } fullURL = fmt.Sprintf("%s%s?%s", ds.HTTPJson.Url, subPath, query.Encode()) req, err = http.NewRequest("GET", fullURL, nil) if err != nil { logger.Errorf("Error creating request: %v", err) return fmt.Errorf("request url:%s failed: %v", fullURL, err) } } else if ds.PluginType == models.TDENGINE { fullURL = fmt.Sprintf("%s/rest/sql", ds.HTTPJson.Url) req, err = http.NewRequest("POST", fullURL, strings.NewReader("show databases")) if err != nil { logger.Errorf("Error creating request: %v", err) return fmt.Errorf("request url:%s failed: %v", fullURL, err) } } if ds.PluginType == models.LOKI { subPath := "/api/v1/labels" fullURL = fmt.Sprintf("%s%s", ds.HTTPJson.Url, subPath) req, err = http.NewRequest("GET", fullURL, nil) if err != nil { logger.Errorf("Error creating request: %v", err) if !strings.Contains(ds.HTTPJson.Url, "/loki") { lang := c.GetHeader("X-Language") return fmt.Errorf(i18n.Sprintf(lang, "/loki suffix is miss, please add /loki to the url: %s", ds.HTTPJson.Url+"/loki")) } return fmt.Errorf("request url:%s failed: %v", fullURL, err) } } if ds.AuthJson.BasicAuthUser != "" { req.SetBasicAuth(ds.AuthJson.BasicAuthUser, ds.AuthJson.BasicAuthPassword) } for k, v := range ds.HTTPJson.Headers { req.Header.Set(k, v) } resp, err := client.Do(req) if err != nil { logger.Errorf("Error making request: %v\n", err) return fmt.Errorf("request url:%s failed: %v", fullURL, err) } defer resp.Body.Close() if resp.StatusCode != 200 { logger.Errorf("Error making request: %v\n", resp.StatusCode) if resp.StatusCode == 404 && ds.PluginType == models.LOKI && !strings.Contains(ds.HTTPJson.Url, "/loki") { lang := c.GetHeader("X-Language") return fmt.Errorf(i18n.Sprintf(lang, "/loki suffix is miss, please add /loki to the url: %s", ds.HTTPJson.Url+"/loki")) } body, _ := io.ReadAll(resp.Body) return fmt.Errorf("request url:%s failed code:%d body:%s", fullURL, resp.StatusCode, string(body)) } return nil } func (rt *Router) datasourceGet(c *gin.Context) { if rt.DatasourceCache.DatasourceCheckHook(c) { Render(c, []int{}, nil) return } var req models.Datasource ginx.BindJSON(c, &req) err := req.Get(rt.Ctx) Render(c, req, err) } func (rt *Router) datasourceUpdataStatus(c *gin.Context) { if rt.DatasourceCache.DatasourceCheckHook(c) { Render(c, []int{}, nil) return } var req models.Datasource ginx.BindJSON(c, &req) username := Username(c) req.UpdatedBy = username err := req.Update(rt.Ctx, "status", "updated_by", "updated_at") Render(c, req, err) } func (rt *Router) datasourceDel(c *gin.Context) { if rt.DatasourceCache.DatasourceCheckHook(c) { Render(c, []int{}, nil) return } var ids []int64 ginx.BindJSON(c, &ids) err := models.DatasourceDel(rt.Ctx, ids) Render(c, nil, err) } func (rt *Router) getDatasourceIds(c *gin.Context) { name := ginx.QueryStr(c, "name") datasourceIds, err := models.GetDatasourceIdsByEngineName(rt.Ctx, name) ginx.NewRender(c).Data(datasourceIds, err) } type datasourceQueryForm struct { Cate string `json:"datasource_cate"` DatasourceQueries []models.DatasourceQuery `json:"datasource_queries"` } type datasourceQueryResp struct { ID int64 `json:"id"` Name string `json:"name"` } func (rt *Router) datasourceQuery(c *gin.Context) { var dsf datasourceQueryForm ginx.BindJSON(c, &dsf) datasources, err := models.GetDatasourcesGetsByTypes(rt.Ctx, []string{dsf.Cate}) ginx.Dangerous(err) nameToID := make(map[string]int64) IDToName := make(map[int64]string) for _, ds := range datasources { nameToID[ds.Name] = ds.Id IDToName[ds.Id] = ds.Name } ids := models.GetDatasourceIDsByDatasourceQueries(dsf.DatasourceQueries, IDToName, nameToID) var req []datasourceQueryResp for _, id := range ids { req = append(req, datasourceQueryResp{ ID: id, Name: IDToName[id], }) } ginx.NewRender(c).Data(req, err) } // getElasticsearchVersion 该函数尝试从提供的Elasticsearch数据源中获取版本号,遍历所有URL, // 直到成功获取版本号或所有URL均尝试失败为止。 func getElasticsearchVersion(ds models.Datasource, timeout time.Duration) (string, error) { client := &http.Client{ Timeout: timeout, Transport: &http.Transport{ TLSClientConfig: &tls.Config{ InsecureSkipVerify: ds.HTTPJson.TLS.SkipTlsVerify, }, }, } urls := make([]string, 0) if len(ds.HTTPJson.Urls) > 0 { urls = append(urls, ds.HTTPJson.Urls...) } if ds.HTTPJson.Url != "" { urls = append(urls, ds.HTTPJson.Url) } if len(urls) == 0 { return "", fmt.Errorf("no url provided") } var lastErr error for _, raw := range urls { baseURL := strings.TrimRight(raw, "/") + "/" req, err := http.NewRequest("GET", baseURL, nil) if err != nil { lastErr = err continue } if ds.AuthJson.BasicAuthUser != "" { req.SetBasicAuth(ds.AuthJson.BasicAuthUser, ds.AuthJson.BasicAuthPassword) } for k, v := range ds.HTTPJson.Headers { req.Header.Set(k, v) } resp, err := client.Do(req) if err != nil { lastErr = err continue } body, err := io.ReadAll(resp.Body) resp.Body.Close() if err != nil { lastErr = err continue } if resp.StatusCode != 200 { lastErr = fmt.Errorf("request to %s failed with status: %d body:%s", baseURL, resp.StatusCode, string(body)) continue } var result map[string]interface{} if err := json.Unmarshal(body, &result); err != nil { lastErr = err continue } if version, ok := result["version"].(map[string]interface{}); ok { if number, ok := version["number"].(string); ok && number != "" { return number, nil } } lastErr = fmt.Errorf("version not found in response from %s", baseURL) } if lastErr != nil { return "", lastErr } return "", fmt.Errorf("failed to get elasticsearch version") } ================================================ FILE: center/router/router_datasource_db.go ================================================ package router import ( "context" "github.com/ccfos/nightingale/v6/dscache" "github.com/ccfos/nightingale/v6/dskit/types" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/ccfos/nightingale/v6/pkg/logx" "github.com/gin-gonic/gin" ) func (rt *Router) ShowDatabases(c *gin.Context) { var f models.QueryParam ginx.BindJSON(c, &f) plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId) if !exists { logx.Warningf(c.Request.Context(), "cluster:%d not exists", f.DatasourceId) ginx.Bomb(200, "cluster not exists") } var databases []string var err error type DatabaseShower interface { ShowDatabases(context.Context) ([]string, error) } switch plug.(type) { case DatabaseShower: databases, err = plug.(DatabaseShower).ShowDatabases(c.Request.Context()) ginx.Dangerous(err) default: ginx.Bomb(200, "datasource not exists") } if len(databases) == 0 { databases = make([]string, 0) } ginx.NewRender(c).Data(databases, nil) } func (rt *Router) ShowTables(c *gin.Context) { var f models.QueryParam ginx.BindJSON(c, &f) plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId) if !exists { logx.Warningf(c.Request.Context(), "cluster:%d not exists", f.DatasourceId) ginx.Bomb(200, "cluster not exists") } // 只接受一个入参 tables := make([]string, 0) var err error type TableShower interface { ShowTables(ctx context.Context, database string) ([]string, error) } switch plug.(type) { case TableShower: if len(f.Queries) > 0 { database, ok := f.Queries[0].(string) if ok { tables, err = plug.(TableShower).ShowTables(c.Request.Context(), database) } } default: ginx.Bomb(200, "datasource not exists") } ginx.NewRender(c).Data(tables, err) } func (rt *Router) DescribeTable(c *gin.Context) { var f models.QueryParam ginx.BindJSON(c, &f) plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId) if !exists { logx.Warningf(c.Request.Context(), "cluster:%d not exists", f.DatasourceId) ginx.Bomb(200, "cluster not exists") } // 只接受一个入参 columns := make([]*types.ColumnProperty, 0) var err error type TableDescriber interface { DescribeTable(context.Context, interface{}) ([]*types.ColumnProperty, error) } switch plug.(type) { case TableDescriber: client := plug.(TableDescriber) if len(f.Queries) > 0 { columns, err = client.DescribeTable(c.Request.Context(), f.Queries[0]) } default: ginx.Bomb(200, "datasource not exists") } ginx.NewRender(c).Data(columns, err) } ================================================ FILE: center/router/router_embedded.go ================================================ package router import ( "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) func (rt *Router) embeddedProductGets(c *gin.Context) { products, err := models.EmbeddedProductGets(rt.Ctx) ginx.Dangerous(err) models.FillUpdateByNicknames(rt.Ctx, products) // 获取当前用户可访问的Group ID 列表 me := c.MustGet("user").(*models.User) if me.IsAdmin() { ginx.NewRender(c).Data(products, err) return } gids, err := models.MyGroupIds(rt.Ctx, me.Id) bgSet := make(map[int64]struct{}, len(gids)) for _, id := range gids { bgSet[id] = struct{}{} } // 过滤出公开或有权限访问的私有 product link var result []*models.EmbeddedProduct for _, product := range products { if !product.IsPrivate { result = append(result, product) continue } for _, tid := range product.TeamIDs { if _, ok := bgSet[tid]; ok { result = append(result, product) break } } } ginx.NewRender(c).Data(result, err) } func (rt *Router) embeddedProductGet(c *gin.Context) { id := ginx.UrlParamInt64(c, "id") if id <= 0 { ginx.Bomb(400, "invalid id") } data, err := models.GetEmbeddedProductByID(rt.Ctx, id) ginx.Dangerous(err) me := c.MustGet("user").(*models.User) hashPermission, err := hasEmbeddedProductAccess(rt.Ctx, me, data) ginx.Dangerous(err) if !hashPermission { ginx.Bomb(403, "forbidden") } ginx.NewRender(c).Data(data, nil) } func (rt *Router) embeddedProductAdd(c *gin.Context) { var eps []models.EmbeddedProduct ginx.BindJSON(c, &eps) me := c.MustGet("user").(*models.User) for i := range eps { eps[i].CreateBy = me.Nickname eps[i].UpdateBy = me.Nickname } err := models.AddEmbeddedProduct(rt.Ctx, eps) ginx.NewRender(c).Message(err) } func (rt *Router) embeddedProductPut(c *gin.Context) { var ep models.EmbeddedProduct id := ginx.UrlParamInt64(c, "id") ginx.BindJSON(c, &ep) if id <= 0 { ginx.Bomb(400, "invalid id") } oldProduct, err := models.GetEmbeddedProductByID(rt.Ctx, id) ginx.Dangerous(err) me := c.MustGet("user").(*models.User) now := time.Now().Unix() oldProduct.Name = ep.Name oldProduct.URL = ep.URL oldProduct.IsPrivate = ep.IsPrivate oldProduct.TeamIDs = ep.TeamIDs oldProduct.UpdateBy = me.Username oldProduct.UpdateAt = now err = models.UpdateEmbeddedProduct(rt.Ctx, oldProduct) ginx.NewRender(c).Message(err) } func (rt *Router) embeddedProductDelete(c *gin.Context) { id := ginx.UrlParamInt64(c, "id") if id <= 0 { ginx.Bomb(400, "invalid id") } err := models.DeleteEmbeddedProduct(rt.Ctx, id) ginx.NewRender(c).Message(err) } func hasEmbeddedProductAccess(ctx *ctx.Context, user *models.User, ep *models.EmbeddedProduct) (bool, error) { if user.IsAdmin() || !ep.IsPrivate { return true, nil } gids, err := models.MyGroupIds(ctx, user.Id) if err != nil { return false, err } groupSet := make(map[int64]struct{}, len(gids)) for _, gid := range gids { groupSet[gid] = struct{}{} } for _, tid := range ep.TeamIDs { if _, ok := groupSet[tid]; ok { return true, nil } } return false, nil } ================================================ FILE: center/router/router_es.go ================================================ package router import ( "github.com/ccfos/nightingale/v6/datasource/es" "github.com/ccfos/nightingale/v6/dscache" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/ccfos/nightingale/v6/pkg/logx" "github.com/gin-gonic/gin" ) type IndexReq struct { Cate string `json:"cate"` DatasourceId int64 `json:"datasource_id"` Index string `json:"index"` } type FieldValueReq struct { Cate string `json:"cate"` DatasourceId int64 `json:"datasource_id"` Index string `json:"index"` Query FieldObj `json:"query"` } type FieldObj struct { Find string `json:"find"` Field string `json:"field"` Query string `json:"query"` } func (rt *Router) QueryIndices(c *gin.Context) { var f IndexReq ginx.BindJSON(c, &f) plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId) if !exists { logx.Warningf(c.Request.Context(), "cluster:%d not exists", f.DatasourceId) ginx.Bomb(200, "cluster not exists") } indices, err := plug.(*es.Elasticsearch).QueryIndices() ginx.Dangerous(err) ginx.NewRender(c).Data(indices, nil) } func (rt *Router) QueryFields(c *gin.Context) { var f IndexReq ginx.BindJSON(c, &f) plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId) if !exists { logx.Warningf(c.Request.Context(), "cluster:%d not exists", f.DatasourceId) ginx.Bomb(200, "cluster not exists") } fields, err := plug.(*es.Elasticsearch).QueryFields([]string{f.Index}) ginx.Dangerous(err) ginx.NewRender(c).Data(fields, nil) } func (rt *Router) QueryESVariable(c *gin.Context) { var f FieldValueReq ginx.BindJSON(c, &f) plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId) if !exists { logx.Warningf(c.Request.Context(), "cluster:%d not exists", f.DatasourceId) ginx.Bomb(200, "cluster not exists") } fields, err := plug.(*es.Elasticsearch).QueryFieldValue([]string{f.Index}, f.Query.Field, f.Query.Query) ginx.Dangerous(err) ginx.NewRender(c).Data(fields, nil) } ================================================ FILE: center/router/router_es_index_pattern.go ================================================ package router import ( "net/http" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) // 创建 ES Index Pattern func (rt *Router) esIndexPatternAdd(c *gin.Context) { var f models.EsIndexPattern ginx.BindJSON(c, &f) username := c.MustGet("username").(string) now := time.Now().Unix() f.CreateAt = now f.CreateBy = username f.UpdateAt = now f.UpdateBy = username err := f.Add(rt.Ctx) ginx.NewRender(c).Message(err) } // 更新 ES Index Pattern func (rt *Router) esIndexPatternPut(c *gin.Context) { var f models.EsIndexPattern ginx.BindJSON(c, &f) id := ginx.QueryInt64(c, "id") esIndexPattern, err := models.EsIndexPatternGetById(rt.Ctx, id) ginx.Dangerous(err) if esIndexPattern == nil { ginx.NewRender(c, http.StatusNotFound).Message("No such EsIndexPattern") return } f.UpdateBy = c.MustGet("username").(string) ginx.NewRender(c).Message(esIndexPattern.Update(rt.Ctx, f)) } // 删除 ES Index Pattern func (rt *Router) esIndexPatternDel(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) if len(f.Ids) == 0 { ginx.Bomb(http.StatusBadRequest, "ids empty") } ginx.NewRender(c).Message(models.EsIndexPatternDel(rt.Ctx, f.Ids)) } // ES Index Pattern列表 func (rt *Router) esIndexPatternGetList(c *gin.Context) { datasourceId := ginx.QueryInt64(c, "datasource_id", 0) var lst []*models.EsIndexPattern var err error if datasourceId != 0 { lst, err = models.EsIndexPatternGets(rt.Ctx, "datasource_id = ?", datasourceId) } else { lst, err = models.EsIndexPatternGets(rt.Ctx, "") } if err == nil { models.FillUpdateByNicknames(rt.Ctx, lst) } ginx.NewRender(c).Data(lst, err) } // ES Index Pattern 单个数据 func (rt *Router) esIndexPatternGet(c *gin.Context) { id := ginx.QueryInt64(c, "id") item, err := models.EsIndexPatternGet(rt.Ctx, "id=?", id) ginx.NewRender(c).Data(item, err) } ================================================ FILE: center/router/router_event_detail.go ================================================ package router import ( "encoding/json" "fmt" "io" "net/http" "strconv" "time" "github.com/ccfos/nightingale/v6/alert/naming" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/loggrep" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) // eventDetailPage renders an HTML log viewer page (for pages group). func (rt *Router) eventDetailPage(c *gin.Context) { hash := ginx.UrlParamStr(c, "hash") if !loggrep.IsValidHash(hash) { c.String(http.StatusBadRequest, "invalid hash format") return } logs, instance, err := rt.getEventLogs(hash) if err != nil { c.String(http.StatusInternalServerError, "Error: %v", err) return } c.Header("Content-Type", "text/html; charset=utf-8") err = loggrep.RenderHTML(c.Writer, loggrep.PageData{ Hash: hash, Instance: instance, Logs: logs, Total: len(logs), }) if err != nil { c.String(http.StatusInternalServerError, "render error: %v", err) } } // eventDetailJSON returns JSON (for service group). func (rt *Router) eventDetailJSON(c *gin.Context) { hash := ginx.UrlParamStr(c, "hash") if !loggrep.IsValidHash(hash) { ginx.Bomb(200, "invalid hash format") } logs, instance, err := rt.getEventLogs(hash) ginx.Dangerous(err) ginx.NewRender(c).Data(loggrep.EventDetailResp{ Logs: logs, Instance: instance, }, nil) } // getNodeForDatasource returns the alert engine instance responsible for the given // datasource and primary key. It first checks the local hashring, and falls back // to querying the database for active instances if the hashring is empty // (e.g. when the datasource belongs to another engine cluster). func (rt *Router) getNodeForDatasource(datasourceId int64, pk string) (string, error) { dsIdStr := strconv.FormatInt(datasourceId, 10) node, err := naming.DatasourceHashRing.GetNode(dsIdStr, pk) if err == nil { return node, nil } // Hashring is empty for this datasource (likely belongs to another engine cluster). // Query the DB for active instances. servers, dbErr := models.AlertingEngineGetsInstances(rt.Ctx, "datasource_id = ? and clock > ?", datasourceId, time.Now().Unix()-30) if dbErr != nil { return "", dbErr } if len(servers) == 0 { return "", fmt.Errorf("no active instances for datasource %d", datasourceId) } ring := naming.NewConsistentHashRing(int32(naming.NodeReplicas), servers) return ring.Get(pk) } // getEventLogs resolves the target instance and retrieves logs. func (rt *Router) getEventLogs(hash string) ([]string, string, error) { event, err := models.AlertHisEventGetByHash(rt.Ctx, hash) if err != nil { return nil, "", err } if event == nil { return nil, "", fmt.Errorf("no such alert event") } ruleId := strconv.FormatInt(event.RuleId, 10) instance := fmt.Sprintf("%s:%d", rt.Alert.Heartbeat.IP, rt.HTTP.Port) node, err := rt.getNodeForDatasource(event.DatasourceId, ruleId) if err != nil || node == instance { // hashring not ready or target is self, handle locally logs, err := loggrep.GrepLogDir(rt.LogDir, hash) return logs, instance, err } // forward to the target alert instance return rt.forwardEventDetail(node, hash) } func (rt *Router) forwardEventDetail(node, hash string) ([]string, string, error) { url := fmt.Sprintf("http://%s/v1/n9e/event-detail/%s", node, hash) req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, node, err } for user, pass := range rt.HTTP.APIForService.BasicAuth { req.SetBasicAuth(user, pass) break } client := &http.Client{Timeout: 15 * time.Second} resp, err := client.Do(req) if err != nil { return nil, node, fmt.Errorf("forward to %s failed: %v", node, err) } defer resp.Body.Close() body, err := io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024)) // 10MB limit if err != nil { return nil, node, err } var result struct { Dat loggrep.EventDetailResp `json:"dat"` Err string `json:"err"` } if err := json.Unmarshal(body, &result); err != nil { return nil, node, err } if result.Err != "" { return nil, node, fmt.Errorf("%s", result.Err) } return result.Dat.Logs, result.Dat.Instance, nil } ================================================ FILE: center/router/router_event_pipeline.go ================================================ package router import ( "encoding/json" "fmt" "net/http" "time" "github.com/ccfos/nightingale/v6/alert/pipeline/engine" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/google/uuid" "github.com/toolkits/pkg/i18n" "github.com/toolkits/pkg/logger" ) // 获取事件Pipeline列表 func (rt *Router) eventPipelinesList(c *gin.Context) { me := c.MustGet("user").(*models.User) pipelines, err := models.ListEventPipelines(rt.Ctx) ginx.Dangerous(err) allTids := make([]int64, 0) for _, pipeline := range pipelines { allTids = append(allTids, pipeline.TeamIds...) } ugMap, err := models.UserGroupIdAndNameMap(rt.Ctx, allTids) ginx.Dangerous(err) for _, pipeline := range pipelines { for _, tid := range pipeline.TeamIds { pipeline.TeamNames = append(pipeline.TeamNames, ugMap[tid]) } // 兼容处理:自动填充工作流字段 pipeline.FillWorkflowFields() } models.FillUpdateByNicknames(rt.Ctx, pipelines) gids, err := models.MyGroupIdsMap(rt.Ctx, me.Id) ginx.Dangerous(err) if me.IsAdmin() { for _, pipeline := range pipelines { if pipeline.TriggerMode == "" { pipeline.TriggerMode = models.TriggerModeEvent } if pipeline.UseCase == "" { pipeline.UseCase = models.UseCaseEventPipeline } } ginx.NewRender(c).Data(pipelines, nil) return } res := make([]*models.EventPipeline, 0) for _, pipeline := range pipelines { if pipeline.TriggerMode == "" { pipeline.TriggerMode = models.TriggerModeEvent } if pipeline.UseCase == "" { pipeline.UseCase = models.UseCaseEventPipeline } for _, tid := range pipeline.TeamIds { if _, ok := gids[tid]; ok { res = append(res, pipeline) break } } } ginx.NewRender(c).Data(res, nil) } // 获取单个事件Pipeline详情 func (rt *Router) getEventPipeline(c *gin.Context) { me := c.MustGet("user").(*models.User) id := ginx.UrlParamInt64(c, "id") pipeline, err := models.GetEventPipeline(rt.Ctx, id) ginx.Dangerous(err) ginx.Dangerous(me.CheckGroupPermission(rt.Ctx, pipeline.TeamIds)) err = pipeline.FillTeamNames(rt.Ctx) ginx.Dangerous(err) // 兼容处理:自动填充工作流字段 pipeline.FillWorkflowFields() if pipeline.TriggerMode == "" { pipeline.TriggerMode = models.TriggerModeEvent } if pipeline.UseCase == "" { pipeline.UseCase = models.UseCaseEventPipeline } ginx.NewRender(c).Data(pipeline, nil) } // 创建事件Pipeline func (rt *Router) addEventPipeline(c *gin.Context) { var pipeline models.EventPipeline ginx.BindJSON(c, &pipeline) user := c.MustGet("user").(*models.User) now := time.Now().Unix() pipeline.CreateBy = user.Username pipeline.CreateAt = now pipeline.UpdateAt = now pipeline.UpdateBy = user.Username err := pipeline.Verify() if err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } ginx.Dangerous(user.CheckGroupPermission(rt.Ctx, pipeline.TeamIds)) err = models.CreateEventPipeline(rt.Ctx, &pipeline) ginx.NewRender(c).Message(err) } // 更新事件Pipeline func (rt *Router) updateEventPipeline(c *gin.Context) { var f models.EventPipeline ginx.BindJSON(c, &f) me := c.MustGet("user").(*models.User) f.UpdateBy = me.Username f.UpdateAt = time.Now().Unix() pipeline, err := models.GetEventPipeline(rt.Ctx, f.ID) if err != nil { ginx.Bomb(http.StatusNotFound, "No such event pipeline") } ginx.Dangerous(me.CheckGroupPermission(rt.Ctx, pipeline.TeamIds)) ginx.NewRender(c).Message(pipeline.Update(rt.Ctx, &f)) } // 删除事件Pipeline func (rt *Router) deleteEventPipelines(c *gin.Context) { var f struct { Ids []int64 `json:"ids"` } ginx.BindJSON(c, &f) if len(f.Ids) == 0 { ginx.Bomb(http.StatusBadRequest, "ids required") } me := c.MustGet("user").(*models.User) for _, id := range f.Ids { pipeline, err := models.GetEventPipeline(rt.Ctx, id) ginx.Dangerous(err) ginx.Dangerous(me.CheckGroupPermission(rt.Ctx, pipeline.TeamIds)) } err := models.DeleteEventPipelines(rt.Ctx, f.Ids) ginx.NewRender(c).Message(err) } // 测试事件Pipeline func (rt *Router) tryRunEventPipeline(c *gin.Context) { var f struct { EventId int64 `json:"event_id"` PipelineConfig models.EventPipeline `json:"pipeline_config"` InputVariables map[string]string `json:"input_variables,omitempty"` } ginx.BindJSON(c, &f) hisEvent, err := models.AlertHisEventGetById(rt.Ctx, f.EventId) if err != nil || hisEvent == nil { ginx.Bomb(http.StatusBadRequest, "event not found") } event := hisEvent.ToCur() lang := c.GetHeader("X-Language") me := c.MustGet("user").(*models.User) // 统一使用工作流引擎执行(兼容线性模式和工作流模式) workflowEngine := engine.NewWorkflowEngine(rt.Ctx) triggerCtx := &models.WorkflowTriggerContext{ Mode: models.TriggerModeAPI, TriggerBy: me.Username, InputsOverrides: f.InputVariables, } resultEvent, result, err := workflowEngine.Execute(&f.PipelineConfig, event, triggerCtx) if err != nil { ginx.Bomb(http.StatusBadRequest, "pipeline execute error: %v", err) } m := map[string]interface{}{ "event": resultEvent, "result": i18n.Sprintf(lang, result.Message), "status": result.Status, "node_results": result.NodeResults, } if resultEvent == nil { m["result"] = i18n.Sprintf(lang, "event is dropped") } ginx.NewRender(c).Data(m, nil) } // 测试事件处理器 func (rt *Router) tryRunEventProcessor(c *gin.Context) { var f struct { EventId int64 `json:"event_id"` ProcessorConfig models.ProcessorConfig `json:"processor_config"` } ginx.BindJSON(c, &f) hisEvent, err := models.AlertHisEventGetById(rt.Ctx, f.EventId) if err != nil || hisEvent == nil { ginx.Bomb(http.StatusBadRequest, "event not found") } event := hisEvent.ToCur() processor, err := models.GetProcessorByType(f.ProcessorConfig.Typ, f.ProcessorConfig.Config) if err != nil { ginx.Bomb(200, "get processor err: %+v", err) } wfCtx := &models.WorkflowContext{ Event: event, Vars: make(map[string]interface{}), } wfCtx, res, err := processor.Process(rt.Ctx, wfCtx) if err != nil { ginx.Bomb(200, "processor err: %+v", err) } lang := c.GetHeader("X-Language") ginx.NewRender(c).Data(map[string]interface{}{ "event": wfCtx.Event, "result": i18n.Sprintf(lang, res), }, nil) } func (rt *Router) tryRunEventProcessorByNotifyRule(c *gin.Context) { var f struct { EventId int64 `json:"event_id"` PipelineConfigs []models.PipelineConfig `json:"pipeline_configs"` } ginx.BindJSON(c, &f) hisEvent, err := models.AlertHisEventGetById(rt.Ctx, f.EventId) if err != nil || hisEvent == nil { ginx.Bomb(http.StatusBadRequest, "event not found") } event := hisEvent.ToCur() pids := make([]int64, 0) for _, pc := range f.PipelineConfigs { if pc.Enable { pids = append(pids, pc.PipelineId) } } pipelines, err := models.GetEventPipelinesByIds(rt.Ctx, pids) if err != nil { ginx.Bomb(http.StatusBadRequest, "processors not found") } wfCtx := &models.WorkflowContext{ Event: event, Vars: make(map[string]interface{}), } for _, pl := range pipelines { for _, p := range pl.ProcessorConfigs { processor, err := models.GetProcessorByType(p.Typ, p.Config) if err != nil { ginx.Bomb(http.StatusBadRequest, "get processor: %+v err: %+v", p, err) } wfCtx, _, err = processor.Process(rt.Ctx, wfCtx) if err != nil { ginx.Bomb(http.StatusBadRequest, "processor: %+v err: %+v", p, err) } if wfCtx == nil || wfCtx.Event == nil { lang := c.GetHeader("X-Language") ginx.NewRender(c).Data(map[string]interface{}{ "event": nil, "result": i18n.Sprintf(lang, "event is dropped"), }, nil) return } } } ginx.NewRender(c).Data(wfCtx.Event, nil) } func (rt *Router) eventPipelinesListByService(c *gin.Context) { pipelines, err := models.ListEventPipelines(rt.Ctx) ginx.NewRender(c).Data(pipelines, err) } type EventPipelineRequest struct { // 事件数据(可选,如果不传则使用空事件) Event *models.AlertCurEvent `json:"event,omitempty"` // 输入参数覆盖 InputsOverrides map[string]string `json:"inputs_overrides,omitempty"` Username string `json:"username,omitempty"` } // executePipelineTrigger 执行 Pipeline 触发的公共逻辑 func (rt *Router) executePipelineTrigger(pipeline *models.EventPipeline, req *EventPipelineRequest, triggerBy string) (string, error) { // 准备事件数据 var event *models.AlertCurEvent if req.Event != nil { event = req.Event } else { // 创建空事件 event = &models.AlertCurEvent{ TriggerTime: time.Now().Unix(), } } // 生成执行ID executionID := uuid.New().String() // 创建触发上下文 triggerCtx := &models.WorkflowTriggerContext{ Mode: models.TriggerModeAPI, TriggerBy: triggerBy, InputsOverrides: req.InputsOverrides, RequestID: executionID, } // 异步执行工作流 go func() { workflowEngine := engine.NewWorkflowEngine(rt.Ctx) _, _, err := workflowEngine.Execute(pipeline, event, triggerCtx) if err != nil { logger.Errorf("async workflow execute error: pipeline_id=%d execution_id=%s err=%v", pipeline.ID, executionID, err) } }() return executionID, nil } // triggerEventPipelineByService Service 调用触发工作流执行 func (rt *Router) triggerEventPipelineByService(c *gin.Context) { pipelineID := ginx.UrlParamInt64(c, "id") var f EventPipelineRequest ginx.BindJSON(c, &f) // 获取 Pipeline pipeline, err := models.GetEventPipeline(rt.Ctx, pipelineID) if err != nil { ginx.Bomb(http.StatusNotFound, "pipeline not found: %v", err) } executionID, err := rt.executePipelineTrigger(pipeline, &f, f.Username) if err != nil { ginx.Bomb(http.StatusBadRequest, "%v", err) } ginx.NewRender(c).Data(gin.H{ "execution_id": executionID, "message": "workflow execution started", }, nil) } // triggerEventPipelineByAPI API 触发工作流执行 func (rt *Router) triggerEventPipelineByAPI(c *gin.Context) { pipelineID := ginx.UrlParamInt64(c, "id") var f EventPipelineRequest ginx.BindJSON(c, &f) // 获取 Pipeline pipeline, err := models.GetEventPipeline(rt.Ctx, pipelineID) if err != nil { ginx.Bomb(http.StatusNotFound, "pipeline not found: %v", err) } // 检查权限 me := c.MustGet("user").(*models.User) ginx.Dangerous(me.CheckGroupPermission(rt.Ctx, pipeline.TeamIds)) executionID, err := rt.executePipelineTrigger(pipeline, &f, me.Username) if err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } ginx.NewRender(c).Data(gin.H{ "execution_id": executionID, "message": "workflow execution started", }, nil) } func (rt *Router) listAllEventPipelineExecutions(c *gin.Context) { pipelineId := ginx.QueryInt64(c, "pipeline_id", 0) pipelineName := ginx.QueryStr(c, "pipeline_name", "") mode := ginx.QueryStr(c, "mode", "") status := ginx.QueryStr(c, "status", "") limit := ginx.QueryInt(c, "limit", 20) offset := ginx.QueryInt(c, "p", 1) if limit <= 0 || limit > 1000 { limit = 20 } if offset <= 0 { offset = 1 } executions, total, err := models.ListAllEventPipelineExecutions(rt.Ctx, pipelineId, pipelineName, mode, status, limit, (offset-1)*limit) ginx.Dangerous(err) ginx.NewRender(c).Data(gin.H{ "list": executions, "total": total, }, nil) } func (rt *Router) listEventPipelineExecutions(c *gin.Context) { pipelineID := ginx.UrlParamInt64(c, "id") mode := ginx.QueryStr(c, "mode", "") status := ginx.QueryStr(c, "status", "") limit := ginx.QueryInt(c, "limit", 20) offset := ginx.QueryInt(c, "p", 1) if limit <= 0 || limit > 1000 { limit = 20 } if offset <= 0 { offset = 1 } executions, total, err := models.ListEventPipelineExecutions(rt.Ctx, pipelineID, mode, status, limit, (offset-1)*limit) ginx.Dangerous(err) ginx.NewRender(c).Data(gin.H{ "list": executions, "total": total, }, nil) } func (rt *Router) getEventPipelineExecution(c *gin.Context) { execID := ginx.UrlParamStr(c, "exec_id") detail, err := models.GetEventPipelineExecutionDetail(rt.Ctx, execID) if err != nil { ginx.Bomb(http.StatusNotFound, "execution not found: %v", err) } ginx.NewRender(c).Data(detail, nil) } func (rt *Router) getEventPipelineExecutionStats(c *gin.Context) { pipelineID := ginx.UrlParamInt64(c, "id") stats, err := models.GetEventPipelineExecutionStatistics(rt.Ctx, pipelineID) ginx.Dangerous(err) ginx.NewRender(c).Data(stats, nil) } func (rt *Router) cleanEventPipelineExecutions(c *gin.Context) { var f struct { BeforeDays int `json:"before_days"` } ginx.BindJSON(c, &f) if f.BeforeDays <= 0 { f.BeforeDays = 30 } beforeTime := time.Now().AddDate(0, 0, -f.BeforeDays).Unix() affected, err := models.DeleteEventPipelineExecutions(rt.Ctx, beforeTime) ginx.Dangerous(err) ginx.NewRender(c).Data(gin.H{ "deleted": affected, }, nil) } func (rt *Router) streamEventPipeline(c *gin.Context) { pipelineID := ginx.UrlParamInt64(c, "id") var f EventPipelineRequest ginx.BindJSON(c, &f) pipeline, err := models.GetEventPipeline(rt.Ctx, pipelineID) if err != nil { ginx.Bomb(http.StatusNotFound, "pipeline not found: %v", err) } me := c.MustGet("user").(*models.User) ginx.Dangerous(me.CheckGroupPermission(rt.Ctx, pipeline.TeamIds)) var event *models.AlertCurEvent if f.Event != nil { event = f.Event } else { event = &models.AlertCurEvent{ TriggerTime: time.Now().Unix(), } } triggerCtx := &models.WorkflowTriggerContext{ Mode: models.TriggerModeAPI, TriggerBy: me.Username, InputsOverrides: f.InputsOverrides, RequestID: uuid.New().String(), Stream: true, // 流式端点强制启用流式输出 } workflowEngine := engine.NewWorkflowEngine(rt.Ctx) _, result, err := workflowEngine.Execute(pipeline, event, triggerCtx) if err != nil { ginx.Bomb(http.StatusInternalServerError, "execute failed: %v", err) } if result.Stream && result.StreamChan != nil { rt.handleStreamResponse(c, result, triggerCtx.RequestID) return } ginx.NewRender(c).Data(result, nil) } func (rt *Router) handleStreamResponse(c *gin.Context, result *models.WorkflowResult, requestID string) { // 设置 SSE 响应头 c.Header("Content-Type", "text/event-stream") c.Header("Cache-Control", "no-cache") c.Header("Connection", "keep-alive") c.Header("X-Accel-Buffering", "no") // 禁用 nginx 缓冲 c.Header("X-Request-ID", requestID) flusher, ok := c.Writer.(http.Flusher) if !ok { ginx.Bomb(http.StatusInternalServerError, "streaming not supported") return } // 发送初始连接成功消息 initData := fmt.Sprintf(`{"type":"connected","request_id":"%s","timestamp":%d}`, requestID, time.Now().UnixMilli()) fmt.Fprintf(c.Writer, "data: %s\n\n", initData) flusher.Flush() // 从 channel 读取并发送 SSE timeout := time.After(30 * time.Minute) // 最长流式输出时间 for { select { case chunk, ok := <-result.StreamChan: if !ok { // channel 关闭,发送结束标记 return } data, err := json.Marshal(chunk) if err != nil { logger.Errorf("stream: failed to marshal chunk: %v", err) continue } fmt.Fprintf(c.Writer, "data: %s\n\n", data) flusher.Flush() if chunk.Done { return } case <-c.Request.Context().Done(): // 客户端断开连接 logger.Infof("stream: client disconnected, request_id=%s", requestID) return case <-timeout: logger.Errorf("stream: timeout, request_id=%s", requestID) return } } } func (rt *Router) streamEventPipelineByService(c *gin.Context) { pipelineID := ginx.UrlParamInt64(c, "id") var f EventPipelineRequest ginx.BindJSON(c, &f) pipeline, err := models.GetEventPipeline(rt.Ctx, pipelineID) if err != nil { ginx.Bomb(http.StatusNotFound, "pipeline not found: %v", err) } var event *models.AlertCurEvent if f.Event != nil { event = f.Event } else { event = &models.AlertCurEvent{ TriggerTime: time.Now().Unix(), } } triggerCtx := &models.WorkflowTriggerContext{ Mode: models.TriggerModeAPI, TriggerBy: f.Username, InputsOverrides: f.InputsOverrides, RequestID: uuid.New().String(), Stream: true, // 流式端点强制启用流式输出 } workflowEngine := engine.NewWorkflowEngine(rt.Ctx) _, result, err := workflowEngine.Execute(pipeline, event, triggerCtx) if err != nil { ginx.Bomb(http.StatusInternalServerError, "execute failed: %v", err) } // 检查是否是流式输出 if result.Stream && result.StreamChan != nil { rt.handleStreamResponse(c, result, triggerCtx.RequestID) return } ginx.NewRender(c).Data(result, nil) } // eventPipelineExecutionAdd 接收 edge 节点同步的 Pipeline 执行记录 func (rt *Router) eventPipelineExecutionAdd(c *gin.Context) { var execution models.EventPipelineExecution ginx.BindJSON(c, &execution) if execution.ID == "" { ginx.Bomb(http.StatusBadRequest, "id is required") } if execution.PipelineID <= 0 { ginx.Bomb(http.StatusBadRequest, "pipeline_id is required") } ginx.NewRender(c).Message(models.DB(rt.Ctx).Create(&execution).Error) } ================================================ FILE: center/router/router_funcs.go ================================================ package router import ( "net/http" "strconv" "strings" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) const defaultLimit = 300 func (rt *Router) statistic(c *gin.Context) { name := ginx.QueryStr(c, "name") var model interface{} var err error var statistics *models.Statistics switch name { case "alert_mute": model = models.AlertMute{} case "alert_rule": model = models.AlertRule{} case "alert_subscribe": model = models.AlertSubscribe{} case "busi_group": model = models.BusiGroup{} case "recording_rule": model = models.RecordingRule{} case "target": model = models.Target{} case "user": model = models.User{} case "user_group": model = models.UserGroup{} case "notify_rule": model = models.NotifyRule{} case "notify_channel": model = models.NotifyChannel{} case "event_pipeline": statistics, err = models.EventPipelineStatistics(rt.Ctx) ginx.NewRender(c).Data(statistics, err) return case "datasource": // datasource update_at is different from others statistics, err = models.DatasourceStatistics(rt.Ctx) ginx.NewRender(c).Data(statistics, err) return case "user_variable": statistics, err = models.ConfigsUserVariableStatistics(rt.Ctx) ginx.NewRender(c).Data(statistics, err) return case "cval": statistics, err = models.ConfigCvalStatistics(rt.Ctx) ginx.NewRender(c).Data(statistics, err) return case "message_template": statistics, err = models.MessageTemplateStatistics(rt.Ctx) ginx.NewRender(c).Data(statistics, err) return default: ginx.Bomb(http.StatusBadRequest, "invalid name") } statistics, err = models.StatisticsGet(rt.Ctx, model) ginx.NewRender(c).Data(statistics, err) } func queryDatasourceIds(c *gin.Context) []int64 { datasourceIds := ginx.QueryStr(c, "datasource_ids", "") datasourceIds = strings.ReplaceAll(datasourceIds, ",", " ") idsStr := strings.Fields(datasourceIds) ids := make([]int64, len(idsStr)) for i, idStr := range idsStr { id, _ := strconv.ParseInt(idStr, 10, 64) ids[i] = id } return ids } func queryStrListField(c *gin.Context, fieldName string, sep ...string) []string { str := ginx.QueryStr(c, fieldName, "") if str == "" { return nil } lst := []string{str} for _, s := range sep { var newLst []string for _, str := range lst { newLst = append(newLst, strings.Split(str, s)...) } lst = newLst } return lst } type idsForm struct { Ids []int64 `json:"ids"` IsSyncToFlashDuty bool `json:"is_sync_to_flashduty"` } func (f idsForm) Verify() { if len(f.Ids) == 0 { ginx.Bomb(http.StatusBadRequest, "ids empty") } } func User(ctx *ctx.Context, id int64) *models.User { obj, err := models.UserGetById(ctx, id) ginx.Dangerous(err) if obj == nil { ginx.Bomb(http.StatusNotFound, "No such user") } return obj } func UserGroup(ctx *ctx.Context, id int64) *models.UserGroup { obj, err := models.UserGroupGetById(ctx, id) ginx.Dangerous(err) if obj == nil { ginx.Bomb(http.StatusNotFound, "No such UserGroup") } bgids, err := models.BusiGroupIds(ctx, []int64{id}) ginx.Dangerous(err) obj.BusiGroups, err = models.BusiGroupGetByIds(ctx, bgids) ginx.Dangerous(err) return obj } func BusiGroup(ctx *ctx.Context, id int64) *models.BusiGroup { obj, err := models.BusiGroupGetById(ctx, id) ginx.Dangerous(err) if obj == nil { ginx.Bomb(http.StatusNotFound, "No such BusiGroup") } return obj } func Dashboard(ctx *ctx.Context, id int64) *models.Dashboard { obj, err := models.DashboardGet(ctx, "id=?", id) ginx.Dangerous(err) if obj == nil { ginx.Bomb(http.StatusNotFound, "No such dashboard") } return obj } type DoneIdsReply struct { Err string `json:"err"` Dat struct { List []int64 `json:"list"` } `json:"dat"` } type TaskCreateReply struct { Err string `json:"err"` Dat int64 `json:"dat"` // task.id } func Username(c *gin.Context) string { username := c.GetString(gin.AuthUserKey) if username == "" { user := c.MustGet("user").(*models.User) username = user.Username } return username } func HasPermission(ctx *ctx.Context, c *gin.Context, sourceType, sourceId string, isAnonymousAccess bool) bool { if sourceType == "event" && isAnonymousAccess { return true } // 尝试从请求中获取 __token 参数 token := ginx.QueryStr(c, "__token", "") // 如果有 __token 参数,验证其合法性 if token != "" { return ValidateSourceToken(ctx, sourceType, sourceId, token) } return false } func ValidateSourceToken(ctx *ctx.Context, sourceType, sourceId, token string) bool { if token == "" { return false } // 根据源类型、源ID和令牌获取源令牌记录 sourceToken, err := models.GetSourceTokenBySource(ctx, sourceType, sourceId, token) if err != nil { return false } // 检查令牌是否过期 if sourceToken.IsExpired() { return false } return true } ================================================ FILE: center/router/router_heartbeat.go ================================================ package router import ( "compress/gzip" "encoding/json" "errors" "io/ioutil" "sort" "strconv" "strings" "time" "github.com/ccfos/nightingale/v6/center/metas" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pushgw/idents" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/logger" ) type HeartbeatHookFunc func(ident string) map[string]interface{} func (rt *Router) heartbeat(c *gin.Context) { req, err := HandleHeartbeat(c, rt.Ctx, rt.Alert.Heartbeat.EngineName, rt.MetaSet, rt.IdentSet, rt.TargetCache) ginx.Dangerous(err) m := rt.HeartbeatHook(req.Hostname) ginx.NewRender(c).Data(m, err) } func HandleHeartbeat(c *gin.Context, ctx *ctx.Context, engineName string, metaSet *metas.Set, identSet *idents.Set, targetCache *memsto.TargetCacheType) (models.HostMeta, error) { var bs []byte var err error var r *gzip.Reader var req models.HostMeta if c.GetHeader("Content-Encoding") == "gzip" { r, err = gzip.NewReader(c.Request.Body) if err != nil { c.String(400, err.Error()) return req, err } defer r.Close() bs, err = ioutil.ReadAll(r) ginx.Dangerous(err) } else { defer c.Request.Body.Close() bs, err = ioutil.ReadAll(c.Request.Body) if err != nil { return req, err } } err = json.Unmarshal(bs, &req) if err != nil { return req, err } if req.Hostname == "" { return req, errors.New("hostname is required") } // maybe from pushgw if req.Offset == 0 { req.Offset = (time.Now().UnixMilli() - req.UnixTime) } if req.RemoteAddr == "" { req.RemoteAddr = c.ClientIP() } if req.EngineName == "" { req.EngineName = engineName } metaSet.Set(req.Hostname, req) var items = make(map[string]struct{}) items[req.Hostname] = struct{}{} identSet.MSet(items) if target, has := targetCache.Get(req.Hostname); has && target != nil { gidsStr := ginx.QueryStr(c, "gid", "") overwriteGids := ginx.QueryBool(c, "overwrite_gids", false) hostIp := strings.TrimSpace(req.HostIp) gids := strings.Split(gidsStr, ",") if overwriteGids { groupIds := make([]int64, 0) for i := range gids { if gids[i] == "" { continue } groupId, err := strconv.ParseInt(gids[i], 10, 64) if err != nil { logger.Warningf("update target:%s group ids failed, err: %v", req.Hostname, err) continue } groupIds = append(groupIds, groupId) } err := models.TargetOverrideBgids(ctx, []string{target.Ident}, groupIds, nil) if err != nil { logger.Warningf("update target:%s group ids failed, err: %v", target.Ident, err) } } else if gidsStr != "" { for i := range gids { groupId, err := strconv.ParseInt(gids[i], 10, 64) if err != nil { logger.Warningf("update target:%s group ids failed, err: %v", req.Hostname, err) continue } if !target.MatchGroupId(groupId) { err := models.TargetBindBgids(ctx, []string{target.Ident}, []int64{groupId}, nil) if err != nil { logger.Warningf("update target:%s group ids failed, err: %v", target.Ident, err) } } } } newTarget := models.Target{} targetNeedUpdate := false if hostIp != "" && hostIp != target.HostIp { newTarget.HostIp = hostIp targetNeedUpdate = true } hostTagsMap := target.GetHostTagsMap() hostTagNeedUpdate := false if len(hostTagsMap) != len(req.GlobalLabels) { hostTagNeedUpdate = true } else { for k, v := range req.GlobalLabels { if v == "" { continue } if tagv, ok := hostTagsMap[k]; !ok || tagv != v { hostTagNeedUpdate = true break } } } if hostTagNeedUpdate { lst := []string{} for k, v := range req.GlobalLabels { lst = append(lst, k+"="+v) } sort.Strings(lst) newTarget.HostTags = lst targetNeedUpdate = true } userTagsMap := target.GetTagsMap() userTagNeedUpdate := false userTags := []string{} for k, v := range userTagsMap { if v == "" { continue } if _, ok := req.GlobalLabels[k]; !ok { userTags = append(userTags, k+"="+v) } else { // 该key在hostTags中已经存在 userTagNeedUpdate = true } } if userTagNeedUpdate { newTarget.Tags = strings.Join(userTags, " ") + " " targetNeedUpdate = true } if req.EngineName != "" && req.EngineName != target.EngineName { newTarget.EngineName = req.EngineName targetNeedUpdate = true } if req.AgentVersion != "" && req.AgentVersion != target.AgentVersion { newTarget.AgentVersion = req.AgentVersion targetNeedUpdate = true } if req.OS != "" && req.OS != target.OS { newTarget.OS = req.OS targetNeedUpdate = true } if targetNeedUpdate { newTarget.UpdateAt = time.Now().Unix() err := models.DB(ctx).Model(&target).Updates(newTarget).Error if err != nil { logger.Errorf("update target fields failed, err: %v", err) } } logger.Debugf("heartbeat field:%+v target: %v", newTarget, *target) } return req, nil } ================================================ FILE: center/router/router_login.go ================================================ package router import ( "encoding/base64" "encoding/json" "fmt" "net/http" "strconv" "strings" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/cas" "github.com/ccfos/nightingale/v6/pkg/dingtalk" "github.com/ccfos/nightingale/v6/pkg/feishu" "github.com/ccfos/nightingale/v6/pkg/ldapx" "github.com/ccfos/nightingale/v6/pkg/logx" "github.com/ccfos/nightingale/v6/pkg/oauth2x" "github.com/ccfos/nightingale/v6/pkg/oidcx" "github.com/ccfos/nightingale/v6/pkg/secu" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/dgrijalva/jwt-go" "github.com/gin-gonic/gin" "github.com/pelletier/go-toml/v2" "github.com/pkg/errors" "gorm.io/gorm" ) type loginForm struct { Username string `json:"username" binding:"required"` Password string `json:"password" binding:"required"` Captchaid string `json:"captchaid"` Verifyvalue string `json:"verifyvalue"` } func (rt *Router) loginPost(c *gin.Context) { var f loginForm ginx.BindJSON(c, &f) rctx := c.Request.Context() logx.Infof(rctx, "username:%s login from:%s", f.Username, c.ClientIP()) if rt.HTTP.ShowCaptcha.Enable { if !CaptchaVerify(f.Captchaid, f.Verifyvalue) { ginx.NewRender(c).Message("incorrect verification code") return } } authPassWord := f.Password // need decode if rt.HTTP.RSA.OpenRSA { decPassWord, err := secu.Decrypt(f.Password, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord) if err != nil { logx.Errorf(rctx, "RSA Decrypt failed: %v username: %s", err, f.Username) ginx.NewRender(c).Message(err) return } authPassWord = decPassWord } reqCtx := rt.Ctx.WithContext(rctx) var user *models.User var err error lc := rt.Sso.LDAP.Copy() if lc.Enable { user, err = ldapx.LdapLogin(reqCtx, f.Username, authPassWord, lc.DefaultRoles, lc.DefaultTeams, lc) if err != nil { logx.Debugf(rctx, "ldap login failed: %v username: %s", err, f.Username) var errLoginInN9e error // to use n9e as the minimum guarantee for login if user, errLoginInN9e = models.PassLogin(reqCtx, rt.Redis, f.Username, authPassWord); errLoginInN9e != nil { ginx.NewRender(c).Message("ldap login failed: %v; n9e login failed: %v", err, errLoginInN9e) return } } else { user.RolesLst = strings.Fields(user.Roles) } } else { user, err = models.PassLogin(reqCtx, rt.Redis, f.Username, authPassWord) ginx.Dangerous(err) } if user == nil { // Theoretically impossible ginx.NewRender(c).Message("Username or password invalid") return } userIdentity := fmt.Sprintf("%d-%s", user.Id, user.Username) ts, err := rt.createTokens(rt.HTTP.JWTAuth.SigningKey, userIdentity) ginx.Dangerous(err) ginx.Dangerous(rt.createAuth(c.Request.Context(), userIdentity, ts)) ginx.NewRender(c).Data(gin.H{ "user": user, "access_token": ts.AccessToken, "refresh_token": ts.RefreshToken, }, nil) } func (rt *Router) logoutPost(c *gin.Context) { rctx := c.Request.Context() logx.Infof(rctx, "username:%s logout from:%s", c.GetString("username"), c.ClientIP()) metadata, err := rt.extractTokenMetadata(c.Request) if err != nil { ginx.NewRender(c, http.StatusBadRequest).Message("failed to parse jwt token") return } delErr := rt.deleteTokens(c.Request.Context(), metadata) if delErr != nil { ginx.NewRender(c).Message(http.StatusText(http.StatusInternalServerError)) return } var logoutAddr string user := c.MustGet("user").(*models.User) // 获取用户的 id_token idToken, err := rt.fetchIdToken(c.Request.Context(), user.Id) if err != nil { logx.Debugf(rctx, "fetch id_token failed: %v, user_id: %d", err, user.Id) idToken = "" // 如果获取失败,使用空字符串 } // 删除 id_token rt.deleteIdToken(c.Request.Context(), user.Id) switch user.Belong { case "oidc": logoutAddr = rt.Sso.OIDC.GetSsoLogoutAddr(idToken) case "cas": logoutAddr = rt.Sso.CAS.GetSsoLogoutAddr() case "oauth2": logoutAddr = rt.Sso.OAuth2.GetSsoLogoutAddr() } ginx.NewRender(c).Data(logoutAddr, nil) } type refreshForm struct { RefreshToken string `json:"refresh_token" binding:"required"` } func (rt *Router) refreshPost(c *gin.Context) { var f refreshForm ginx.BindJSON(c, &f) // verify the token token, err := jwt.Parse(f.RefreshToken, func(token *jwt.Token) (interface{}, error) { if _, ok := token.Method.(*jwt.SigningMethodHMAC); !ok { return nil, fmt.Errorf("unexpected jwt signing method: %v", token.Header["alg"]) } return []byte(rt.HTTP.JWTAuth.SigningKey), nil }) // if there is an error, the token must have expired if err != nil { // redirect to login page ginx.NewRender(c, http.StatusUnauthorized).Message("refresh token expired") return } // Since token is valid, get the uuid: claims, ok := token.Claims.(jwt.MapClaims) //the token claims should conform to MapClaims if ok && token.Valid { refreshUuid, ok := claims["refresh_uuid"].(string) //convert the interface to string if !ok { // Theoretically impossible ginx.NewRender(c, http.StatusUnauthorized).Message("failed to parse refresh_uuid from jwt") return } // 看这个 token 是否还存在 redis 中 val, err := rt.fetchAuth(c.Request.Context(), refreshUuid) if err != nil || val == "" { ginx.NewRender(c, http.StatusUnauthorized).Message("refresh token expired") return } userIdentity, ok := claims["user_identity"].(string) if !ok { // Theoretically impossible ginx.NewRender(c, http.StatusUnauthorized).Message("failed to parse user_identity from jwt") return } userid, err := strconv.ParseInt(strings.Split(userIdentity, "-")[0], 10, 64) if err != nil { ginx.NewRender(c, http.StatusUnauthorized).Message("failed to parse user_identity from jwt") return } u, err := models.UserGetById(rt.Ctx, userid) if err != nil { ginx.NewRender(c, http.StatusInternalServerError).Message("failed to query user by id") return } if u == nil { // user already deleted ginx.NewRender(c, http.StatusUnauthorized).Message("user already deleted") return } // Delete the previous Refresh Token err = rt.deleteAuth(c.Request.Context(), refreshUuid) if err != nil { ginx.NewRender(c, http.StatusUnauthorized).Message(http.StatusText(http.StatusInternalServerError)) return } // Delete previous Access Token rt.deleteAuth(c.Request.Context(), strings.Split(refreshUuid, "++")[0]) // Create new pairs of refresh and access tokens ts, err := rt.createTokens(rt.HTTP.JWTAuth.SigningKey, userIdentity) ginx.Dangerous(err) ginx.Dangerous(rt.createAuth(c.Request.Context(), userIdentity, ts)) // 延长 id_token 的过期时间,使其与新的 refresh token 生命周期保持一致 // 注意:这里不会获取新的 id_token,只是延长 Redis 中现有 id_token 的 TTL if idToken, err := rt.fetchIdToken(c.Request.Context(), userid); err == nil && idToken != "" { if err := rt.saveIdToken(c.Request.Context(), userid, idToken); err != nil { logx.Debugf(c.Request.Context(), "refresh id_token ttl failed: %v, user_id: %d", err, userid) } } ginx.NewRender(c).Data(gin.H{ "access_token": ts.AccessToken, "refresh_token": ts.RefreshToken, }, nil) } else { // redirect to login page ginx.NewRender(c, http.StatusUnauthorized).Message("refresh token expired") } } func (rt *Router) loginRedirect(c *gin.Context) { redirect := ginx.QueryStr(c, "redirect", "/") v, exists := c.Get("userid") if exists { userid := v.(int64) user, err := models.UserGetById(rt.Ctx, userid) ginx.Dangerous(err) if user == nil { ginx.Bomb(200, "user not found") } if user.Username != "" { // already login ginx.NewRender(c).Data(redirect, nil) return } } if !rt.Sso.OIDC.Enable { ginx.NewRender(c).Data("", nil) return } redirect, err := rt.Sso.OIDC.Authorize(rt.Redis, redirect) ginx.Dangerous(err) ginx.NewRender(c).Data(redirect, err) } type CallbackOutput struct { Redirect string `json:"redirect"` User *models.User `json:"user"` AccessToken string `json:"access_token"` RefreshToken string `json:"refresh_token"` } func (rt *Router) loginCallback(c *gin.Context) { rctx := c.Request.Context() code := ginx.QueryStr(c, "code", "") state := ginx.QueryStr(c, "state", "") ret, err := rt.Sso.OIDC.Callback(rt.Redis, rctx, code, state) if err != nil { logx.Errorf(rctx, "sso_callback fail. code:%s, state:%s, get ret: %+v. error: %v", code, state, ret, err) ginx.NewRender(c).Data(CallbackOutput{}, err) return } user, err := models.UserGet(rt.Ctx, "username=?", ret.Username) ginx.Dangerous(err) if user != nil { if rt.Sso.OIDC.CoverAttributes { updatedFields := user.UpdateSsoFields("oidc", ret.Nickname, ret.Phone, ret.Email) ginx.Dangerous(user.Update(rt.Ctx, "update_at", updatedFields...)) } } else { user = new(models.User) user.FullSsoFields("oidc", ret.Username, ret.Nickname, ret.Phone, ret.Email, rt.Sso.OIDC.DefaultRoles) // create user from oidc ginx.Dangerous(user.Add(rt.Ctx)) if len(rt.Sso.OIDC.DefaultTeams) > 0 { for _, gid := range rt.Sso.OIDC.DefaultTeams { err = models.UserGroupMemberAdd(rt.Ctx, gid, user.Id) if err != nil { logx.Errorf(rctx, "user:%v UserGroupMemberAdd: %s", user, err) } } } } // set user login state userIdentity := fmt.Sprintf("%d-%s", user.Id, user.Username) ts, err := rt.createTokens(rt.HTTP.JWTAuth.SigningKey, userIdentity) ginx.Dangerous(err) ginx.Dangerous(rt.createAuth(rctx, userIdentity, ts)) // 保存 id_token 到 Redis,用于登出时使用 if ret.IdToken != "" { if err := rt.saveIdToken(rctx, user.Id, ret.IdToken); err != nil { logx.Errorf(rctx, "save id_token failed: %v, user_id: %d", err, user.Id) } } redirect := "/" if ret.Redirect != "/login" { redirect = ret.Redirect } ginx.NewRender(c).Data(CallbackOutput{ Redirect: redirect, User: user, AccessToken: ts.AccessToken, RefreshToken: ts.RefreshToken, }, nil) } type RedirectOutput struct { Redirect string `json:"redirect"` State string `json:"state"` } func (rt *Router) loginRedirectCas(c *gin.Context) { redirect := ginx.QueryStr(c, "redirect", "/") v, exists := c.Get("userid") if exists { userid := v.(int64) user, err := models.UserGetById(rt.Ctx, userid) ginx.Dangerous(err) if user == nil { ginx.Bomb(200, "user not found") } if user.Username != "" { // already login ginx.NewRender(c).Data(redirect, nil) return } } if !rt.Sso.CAS.Enable { logx.Errorf(c.Request.Context(), "cas is not enable") ginx.NewRender(c).Data("", nil) return } redirect, state, err := rt.Sso.CAS.Authorize(rt.Redis, redirect) ginx.Dangerous(err) ginx.NewRender(c).Data(RedirectOutput{ Redirect: redirect, State: state, }, err) } func (rt *Router) loginCallbackCas(c *gin.Context) { rctx := c.Request.Context() ticket := ginx.QueryStr(c, "ticket", "") state := ginx.QueryStr(c, "state", "") ret, err := rt.Sso.CAS.ValidateServiceTicket(rctx, ticket, state, rt.Redis) if err != nil { logx.Errorf(rctx, "ValidateServiceTicket: %s", err) ginx.NewRender(c).Data("", err) return } user, err := models.UserGet(rt.Ctx, "username=?", ret.Username) if err != nil { logx.Errorf(rctx, "UserGet: %s", err) } ginx.Dangerous(err) if user != nil { if rt.Sso.CAS.CoverAttributes { updatedFields := user.UpdateSsoFields("cas", ret.Nickname, ret.Phone, ret.Email) ginx.Dangerous(user.Update(rt.Ctx, "update_at", updatedFields...)) } } else { user = new(models.User) user.FullSsoFields("cas", ret.Username, ret.Nickname, ret.Phone, ret.Email, rt.Sso.CAS.DefaultRoles) // create user from cas ginx.Dangerous(user.Add(rt.Ctx)) } // set user login state userIdentity := fmt.Sprintf("%d-%s", user.Id, user.Username) ts, err := rt.createTokens(rt.HTTP.JWTAuth.SigningKey, userIdentity) if err != nil { logx.Errorf(rctx, "createTokens: %s", err) } ginx.Dangerous(err) ginx.Dangerous(rt.createAuth(rctx, userIdentity, ts)) redirect := "/" if ret.Redirect != "/login" { redirect = ret.Redirect } ginx.NewRender(c).Data(CallbackOutput{ Redirect: redirect, User: user, AccessToken: ts.AccessToken, RefreshToken: ts.RefreshToken, }, nil) } func (rt *Router) loginRedirectOAuth(c *gin.Context) { redirect := ginx.QueryStr(c, "redirect", "/") v, exists := c.Get("userid") if exists { userid := v.(int64) user, err := models.UserGetById(rt.Ctx, userid) ginx.Dangerous(err) if user == nil { ginx.Bomb(200, "user not found") } if user.Username != "" { // already login ginx.NewRender(c).Data(redirect, nil) return } } if !rt.Sso.OAuth2.Enable { ginx.NewRender(c).Data("", nil) return } redirect, err := rt.Sso.OAuth2.Authorize(rt.Redis, redirect) ginx.Dangerous(err) ginx.NewRender(c).Data(redirect, err) } func (rt *Router) loginRedirectDingTalk(c *gin.Context) { redirect := ginx.QueryStr(c, "redirect", "/") v, exists := c.Get("userid") if exists { userid := v.(int64) user, err := models.UserGetById(rt.Ctx, userid) ginx.Dangerous(err) if user == nil { ginx.Bomb(200, "user not found") } if user.Username != "" { // already login ginx.NewRender(c).Data(redirect, nil) return } } if !rt.Sso.DingTalk.Enable { ginx.NewRender(c).Data("", nil) return } redirect, err := rt.Sso.DingTalk.Authorize(rt.Redis, redirect) ginx.Dangerous(err) ginx.NewRender(c).Data(redirect, err) } func (rt *Router) loginCallbackDingTalk(c *gin.Context) { rctx := c.Request.Context() code := ginx.QueryStr(c, "code", "") state := ginx.QueryStr(c, "state", "") ret, err := rt.Sso.DingTalk.Callback(rt.Redis, rctx, code, state) if err != nil { logx.Errorf(rctx, "sso_callback DingTalk fail. code:%s, state:%s, get ret: %+v. error: %v", code, state, ret, err) ginx.NewRender(c).Data(CallbackOutput{}, err) return } user, err := models.UserGet(rt.Ctx, "username=?", ret.Username) ginx.Dangerous(err) if user != nil { if rt.Sso.DingTalk.DingTalkConfig.CoverAttributes { updatedFields := user.UpdateSsoFields(dingtalk.SsoTypeName, ret.Nickname, ret.Phone, ret.Email) ginx.Dangerous(user.Update(rt.Ctx, "update_at", updatedFields...)) } } else { user = new(models.User) user.FullSsoFields(dingtalk.SsoTypeName, ret.Username, ret.Nickname, ret.Phone, ret.Email, rt.Sso.DingTalk.DingTalkConfig.DefaultRoles) // create user from dingtalk ginx.Dangerous(user.Add(rt.Ctx)) } // set user login state userIdentity := fmt.Sprintf("%d-%s", user.Id, user.Username) ts, err := rt.createTokens(rt.HTTP.JWTAuth.SigningKey, userIdentity) ginx.Dangerous(err) ginx.Dangerous(rt.createAuth(c.Request.Context(), userIdentity, ts)) redirect := "/" if ret.Redirect != "/login" { redirect = ret.Redirect } ginx.NewRender(c).Data(CallbackOutput{ Redirect: redirect, User: user, AccessToken: ts.AccessToken, RefreshToken: ts.RefreshToken, }, nil) } func (rt *Router) loginRedirectFeiShu(c *gin.Context) { redirect := ginx.QueryStr(c, "redirect", "/") v, exists := c.Get("userid") if exists { userid := v.(int64) user, err := models.UserGetById(rt.Ctx, userid) ginx.Dangerous(err) if user == nil { ginx.Bomb(200, "user not found") } if user.Username != "" { // already login ginx.NewRender(c).Data(redirect, nil) return } } if rt.Sso.FeiShu == nil || !rt.Sso.FeiShu.Enable { ginx.NewRender(c).Data("", nil) return } redirect, err := rt.Sso.FeiShu.Authorize(rt.Redis, redirect) ginx.Dangerous(err) ginx.NewRender(c).Data(redirect, err) } func (rt *Router) loginCallbackFeiShu(c *gin.Context) { rctx := c.Request.Context() code := ginx.QueryStr(c, "code", "") state := ginx.QueryStr(c, "state", "") ret, err := rt.Sso.FeiShu.Callback(rt.Redis, rctx, code, state) if err != nil { logx.Errorf(rctx, "sso_callback FeiShu fail. code:%s, state:%s, get ret: %+v. error: %v", code, state, ret, err) ginx.NewRender(c).Data(CallbackOutput{}, err) return } user, err := models.UserGet(rt.Ctx, "username=?", ret.Username) ginx.Dangerous(err) if user != nil { if rt.Sso.FeiShu != nil && rt.Sso.FeiShu.FeiShuConfig != nil && rt.Sso.FeiShu.FeiShuConfig.CoverAttributes { updatedFields := user.UpdateSsoFields(feishu.SsoTypeName, ret.Nickname, ret.Phone, ret.Email) ginx.Dangerous(user.Update(rt.Ctx, "update_at", updatedFields...)) } } else { user = new(models.User) defaultRoles := []string{} defaultUserGroups := []int64{} if rt.Sso.FeiShu != nil && rt.Sso.FeiShu.FeiShuConfig != nil { defaultRoles = rt.Sso.FeiShu.FeiShuConfig.DefaultRoles defaultUserGroups = rt.Sso.FeiShu.FeiShuConfig.DefaultUserGroups } user.FullSsoFields(feishu.SsoTypeName, ret.Username, ret.Nickname, ret.Phone, ret.Email, defaultRoles) ginx.Dangerous(user.Add(rt.Ctx)) if len(defaultUserGroups) > 0 { err = user.AddToUserGroups(rt.Ctx, defaultUserGroups) if err != nil { logx.Errorf(rctx, "sso feishu add user group error %v %v", ret, err) } } } // set user login state userIdentity := fmt.Sprintf("%d-%s", user.Id, user.Username) ts, err := rt.createTokens(rt.HTTP.JWTAuth.SigningKey, userIdentity) ginx.Dangerous(err) ginx.Dangerous(rt.createAuth(c.Request.Context(), userIdentity, ts)) redirect := "/" if ret.Redirect != "/login" { redirect = ret.Redirect } ginx.NewRender(c).Data(CallbackOutput{ Redirect: redirect, User: user, AccessToken: ts.AccessToken, RefreshToken: ts.RefreshToken, }, nil) } func (rt *Router) loginCallbackOAuth(c *gin.Context) { rctx := c.Request.Context() code := ginx.QueryStr(c, "code", "") state := ginx.QueryStr(c, "state", "") ret, err := rt.Sso.OAuth2.Callback(rt.Redis, rctx, code, state) if err != nil { logx.Debugf(rctx, "sso.callback() get ret %+v error %v", ret, err) ginx.NewRender(c).Data(CallbackOutput{}, err) return } user, err := models.UserGet(rt.Ctx, "username=?", ret.Username) ginx.Dangerous(err) if user != nil { if rt.Sso.OAuth2.CoverAttributes { updatedFields := user.UpdateSsoFields("oauth2", ret.Nickname, ret.Phone, ret.Email) ginx.Dangerous(user.Update(rt.Ctx, "update_at", updatedFields...)) } } else { user = new(models.User) user.FullSsoFields("oauth2", ret.Username, ret.Nickname, ret.Phone, ret.Email, rt.Sso.OAuth2.DefaultRoles) // create user from oidc ginx.Dangerous(user.Add(rt.Ctx)) } // set user login state userIdentity := fmt.Sprintf("%d-%s", user.Id, user.Username) ts, err := rt.createTokens(rt.HTTP.JWTAuth.SigningKey, userIdentity) ginx.Dangerous(err) ginx.Dangerous(rt.createAuth(c.Request.Context(), userIdentity, ts)) redirect := "/" if ret.Redirect != "/login" { redirect = ret.Redirect } ginx.NewRender(c).Data(CallbackOutput{ Redirect: redirect, User: user, AccessToken: ts.AccessToken, RefreshToken: ts.RefreshToken, }, nil) } type SsoConfigOutput struct { OidcDisplayName string `json:"oidcDisplayName"` CasDisplayName string `json:"casDisplayName"` OauthDisplayName string `json:"oauthDisplayName"` DingTalkDisplayName string `json:"dingTalkDisplayName"` FeiShuDisplayName string `json:"feishuDisplayName"` } func (rt *Router) ssoConfigNameGet(c *gin.Context) { var oidcDisplayName, casDisplayName, oauthDisplayName, dingTalkDisplayName, feiShuDisplayName string if rt.Sso.OIDC != nil { oidcDisplayName = rt.Sso.OIDC.GetDisplayName() } if rt.Sso.CAS != nil { casDisplayName = rt.Sso.CAS.GetDisplayName() } if rt.Sso.OAuth2 != nil { oauthDisplayName = rt.Sso.OAuth2.GetDisplayName() } if rt.Sso.DingTalk != nil { dingTalkDisplayName = rt.Sso.DingTalk.GetDisplayName() } if rt.Sso.FeiShu != nil { feiShuDisplayName = rt.Sso.FeiShu.GetDisplayName() } ginx.NewRender(c).Data(SsoConfigOutput{ OidcDisplayName: oidcDisplayName, CasDisplayName: casDisplayName, OauthDisplayName: oauthDisplayName, DingTalkDisplayName: dingTalkDisplayName, FeiShuDisplayName: feiShuDisplayName, }, nil) } func (rt *Router) ssoConfigGets(c *gin.Context) { var ssoConfigs []models.SsoConfig lst, err := models.SsoConfigGets(rt.Ctx) ginx.Dangerous(err) if len(lst) == 0 { ginx.NewRender(c).Data(ssoConfigs, nil) return } // TODO: dingTalkExist 为了兼容当前前端配置, 后期单点登陆统一调整后不在预先设置默认内容 dingTalkExist := false feiShuExist := false for _, config := range lst { var ssoReqConfig models.SsoConfig ssoReqConfig.Id = config.Id ssoReqConfig.Name = config.Name ssoReqConfig.UpdateAt = config.UpdateAt switch config.Name { case dingtalk.SsoTypeName: dingTalkExist = true err := json.Unmarshal([]byte(config.Content), &ssoReqConfig.SettingJson) ginx.Dangerous(err) case feishu.SsoTypeName: feiShuExist = true err := json.Unmarshal([]byte(config.Content), &ssoReqConfig.SettingJson) ginx.Dangerous(err) default: ssoReqConfig.Content = config.Content } ssoConfigs = append(ssoConfigs, ssoReqConfig) } // TODO: dingTalkExist 为了兼容当前前端配置, 后期单点登陆统一调整后不在预先设置默认内容 if !dingTalkExist { var ssoConfig models.SsoConfig ssoConfig.Name = dingtalk.SsoTypeName ssoConfigs = append(ssoConfigs, ssoConfig) } if !feiShuExist { var ssoConfig models.SsoConfig ssoConfig.Name = feishu.SsoTypeName ssoConfigs = append(ssoConfigs, ssoConfig) } ginx.NewRender(c).Data(ssoConfigs, nil) } func (rt *Router) ssoConfigUpdate(c *gin.Context) { var f models.SsoConfig var ssoConfig models.SsoConfig ginx.BindJSON(c, &ssoConfig) switch ssoConfig.Name { case dingtalk.SsoTypeName: f.Name = ssoConfig.Name setting, err := json.Marshal(ssoConfig.SettingJson) ginx.Dangerous(err) f.Content = string(setting) f.UpdateAt = time.Now().Unix() sso, err := f.Query(rt.Ctx) if !errors.Is(err, gorm.ErrRecordNotFound) { ginx.Dangerous(err) } if errors.Is(err, gorm.ErrRecordNotFound) { err = f.Create(rt.Ctx) } else { f.Id = sso.Id err = f.Update(rt.Ctx) } ginx.Dangerous(err) case feishu.SsoTypeName: f.Name = ssoConfig.Name setting, err := json.Marshal(ssoConfig.SettingJson) ginx.Dangerous(err) f.Content = string(setting) f.UpdateAt = time.Now().Unix() sso, err := f.Query(rt.Ctx) if !errors.Is(err, gorm.ErrRecordNotFound) { ginx.Dangerous(err) } if errors.Is(err, gorm.ErrRecordNotFound) { err = f.Create(rt.Ctx) } else { f.Id = sso.Id err = f.Update(rt.Ctx) } ginx.Dangerous(err) default: f.Id = ssoConfig.Id f.Name = ssoConfig.Name f.Content = ssoConfig.Content err := f.Update(rt.Ctx) ginx.Dangerous(err) } switch f.Name { case "LDAP": var config ldapx.Config err := toml.Unmarshal([]byte(f.Content), &config) ginx.Dangerous(err) rt.Sso.LDAP.Reload(config) case "OIDC": var config oidcx.Config err := toml.Unmarshal([]byte(f.Content), &config) ginx.Dangerous(err) rt.Sso.OIDC, err = oidcx.New(config) ginx.Dangerous(err) case "CAS": var config cas.Config err := toml.Unmarshal([]byte(f.Content), &config) ginx.Dangerous(err) rt.Sso.CAS.Reload(config) case "OAuth2": var config oauth2x.Config err := toml.Unmarshal([]byte(f.Content), &config) ginx.Dangerous(err) rt.Sso.OAuth2.Reload(config) case dingtalk.SsoTypeName: var config dingtalk.Config err := json.Unmarshal([]byte(f.Content), &config) ginx.Dangerous(err) if rt.Sso.DingTalk == nil { rt.Sso.DingTalk = dingtalk.New(config) } rt.Sso.DingTalk.Reload(config) case feishu.SsoTypeName: var config feishu.Config err := json.Unmarshal([]byte(f.Content), &config) ginx.Dangerous(err) if rt.Sso.FeiShu == nil { rt.Sso.FeiShu = feishu.New(config) } rt.Sso.FeiShu.Reload(config) } ginx.NewRender(c).Message(nil) } type RSAConfigOutput struct { OpenRSA bool RSAPublicKey string } func (rt *Router) rsaConfigGet(c *gin.Context) { publicKey := "" if len(rt.HTTP.RSA.RSAPublicKey) > 0 { publicKey = base64.StdEncoding.EncodeToString(rt.HTTP.RSA.RSAPublicKey) } ginx.NewRender(c).Data(RSAConfigOutput{ OpenRSA: rt.HTTP.RSA.OpenRSA, RSAPublicKey: publicKey, }, nil) } ================================================ FILE: center/router/router_message_template.go ================================================ package router import ( "bytes" "fmt" "html/template" "net/http" "strings" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/slice" "github.com/ccfos/nightingale/v6/pkg/strx" "github.com/ccfos/nightingale/v6/pkg/tplx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/google/uuid" ) func (rt *Router) messageTemplatesAdd(c *gin.Context) { var lst []*models.MessageTemplate ginx.BindJSON(c, &lst) if len(lst) == 0 { ginx.Bomb(http.StatusBadRequest, "input json is empty") } me := c.MustGet("user").(*models.User) isAdmin := me.IsAdmin() idents := make([]string, 0, len(lst)) gids, err := models.MyGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) now := time.Now().Unix() for _, tpl := range lst { // 生成一个唯一的标识符,以后也不允许修改,前端不需要传这个参数 tpl.Ident = uuid.New().String() ginx.Dangerous(tpl.Verify()) if !isAdmin && !slice.HaveIntersection(gids, tpl.UserGroupIds) { ginx.Bomb(http.StatusForbidden, "forbidden") } idents = append(idents, tpl.Ident) tpl.CreateBy = me.Username tpl.CreateAt = now tpl.UpdateBy = me.Username tpl.UpdateAt = now } lstWithSameId, err := models.MessageTemplatesGet(rt.Ctx, "ident IN ?", idents) ginx.Dangerous(err) if len(lstWithSameId) > 0 { ginx.Bomb(http.StatusBadRequest, "ident already exists") } ids := make([]int64, 0, len(lst)) for _, tpl := range lst { err := models.Insert(rt.Ctx, tpl) ginx.Dangerous(err) ids = append(ids, tpl.ID) } ginx.NewRender(c).Data(ids, nil) } func (rt *Router) messageTemplatesDel(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) f.Verify() lst, err := models.MessageTemplatesGet(rt.Ctx, "id in (?)", f.Ids) ginx.Dangerous(err) notifyRuleIds, err := models.UsedByNotifyRule(rt.Ctx, models.MsgTplList(lst)) ginx.Dangerous(err) if len(notifyRuleIds) > 0 { ginx.NewRender(c).Message(fmt.Errorf("used by notify rule: %v", notifyRuleIds)) return } if me := c.MustGet("user").(*models.User); !me.IsAdmin() { gids, err := models.MyGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) for _, t := range lst { if !slice.HaveIntersection(gids, t.UserGroupIds) { ginx.Bomb(http.StatusForbidden, "forbidden") } } } ginx.NewRender(c).Message(models.DB(rt.Ctx).Delete( &models.MessageTemplate{}, "id in (?)", f.Ids).Error) } func (rt *Router) messageTemplatePut(c *gin.Context) { var f models.MessageTemplate ginx.BindJSON(c, &f) mt, err := models.MessageTemplateGet(rt.Ctx, "id <> ? and ident = ?", ginx.UrlParamInt64(c, "id"), f.Ident) ginx.Dangerous(err) if mt != nil { ginx.Bomb(http.StatusBadRequest, "message template ident already exists") } mt, err = models.MessageTemplateGet(rt.Ctx, "id = ?", ginx.UrlParamInt64(c, "id")) ginx.Dangerous(err) if mt == nil { ginx.Bomb(http.StatusNotFound, "message template not found") } me := c.MustGet("user").(*models.User) if !me.IsAdmin() { gids, err := models.MyGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) if !slice.HaveIntersection(gids, mt.UserGroupIds) { ginx.Bomb(http.StatusForbidden, "forbidden") } } f.UpdateBy = me.Username ginx.NewRender(c).Message(mt.Update(rt.Ctx, f)) } func (rt *Router) messageTemplateGet(c *gin.Context) { me := c.MustGet("user").(*models.User) gids, err := models.MyGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) tid := ginx.UrlParamInt64(c, "id") mt, err := models.MessageTemplateGet(rt.Ctx, "id = ?", tid) ginx.Dangerous(err) if mt == nil { ginx.Bomb(http.StatusNotFound, "message template not found") } if mt.Private == 1 && !slice.HaveIntersection(gids, mt.UserGroupIds) { ginx.Bomb(http.StatusForbidden, "forbidden") } ginx.NewRender(c).Data(mt, nil) } func (rt *Router) messageTemplatesGet(c *gin.Context) { var notifyChannelIdents []string if tmp := ginx.QueryStr(c, "notify_channel_idents", ""); tmp != "" { notifyChannelIdents = strings.Split(tmp, ",") } notifyChannelIds := strx.IdsInt64ForAPI(ginx.QueryStr(c, "notify_channel_ids", "")) if len(notifyChannelIds) > 0 { ginx.Dangerous(models.DB(rt.Ctx).Model(models.NotifyChannelConfig{}). Where("id in (?)", notifyChannelIds).Pluck("ident", ¬ifyChannelIdents).Error) } me := c.MustGet("user").(*models.User) gids, err := models.MyGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) lst, err := models.MessageTemplatesGetBy(rt.Ctx, notifyChannelIdents) ginx.Dangerous(err) models.FillUpdateByNicknames(rt.Ctx, lst) if me.IsAdmin() { ginx.NewRender(c).Data(lst, nil) return } res := make([]*models.MessageTemplate, 0) for _, t := range lst { if slice.HaveIntersection[int64](gids, t.UserGroupIds) || t.Private == 0 { res = append(res, t) } } ginx.NewRender(c).Data(res, nil) } type evtMsgReq struct { EventIds []int64 `json:"event_ids"` Tpl struct { Content map[string]string `json:"content"` } `json:"tpl"` } func (rt *Router) eventsMessage(c *gin.Context) { var req evtMsgReq ginx.BindJSON(c, &req) hisEvents, err := models.AlertHisEventGetByIds(rt.Ctx, req.EventIds) ginx.Dangerous(err) if len(hisEvents) == 0 { ginx.Bomb(http.StatusBadRequest, "event not found") } ginx.Dangerous(err) events := make([]*models.AlertCurEvent, len(hisEvents)) for i, he := range hisEvents { events[i] = he.ToCur() } renderData := make(map[string]interface{}) renderData["events"] = events defs := models.GetDefs(renderData) ret := make(map[string]string, len(req.Tpl.Content)) for k, v := range req.Tpl.Content { text := strings.Join(append(defs, v), "") tpl, err := template.New(k).Funcs(tplx.TemplateFuncMap).Parse(text) if err != nil { ret[k] = err.Error() continue } var buf bytes.Buffer err = tpl.Execute(&buf, renderData) if err != nil { ret[k] = err.Error() continue } ret[k] = buf.String() } ginx.NewRender(c).Data(ret, nil) } ================================================ FILE: center/router/router_metric_desc.go ================================================ package router import ( "github.com/ccfos/nightingale/v6/center/cconf" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) func (rt *Router) metricsDescGetFile(c *gin.Context) { c.JSON(200, rt.Center.MetricDesc) } // 前端传过来一个metric数组,后端去查询有没有对应的释义,返回map func (rt *Router) metricsDescGetMap(c *gin.Context) { var arr []string ginx.BindJSON(c, &arr) ret := make(map[string]string) for _, key := range arr { ret[key] = cconf.GetMetricDesc(c.GetHeader("X-Language"), key) } ginx.NewRender(c).Data(ret, nil) } // 页面功能暂时先不要了,直接通过配置文件来维护 // func metricDescriptionGets(c *gin.Context) { // limit := ginx.QueryInt(c, "limit", 20) // query := ginx.QueryStr(c, "query", "") // total, err := models.MetricDescriptionTotal(query) // ginx.Dangerous(err) // list, err := models.MetricDescriptionGets(query, limit, ginx.Offset(c, limit)) // ginx.Dangerous(err) // ginx.NewRender(c).Data(gin.H{ // "list": list, // "total": total, // }, nil) // } // type metricDescriptionAddForm struct { // Data string `json:"data"` // } // func metricDescriptionAdd(c *gin.Context) { // var f metricDescriptionAddForm // ginx.BindJSON(c, &f) // var metricDescriptions []models.MetricDescription // lines := strings.Split(f.Data, "\n") // for _, md := range lines { // arr := strings.SplitN(md, ":", 2) // if len(arr) != 2 { // ginx.Bomb(200, "metric description %s is illegal", md) // } // m := models.MetricDescription{ // Metric: arr[0], // Description: arr[1], // } // metricDescriptions = append(metricDescriptions, m) // } // if len(metricDescriptions) == 0 { // ginx.Bomb(http.StatusBadRequest, "Decoded metric description empty") // } // ginx.NewRender(c).Message(models.MetricDescriptionUpdate(metricDescriptions)) // } // func metricDescriptionDel(c *gin.Context) { // var f idsForm // ginx.BindJSON(c, &f) // f.Verify() // ginx.NewRender(c).Message(models.MetricDescriptionDel(f.Ids)) // } // type metricDescriptionForm struct { // Description string `json:"description"` // } // func metricDescriptionPut(c *gin.Context) { // var f metricDescriptionForm // ginx.BindJSON(c, &f) // md, err := models.MetricDescriptionGet("id=?", ginx.UrlParamInt64(c, "id")) // ginx.Dangerous(err) // if md == nil { // ginx.Bomb(200, "No such metric description") // } // ginx.NewRender(c).Message(md.Update(f.Description, time.Now().Unix())) // } ================================================ FILE: center/router/router_metric_view.go ================================================ package router import ( "net/http" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) // no param func (rt *Router) metricViewGets(c *gin.Context) { lst, err := models.MetricViewGets(rt.Ctx, c.MustGet("userid")) ginx.NewRender(c).Data(lst, err) } // body: name, configs, cate func (rt *Router) metricViewAdd(c *gin.Context) { var f models.MetricView ginx.BindJSON(c, &f) me := c.MustGet("user").(*models.User) if !me.IsAdmin() { // 管理员可以选择当前这个视图是公开呢,还是私有,普通用户的话就只能是私有的 f.Cate = 1 } f.Id = 0 f.CreateBy = me.Id ginx.Dangerous(f.Add(rt.Ctx)) ginx.NewRender(c).Data(f, nil) } // body: ids func (rt *Router) metricViewDel(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) f.Verify() me := c.MustGet("user").(*models.User) if me.IsAdmin() { ginx.NewRender(c).Message(models.MetricViewDel(rt.Ctx, f.Ids)) } else { ginx.NewRender(c).Message(models.MetricViewDel(rt.Ctx, f.Ids, me.Id)) } } // body: id, name, configs, cate func (rt *Router) metricViewPut(c *gin.Context) { var f models.MetricView ginx.BindJSON(c, &f) view, err := models.MetricViewGet(rt.Ctx, "id = ?", f.Id) ginx.Dangerous(err) if view == nil { ginx.NewRender(c).Message("no such item(id: %d)", f.Id) return } me := c.MustGet("user").(*models.User) if !me.IsAdmin() { f.Cate = 1 // 如果是普通用户,只能修改自己的 if view.CreateBy != me.Id { ginx.NewRender(c, http.StatusForbidden).Message("forbidden") return } } ginx.NewRender(c).Message(view.Update(rt.Ctx, f.Name, f.Configs, f.Cate, me.Id)) } ================================================ FILE: center/router/router_mute.go ================================================ package router import ( "net/http" "strings" "time" "github.com/ccfos/nightingale/v6/alert/common" "github.com/ccfos/nightingale/v6/alert/mute" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/strx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/i18n" ) // Return all, front-end search and paging func (rt *Router) alertMuteGetsByBG(c *gin.Context) { bgid := ginx.UrlParamInt64(c, "id") prods := strings.Fields(ginx.QueryStr(c, "prods", "")) query := ginx.QueryStr(c, "query", "") expired := ginx.QueryInt(c, "expired", -1) lst, err := models.AlertMuteGets(rt.Ctx, prods, bgid, -1, expired, query) if err == nil { models.FillUpdateByNicknames(rt.Ctx, lst) } ginx.NewRender(c).Data(lst, err) } func (rt *Router) alertMuteGetsByGids(c *gin.Context) { gids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",") if len(gids) > 0 { for _, gid := range gids { rt.bgroCheck(c, gid) } } else { me := c.MustGet("user").(*models.User) if !me.IsAdmin() { var err error gids, err = models.MyBusiGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) if len(gids) == 0 { ginx.NewRender(c).Data([]int{}, nil) return } } } lst, err := models.AlertMuteGetsByBGIds(rt.Ctx, gids) if err == nil { models.FillUpdateByNicknames(rt.Ctx, lst) } ginx.NewRender(c).Data(lst, err) } func (rt *Router) alertMuteGets(c *gin.Context) { prods := strings.Fields(ginx.QueryStr(c, "prods", "")) bgid := ginx.QueryInt64(c, "bgid", -1) query := ginx.QueryStr(c, "query", "") disabled := ginx.QueryInt(c, "disabled", -1) expired := ginx.QueryInt(c, "expired", -1) lst, err := models.AlertMuteGets(rt.Ctx, prods, bgid, disabled, expired, query) if err == nil { models.FillUpdateByNicknames(rt.Ctx, lst) } ginx.NewRender(c).Data(lst, err) } func (rt *Router) activeAlertMuteGets(c *gin.Context) { lst, err := models.AlertMuteGetsAll(rt.Ctx) ginx.NewRender(c).Data(lst, err) } func (rt *Router) alertMuteAdd(c *gin.Context) { var f models.AlertMute ginx.BindJSON(c, &f) username := c.MustGet("username").(string) f.CreateBy = username f.UpdateBy = username f.GroupId = ginx.UrlParamInt64(c, "id") ginx.Dangerous(f.Add(rt.Ctx)) ginx.NewRender(c).Data(f.Id, nil) } type MuteTestForm struct { EventId int64 `json:"event_id" binding:"required"` AlertMute models.AlertMute `json:"config" binding:"required"` PassTimeCheck bool `json:"pass_time_check"` } func (rt *Router) alertMuteTryRun(c *gin.Context) { var f MuteTestForm ginx.BindJSON(c, &f) ginx.Dangerous(f.AlertMute.Verify()) hisEvent, err := models.AlertHisEventGetById(rt.Ctx, f.EventId) ginx.Dangerous(err) if hisEvent == nil { ginx.Bomb(http.StatusNotFound, "event not found") } curEvent := *hisEvent.ToCur() curEvent.SetTagsMap() if f.PassTimeCheck { f.AlertMute.MuteTimeType = models.Periodic f.AlertMute.PeriodicMutesJson = []models.PeriodicMute{ { EnableDaysOfWeek: "0 1 2 3 4 5 6", EnableStime: "00:00", EnableEtime: "00:00", }, } } match, err := mute.MatchMute(&curEvent, &f.AlertMute) if err != nil { // 对错误信息进行 i18n 翻译 translatedErr := i18n.Sprintf(c.GetHeader("X-Language"), err.Error()) ginx.Bomb(http.StatusBadRequest, translatedErr) } if !match { ginx.NewRender(c).Data("event not match mute", nil) return } ginx.NewRender(c).Data("event match mute", nil) } // Preview events (alert_cur_event) that match the mute strategy based on the following criteria: // business group ID (group_id, group_id), product (prod, rule_prod), // alert event severity (severities, severity), and event tags (tags, tags). // For products of type not 'host', also consider the category (cate, cate) and datasource ID (datasource_ids, datasource_id). func (rt *Router) alertMutePreview(c *gin.Context) { //Generally the match of events would be less. var f models.AlertMute ginx.BindJSON(c, &f) f.GroupId = ginx.UrlParamInt64(c, "id") ginx.Dangerous(f.Verify()) //verify and parse tags json to ITags events, err := models.AlertCurEventGetsFromAlertMute(rt.Ctx, &f) ginx.Dangerous(err) matchEvents := make([]*models.AlertCurEvent, 0, len(events)) for i := 0; i < len(events); i++ { events[i].DB2Mem() if common.MatchTags(events[i].TagsMap, f.ITags) { matchEvents = append(matchEvents, events[i]) } } ginx.NewRender(c).Data(matchEvents, err) } func (rt *Router) alertMuteAddByService(c *gin.Context) { var f models.AlertMute ginx.BindJSON(c, &f) err := f.Add(rt.Ctx) ginx.NewRender(c).Data(f.Id, err) } func (rt *Router) alertMuteDel(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) f.Verify() ginx.NewRender(c).Message(models.AlertMuteDel(rt.Ctx, f.Ids)) } // alertMuteGet returns the alert mute by ID func (rt *Router) alertMuteGet(c *gin.Context) { amid := ginx.UrlParamInt64(c, "amid") am, err := models.AlertMuteGetById(rt.Ctx, amid) am.DB2FE() ginx.NewRender(c).Data(am, err) } func (rt *Router) alertMutePutByFE(c *gin.Context) { var f models.AlertMute ginx.BindJSON(c, &f) amid := ginx.UrlParamInt64(c, "amid") am, err := models.AlertMuteGetById(rt.Ctx, amid) ginx.Dangerous(err) if am == nil { ginx.NewRender(c, http.StatusNotFound).Message("No such AlertMute") return } rt.bgrwCheck(c, am.GroupId) f.UpdateBy = c.MustGet("username").(string) ginx.NewRender(c).Message(am.Update(rt.Ctx, f)) } type alertMuteFieldForm struct { Ids []int64 `json:"ids"` Fields map[string]interface{} `json:"fields"` } func (rt *Router) alertMutePutFields(c *gin.Context) { var f alertMuteFieldForm ginx.BindJSON(c, &f) if len(f.Fields) == 0 { ginx.Bomb(http.StatusBadRequest, "fields empty") } f.Fields["update_by"] = c.MustGet("username").(string) f.Fields["update_at"] = time.Now().Unix() for i := 0; i < len(f.Ids); i++ { am, err := models.AlertMuteGetById(rt.Ctx, f.Ids[i]) ginx.Dangerous(err) if am == nil { continue } am.FE2DB() ginx.Dangerous(am.UpdateFieldsMap(rt.Ctx, f.Fields)) } ginx.NewRender(c).Message(nil) } ================================================ FILE: center/router/router_mw.go ================================================ package router import ( "context" "errors" "fmt" "net/http" "strconv" "strings" "time" "github.com/ccfos/nightingale/v6/center/cstats" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/golang-jwt/jwt" "github.com/google/uuid" ) const ( DefaultTokenKey = "X-User-Token" ) type AccessDetails struct { AccessUuid string UserIdentity string } func (rt *Router) handleProxyUser(c *gin.Context) *models.User { headerUserNameKey := rt.HTTP.ProxyAuth.HeaderUserNameKey username := c.GetHeader(headerUserNameKey) if username == "" { ginx.Bomb(http.StatusUnauthorized, "unauthorized") } user, err := models.UserGetByUsername(rt.Ctx, username) if err != nil { ginx.Bomb(http.StatusInternalServerError, err.Error()) } if user == nil { now := time.Now().Unix() user = &models.User{ Username: username, Nickname: username, Roles: strings.Join(rt.HTTP.ProxyAuth.DefaultRoles, " "), CreateAt: now, UpdateAt: now, CreateBy: "system", UpdateBy: "system", } err = user.Add(rt.Ctx) if err != nil { ginx.Bomb(http.StatusInternalServerError, err.Error()) } } return user } func (rt *Router) proxyAuth() gin.HandlerFunc { return func(c *gin.Context) { user := rt.handleProxyUser(c) c.Set("userid", user.Id) c.Set("username", user.Username) c.Next() } } // tokenAuth 支持两种方式的认证,固定 token 和 jwt token // 因为不太好区分用户使用哪个方式,所以两种方式放在一个中间件里 func (rt *Router) tokenAuth() gin.HandlerFunc { return func(c *gin.Context) { // 先验证固定 token if rt.HTTP.TokenAuth.Enable { tokenKey := rt.HTTP.TokenAuth.HeaderUserTokenKey if tokenKey == "" { tokenKey = DefaultTokenKey } token := c.GetHeader(tokenKey) if token != "" { user := rt.UserTokenCache.GetByToken(token) if user != nil && user.Username != "" { c.Set("userid", user.Id) c.Set("username", user.Username) c.Next() return } } } // 再验证 jwt token metadata, err := rt.extractTokenMetadata(c.Request) if err != nil { ginx.Bomb(http.StatusUnauthorized, "unauthorized") } userIdentity, err := rt.fetchAuth(c.Request.Context(), metadata.AccessUuid) if err != nil { ginx.Bomb(http.StatusUnauthorized, "unauthorized") } // ${userid}-${username} arr := strings.SplitN(userIdentity, "-", 2) if len(arr) != 2 { ginx.Bomb(http.StatusUnauthorized, "unauthorized") } userid, err := strconv.ParseInt(arr[0], 10, 64) if err != nil { ginx.Bomb(http.StatusUnauthorized, "unauthorized") } c.Set("userid", userid) c.Set("username", arr[1]) c.Next() } } func (rt *Router) Auth() gin.HandlerFunc { return rt.auth() } func (rt *Router) auth() gin.HandlerFunc { if rt.HTTP.ProxyAuth.Enable { return rt.proxyAuth() } else { return rt.tokenAuth() } } // if proxy auth is enabled, mock jwt login/logout/refresh request func (rt *Router) jwtMock() gin.HandlerFunc { return func(c *gin.Context) { if !rt.HTTP.ProxyAuth.Enable { c.Next() return } if strings.Contains(c.FullPath(), "logout") { ginx.Bomb(http.StatusBadRequest, "logout is not supported when proxy auth is enabled") } user := rt.handleProxyUser(c) ginx.NewRender(c).Data(gin.H{ "user": user, "access_token": "", "refresh_token": "", }, nil) c.Abort() } } func (rt *Router) User() gin.HandlerFunc { return rt.user() } func (rt *Router) user() gin.HandlerFunc { return func(c *gin.Context) { username := c.MustGet("username").(string) user, err := models.UserGetByUsername(rt.Ctx, username) if err != nil { ginx.Bomb(http.StatusUnauthorized, "unauthorized") } if user == nil { ginx.Bomb(http.StatusUnauthorized, "unauthorized") } c.Set("user", user) c.Set("isadmin", user.IsAdmin()) // Update user.LastActiveTime rt.UserCache.SetLastActiveTime(user.Id, time.Now().Unix()) c.Next() } } func (rt *Router) userGroupWrite() gin.HandlerFunc { return func(c *gin.Context) { me := c.MustGet("user").(*models.User) ug := UserGroup(rt.Ctx, ginx.UrlParamInt64(c, "id")) can, err := me.CanModifyUserGroup(rt.Ctx, ug) ginx.Dangerous(err) if !can { ginx.Bomb(http.StatusForbidden, "forbidden") } c.Set("user_group", ug) c.Next() } } func (rt *Router) bgro() gin.HandlerFunc { return func(c *gin.Context) { me := c.MustGet("user").(*models.User) bg := BusiGroup(rt.Ctx, ginx.UrlParamInt64(c, "id")) can, err := me.CanDoBusiGroup(rt.Ctx, bg) ginx.Dangerous(err) if !can { ginx.Bomb(http.StatusForbidden, "forbidden") } c.Set("busi_group", bg) c.Next() } } // bgrw 逐步要被干掉,不安全 func (rt *Router) Bgrw() gin.HandlerFunc { return rt.bgrw() } func (rt *Router) bgrw() gin.HandlerFunc { return func(c *gin.Context) { me := c.MustGet("user").(*models.User) bg := BusiGroup(rt.Ctx, ginx.UrlParamInt64(c, "id")) can, err := me.CanDoBusiGroup(rt.Ctx, bg, "rw") ginx.Dangerous(err) if !can { ginx.Bomb(http.StatusForbidden, "forbidden") } c.Set("busi_group", bg) c.Next() } } // bgrwCheck 要逐渐替换掉bgrw方法,更安全 func (rt *Router) bgrwCheck(c *gin.Context, bgid int64) { me := c.MustGet("user").(*models.User) bg := BusiGroup(rt.Ctx, bgid) can, err := me.CanDoBusiGroup(rt.Ctx, bg, "rw") ginx.Dangerous(err) if !can { ginx.Bomb(http.StatusForbidden, "forbidden") } c.Set("busi_group", bg) } func (rt *Router) bgrwChecks(c *gin.Context, bgids []int64) { set := make(map[int64]struct{}) for i := 0; i < len(bgids); i++ { if _, has := set[bgids[i]]; has { continue } rt.bgrwCheck(c, bgids[i]) set[bgids[i]] = struct{}{} } } func (rt *Router) bgroCheck(c *gin.Context, bgid int64) { me := c.MustGet("user").(*models.User) bg := BusiGroup(rt.Ctx, bgid) can, err := me.CanDoBusiGroup(rt.Ctx, bg) ginx.Dangerous(err) if !can { ginx.Bomb(http.StatusForbidden, "forbidden") } c.Set("busi_group", bg) } func (rt *Router) Perm(operation string) gin.HandlerFunc { return rt.perm(operation) } func (rt *Router) perm(operation string) gin.HandlerFunc { return func(c *gin.Context) { me := c.MustGet("user").(*models.User) can, err := me.CheckPerm(rt.Ctx, operation) ginx.Dangerous(err) if !can { ginx.Bomb(http.StatusForbidden, "forbidden") } c.Next() } } func (rt *Router) admin() gin.HandlerFunc { return func(c *gin.Context) { userid := c.MustGet("userid").(int64) user, err := models.UserGetById(rt.Ctx, userid) if err != nil { ginx.Bomb(http.StatusUnauthorized, "unauthorized") } if user == nil { ginx.Bomb(http.StatusUnauthorized, "unauthorized") } roles := strings.Fields(user.Roles) found := false for i := 0; i < len(roles); i++ { if roles[i] == models.AdminRole { found = true break } } if !found { ginx.Bomb(http.StatusForbidden, "forbidden") } c.Set("user", user) c.Next() } } func (rt *Router) extractTokenMetadata(r *http.Request) (*AccessDetails, error) { token, err := rt.verifyToken(rt.HTTP.JWTAuth.SigningKey, rt.extractToken(r)) if err != nil { return nil, err } claims, ok := token.Claims.(jwt.MapClaims) if ok && token.Valid { accessUuid, ok := claims["access_uuid"].(string) if !ok { return nil, errors.New("failed to parse access_uuid from jwt") } // accessUuid 在 redis 里存在才放行 val, err := rt.fetchAuth(r.Context(), accessUuid) if err != nil || val == "" { return nil, errors.New("unauthorized") } return &AccessDetails{ AccessUuid: accessUuid, UserIdentity: claims["user_identity"].(string), }, nil } return nil, err } func (rt *Router) extractToken(r *http.Request) string { tok := r.Header.Get("Authorization") if len(tok) > 6 && strings.ToUpper(tok[0:7]) == "BEARER " { return tok[7:] } return "" } func (rt *Router) createAuth(ctx context.Context, userIdentity string, td *TokenDetails) error { username := strings.Split(userIdentity, "-")[1] // 如果只能有一个账号登录,那么就删除之前的 token if rt.HTTP.JWTAuth.SingleLogin { delKeys, err := rt.Redis.SMembers(ctx, rt.wrapJwtKey(username)).Result() if err != nil { return err } if len(delKeys) > 0 { errDel := rt.Redis.Del(ctx, delKeys...).Err() if errDel != nil { return errDel } } if errDel := rt.Redis.Del(ctx, rt.wrapJwtKey(username)).Err(); errDel != nil { return errDel } } at := time.Unix(td.AtExpires, 0) rte := time.Unix(td.RtExpires, 0) now := time.Now() if err := rt.Redis.Set(ctx, rt.wrapJwtKey(td.AccessUuid), userIdentity, at.Sub(now)).Err(); err != nil { cstats.RedisOperationLatency.WithLabelValues("set_token", "fail").Observe(time.Since(now).Seconds()) return err } if err := rt.Redis.Set(ctx, rt.wrapJwtKey(td.RefreshUuid), userIdentity, rte.Sub(now)).Err(); err != nil { cstats.RedisOperationLatency.WithLabelValues("set_token", "fail").Observe(time.Since(now).Seconds()) return err } cstats.RedisOperationLatency.WithLabelValues("set_token", "success").Observe(time.Since(now).Seconds()) if rt.HTTP.JWTAuth.SingleLogin { if err := rt.Redis.SAdd(ctx, rt.wrapJwtKey(username), rt.wrapJwtKey(td.AccessUuid), rt.wrapJwtKey(td.RefreshUuid)).Err(); err != nil { return err } } return nil } func (rt *Router) fetchAuth(ctx context.Context, givenUuid string) (string, error) { now := time.Now() ret, err := rt.Redis.Get(ctx, rt.wrapJwtKey(givenUuid)).Result() if err != nil { cstats.RedisOperationLatency.WithLabelValues("get_token", "fail").Observe(time.Since(now).Seconds()) } else { cstats.RedisOperationLatency.WithLabelValues("get_token", "success").Observe(time.Since(now).Seconds()) } return ret, err } func (rt *Router) deleteAuth(ctx context.Context, givenUuid string) error { err := rt.Redis.Del(ctx, rt.wrapJwtKey(givenUuid)).Err() if err != nil { cstats.RedisOperationLatency.WithLabelValues("del_token", "fail").Observe(time.Since(time.Now()).Seconds()) } else { cstats.RedisOperationLatency.WithLabelValues("del_token", "success").Observe(time.Since(time.Now()).Seconds()) } return err } func (rt *Router) deleteTokens(ctx context.Context, authD *AccessDetails) error { // get the refresh uuid refreshUuid := authD.AccessUuid + "++" + authD.UserIdentity // delete access token err := rt.Redis.Del(ctx, rt.wrapJwtKey(authD.AccessUuid)).Err() if err != nil { return err } // delete refresh token err = rt.Redis.Del(ctx, rt.wrapJwtKey(refreshUuid)).Err() if err != nil { return err } return nil } func (rt *Router) wrapJwtKey(key string) string { return rt.HTTP.JWTAuth.RedisKeyPrefix + key } func (rt *Router) wrapIdTokenKey(userId int64) string { return fmt.Sprintf("n9e_id_token_%d", userId) } // saveIdToken 保存用户的 id_token 到 Redis func (rt *Router) saveIdToken(ctx context.Context, userId int64, idToken string) error { if idToken == "" { return nil } // id_token 的过期时间应该与 RefreshToken 保持一致,确保在整个会话期间都可用于登出 expiration := time.Minute * time.Duration(rt.HTTP.JWTAuth.RefreshExpired) return rt.Redis.Set(ctx, rt.wrapIdTokenKey(userId), idToken, expiration).Err() } // fetchIdToken 从 Redis 获取用户的 id_token func (rt *Router) fetchIdToken(ctx context.Context, userId int64) (string, error) { return rt.Redis.Get(ctx, rt.wrapIdTokenKey(userId)).Result() } // deleteIdToken 从 Redis 删除用户的 id_token func (rt *Router) deleteIdToken(ctx context.Context, userId int64) error { return rt.Redis.Del(ctx, rt.wrapIdTokenKey(userId)).Err() } type TokenDetails struct { AccessToken string RefreshToken string AccessUuid string RefreshUuid string AtExpires int64 RtExpires int64 } func (rt *Router) createTokens(signingKey, userIdentity string) (*TokenDetails, error) { td := &TokenDetails{} td.AtExpires = time.Now().Add(time.Minute * time.Duration(rt.HTTP.JWTAuth.AccessExpired)).Unix() td.AccessUuid = uuid.NewString() td.RtExpires = time.Now().Add(time.Minute * time.Duration(rt.HTTP.JWTAuth.RefreshExpired)).Unix() td.RefreshUuid = td.AccessUuid + "++" + userIdentity var err error // Creating Access Token atClaims := jwt.MapClaims{} atClaims["authorized"] = true atClaims["access_uuid"] = td.AccessUuid atClaims["user_identity"] = userIdentity atClaims["exp"] = td.AtExpires at := jwt.NewWithClaims(jwt.SigningMethodHS256, atClaims) td.AccessToken, err = at.SignedString([]byte(signingKey)) if err != nil { return nil, err } // Creating Refresh Token rtClaims := jwt.MapClaims{} rtClaims["refresh_uuid"] = td.RefreshUuid rtClaims["user_identity"] = userIdentity rtClaims["exp"] = td.RtExpires jrt := jwt.NewWithClaims(jwt.SigningMethodHS256, rtClaims) td.RefreshToken, err = jrt.SignedString([]byte(signingKey)) if err != nil { return nil, err } return td, nil } func (rt *Router) verifyToken(signingKey, tokenString string) (*jwt.Token, error) { if tokenString == "" { return nil, fmt.Errorf("bearer token not found") } token, err := jwt.Parse(tokenString, func(token *jwt.Token) (interface{}, error) { if _, ok := token.Method.(*jwt.SigningMethodHMAC); !ok { return nil, fmt.Errorf("unexpected jwt signing method: %v", token.Header["alg"]) } return []byte(signingKey), nil }) if err != nil { return nil, err } return token, nil } ================================================ FILE: center/router/router_notification_record.go ================================================ package router import ( "strings" "github.com/ccfos/nightingale/v6/alert/sender" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/logger" ) type NotificationResponse struct { SubRules []SubRule `json:"sub_rules"` Notifies map[string][]Record `json:"notifies"` } type SubRule struct { SubID int64 `json:"sub_id"` NotifyRuleId int64 `json:"notify_rule_id"` Notifies map[string][]Record `json:"notifies"` } type Record struct { NotifyRuleId int64 `json:"notify_rule_id"` Target string `json:"target"` Username string `json:"username"` Status int `json:"status"` Detail string `json:"detail"` } // notificationRecordAdd func (rt *Router) notificationRecordAdd(c *gin.Context) { var req []*models.NotificationRecord ginx.BindJSON(c, &req) err := sender.PushNotifyRecords(req) ginx.Dangerous(err, 429) ginx.NewRender(c).Data(nil, err) } func (rt *Router) notificationRecordList(c *gin.Context) { eid := ginx.UrlParamInt64(c, "eid") lst, err := models.NotificationRecordsGetByEventId(rt.Ctx, eid) ginx.Dangerous(err) response := buildNotificationResponse(rt.Ctx, lst) ginx.NewRender(c).Data(response, nil) } func buildNotificationResponse(ctx *ctx.Context, nl []*models.NotificationRecord) NotificationResponse { response := NotificationResponse{ SubRules: []SubRule{}, Notifies: make(map[string][]Record), } subRuleMap := make(map[int64]*SubRule) // Collect all group IDs groupIdSet := make(map[int64]struct{}) // map[SubId]map[Channel]map[Target]index filter := make(map[int64]map[string]map[string]int) for i, n := range nl { // 对相同的 channel-target 进行合并 for _, gid := range n.GetGroupIds(ctx) { groupIdSet[gid] = struct{}{} } if _, exists := filter[n.SubId]; !exists { filter[n.SubId] = make(map[string]map[string]int) } if _, exists := filter[n.SubId][n.Channel]; !exists { filter[n.SubId][n.Channel] = make(map[string]int) } idx, exists := filter[n.SubId][n.Channel][n.Target] if !exists { filter[n.SubId][n.Channel][n.Target] = i } else { if nl[idx].Status < n.Status { nl[idx].Status = n.Status } nl[idx].Details = nl[idx].Details + ", " + n.Details nl[i] = nil } } // Fill usernames only once usernameByTarget := fillUserNames(ctx, groupIdSet) for _, n := range nl { if n == nil { continue } m := usernameByTarget[n.Target] usernames := make([]string, 0, len(m)) for k := range m { usernames = append(usernames, k) } if !checkChannel(n.Channel) { // Hide sensitive information n.Target = replaceLastEightChars(n.Target) } record := Record{ Target: n.Target, Status: n.Status, Detail: n.Details, NotifyRuleId: n.NotifyRuleID, } record.Username = strings.Join(usernames, ",") if n.SubId > 0 { // Handle SubRules subRule, ok := subRuleMap[n.SubId] if !ok { newSubRule := &SubRule{ NotifyRuleId: n.NotifyRuleID, SubID: n.SubId, } newSubRule.Notifies = make(map[string][]Record) newSubRule.Notifies[n.Channel] = []Record{record} subRuleMap[n.SubId] = newSubRule } else { if _, exists := subRule.Notifies[n.Channel]; !exists { subRule.Notifies[n.Channel] = []Record{record} } else { subRule.Notifies[n.Channel] = append(subRule.Notifies[n.Channel], record) } } continue } if response.Notifies == nil { response.Notifies = make(map[string][]Record) } if _, exists := response.Notifies[n.Channel]; !exists { response.Notifies[n.Channel] = []Record{record} } else { response.Notifies[n.Channel] = append(response.Notifies[n.Channel], record) } } for _, subRule := range subRuleMap { response.SubRules = append(response.SubRules, *subRule) } return response } // check channel is one of the following: tx-sms, tx-voice, ali-sms, ali-voice, email, script func checkChannel(channel string) bool { switch channel { case "tx-sms", "tx-voice", "ali-sms", "ali-voice", "email", "script": return true } return false } func replaceLastEightChars(s string) string { if len(s) <= 8 { return strings.Repeat("*", len(s)) } return s[:len(s)-8] + strings.Repeat("*", 8) } func fillUserNames(ctx *ctx.Context, groupIdSet map[int64]struct{}) map[string]map[string]struct{} { userNameByTarget := make(map[string]map[string]struct{}) gids := make([]int64, 0, len(groupIdSet)) for gid := range groupIdSet { gids = append(gids, gid) } users, err := models.UsersGetByGroupIds(ctx, gids) if err != nil { logger.Errorf("UsersGetByGroupIds failed, err: %v", err) return userNameByTarget } for _, user := range users { logger.Warningf("user: %s", user.Username) for _, ch := range models.DefaultChannels { target, exist := user.ExtractToken(ch) if exist { if _, ok := userNameByTarget[target]; !ok { userNameByTarget[target] = make(map[string]struct{}) } userNameByTarget[target][user.Username] = struct{}{} } } } return userNameByTarget } ================================================ FILE: center/router/router_notify_channel.go ================================================ package router import ( "bytes" "encoding/json" "fmt" "io" "net/http" "net/url" "sort" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) func (rt *Router) notifyChannelsAdd(c *gin.Context) { me := c.MustGet("user").(*models.User) var lst []*models.NotifyChannelConfig ginx.BindJSON(c, &lst) if len(lst) == 0 { ginx.Bomb(http.StatusBadRequest, "input json is empty") } names := make([]string, 0, len(lst)) for i := range lst { ginx.Dangerous(lst[i].Verify()) names = append(names, lst[i].Name) lst[i].CreateBy = me.Username lst[i].CreateAt = time.Now().Unix() lst[i].UpdateBy = me.Username lst[i].UpdateAt = time.Now().Unix() } lstWithSameName, err := models.NotifyChannelsGet(rt.Ctx, "name IN ?", names) ginx.Dangerous(err) if len(lstWithSameName) > 0 { ginx.Bomb(http.StatusBadRequest, "name already exists") } ids := make([]int64, 0, len(lst)) for _, item := range lst { err := models.Insert(rt.Ctx, item) ginx.Dangerous(err) ids = append(ids, item.ID) } ginx.NewRender(c).Data(ids, nil) } func (rt *Router) notifyChannelsDel(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) f.Verify() lst, err := models.NotifyChannelsGet(rt.Ctx, "id in (?)", f.Ids) ginx.Dangerous(err) notifyRuleIds, err := models.UsedByNotifyRule(rt.Ctx, models.NotiChList(lst)) ginx.Dangerous(err) if len(notifyRuleIds) > 0 { ginx.NewRender(c).Message(fmt.Errorf("used by notify rule: %v", notifyRuleIds)) return } ginx.NewRender(c).Message(models.DB(rt.Ctx). Delete(&models.NotifyChannelConfig{}, "id in (?)", f.Ids).Error) } func (rt *Router) notifyChannelPut(c *gin.Context) { me := c.MustGet("user").(*models.User) var f models.NotifyChannelConfig ginx.BindJSON(c, &f) lstWithSameName, err := models.NotifyChannelsGet(rt.Ctx, "name = ? and id <> ?", f.Name, f.ID) ginx.Dangerous(err) if len(lstWithSameName) > 0 { ginx.Bomb(http.StatusBadRequest, "name already exists") } nc, err := models.NotifyChannelGet(rt.Ctx, "id = ?", ginx.UrlParamInt64(c, "id")) ginx.Dangerous(err) if nc == nil { ginx.Bomb(http.StatusNotFound, "notify channel not found") } f.UpdateBy = me.Username ginx.NewRender(c).Message(nc.Update(rt.Ctx, f)) } func (rt *Router) notifyChannelGet(c *gin.Context) { cid := ginx.UrlParamInt64(c, "id") nc, err := models.NotifyChannelGet(rt.Ctx, "id = ?", cid) ginx.Dangerous(err) if nc == nil { ginx.Bomb(http.StatusNotFound, "notify channel not found") } ginx.NewRender(c).Data(nc, nil) } func (rt *Router) notifyChannelGetBy(c *gin.Context) { ident := ginx.QueryStr(c, "ident") nc, err := models.NotifyChannelGet(rt.Ctx, "ident = ?", ident) ginx.Dangerous(err) if nc == nil { ginx.Bomb(http.StatusNotFound, "notify channel not found") } nc.ParamConfig = &models.NotifyParamConfig{} nc.RequestConfig = &models.RequestConfig{} ginx.NewRender(c).Data(nc, nil) } func (rt *Router) notifyChannelsGet(c *gin.Context) { lst, err := models.NotifyChannelsGet(rt.Ctx, "", nil) if err == nil { models.FillUpdateByNicknames(rt.Ctx, lst) } ginx.NewRender(c).Data(lst, err) } func (rt *Router) notifyChannelsGetForNormalUser(c *gin.Context) { lst, err := models.NotifyChannelsGet(rt.Ctx, "") ginx.Dangerous(err) newLst := make([]*models.NotifyChannelConfig, 0, len(lst)) for _, c := range lst { newLst = append(newLst, &models.NotifyChannelConfig{ ID: c.ID, Name: c.Name, Ident: c.Ident, Enable: c.Enable, RequestType: c.RequestType, ParamConfig: c.ParamConfig, }) } ginx.NewRender(c).Data(newLst, nil) } func (rt *Router) notifyChannelIdentsGet(c *gin.Context) { // 获取所有通知渠道 channels, err := models.NotifyChannelsGet(rt.Ctx, "", nil) ginx.Dangerous(err) // ident 去重 idents := make(map[string]struct{}) for _, channel := range channels { if channel.Ident != "" { idents[channel.Ident] = struct{}{} } } lst := make([]string, 0, len(idents)) for ident := range idents { lst = append(lst, ident) } sort.Strings(lst) ginx.NewRender(c).Data(lst, nil) } func (rt *Router) flashDutyNotifyChannelsGet(c *gin.Context) { cid := ginx.UrlParamInt64(c, "id") nc, err := models.NotifyChannelGet(rt.Ctx, "id = ?", cid) ginx.Dangerous(err) if nc == nil { ginx.Bomb(http.StatusNotFound, "notify channel not found") } configs, err := models.ConfigsSelectByCkey(rt.Ctx, "flashduty_app_key") if err != nil { ginx.Bomb(http.StatusInternalServerError, "failed to get flashduty app key") } jsonData := []byte("{}") if len(configs) > 0 { me := c.MustGet("user").(*models.User) jsonData = []byte(fmt.Sprintf(`{"member_name":"%s","email":"%s","phone":"%s"}`, me.Username, me.Email, me.Phone)) } items, err := getFlashDutyChannels(nc.RequestConfig.FlashDutyRequestConfig.IntegrationUrl, jsonData, time.Duration(nc.RequestConfig.FlashDutyRequestConfig.Timeout)*time.Millisecond) ginx.Dangerous(err) ginx.NewRender(c).Data(items, nil) } type flushDutyChannelsResponse struct { Error struct { Code string `json:"code"` Message string `json:"message"` } `json:"error"` Data struct { Items []FlashDutyChannel `json:"items"` Total int `json:"total"` } `json:"data"` } type FlashDutyChannel struct { ChannelID int `json:"channel_id"` ChannelName string `json:"channel_name"` Status string `json:"status"` } // getFlashDutyChannels 从FlashDuty API获取频道列表 func getFlashDutyChannels(integrationUrl string, jsonData []byte, timeout time.Duration) ([]FlashDutyChannel, error) { // 解析URL,提取baseUrl和参数 baseUrl, integrationKey, err := parseIntegrationUrl(integrationUrl) if err != nil { return nil, err } if integrationKey == "" { return nil, fmt.Errorf("integration_key not found in URL") } // 构建新的API URL,保持原始路径 url := fmt.Sprintf("%s/channel/list-by-integration?integration_key=%s", baseUrl, integrationKey) req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData)) if err != nil { return nil, err } req.Header.Set("Content-Type", "application/json") httpResp, err := (&http.Client{ Timeout: timeout, }).Do(req) if err != nil { return nil, err } defer httpResp.Body.Close() body, err := io.ReadAll(httpResp.Body) if err != nil { return nil, err } var res flushDutyChannelsResponse if err := json.Unmarshal(body, &res); err != nil { return nil, err } if res.Error.Message != "" { return nil, fmt.Errorf(res.Error.Message) } return res.Data.Items, nil } // parseIntegrationUrl 从URL中提取baseUrl和参数 func parseIntegrationUrl(urlStr string) (baseUrl string, integrationKey string, err error) { // 解析URL parsedUrl, err := url.Parse(urlStr) if err != nil { return "", "", err } host := fmt.Sprintf("%s://%s", parsedUrl.Scheme, parsedUrl.Host) // 提取查询参数 queryParams := parsedUrl.Query() integrationKey = queryParams.Get("integration_key") return host, integrationKey, nil } func (rt *Router) pagerDutyNotifyServicesGet(c *gin.Context) { cid := ginx.UrlParamInt64(c, "id") nc, err := models.NotifyChannelGet(rt.Ctx, "id = ?", cid) ginx.Dangerous(err) if err != nil || nc == nil { ginx.Bomb(http.StatusNotFound, "notify channel not found") } items, err := getPagerDutyServices(nc.RequestConfig.PagerDutyRequestConfig.ApiKey, time.Duration(nc.RequestConfig.PagerDutyRequestConfig.Timeout)*time.Millisecond) if err != nil { ginx.Bomb(http.StatusInternalServerError, fmt.Sprintf("failed to get pagerduty services: %v", err)) } // 服务: []集成,扁平化为服务-集成 var flattenedItems []map[string]string for _, svc := range items { for _, integ := range svc.Integrations { flattenedItems = append(flattenedItems, map[string]string{ "service_id": svc.ID, "service_name": svc.Name, "integration_summary": integ.Summary, "integration_id": integ.ID, "integration_url": integ.Self, }) } } ginx.NewRender(c).Data(flattenedItems, nil) } func (rt *Router) pagerDutyIntegrationKeyGet(c *gin.Context) { serviceId := ginx.UrlParamStr(c, "service_id") integrationId := ginx.UrlParamStr(c, "integration_id") cid := ginx.UrlParamInt64(c, "id") nc, err := models.NotifyChannelGet(rt.Ctx, "id = ?", cid) ginx.Dangerous(err) if err != nil || nc == nil { ginx.Bomb(http.StatusNotFound, "notify channel not found") } integrationUrl := fmt.Sprintf("https://api.pagerduty.com/services/%s/integrations/%s", serviceId, integrationId) integrationKey, err := getPagerDutyIntegrationKey(integrationUrl, nc.RequestConfig.PagerDutyRequestConfig.ApiKey, time.Duration(nc.RequestConfig.PagerDutyRequestConfig.Timeout)*time.Millisecond) if err != nil { ginx.Bomb(http.StatusInternalServerError, fmt.Sprintf("failed to get pagerduty integration key: %v", err)) } ginx.NewRender(c).Data(map[string]string{ "integration_key": integrationKey, }, nil) } type PagerDutyIntegration struct { ID string `json:"id"` IntegrationKey string `json:"integration_key"` Self string `json:"self"` // integration 的 API URL Summary string `json:"summary"` } type PagerDutyService struct { Name string `json:"name"` ID string `json:"id"` Integrations []PagerDutyIntegration `json:"integrations"` } // getPagerDutyServices 从 PagerDuty API 分页获取所有服务及其集成信息 func getPagerDutyServices(apiKey string, timeout time.Duration) ([]PagerDutyService, error) { const limit = 100 // 每页最大数量 var offset uint // 分页偏移量 var allServices []PagerDutyService for { // 构建带分页参数的 URL url := fmt.Sprintf("https://api.pagerduty.com/services?limit=%d&offset=%d", limit, offset) req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, err } req.Header.Set("Authorization", fmt.Sprintf("Token token=%s", apiKey)) req.Header.Set("Accept", "application/vnd.pagerduty+json;version=2") httpResp, err := (&http.Client{Timeout: timeout}).Do(req) if err != nil { return nil, err } body, err := io.ReadAll(httpResp.Body) httpResp.Body.Close() if err != nil { return nil, err } // 定义包含分页信息的响应结构 var serviceRes struct { Services []PagerDutyService `json:"services"` More bool `json:"more"` // 是否还有更多数据 Limit uint `json:"limit"` Offset uint `json:"offset"` } if err := json.Unmarshal(body, &serviceRes); err != nil { return nil, err } allServices = append(allServices, serviceRes.Services...) // 判断是否还有更多数据 if !serviceRes.More || len(serviceRes.Services) < int(limit) { break } offset += limit // 准备请求下一页 } return allServices, nil } // getPagerDutyIntegrationKey 通过 integration 的 API URL 获取 integration key func getPagerDutyIntegrationKey(integrationUrl, apiKey string, timeout time.Duration) (string, error) { req, err := http.NewRequest("GET", integrationUrl, nil) if err != nil { return "", err } req.Header.Set("Authorization", fmt.Sprintf("Token token=%s", apiKey)) httpResp, err := (&http.Client{ Timeout: timeout, }).Do(req) if err != nil { return "", err } defer httpResp.Body.Close() body, err := io.ReadAll(httpResp.Body) if err != nil { return "", err } var integRes struct { Integration struct { IntegrationKey string `json:"integration_key"` } `json:"integration"` } if err := json.Unmarshal(body, &integRes); err != nil { return "", err } return integRes.Integration.IntegrationKey, nil } ================================================ FILE: center/router/router_notify_channel_test.go ================================================ package router import ( "fmt" "testing" ) func TestGetFlashDutyChannels(t *testing.T) { // 构造测试数据 integrationUrl := "https://api.flashcat.cloud/event/push/alert/n9e?integration_key=xxx" jsonData := []byte(`{}`) // 调用被测试的函数 channels, err := getFlashDutyChannels(integrationUrl, jsonData, 5000) fmt.Println(channels, err) } ================================================ FILE: center/router/router_notify_config.go ================================================ package router import ( "encoding/json" "fmt" "strings" "github.com/ccfos/nightingale/v6/alert/aconf" "github.com/ccfos/nightingale/v6/alert/sender" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/tplx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/pelletier/go-toml/v2" "github.com/toolkits/pkg/str" ) func (rt *Router) webhookGets(c *gin.Context) { var webhooks []models.Webhook cval, err := models.ConfigsGet(rt.Ctx, models.WEBHOOKKEY) ginx.Dangerous(err) if cval == "" { ginx.NewRender(c).Data(webhooks, nil) return } err = json.Unmarshal([]byte(cval), &webhooks) ginx.NewRender(c).Data(webhooks, err) } func (rt *Router) webhookPuts(c *gin.Context) { var webhooks []models.Webhook ginx.BindJSON(c, &webhooks) for i := 0; i < len(webhooks); i++ { webhooks[i].Headers = []string{} if len(webhooks[i].HeaderMap) > 0 { for k, v := range webhooks[i].HeaderMap { webhooks[i].Headers = append(webhooks[i].Headers, k) webhooks[i].Headers = append(webhooks[i].Headers, v) } } } data, err := json.Marshal(webhooks) ginx.Dangerous(err) username := c.MustGet("username").(string) ginx.NewRender(c).Message(models.ConfigsSetWithUname(rt.Ctx, models.WEBHOOKKEY, string(data), username)) } func (rt *Router) notifyScriptGet(c *gin.Context) { var notifyScript models.NotifyScript cval, err := models.ConfigsGet(rt.Ctx, models.NOTIFYSCRIPT) ginx.Dangerous(err) if cval == "" { ginx.NewRender(c).Data(notifyScript, nil) return } err = json.Unmarshal([]byte(cval), ¬ifyScript) ginx.NewRender(c).Data(notifyScript, err) } func (rt *Router) notifyScriptPut(c *gin.Context) { var notifyScript models.NotifyScript ginx.BindJSON(c, ¬ifyScript) data, err := json.Marshal(notifyScript) ginx.Dangerous(err) username := c.MustGet("username").(string) ginx.NewRender(c).Message(models.ConfigsSetWithUname(rt.Ctx, models.NOTIFYSCRIPT, string(data), username)) } func (rt *Router) notifyChannelGets(c *gin.Context) { var notifyChannels []models.NotifyChannel cval, err := models.ConfigsGet(rt.Ctx, models.NOTIFYCHANNEL) ginx.Dangerous(err) if cval == "" { ginx.NewRender(c).Data(notifyChannels, nil) return } err = json.Unmarshal([]byte(cval), ¬ifyChannels) ginx.NewRender(c).Data(notifyChannels, err) } func (rt *Router) notifyChannelPuts(c *gin.Context) { var notifyChannels []models.NotifyChannel ginx.BindJSON(c, ¬ifyChannels) channels := []string{models.Dingtalk, models.Wecom, models.Feishu, models.Mm, models.Telegram, models.Email, models.Lark, models.LarkCard} m := make(map[string]struct{}) for _, v := range notifyChannels { m[v.Ident] = struct{}{} } for _, v := range channels { if _, ok := m[v]; !ok { ginx.Bomb(200, "channel %s ident can not modify", v) } } data, err := json.Marshal(notifyChannels) ginx.Dangerous(err) username := c.MustGet("username").(string) ginx.NewRender(c).Message(models.ConfigsSetWithUname(rt.Ctx, models.NOTIFYCHANNEL, string(data), username)) } func (rt *Router) notifyContactGets(c *gin.Context) { notifyContacts := []models.NotifyContact{} cval, err := models.ConfigsGet(rt.Ctx, models.NOTIFYCONTACT) ginx.Dangerous(err) if cval == "" { ginx.NewRender(c).Data(notifyContacts, nil) return } err = json.Unmarshal([]byte(cval), ¬ifyContacts) ginx.NewRender(c).Data(notifyContacts, err) } func (rt *Router) notifyContactPuts(c *gin.Context) { var notifyContacts []models.NotifyContact ginx.BindJSON(c, ¬ifyContacts) data, err := json.Marshal(notifyContacts) ginx.Dangerous(err) username := c.MustGet("username").(string) ginx.NewRender(c).Message(models.ConfigsSetWithUname(rt.Ctx, models.NOTIFYCONTACT, string(data), username)) } func (rt *Router) notifyConfigGet(c *gin.Context) { key := ginx.QueryStr(c, "ckey") cval, err := models.ConfigsGet(rt.Ctx, key) if cval == "" { switch key { case models.IBEX: cval = memsto.DefaultIbex case models.SMTP: cval = memsto.DefaultSMTP } } ginx.NewRender(c).Data(cval, err) } func (rt *Router) notifyConfigPut(c *gin.Context) { var f models.Configs ginx.BindJSON(c, &f) userVariableMap := rt.NotifyConfigCache.ConfigCache.Get() text := tplx.ReplaceTemplateUseText(f.Ckey, f.Cval, userVariableMap) switch f.Ckey { case models.SMTP: var smtp aconf.SMTPConfig err := toml.Unmarshal([]byte(text), &smtp) ginx.Dangerous(err) default: ginx.Bomb(200, "key %s can not modify", f.Ckey) } username := c.MustGet("username").(string) //insert or update built-in config ginx.Dangerous(models.ConfigsSetWithUname(rt.Ctx, f.Ckey, f.Cval, username)) if f.Ckey == models.SMTP { // 重置邮件发送器 smtp, errSmtp := SmtpValidate(text) ginx.Dangerous(errSmtp) go sender.RestartEmailSender(rt.Ctx, smtp) } ginx.NewRender(c).Message(nil) } func SmtpValidate(text string) (aconf.SMTPConfig, error) { var smtp aconf.SMTPConfig var err error err = toml.Unmarshal([]byte(text), &smtp) if err != nil { return smtp, err } if smtp.Host == "" || smtp.Port == 0 { return smtp, fmt.Errorf("smtp host or port can not be empty") } return smtp, err } type form struct { models.Configs Email string `json:"email"` } // After configuring the aconf.SMTPConfig, users can choose to perform a test. In this test, the function attempts to send an email func (rt *Router) attemptSendEmail(c *gin.Context) { var f form ginx.BindJSON(c, &f) if f.Email = strings.TrimSpace(f.Email); f.Email == "" || !str.IsMail(f.Email) { ginx.Bomb(200, "email(%s) invalid", f.Email) } if f.Ckey != models.SMTP { ginx.Bomb(200, "config(%v) invalid", f) } userVariableMap := rt.NotifyConfigCache.ConfigCache.Get() text := tplx.ReplaceTemplateUseText(f.Ckey, f.Cval, userVariableMap) smtp, err := SmtpValidate(text) ginx.Dangerous(err) ginx.NewRender(c).Message(sender.SendEmail("Email test", "email content", []string{f.Email}, smtp)) } func (rt *Router) notifyChannelConfigGets(c *gin.Context) { id := ginx.QueryInt64(c, "id", 0) name := ginx.QueryStr(c, "name", "") ident := ginx.QueryStr(c, "ident", "") enabled := ginx.QueryInt(c, "enabled", -1) notifyChannels, err := models.NotifyChannelGets(rt.Ctx, id, name, ident, enabled) ginx.NewRender(c).Data(notifyChannels, err) } ================================================ FILE: center/router/router_notify_rule.go ================================================ package router import ( "fmt" "net/http" "time" "github.com/ccfos/nightingale/v6/alert/dispatch" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/slice" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/logger" ) func (rt *Router) notifyRulesAdd(c *gin.Context) { var lst []*models.NotifyRule ginx.BindJSON(c, &lst) if len(lst) == 0 { ginx.Bomb(http.StatusBadRequest, "input json is empty") } me := c.MustGet("user").(*models.User) isAdmin := me.IsAdmin() gids, err := models.MyGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) now := time.Now().Unix() for _, nr := range lst { ginx.Dangerous(nr.Verify()) if !isAdmin && !slice.HaveIntersection(gids, nr.UserGroupIds) { ginx.Bomb(http.StatusForbidden, "forbidden") } nr.CreateBy = me.Username nr.CreateAt = now nr.UpdateBy = me.Username nr.UpdateAt = now err := models.Insert(rt.Ctx, nr) ginx.Dangerous(err) } ginx.NewRender(c).Data(lst, nil) } func (rt *Router) notifyRulesDel(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) f.Verify() if me := c.MustGet("user").(*models.User); !me.IsAdmin() { lst, err := models.NotifyRulesGet(rt.Ctx, "id in (?)", f.Ids) ginx.Dangerous(err) gids, err := models.MyGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) for _, t := range lst { if !slice.HaveIntersection(gids, t.UserGroupIds) { ginx.Bomb(http.StatusForbidden, "forbidden") } } } ginx.NewRender(c).Message(models.DB(rt.Ctx). Delete(&models.NotifyRule{}, "id in (?)", f.Ids).Error) } func (rt *Router) notifyRulePut(c *gin.Context) { var f models.NotifyRule ginx.BindJSON(c, &f) nr, err := models.NotifyRuleGet(rt.Ctx, "id = ?", ginx.UrlParamInt64(c, "id")) ginx.Dangerous(err) if nr == nil { ginx.Bomb(http.StatusNotFound, "notify rule not found") } me := c.MustGet("user").(*models.User) gids, err := models.MyGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) if !slice.HaveIntersection(gids, nr.UserGroupIds) && !me.IsAdmin() { ginx.Bomb(http.StatusForbidden, "forbidden") } f.UpdateBy = me.Username ginx.NewRender(c).Message(nr.Update(rt.Ctx, f)) } func (rt *Router) notifyRuleGet(c *gin.Context) { me := c.MustGet("user").(*models.User) gids, err := models.MyGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) tid := ginx.UrlParamInt64(c, "id") nr, err := models.NotifyRuleGet(rt.Ctx, "id = ?", tid) ginx.Dangerous(err) if nr == nil { ginx.Bomb(http.StatusNotFound, "notify rule not found") } if !slice.HaveIntersection(gids, nr.UserGroupIds) && !me.IsAdmin() { ginx.Bomb(http.StatusForbidden, "forbidden") } ginx.NewRender(c).Data(nr, nil) } func (rt *Router) notifyRulesGetByService(c *gin.Context) { ginx.NewRender(c).Data(models.NotifyRulesGet(rt.Ctx, "enable = ?", true)) } func (rt *Router) notifyRulesGet(c *gin.Context) { me := c.MustGet("user").(*models.User) gids, err := models.MyGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) lst, err := models.NotifyRulesGet(rt.Ctx, "", nil) ginx.Dangerous(err) models.FillUpdateByNicknames(rt.Ctx, lst) if me.IsAdmin() { ginx.NewRender(c).Data(lst, nil) return } res := make([]*models.NotifyRule, 0) for _, nr := range lst { if slice.HaveIntersection[int64](gids, nr.UserGroupIds) { res = append(res, nr) } } ginx.NewRender(c).Data(res, nil) } type NotifyTestForm struct { EventIDs []int64 `json:"event_ids" binding:"required"` NotifyConfig models.NotifyConfig `json:"notify_config" binding:"required"` } func (rt *Router) notifyTest(c *gin.Context) { var f NotifyTestForm ginx.BindJSON(c, &f) hisEvents, err := models.AlertHisEventGetByIds(rt.Ctx, f.EventIDs) ginx.Dangerous(err) if len(hisEvents) == 0 { ginx.Bomb(http.StatusBadRequest, "event not found") } ginx.Dangerous(err) events := []*models.AlertCurEvent{} for _, he := range hisEvents { event := he.ToCur() event.SetTagsMap() if err := dispatch.NotifyRuleMatchCheck(&f.NotifyConfig, event); err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } events = append(events, event) } resp, err := SendNotifyChannelMessage(rt.Ctx, rt.UserCache, rt.UserGroupCache, f.NotifyConfig, events) if resp == "" { resp = "success" } ginx.NewRender(c).Data(resp, err) } func SendNotifyChannelMessage(ctx *ctx.Context, userCache *memsto.UserCacheType, userGroup *memsto.UserGroupCacheType, notifyConfig models.NotifyConfig, events []*models.AlertCurEvent) (string, error) { notifyChannels, err := models.NotifyChannelGets(ctx, notifyConfig.ChannelID, "", "", -1) if err != nil { return "", fmt.Errorf("failed to get notify channels: %v", err) } if len(notifyChannels) == 0 { return "", fmt.Errorf("notify channel not found") } notifyChannel := notifyChannels[0] if !notifyChannel.Enable { return "", fmt.Errorf("notify channel not enabled, please enable it first") } // 获取站点URL用于模板渲染 siteUrl, _ := models.ConfigsGetSiteUrl(ctx) if siteUrl == "" { siteUrl = "http://127.0.0.1:17000" } tplContent := make(map[string]interface{}) if notifyChannel.RequestType != "flashduty" { messageTemplates, err := models.MessageTemplateGets(ctx, notifyConfig.TemplateID, "", "") if err != nil { return "", fmt.Errorf("failed to get message templates: %v", err) } if len(messageTemplates) == 0 { return "", fmt.Errorf("message template not found") } tplContent = messageTemplates[0].RenderEvent(events, siteUrl) } var contactKey string if notifyChannel.ParamConfig != nil && notifyChannel.ParamConfig.UserInfo != nil { contactKey = notifyChannel.ParamConfig.UserInfo.ContactKey } sendtos, flashDutyChannelIDs, pagerDutyRoutingKeys, customParams := dispatch.GetNotifyConfigParams(¬ifyConfig, contactKey, userCache, userGroup) var resp string switch notifyChannel.RequestType { case "flashduty": client, err := models.GetHTTPClient(notifyChannel) if err != nil { return "", fmt.Errorf("failed to get http client: %v", err) } for i := range flashDutyChannelIDs { resp, err = notifyChannel.SendFlashDuty(events, flashDutyChannelIDs[i], client) if err != nil { return "", fmt.Errorf("failed to send flashduty notify: %v", err) } } logger.Infof("channel_name: %v, event:%s, tplContent:%s, customParams:%v, respBody: %v, err: %v", notifyChannel.Name, events[0].Hash, tplContent, customParams, resp, err) return resp, nil case "pagerduty": client, err := models.GetHTTPClient(notifyChannel) if err != nil { return "", fmt.Errorf("failed to get http client: %v", err) } for _, routingKey := range pagerDutyRoutingKeys { resp, err = notifyChannel.SendPagerDuty(events, routingKey, siteUrl, client) if err != nil { return "", fmt.Errorf("failed to send pagerduty notify: %v", err) } } logger.Infof("channel_name: %v, event:%s, tplContent:%s, customParams:%v, respBody: %v, err: %v", notifyChannel.Name, events[0].Hash, tplContent, customParams, resp, err) return resp, nil case "http": client, err := models.GetHTTPClient(notifyChannel) if err != nil { return "", fmt.Errorf("failed to get http client: %v", err) } if notifyChannel.RequestConfig == nil { return "", fmt.Errorf("request config is nil") } if notifyChannel.RequestConfig.HTTPRequestConfig == nil { return "", fmt.Errorf("http request config is nil") } if dispatch.NeedBatchContacts(notifyChannel.RequestConfig.HTTPRequestConfig) || len(sendtos) == 0 { resp, err = notifyChannel.SendHTTP(events, tplContent, customParams, sendtos, client) logger.Infof("channel_name: %v, event:%s, sendtos:%+v, tplContent:%s, customParams:%v, respBody: %v, err: %v", notifyChannel.Name, events[0].Hash, sendtos, tplContent, customParams, resp, err) if err != nil { return "", fmt.Errorf("failed to send http notify: %v", err) } return resp, nil } else { for i := range sendtos { resp, err = notifyChannel.SendHTTP(events, tplContent, customParams, []string{sendtos[i]}, client) logger.Infof("channel_name: %v, event:%s, tplContent:%s, customParams:%v, sendto:%+v, respBody: %v, err: %v", notifyChannel.Name, events[0].Hash, tplContent, customParams, sendtos[i], resp, err) if err != nil { return "", fmt.Errorf("failed to send http notify: %v", err) } } return resp, nil } case "smtp": if len(sendtos) == 0 { return "", fmt.Errorf("no valid email address in the user and team") } err := notifyChannel.SendEmailNow(events, tplContent, sendtos) if err != nil { return "", fmt.Errorf("failed to send email notify: %v", err) } return resp, nil case "script": resp, _, err := notifyChannel.SendScript(events, tplContent, customParams, sendtos) logger.Infof("channel_name: %v, event:%s, tplContent:%s, customParams:%v, respBody: %v, err: %v", notifyChannel.Name, events[0].Hash, tplContent, customParams, resp, err) return resp, err default: logger.Errorf("unsupported request type: %v", notifyChannel.RequestType) return "", fmt.Errorf("unsupported request type") } } type paramList struct { Name string `json:"name"` CName string `json:"cname"` Value interface{} `json:"value"` } func (rt *Router) notifyRuleCustomParamsGet(c *gin.Context) { notifyChannelID := ginx.QueryInt64(c, "notify_channel_id") me := c.MustGet("user").(*models.User) gids, err := models.MyGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) notifyChannel, err := models.NotifyChannelGet(rt.Ctx, "id=?", notifyChannelID) ginx.Dangerous(err) keyMap := make(map[string]string) if notifyChannel == nil { ginx.NewRender(c).Data([][]paramList{}, nil) return } if notifyChannel.ParamConfig == nil { ginx.NewRender(c).Data([][]paramList{}, nil) return } for _, param := range notifyChannel.ParamConfig.Custom.Params { keyMap[param.Key] = param.CName } lst, err := models.NotifyRulesGet(rt.Ctx, "", nil) ginx.Dangerous(err) res := make([][]paramList, 0) filter := make(map[string]struct{}) for _, nr := range lst { if !slice.HaveIntersection[int64](gids, nr.UserGroupIds) { continue } for _, nc := range nr.NotifyConfigs { if nc.ChannelID != notifyChannelID { continue } list := make([]paramList, 0) filterKey := "" for key, value := range nc.Params { // 找到在通知媒介中的自定义变量配置项,进行 cname 转换 cname, exists := keyMap[key] if exists { list = append(list, paramList{ Name: key, CName: cname, Value: value, }) } filterKey += fmt.Sprintf("%s:%s,", key, value) } if _, ok := filter[filterKey]; ok { continue } filter[filterKey] = struct{}{} res = append(res, list) } } ginx.NewRender(c).Data(res, nil) } ================================================ FILE: center/router/router_notify_tpl.go ================================================ package router import ( "bytes" "encoding/json" "fmt" "html/template" "strings" "time" "github.com/ccfos/nightingale/v6/center/cconf" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/tplx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/str" ) func (rt *Router) notifyTplGets(c *gin.Context) { m := make(map[string]struct{}) for _, channel := range models.DefaultChannels { m[channel] = struct{}{} } m[models.EmailSubject] = struct{}{} lst, err := models.NotifyTplGets(rt.Ctx) ginx.Dangerous(err) for i := 0; i < len(lst); i++ { if _, exists := m[lst[i].Channel]; exists { lst[i].BuiltIn = true } } models.FillUpdateByNicknames(rt.Ctx, lst) ginx.NewRender(c).Data(lst, err) } func (rt *Router) notifyTplUpdateContent(c *gin.Context) { user := c.MustGet("user").(*models.User) var f models.NotifyTpl ginx.BindJSON(c, &f) ginx.Dangerous(templateValidate(f)) notifyTpl, err := models.NotifyTplGet(rt.Ctx, f.Id) ginx.Dangerous(err) if notifyTpl.CreateBy != user.Username && !user.IsAdmin() { ginx.Bomb(403, "forbidden") } f.UpdateAt = time.Now().Unix() f.UpdateBy = user.Username ginx.NewRender(c).Message(f.UpdateContent(rt.Ctx)) } func (rt *Router) notifyTplUpdate(c *gin.Context) { var f models.NotifyTpl ginx.BindJSON(c, &f) ginx.Dangerous(templateValidate(f)) user := c.MustGet("user").(*models.User) notifyTpl, err := models.NotifyTplGet(rt.Ctx, f.Id) ginx.Dangerous(err) if notifyTpl.CreateBy != user.Username && !user.IsAdmin() { ginx.Bomb(403, "forbidden") } // get the count of the same channel and name but different id count, err := models.Count(models.DB(rt.Ctx).Model(&models.NotifyTpl{}).Where("(channel = ? or name = ?) and id <> ?", f.Channel, f.Name, f.Id)) ginx.Dangerous(err) if count != 0 { ginx.Bomb(200, "Refuse to create duplicate channel or name") } notifyTpl.UpdateAt = time.Now().Unix() notifyTpl.UpdateBy = user.Username notifyTpl.Name = f.Name ginx.NewRender(c).Message(notifyTpl.Update(rt.Ctx)) } func templateValidate(f models.NotifyTpl) error { if len(f.Channel) > 32 { return fmt.Errorf("channel length should not exceed 32") } if str.Dangerous(f.Channel) { return fmt.Errorf("channel should not contain dangerous characters") } if len(f.Name) > 255 { return fmt.Errorf("name length should not exceed 255") } if str.Dangerous(f.Name) { return fmt.Errorf("name should not contain dangerous characters") } if f.Content == "" { return nil } var defs = []string{ "{{$labels := .TagsMap}}", "{{$value := .TriggerValue}}", } text := strings.Join(append(defs, f.Content), "") if _, err := template.New(f.Channel).Funcs(tplx.TemplateFuncMap).Parse(text); err != nil { return fmt.Errorf("notify template verify illegal:%s", err.Error()) } return nil } func (rt *Router) notifyTplPreview(c *gin.Context) { var event models.AlertCurEvent err := json.Unmarshal([]byte(cconf.EVENT_EXAMPLE), &event) ginx.Dangerous(err) var f models.NotifyTpl ginx.BindJSON(c, &f) var defs = []string{ "{{$labels := .TagsMap}}", "{{$value := .TriggerValue}}", } text := strings.Join(append(defs, f.Content), "") tpl, err := template.New(f.Channel).Funcs(tplx.TemplateFuncMap).Parse(text) ginx.Dangerous(err) event.TagsMap = make(map[string]string) for i := 0; i < len(event.TagsJSON); i++ { pair := strings.TrimSpace(event.TagsJSON[i]) if pair == "" { continue } arr := strings.SplitN(pair, "=", 2) if len(arr) != 2 { continue } event.TagsMap[arr[0]] = arr[1] } var body bytes.Buffer var ret string if err := tpl.Execute(&body, event); err != nil { ret = err.Error() } else { ret = body.String() } ginx.NewRender(c).Data(ret, nil) } // add new notify template func (rt *Router) notifyTplAdd(c *gin.Context) { var f models.NotifyTpl ginx.BindJSON(c, &f) user := c.MustGet("user").(*models.User) f.CreateBy = user.Username f.Channel = strings.TrimSpace(f.Channel) ginx.Dangerous(templateValidate(f)) count, err := models.Count(models.DB(rt.Ctx).Model(&models.NotifyTpl{}).Where("channel = ? or name = ?", f.Channel, f.Name)) ginx.Dangerous(err) if count != 0 { ginx.Bomb(200, "Refuse to create duplicate channel(unique)") } f.CreateAt = time.Now().Unix() ginx.NewRender(c).Message(f.Create(rt.Ctx)) } // delete notify template, not allowed to delete the system defaults(models.DefaultChannels) func (rt *Router) notifyTplDel(c *gin.Context) { f := new(models.NotifyTpl) id := ginx.UrlParamInt64(c, "id") user := c.MustGet("user").(*models.User) notifyTpl, err := models.NotifyTplGet(rt.Ctx, id) ginx.Dangerous(err) if notifyTpl.CreateBy != user.Username && !user.IsAdmin() { ginx.Bomb(403, "forbidden") } ginx.NewRender(c).Message(f.NotifyTplDelete(rt.Ctx, id)) } func (rt *Router) messageTemplateGets(c *gin.Context) { id := ginx.QueryInt64(c, "id", 0) name := ginx.QueryStr(c, "name", "") ident := ginx.QueryStr(c, "ident", "") tpls, err := models.MessageTemplateGets(rt.Ctx, id, name, ident) if err == nil { models.FillUpdateByNicknames(rt.Ctx, tpls) } ginx.NewRender(c).Data(tpls, err) } ================================================ FILE: center/router/router_opensearch.go ================================================ package router import ( "github.com/ccfos/nightingale/v6/datasource/opensearch" "github.com/ccfos/nightingale/v6/dscache" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/logger" ) func (rt *Router) QueryOSIndices(c *gin.Context) { var f IndexReq ginx.BindJSON(c, &f) plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId) if !exists { logger.Warningf("cluster:%d not exists", f.DatasourceId) ginx.Bomb(200, "cluster not exists") } indices, err := plug.(*opensearch.OpenSearch).QueryIndices() ginx.Dangerous(err) ginx.NewRender(c).Data(indices, nil) } func (rt *Router) QueryOSFields(c *gin.Context) { var f IndexReq ginx.BindJSON(c, &f) plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId) if !exists { logger.Warningf("cluster:%d not exists", f.DatasourceId) ginx.Bomb(200, "cluster not exists") } fields, err := plug.(*opensearch.OpenSearch).QueryFields([]string{f.Index}) ginx.Dangerous(err) ginx.NewRender(c).Data(fields, nil) } func (rt *Router) QueryOSVariable(c *gin.Context) { var f FieldValueReq ginx.BindJSON(c, &f) plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId) if !exists { logger.Warningf("cluster:%d not exists", f.DatasourceId) ginx.Bomb(200, "cluster not exists") } fields, err := plug.(*opensearch.OpenSearch).QueryFieldValue([]string{f.Index}, f.Query.Field, f.Query.Query) ginx.Dangerous(err) ginx.NewRender(c).Data(fields, nil) } ================================================ FILE: center/router/router_proxy.go ================================================ package router import ( "context" "fmt" "net" "net/http" "net/http/httputil" "regexp" "strconv" "strings" "sync" "time" "github.com/ccfos/nightingale/v6/pkg/logx" "github.com/ccfos/nightingale/v6/pkg/poster" pkgprom "github.com/ccfos/nightingale/v6/pkg/prom" "github.com/ccfos/nightingale/v6/prom" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/prometheus/common/model" "github.com/toolkits/pkg/logger" "github.com/toolkits/pkg/net/httplib" ) type QueryFormItem struct { Start int64 `json:"start" binding:"required"` End int64 `json:"end" binding:"required"` Step int64 `json:"step" binding:"required"` Query string `json:"query" binding:"required"` } type BatchQueryForm struct { DatasourceId int64 `json:"datasource_id" binding:"required"` Queries []QueryFormItem `json:"queries" binding:"required"` } func (rt *Router) promBatchQueryRange(c *gin.Context) { var f BatchQueryForm ginx.Dangerous(c.BindJSON(&f)) lst, err := PromBatchQueryRange(c.Request.Context(), rt.PromClients, f) ginx.NewRender(c).Data(lst, err) } func PromBatchQueryRange(ctx context.Context, pc *prom.PromClientMap, f BatchQueryForm) ([]model.Value, error) { var lst []model.Value cli := pc.GetCli(f.DatasourceId) if cli == nil { logx.Warningf(ctx, "no such datasource id: %d", f.DatasourceId) return lst, fmt.Errorf("no such datasource id: %d", f.DatasourceId) } for _, item := range f.Queries { r := pkgprom.Range{ Start: time.Unix(item.Start, 0), End: time.Unix(item.End, 0), Step: time.Duration(item.Step) * time.Second, } resp, _, err := cli.QueryRange(ctx, item.Query, r) if err != nil { logx.Warningf(ctx, "query range error: query:%s err:%v", item.Query, err) return lst, err } lst = append(lst, resp) } return lst, nil } type BatchInstantForm struct { DatasourceId int64 `json:"datasource_id" binding:"required"` Queries []InstantFormItem `json:"queries" binding:"required"` } type InstantFormItem struct { Time int64 `json:"time" binding:"required"` Query string `json:"query" binding:"required"` } func (rt *Router) promBatchQueryInstant(c *gin.Context) { var f BatchInstantForm ginx.Dangerous(c.BindJSON(&f)) lst, err := PromBatchQueryInstant(c.Request.Context(), rt.PromClients, f) ginx.NewRender(c).Data(lst, err) } func PromBatchQueryInstant(ctx context.Context, pc *prom.PromClientMap, f BatchInstantForm) ([]model.Value, error) { var lst []model.Value cli := pc.GetCli(f.DatasourceId) if cli == nil { logx.Warningf(ctx, "no such datasource id: %d", f.DatasourceId) return lst, fmt.Errorf("no such datasource id: %d", f.DatasourceId) } for _, item := range f.Queries { resp, _, err := cli.Query(ctx, item.Query, time.Unix(item.Time, 0)) if err != nil { logx.Warningf(ctx, "query instant error: query:%s err:%v", item.Query, err) return lst, err } lst = append(lst, resp) } return lst, nil } func (rt *Router) dsProxy(c *gin.Context) { dsId := ginx.UrlParamInt64(c, "id") ds := rt.DatasourceCache.GetById(dsId) if ds == nil { c.String(http.StatusBadRequest, "no such datasource") return } target, err := ds.HTTPJson.ParseUrl() if err != nil { c.String(http.StatusInternalServerError, "invalid urls: %s", ds.HTTPJson.GetUrls()) return } director := func(req *http.Request) { req.URL.Scheme = target.Scheme req.URL.Host = target.Host req.Host = target.Host req.Header.Set("Host", target.Host) // fe request e.g. /api/n9e/proxy/:id/* arr := strings.Split(req.URL.Path, "/") if len(arr) < 6 { c.String(http.StatusBadRequest, "invalid url path") return } req.URL.Path = strings.TrimRight(target.Path, "/") + "/" + strings.Join(arr[5:], "/") if target.RawQuery == "" || req.URL.RawQuery == "" { req.URL.RawQuery = target.RawQuery + req.URL.RawQuery } else { req.URL.RawQuery = target.RawQuery + "&" + req.URL.RawQuery } if _, ok := req.Header["User-Agent"]; !ok { req.Header.Set("User-Agent", "") } if ds.AuthJson.BasicAuthUser != "" { req.SetBasicAuth(ds.AuthJson.BasicAuthUser, ds.AuthJson.BasicAuthPassword) } else { req.Header.Del("Authorization") } headerCount := len(ds.HTTPJson.Headers) if headerCount > 0 { for key, value := range ds.HTTPJson.Headers { req.Header.Set(key, value) if key == "Host" { req.Host = value } } } } errFunc := func(w http.ResponseWriter, r *http.Request, err error) { http.Error(w, err.Error(), http.StatusBadGateway) } transport, has := transportGet(dsId, ds.UpdatedAt) if !has { // 使用 TLS 配置(支持 mTLS) tlsConfig, err := ds.HTTPJson.TLS.TLSConfig() if err != nil { c.String(http.StatusInternalServerError, "failed to create TLS config: %s", err.Error()) return } transport = &http.Transport{ TLSClientConfig: tlsConfig, Proxy: http.ProxyFromEnvironment, DialContext: (&net.Dialer{ Timeout: time.Duration(ds.HTTPJson.DialTimeout) * time.Millisecond, }).DialContext, ResponseHeaderTimeout: time.Duration(ds.HTTPJson.Timeout) * time.Millisecond, MaxIdleConnsPerHost: ds.HTTPJson.MaxIdleConnsPerHost, } transportPut(dsId, ds.UpdatedAt, transport) } modifyResponse := func(r *http.Response) error { if r.StatusCode == http.StatusUnauthorized { logx.Warningf(c.Request.Context(), "proxy path:%s unauthorized access ", c.Request.URL.Path) return fmt.Errorf("unauthorized access") } return nil } proxy := &httputil.ReverseProxy{ Director: director, Transport: transport, ErrorHandler: errFunc, ModifyResponse: modifyResponse, } proxy.ServeHTTP(c.Writer, c.Request) } var ( transports = map[int64]http.RoundTripper{} updatedAts = map[int64]int64{} transportsLock = &sync.Mutex{} ) func transportGet(dsid, newUpdatedAt int64) (http.RoundTripper, bool) { transportsLock.Lock() defer transportsLock.Unlock() tran, has := transports[dsid] if !has { return nil, false } oldUpdateAt, has := updatedAts[dsid] if !has { oldtran := tran.(*http.Transport) oldtran.CloseIdleConnections() delete(transports, dsid) return nil, false } if oldUpdateAt != newUpdatedAt { oldtran := tran.(*http.Transport) oldtran.CloseIdleConnections() delete(transports, dsid) delete(updatedAts, dsid) return nil, false } return tran, has } func transportPut(dsid, updatedat int64, tran http.RoundTripper) { transportsLock.Lock() transports[dsid] = tran updatedAts[dsid] = updatedat transportsLock.Unlock() } const ( DatasourceTypePrometheus = "Prometheus" DatasourceTypeVictoriaMetrics = "VictoriaMetrics" ) type deleteDatasourceSeriesForm struct { DatasourceID int64 `json:"datasource_id"` Match []string `json:"match"` Start string `json:"start"` End string `json:"end"` } func (rt *Router) deleteDatasourceSeries(c *gin.Context) { var ddsf deleteDatasourceSeriesForm ginx.BindJSON(c, &ddsf) ds := rt.DatasourceCache.GetById(ddsf.DatasourceID) if ds == nil { ginx.Bomb(http.StatusBadRequest, "no such datasource") return } // Get datasource type, now only support prometheus and victoriametrics datasourceType, ok := ds.SettingsJson["prometheus.tsdb_type"] if !ok { ginx.Bomb(http.StatusBadRequest, "datasource type not found, please check your datasource settings") return } target, err := ds.HTTPJson.ParseUrl() if err != nil { ginx.Bomb(http.StatusInternalServerError, "invalid urls: %s", ds.HTTPJson.GetUrls()) return } timeout := time.Duration(ds.HTTPJson.DialTimeout) * time.Millisecond matchQueries := make([]string, 0) for _, match := range ddsf.Match { matchQueries = append(matchQueries, fmt.Sprintf("match[]=%s", match)) } matchQuery := strings.Join(matchQueries, "&") switch datasourceType { case DatasourceTypePrometheus: // Prometheus delete api need POST method // https://prometheus.io/docs/prometheus/latest/querying/api/#delete-series url := fmt.Sprintf("http://%s/api/v1/admin/tsdb/delete_series?%s&start=%s&end=%s", target.Host, matchQuery, ddsf.Start, ddsf.End) go func() { resp, _, err := poster.PostJSON(url, timeout, nil) if err != nil { logger.Errorf("delete series error datasource_id: %d, datasource_name: %s, match: %s, start: %s, end: %s, err: %v", ddsf.DatasourceID, ds.Name, ddsf.Match, ddsf.Start, ddsf.End, err) return } logger.Infof("delete datasource series datasource_id: %d, datasource_name: %s, match: %s, start: %s, end: %s, respBody: %s", ddsf.DatasourceID, ds.Name, ddsf.Match, ddsf.Start, ddsf.End, string(resp)) }() case DatasourceTypeVictoriaMetrics: // Delete API doesn’t support the deletion of specific time ranges. // Refer: https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/#how-to-delete-time-series var url string // Check VictoriaMetrics is single node or cluster // Cluster will have /select//prometheus pattern re := regexp.MustCompile(`/select/(\d+)/prometheus`) matches := re.FindStringSubmatch(ds.HTTPJson.Url) if len(matches) > 0 && matches[1] != "" { accountID, err := strconv.Atoi(matches[1]) if err != nil { ginx.Bomb(http.StatusInternalServerError, "invalid accountID: %s", matches[1]) } url = fmt.Sprintf("http://%s/delete/%d/prometheus/api/v1/admin/tsdb/delete_series?%s", target.Host, accountID, matchQuery) } else { url = fmt.Sprintf("http://%s/api/v1/admin/tsdb/delete_series?%s", target.Host, matchQuery) } go func() { resp, err := httplib.Get(url).SetTimeout(timeout).Response() if err != nil { logger.Errorf("delete series failed | datasource_id: %d, datasource_name: %s, match: %s, start: %s, end: %s, err: %v", ddsf.DatasourceID, ds.Name, ddsf.Match, ddsf.Start, ddsf.End, err) return } logger.Infof("sending delete series request | datasource_id: %d, datasource_name: %s, match: %s, start: %s, end: %s, respBody: %s", ddsf.DatasourceID, ds.Name, ddsf.Match, ddsf.Start, ddsf.End, resp.Body) }() default: ginx.Bomb(http.StatusBadRequest, "not support delete series yet") } ginx.NewRender(c).Data(nil, nil) } ================================================ FILE: center/router/router_query.go ================================================ package router import ( "fmt" "sort" "sync" "github.com/ccfos/nightingale/v6/alert/eval" "github.com/ccfos/nightingale/v6/dscache" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/logx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) type CheckDsPermFunc func(c *gin.Context, dsId int64, cate string, q interface{}) bool var CheckDsPerm CheckDsPermFunc = func(c *gin.Context, dsId int64, cate string, q interface{}) bool { // todo: 后续需要根据 cate 判断是否需要权限 return true } type QueryFrom struct { Queries []Query `json:"queries"` Exps []Exp `json:"exps"` } type Query struct { Ref string `json:"ref"` Did int64 `json:"ds_id"` DsCate string `json:"ds_cate"` Query interface{} `json:"query"` } type Exp struct { Exp string `json:"exp"` Ref string `json:"ref"` } type LogResp struct { Total int64 `json:"total"` List []interface{} `json:"list"` } func QueryLogBatchConcurrently(anonymousAccess bool, ctx *gin.Context, f QueryFrom) (LogResp, error) { var resp LogResp var mu sync.Mutex var wg sync.WaitGroup var errs []error rctx := ctx.Request.Context() for _, q := range f.Queries { if !anonymousAccess && !CheckDsPerm(ctx, q.Did, q.DsCate, q) { return LogResp{}, fmt.Errorf("forbidden") } plug, exists := dscache.DsCache.Get(q.DsCate, q.Did) if !exists { logx.Warningf(rctx, "cluster:%d not exists query:%+v", q.Did, q) return LogResp{}, fmt.Errorf("cluster not exists") } // 根据数据源类型对 Query 进行模板渲染处理 err := eval.ExecuteQueryTemplate(q.DsCate, q.Query, nil) if err != nil { logx.Warningf(rctx, "query template execute error: %v", err) return LogResp{}, fmt.Errorf("query template execute error: %v", err) } wg.Add(1) go func(query Query) { defer wg.Done() data, total, err := plug.QueryLog(rctx, query.Query) mu.Lock() defer mu.Unlock() if err != nil { errMsg := fmt.Sprintf("query data error: %v query:%v\n ", err, query) logx.Warningf(rctx, "%s", errMsg) errs = append(errs, err) return } m := make(map[string]interface{}) m["ref"] = query.Ref m["ds_id"] = query.Did m["ds_cate"] = query.DsCate m["data"] = data resp.List = append(resp.List, m) resp.Total += total }(q) } wg.Wait() if len(errs) > 0 { return LogResp{}, errs[0] } if len(resp.List) == 0 { return LogResp{}, fmt.Errorf("no data") } return resp, nil } func (rt *Router) QueryLogBatch(c *gin.Context) { var f QueryFrom ginx.BindJSON(c, &f) resp, err := QueryLogBatchConcurrently(rt.Center.AnonymousAccess.PromQuerier, c, f) if err != nil { ginx.Bomb(200, "err:%v", err) } ginx.NewRender(c).Data(resp, nil) } func QueryDataConcurrently(anonymousAccess bool, ctx *gin.Context, f models.QueryParam) ([]models.DataResp, error) { var resp []models.DataResp var mu sync.Mutex var wg sync.WaitGroup var errs []error rctx := ctx.Request.Context() for _, q := range f.Queries { if !anonymousAccess && !CheckDsPerm(ctx, f.DatasourceId, f.Cate, q) { return nil, fmt.Errorf("forbidden") } plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId) if !exists { logx.Warningf(rctx, "cluster:%d not exists", f.DatasourceId) return nil, fmt.Errorf("cluster not exists") } wg.Add(1) go func(query interface{}) { defer wg.Done() data, err := plug.QueryData(rctx, query) if err != nil { logx.Warningf(rctx, "query data error: req:%+v err:%v", query, err) mu.Lock() errs = append(errs, err) mu.Unlock() return } logx.Debugf(rctx, "query data: req:%+v resp:%+v", query, data) mu.Lock() resp = append(resp, data...) mu.Unlock() }(q) } wg.Wait() if len(errs) > 0 { return nil, errs[0] } // 面向API的统一处理 // 按照 .Metric 排序 // 确保仪表盘中相同图例的曲线颜色相同 if len(resp) > 1 { sort.Slice(resp, func(i, j int) bool { if resp[i].Metric != nil && resp[j].Metric != nil { return resp[i].Metric.String() < resp[j].Metric.String() } return false }) } return resp, nil } func (rt *Router) QueryData(c *gin.Context) { var f models.QueryParam ginx.BindJSON(c, &f) resp, err := QueryDataConcurrently(rt.Center.AnonymousAccess.PromQuerier, c, f) if err != nil { ginx.Bomb(200, "err:%v", err) } ginx.NewRender(c).Data(resp, nil) } // QueryLogConcurrently 并发查询日志 func QueryLogConcurrently(anonymousAccess bool, ctx *gin.Context, f models.QueryParam) (LogResp, error) { var resp LogResp var mu sync.Mutex var wg sync.WaitGroup var errs []error rctx := ctx.Request.Context() for _, q := range f.Queries { if !anonymousAccess && !CheckDsPerm(ctx, f.DatasourceId, f.Cate, q) { return LogResp{}, fmt.Errorf("forbidden") } plug, exists := dscache.DsCache.Get(f.Cate, f.DatasourceId) if !exists { logx.Warningf(rctx, "cluster:%d not exists query:%+v", f.DatasourceId, f) return LogResp{}, fmt.Errorf("cluster not exists") } wg.Add(1) go func(query interface{}) { defer wg.Done() data, total, err := plug.QueryLog(rctx, query) logx.Debugf(rctx, "query log: req:%+v resp:%+v", query, data) if err != nil { errMsg := fmt.Sprintf("query data error: %v query:%v\n ", err, query) logx.Warningf(rctx, "%s", errMsg) mu.Lock() errs = append(errs, err) mu.Unlock() return } mu.Lock() resp.List = append(resp.List, data...) resp.Total += total mu.Unlock() }(q) } wg.Wait() if len(errs) > 0 { return LogResp{}, errs[0] } if len(resp.List) == 0 { return LogResp{}, fmt.Errorf("no data") } return resp, nil } func (rt *Router) QueryLogV2(c *gin.Context) { var f models.QueryParam ginx.BindJSON(c, &f) resp, err := QueryLogConcurrently(rt.Center.AnonymousAccess.PromQuerier, c, f) ginx.NewRender(c).Data(resp, err) } func (rt *Router) QueryLog(c *gin.Context) { var f models.QueryParam ginx.BindJSON(c, &f) rctx := c.Request.Context() var resp []interface{} for _, q := range f.Queries { if !rt.Center.AnonymousAccess.PromQuerier && !CheckDsPerm(c, f.DatasourceId, f.Cate, q) { ginx.Bomb(200, "forbidden") } plug, exists := dscache.DsCache.Get("elasticsearch", f.DatasourceId) if !exists { logx.Warningf(rctx, "cluster:%d not exists", f.DatasourceId) ginx.Bomb(200, "cluster not exists") } data, _, err := plug.QueryLog(rctx, q) if err != nil { logx.Warningf(rctx, "query data error: %v", err) ginx.Bomb(200, "err:%v", err) continue } resp = append(resp, data...) } ginx.NewRender(c).Data(resp, nil) } ================================================ FILE: center/router/router_recording_rule.go ================================================ package router import ( "encoding/json" "net/http" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/strx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) func (rt *Router) recordingRuleGets(c *gin.Context) { busiGroupId := ginx.UrlParamInt64(c, "id") ars, err := models.RecordingRuleGets(rt.Ctx, busiGroupId) if err == nil { models.FillUpdateByNicknames(rt.Ctx, ars) } ginx.NewRender(c).Data(ars, err) } func (rt *Router) recordingRuleGetsByGids(c *gin.Context) { gids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",") if len(gids) > 0 { for _, gid := range gids { rt.bgroCheck(c, gid) } } else { me := c.MustGet("user").(*models.User) if !me.IsAdmin() { var err error gids, err = models.MyBusiGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) if len(gids) == 0 { ginx.NewRender(c).Data([]int{}, nil) return } } } ars, err := models.RecordingRuleGetsByBGIds(rt.Ctx, gids) if err == nil { models.FillUpdateByNicknames(rt.Ctx, ars) } ginx.NewRender(c).Data(ars, err) } func (rt *Router) recordingRuleGetsByService(c *gin.Context) { ars, err := models.RecordingRuleEnabledGets(rt.Ctx) ginx.NewRender(c).Data(ars, err) } func (rt *Router) recordingRuleGet(c *gin.Context) { rrid := ginx.UrlParamInt64(c, "rrid") ar, err := models.RecordingRuleGetById(rt.Ctx, rrid) ginx.Dangerous(err) if ar == nil { ginx.NewRender(c, http.StatusNotFound).Message("No such recording rule") return } ginx.NewRender(c).Data(ar, err) } func (rt *Router) recordingRuleAddByFE(c *gin.Context) { username := c.MustGet("username").(string) var lst []models.RecordingRule ginx.BindJSON(c, &lst) count := len(lst) if count == 0 { ginx.Bomb(http.StatusBadRequest, "input json is empty") } for i := range lst { if len(lst[i].DatasourceQueries) == 0 { lst[i].DatasourceQueries = []models.DatasourceQuery{ models.DataSourceQueryAll, } } } bgid := ginx.UrlParamInt64(c, "id") reterr := make(map[string]string) for i := 0; i < count; i++ { lst[i].Id = 0 lst[i].GroupId = bgid lst[i].CreateBy = username lst[i].UpdateBy = username lst[i].FE2DB() if err := lst[i].Add(rt.Ctx); err != nil { reterr[lst[i].Name] = err.Error() } else { reterr[lst[i].Name] = "" } } ginx.NewRender(c).Data(reterr, nil) } func (rt *Router) recordingRulePutByFE(c *gin.Context) { var f models.RecordingRule ginx.BindJSON(c, &f) rrid := ginx.UrlParamInt64(c, "rrid") ar, err := models.RecordingRuleGetById(rt.Ctx, rrid) ginx.Dangerous(err) if ar == nil { ginx.NewRender(c, http.StatusNotFound).Message("No such recording rule") return } rt.bgrwCheck(c, ar.GroupId) rt.bgroCheck(c, f.GroupId) f.UpdateBy = c.MustGet("username").(string) ginx.NewRender(c).Message(ar.Update(rt.Ctx, f)) } func (rt *Router) recordingRuleDel(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) f.Verify() ginx.NewRender(c).Message(models.RecordingRuleDels(rt.Ctx, f.Ids, ginx.UrlParamInt64(c, "id"))) } type recordRuleFieldForm struct { Ids []int64 `json:"ids"` Fields map[string]interface{} `json:"fields"` } func (rt *Router) recordingRulePutFields(c *gin.Context) { var f recordRuleFieldForm ginx.BindJSON(c, &f) if len(f.Fields) == 0 { ginx.Bomb(http.StatusBadRequest, "fields empty") } f.Fields["update_by"] = c.MustGet("username").(string) f.Fields["update_at"] = time.Now().Unix() if datasourceQueries, ok := f.Fields["datasource_queries"]; ok { bytes, err := json.Marshal(datasourceQueries) ginx.Dangerous(err) f.Fields["datasource_queries"] = string(bytes) } if datasourceIds, ok := f.Fields["datasource_ids"]; ok { bytes, err := json.Marshal(datasourceIds) ginx.Dangerous(err) f.Fields["datasource_ids"] = string(bytes) } for i := 0; i < len(f.Ids); i++ { ar, err := models.RecordingRuleGetById(rt.Ctx, f.Ids[i]) ginx.Dangerous(err) if ar == nil { continue } ginx.Dangerous(ar.UpdateFieldsMap(rt.Ctx, f.Fields)) } ginx.NewRender(c).Message(nil) } ================================================ FILE: center/router/router_role.go ================================================ package router import ( "net/http" "strings" "github.com/ccfos/nightingale/v6/center/cconf" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) func (rt *Router) rolesGets(c *gin.Context) { lst, err := models.RoleGetsAll(rt.Ctx) ginx.NewRender(c).Data(lst, err) } func (rt *Router) permsGets(c *gin.Context) { user := c.MustGet("user").(*models.User) if user.IsAdmin() { var lst []string for _, ops := range cconf.Operations.Ops { for _, op := range ops.Ops { lst = append(lst, op.Name) } } ginx.NewRender(c).Data(lst, nil) return } lst, err := models.OperationsOfRole(rt.Ctx, strings.Fields(user.Roles)) ginx.NewRender(c).Data(lst, err) } // 创建角色 func (rt *Router) roleAdd(c *gin.Context) { var f models.Role ginx.BindJSON(c, &f) err := f.Add(rt.Ctx) ginx.NewRender(c).Message(err) } // 更新角色 func (rt *Router) rolePut(c *gin.Context) { var f models.Role ginx.BindJSON(c, &f) oldRule, err := models.RoleGet(rt.Ctx, "id=?", f.Id) ginx.Dangerous(err) if oldRule == nil { ginx.Bomb(http.StatusOK, "role not found") } if oldRule.Name == "Admin" { ginx.Bomb(http.StatusOK, "admin role can not be modified") } if oldRule.Name != f.Name { // name changed, check duplication num, err := models.RoleCount(rt.Ctx, "name=? and id<>?", f.Name, oldRule.Id) ginx.Dangerous(err) if num > 0 { ginx.Bomb(http.StatusOK, "role name already exists") } } oldRule.Name = f.Name oldRule.Note = f.Note ginx.NewRender(c).Message(oldRule.Update(rt.Ctx, "name", "note")) } func (rt *Router) roleDel(c *gin.Context) { id := ginx.UrlParamInt64(c, "id") target, err := models.RoleGet(rt.Ctx, "id=?", id) ginx.Dangerous(err) if target.Name == "Admin" { ginx.Bomb(http.StatusOK, "admin role can not be modified") } if target == nil { ginx.NewRender(c).Message(nil) return } ginx.NewRender(c).Message(target.Del(rt.Ctx)) } // 角色列表 func (rt *Router) roleGets(c *gin.Context) { lst, err := models.RoleGetsAll(rt.Ctx) ginx.NewRender(c).Data(lst, err) } func (rt *Router) allPerms(c *gin.Context) { roles, err := models.RoleGetsAll(rt.Ctx) ginx.Dangerous(err) m := make(map[string][]string) for _, r := range roles { lst, err := models.OperationsOfRole(rt.Ctx, strings.Fields(r.Name)) if err != nil { continue } m[r.Name] = lst } ginx.NewRender(c).Data(m, err) } ================================================ FILE: center/router/router_role_operation.go ================================================ package router import ( "net/http" "github.com/ccfos/nightingale/v6/center/cconf" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/i18n" ) func (rt *Router) operationOfRole(c *gin.Context) { var ( role *models.Role err error res []string roleOperations []string ) id := ginx.UrlParamInt64(c, "id") role, err = models.RoleGet(rt.Ctx, "id=?", id) ginx.Dangerous(err) if role == nil { ginx.Bomb(http.StatusOK, "role not found") } if role.Name == "Admin" { for _, ops := range cconf.Operations.Ops { for i := range ops.Ops { res = append(res, ops.Ops[i].Name) } } } else { roleOperations, err = models.OperationsOfRole(rt.Ctx, []string{role.Name}) res = roleOperations } ginx.NewRender(c).Data(res, err) } func (rt *Router) roleBindOperation(c *gin.Context) { id := ginx.UrlParamInt64(c, "id") role, err := models.RoleGet(rt.Ctx, "id=?", id) ginx.Dangerous(err) if role == nil { ginx.Bomb(http.StatusOK, "role not found") } if role.Name == "Admin" { ginx.Bomb(http.StatusOK, "admin role can not be modified") } var ops []string ginx.BindJSON(c, &ops) ginx.NewRender(c).Message(models.RoleOperationBind(rt.Ctx, role.Name, ops)) } func (rt *Router) operations(c *gin.Context) { var ops []cconf.Ops for _, v := range rt.Operations.Ops { newOp := cconf.Ops{ Name: v.Name, Cname: i18n.Sprintf(c.GetHeader("X-Language"), v.Cname), Ops: []cconf.SingleOp{}, } for i := range v.Ops { op := cconf.SingleOp{ Name: v.Ops[i].Name, Cname: i18n.Sprintf(c.GetHeader("X-Language"), v.Ops[i].Cname), } newOp.Ops = append(newOp.Ops, op) } ops = append(ops, newOp) } ginx.NewRender(c).Data(ops, nil) } ================================================ FILE: center/router/router_saved_view.go ================================================ package router import ( "net/http" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/slice" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) func (rt *Router) savedViewGets(c *gin.Context) { page := ginx.QueryStr(c, "page", "") me := c.MustGet("user").(*models.User) lst, err := models.SavedViewGets(rt.Ctx, page) if err != nil { ginx.NewRender(c).Data(nil, err) return } models.FillUpdateByNicknames(rt.Ctx, lst) userGids, err := models.MyGroupIds(rt.Ctx, me.Id) if err != nil { ginx.NewRender(c).Data(nil, err) return } favoriteMap, err := models.SavedViewFavoriteGetByUserId(rt.Ctx, me.Id) if err != nil { ginx.NewRender(c).Data(nil, err) return } favoriteViews := make([]models.SavedView, 0) normalViews := make([]models.SavedView, 0) for _, view := range lst { visible := view.CreateBy == me.Username || view.PublicCate == 2 || (view.PublicCate == 1 && slice.HaveIntersection[int64](userGids, view.Gids)) if !visible { continue } view.IsFavorite = favoriteMap[view.Id] // 收藏的排前面 if view.IsFavorite { favoriteViews = append(favoriteViews, view) } else { normalViews = append(normalViews, view) } } ginx.NewRender(c).Data(append(favoriteViews, normalViews...), nil) } func (rt *Router) savedViewAdd(c *gin.Context) { var f models.SavedView ginx.BindJSON(c, &f) me := c.MustGet("user").(*models.User) f.Id = 0 f.CreateBy = me.Username f.UpdateBy = me.Username err := models.SavedViewAdd(rt.Ctx, &f) ginx.NewRender(c).Data(f.Id, err) } func (rt *Router) savedViewPut(c *gin.Context) { id := ginx.UrlParamInt64(c, "id") view, err := models.SavedViewGetById(rt.Ctx, id) if err != nil { ginx.NewRender(c).Data(nil, err) return } if view == nil { ginx.NewRender(c, http.StatusNotFound).Message("saved view not found") return } me := c.MustGet("user").(*models.User) // 只有创建者可以更新 if view.CreateBy != me.Username && !me.IsAdmin() { ginx.NewRender(c, http.StatusForbidden).Message("forbidden") return } var f models.SavedView ginx.BindJSON(c, &f) view.Name = f.Name view.Filter = f.Filter view.PublicCate = f.PublicCate view.Gids = f.Gids err = models.SavedViewUpdate(rt.Ctx, view, me.Username) ginx.NewRender(c).Message(err) } func (rt *Router) savedViewDel(c *gin.Context) { id := ginx.UrlParamInt64(c, "id") view, err := models.SavedViewGetById(rt.Ctx, id) if err != nil { ginx.NewRender(c).Data(nil, err) return } if view == nil { ginx.NewRender(c, http.StatusNotFound).Message("saved view not found") return } me := c.MustGet("user").(*models.User) // 只有创建者或管理员可以删除 if view.CreateBy != me.Username && !me.IsAdmin() { ginx.NewRender(c, http.StatusForbidden).Message("forbidden") return } err = models.SavedViewDel(rt.Ctx, id) ginx.NewRender(c).Message(err) } func (rt *Router) savedViewFavoriteAdd(c *gin.Context) { id := ginx.UrlParamInt64(c, "id") me := c.MustGet("user").(*models.User) err := models.UserViewFavoriteAdd(rt.Ctx, id, me.Id) ginx.NewRender(c).Message(err) } func (rt *Router) savedViewFavoriteDel(c *gin.Context) { id := ginx.UrlParamInt64(c, "id") me := c.MustGet("user").(*models.User) err := models.UserViewFavoriteDel(rt.Ctx, id, me.Id) ginx.NewRender(c).Message(err) } ================================================ FILE: center/router/router_self.go ================================================ package router import ( "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/flashduty" "github.com/ccfos/nightingale/v6/pkg/ormx" "github.com/ccfos/nightingale/v6/pkg/secu" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/google/uuid" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/logger" ) func (rt *Router) selfProfileGet(c *gin.Context) { user := c.MustGet("user").(*models.User) if user.IsAdmin() { user.Admin = true } ginx.NewRender(c).Data(user, nil) } type selfProfileForm struct { Nickname string `json:"nickname"` Phone string `json:"phone"` Email string `json:"email"` Portrait string `json:"portrait"` Contacts ormx.JSONObj `json:"contacts"` } func (rt *Router) selfProfilePut(c *gin.Context) { var f selfProfileForm ginx.BindJSON(c, &f) user := c.MustGet("user").(*models.User) oldInfo := models.User{ Username: user.Username, Phone: user.Phone, Email: user.Email, } user.Nickname = f.Nickname user.Phone = f.Phone user.Email = f.Email user.Portrait = f.Portrait user.Contacts = f.Contacts user.UpdateBy = user.Username if flashduty.NeedSyncUser(rt.Ctx) { flashduty.UpdateUser(rt.Ctx, oldInfo, f.Email, f.Phone) } ginx.NewRender(c).Message(user.UpdateAllFields(rt.Ctx)) } type selfPasswordForm struct { OldPass string `json:"oldpass" binding:"required"` NewPass string `json:"newpass" binding:"required"` } func (rt *Router) selfPasswordPut(c *gin.Context) { var f selfPasswordForm ginx.BindJSON(c, &f) user := c.MustGet("user").(*models.User) newPassWord := f.NewPass oldPassWord := f.OldPass if rt.HTTP.RSA.OpenRSA { var err error newPassWord, err = secu.Decrypt(f.NewPass, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord) if err != nil { logger.Errorf("RSA Decrypt failed: %v username: %s", err, user.Username) ginx.NewRender(c).Message(err) return } oldPassWord, err = secu.Decrypt(f.OldPass, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord) if err != nil { logger.Errorf("RSA Decrypt failed: %v username: %s", err, user.Username) ginx.NewRender(c).Message(err) return } } ginx.NewRender(c).Message(user.ChangePassword(rt.Ctx, oldPassWord, newPassWord)) } type tokenForm struct { TokenName string `json:"token_name"` Token string `json:"token"` } func (rt *Router) getToken(c *gin.Context) { username := c.MustGet("username").(string) tokens, err := models.GetTokensByUsername(rt.Ctx, username) ginx.NewRender(c).Data(tokens, err) } func (rt *Router) addToken(c *gin.Context) { var f tokenForm ginx.BindJSON(c, &f) username := c.MustGet("username").(string) tokens, err := models.GetTokensByUsername(rt.Ctx, username) ginx.Dangerous(err) for _, token := range tokens { if token.TokenName == f.TokenName { ginx.NewRender(c).Message("token name already exists") return } } token, err := models.AddToken(rt.Ctx, username, uuid.New().String(), f.TokenName) ginx.NewRender(c).Data(token, err) } func (rt *Router) deleteToken(c *gin.Context) { id := ginx.UrlParamInt64(c, "id") username := c.MustGet("username").(string) tokenCount, err := models.CountToken(rt.Ctx, username) ginx.Dangerous(err) if tokenCount <= 1 { ginx.NewRender(c).Message("cannot delete the last token") return } ginx.NewRender(c).Message(models.DeleteToken(rt.Ctx, id)) } ================================================ FILE: center/router/router_server.go ================================================ package router import ( "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) func (rt *Router) serversGet(c *gin.Context) { list, err := models.AlertingEngineGets(rt.Ctx, "") ginx.NewRender(c).Data(list, err) } func (rt *Router) serverClustersGet(c *gin.Context) { list, err := models.AlertingEngineGetsClusters(rt.Ctx, "") ginx.NewRender(c).Data(list, err) } func (rt *Router) serverHeartbeat(c *gin.Context) { var req models.HeartbeatInfo ginx.BindJSON(c, &req) err := models.AlertingEngineHeartbeatWithCluster(rt.Ctx, req.Instance, req.EngineCluster, req.DatasourceId) ginx.NewRender(c).Message(err) } func (rt *Router) serversActive(c *gin.Context) { datasourceId := ginx.QueryInt64(c, "dsid", 0) engineName := ginx.QueryStr(c, "engine_name", "") if engineName != "" { servers, err := models.AlertingEngineGetsInstances(rt.Ctx, "engine_cluster = ? and clock > ?", engineName, time.Now().Unix()-30) ginx.NewRender(c).Data(servers, err) return } if datasourceId == 0 { ginx.NewRender(c).Message("dsid is required") return } servers, err := models.AlertingEngineGetsInstances(rt.Ctx, "datasource_id = ? and clock > ?", datasourceId, time.Now().Unix()-30) ginx.NewRender(c).Data(servers, err) } ================================================ FILE: center/router/router_source_token.go ================================================ package router import ( "net/http" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/google/uuid" "github.com/gin-gonic/gin" ) // sourceTokenAdd 生成新的源令牌 func (rt *Router) sourceTokenAdd(c *gin.Context) { var f models.SourceToken ginx.BindJSON(c, &f) if f.ExpireAt > 0 && f.ExpireAt <= time.Now().Unix() { ginx.Bomb(http.StatusBadRequest, "expire time must be in the future") } token := uuid.New().String() username := c.MustGet("username").(string) f.Token = token f.CreateBy = username f.CreateAt = time.Now().Unix() err := f.Add(rt.Ctx) ginx.Dangerous(err) go models.CleanupExpiredTokens(rt.Ctx) ginx.NewRender(c).Data(token, nil) } ================================================ FILE: center/router/router_target.go ================================================ package router import ( "context" "encoding/json" "fmt" "net/http" "strings" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/strx" "github.com/ccfos/nightingale/v6/pushgw/idents" "github.com/ccfos/nightingale/v6/storage" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/prometheus/common/model" "github.com/toolkits/pkg/logger" ) type TargetQuery struct { Filters []models.HostQuery `json:"queries"` P int `json:"p"` Limit int `json:"limit"` } func (rt *Router) targetGetsByHostFilter(c *gin.Context) { var f TargetQuery ginx.BindJSON(c, &f) query := models.GetHostsQuery(f.Filters) hosts, err := models.TargetGetsByFilter(rt.Ctx, query, f.Limit, (f.P-1)*f.Limit) ginx.Dangerous(err) total, err := models.TargetCountByFilter(rt.Ctx, query) ginx.Dangerous(err) models.FillTargetsBeatTime(rt.Redis, hosts) now := time.Now().Unix() for i := 0; i < len(hosts); i++ { if now-hosts[i].BeatTime < 60 { hosts[i].TargetUp = 2 } else if now-hosts[i].BeatTime < 180 { hosts[i].TargetUp = 1 } } ginx.NewRender(c).Data(gin.H{ "list": hosts, "total": total, }, nil) } func (rt *Router) targetGets(c *gin.Context) { bgids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",") query := ginx.QueryStr(c, "query", "") limit := ginx.QueryInt(c, "limit", 30) downtime := ginx.QueryInt64(c, "downtime", 0) dsIds := queryDatasourceIds(c) order := ginx.QueryStr(c, "order", "ident") desc := ginx.QueryBool(c, "desc", false) hosts := queryStrListField(c, "hosts", ",", " ", "\n") var err error if len(bgids) > 0 { for _, gid := range bgids { if gid > 0 { rt.bgroCheck(c, gid) } } } else { user := c.MustGet("user").(*models.User) if !user.IsAdmin() { // 如果是非 admin 用户,全部对象的情况,找到用户有权限的业务组 var err error bgids, err = models.MyBusiGroupIds(rt.Ctx, user.Id) ginx.Dangerous(err) // 将未分配业务组的对象也加入到列表中 bgids = append(bgids, 0) } } options := []models.BuildTargetWhereOption{ models.BuildTargetWhereWithBgids(bgids), models.BuildTargetWhereWithDsIds(dsIds), models.BuildTargetWhereWithQuery(query), models.BuildTargetWhereWithHosts(hosts), } // downtime 筛选:从缓存获取心跳时间,选择较小的集合用 IN 或 NOT IN 过滤 if downtime != 0 { downtimeOpt, hasMatch := rt.downtimeFilter(downtime) if !hasMatch { ginx.NewRender(c).Data(gin.H{ "list": []*models.Target{}, "total": 0, }, nil) return } if downtimeOpt != nil { options = append(options, downtimeOpt) } } total, err := models.TargetTotal(rt.Ctx, options...) ginx.Dangerous(err) list, err := models.TargetGets(rt.Ctx, limit, ginx.Offset(c, limit), order, desc, options...) ginx.Dangerous(err) tgs, err := models.TargetBusiGroupsGetAll(rt.Ctx) ginx.Dangerous(err) for _, t := range list { t.GroupIds = tgs[t.Ident] } if err == nil { now := time.Now() cache := make(map[int64]*models.BusiGroup) // 从 Redis 补全 BeatTime models.FillTargetsBeatTime(rt.Redis, list) var keys []string for i := 0; i < len(list); i++ { ginx.Dangerous(list[i].FillGroup(rt.Ctx, cache)) keys = append(keys, models.WrapIdent(list[i].Ident)) if now.Unix()-list[i].BeatTime < 60 { list[i].TargetUp = 2 } else if now.Unix()-list[i].BeatTime < 180 { list[i].TargetUp = 1 } } if len(keys) > 0 { metaMap := make(map[string]*models.HostMeta) vals := storage.MGet(context.Background(), rt.Redis, keys) for _, value := range vals { var meta models.HostMeta if value == nil { continue } err := json.Unmarshal(value, &meta) if err != nil { logger.Warningf("unmarshal %v host meta failed: %v", value, err) continue } metaMap[meta.Hostname] = &meta } for i := 0; i < len(list); i++ { if meta, ok := metaMap[list[i].Ident]; ok { list[i].FillMeta(meta) } else { // 未上报过元数据的主机,cpuNum默认为-1, 用于前端展示 unknown list[i].CpuNum = -1 } } } } ginx.NewRender(c).Data(gin.H{ "list": list, "total": total, }, nil) } // downtimeFilter 从缓存获取心跳时间,生成 downtime 筛选条件 // 选择匹配集和非匹配集中较小的一方,用 IN 或 NOT IN 来减少 SQL 参数量 // 返回值: // - option: 筛选条件,nil 表示所有 target 都符合条件(无需过滤) // - hasMatch: 是否有符合条件的 target,false 表示无匹配应返回空结果 func (rt *Router) downtimeFilter(downtime int64) (option models.BuildTargetWhereOption, hasMatch bool) { now := time.Now().Unix() targets := rt.TargetCache.GetAll() var matchIdents, nonMatchIdents []string for _, target := range targets { matched := false if downtime > 0 { matched = target.BeatTime < now-downtime } else if downtime < 0 { matched = target.BeatTime > now+downtime } if matched { matchIdents = append(matchIdents, target.Ident) } else { nonMatchIdents = append(nonMatchIdents, target.Ident) } } if len(matchIdents) == 0 { return nil, false } if len(nonMatchIdents) == 0 { return nil, true } if len(matchIdents) <= len(nonMatchIdents) { return models.BuildTargetWhereWithIdents(matchIdents), true } return models.BuildTargetWhereExcludeIdents(nonMatchIdents), true } func (rt *Router) targetExtendInfoByIdent(c *gin.Context) { ident := ginx.QueryStr(c, "ident", "") key := models.WrapExtendIdent(ident) vals := storage.MGet(context.Background(), rt.Redis, []string{key}) if len(vals) > 0 { extInfo := string(vals[0]) if extInfo == "null" { extInfo = "" } ginx.NewRender(c).Data(gin.H{ "extend_info": extInfo, "ident": ident, }, nil) return } ginx.NewRender(c).Data(gin.H{ "extend_info": "", "ident": ident, }, nil) } func (rt *Router) targetGetsByService(c *gin.Context) { lst, err := models.TargetGetsAll(rt.Ctx) ginx.NewRender(c).Data(lst, err) } func (rt *Router) targetGetTags(c *gin.Context) { idents := ginx.QueryStr(c, "idents", "") idents = strings.ReplaceAll(idents, ",", " ") ignoreHostTag := ginx.QueryBool(c, "ignore_host_tag", false) lst, err := models.TargetGetTags(rt.Ctx, strings.Fields(idents), ignoreHostTag, "") ginx.NewRender(c).Data(lst, err) } type targetTagsForm struct { Idents []string `json:"idents" binding:"required_without=HostIps"` HostIps []string `json:"host_ips" binding:"required_without=Idents"` Tags []string `json:"tags" binding:"required"` } func (rt *Router) targetBindTagsByFE(c *gin.Context) { var f targetTagsForm var err error var failedResults = make(map[string]string) ginx.BindJSON(c, &f) if len(f.Idents) == 0 && len(f.HostIps) == 0 { ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided") } // Acquire idents by idents and hostIps failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps) if err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } rt.checkTargetPerm(c, f.Idents) ginx.NewRender(c).Data(rt.targetBindTags(f, failedResults)) } func (rt *Router) targetBindTagsByService(c *gin.Context) { var f targetTagsForm var err error var failedResults = make(map[string]string) ginx.BindJSON(c, &f) if len(f.Idents) == 0 && len(f.HostIps) == 0 { ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided") } // Acquire idents by idents and hostIps failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps) if err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } ginx.NewRender(c).Data(rt.targetBindTags(f, failedResults)) } func (rt *Router) targetBindTags(f targetTagsForm, failedIdents map[string]string) (map[string]string, error) { // 1. Check tags if err := rt.validateTags(f.Tags); err != nil { return nil, err } // 2. Acquire targets by idents targets, err := models.TargetsGetByIdents(rt.Ctx, f.Idents) if err != nil { return nil, err } // 3. Add tags to targets for _, target := range targets { if err = rt.addTagsToTarget(target, f.Tags); err != nil { failedIdents[target.Ident] = err.Error() } } return failedIdents, nil } func (rt *Router) validateTags(tags []string) error { for _, tag := range tags { arr := strings.Split(tag, "=") if len(arr) != 2 { return fmt.Errorf("invalid tag format: %s (expected format: key=value)", tag) } key, value := strings.TrimSpace(arr[0]), strings.TrimSpace(arr[1]) if key == "" { return fmt.Errorf("invalid tag: key is empty in tag %s", tag) } if value == "" { return fmt.Errorf("invalid tag: value is empty in tag %s", tag) } if strings.Contains(key, ".") { return fmt.Errorf("invalid tag key: %s (key cannot contain '.')", key) } if strings.Contains(key, "-") { return fmt.Errorf("invalid tag key: %s (key cannot contain '-')", key) } if !model.LabelNameRE.MatchString(key) { return fmt.Errorf("invalid tag key: %s "+ "(key must start with a letter or underscore, followed by letters, digits, or underscores)", key) } } return nil } func (rt *Router) addTagsToTarget(target *models.Target, tags []string) error { for _, tag := range tags { tagKey := strings.Split(tag, "=")[0] if _, exist := target.TagsMap[tagKey]; exist { return fmt.Errorf("duplicate tagkey(%s)", tagKey) } } return target.AddTags(rt.Ctx, tags) } func (rt *Router) targetUnbindTagsByFE(c *gin.Context) { var f targetTagsForm var err error var failedResults = make(map[string]string) ginx.BindJSON(c, &f) if len(f.Idents) == 0 && len(f.HostIps) == 0 { ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided") } // Acquire idents by idents and hostIps failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps) if err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } rt.checkTargetPerm(c, f.Idents) ginx.NewRender(c).Data(rt.targetUnbindTags(f, failedResults)) } func (rt *Router) targetUnbindTagsByService(c *gin.Context) { var f targetTagsForm var err error var failedResults = make(map[string]string) ginx.BindJSON(c, &f) if len(f.Idents) == 0 && len(f.HostIps) == 0 { ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided") } // Acquire idents by idents and hostIps failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps) if err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } ginx.NewRender(c).Data(rt.targetUnbindTags(f, failedResults)) } func (rt *Router) targetUnbindTags(f targetTagsForm, failedIdents map[string]string) (map[string]string, error) { // 1. Acquire targets by idents targets, err := models.TargetsGetByIdents(rt.Ctx, f.Idents) if err != nil { return nil, err } // 2. Remove tags from targets for _, target := range targets { err = target.DelTags(rt.Ctx, f.Tags) if err != nil { failedIdents[target.Ident] = err.Error() continue } } return failedIdents, nil } type targetNoteForm struct { Idents []string `json:"idents" binding:"required_without=HostIps"` HostIps []string `json:"host_ips" binding:"required_without=Idents"` Note string `json:"note"` } func (rt *Router) targetUpdateNote(c *gin.Context) { var f targetNoteForm var err error var failedResults = make(map[string]string) ginx.BindJSON(c, &f) if len(f.Idents) == 0 && len(f.HostIps) == 0 { ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided") } // Acquire idents by idents and hostIps failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps) if err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } rt.checkTargetPerm(c, f.Idents) ginx.NewRender(c).Data(failedResults, models.TargetUpdateNote(rt.Ctx, f.Idents, f.Note)) } func (rt *Router) targetUpdateNoteByService(c *gin.Context) { var f targetNoteForm var err error var failedResults = make(map[string]string) ginx.BindJSON(c, &f) if len(f.Idents) == 0 && len(f.HostIps) == 0 { ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided") } // Acquire idents by idents and hostIps failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps) if err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } ginx.NewRender(c).Data(failedResults, models.TargetUpdateNote(rt.Ctx, f.Idents, f.Note)) } type targetBgidForm struct { Idents []string `json:"idents" binding:"required_without=HostIps"` HostIps []string `json:"host_ips" binding:"required_without=Idents"` Bgid int64 `json:"bgid"` } type targetBgidsForm struct { Idents []string `json:"idents" binding:"required_without=HostIps"` HostIps []string `json:"host_ips" binding:"required_without=Idents"` Bgids []int64 `json:"bgids"` Tags []string `json:"tags"` Action string `json:"action"` // add del reset } func haveNeverGroupedIdent(ctx *ctx.Context, idents []string) (bool, error) { for _, ident := range idents { bgids, err := models.TargetGroupIdsGetByIdent(ctx, ident) if err != nil { return false, err } if len(bgids) <= 0 { return true, nil } } return false, nil } func (rt *Router) targetBindBgids(c *gin.Context) { var f targetBgidsForm var err error var failedResults = make(map[string]string) ginx.BindJSON(c, &f) if len(f.Idents) == 0 && len(f.HostIps) == 0 { ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided") } // Acquire idents by idents and hostIps failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps) if err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } user := c.MustGet("user").(*models.User) if !user.IsAdmin() { // 普通用户,检查用户是否有权限操作所有请求的业务组 existing, _, err := models.SeparateTargetIdents(rt.Ctx, f.Idents) ginx.Dangerous(err) rt.checkTargetPerm(c, existing) var groupIds []int64 if f.Action == "reset" { // 如果是复写,则需要检查用户是否有权限操作机器之前的业务组 bgids, err := models.TargetGroupIdsGetByIdents(rt.Ctx, f.Idents) ginx.Dangerous(err) groupIds = append(groupIds, bgids...) } groupIds = append(groupIds, f.Bgids...) for _, bgid := range groupIds { bg := BusiGroup(rt.Ctx, bgid) can, err := user.CanDoBusiGroup(rt.Ctx, bg, "rw") ginx.Dangerous(err) if !can { ginx.Bomb(http.StatusForbidden, "forbidden") } } isNeverGrouped, checkErr := haveNeverGroupedIdent(rt.Ctx, f.Idents) ginx.Dangerous(checkErr) if isNeverGrouped { can, err := user.CheckPerm(rt.Ctx, "/targets/bind") ginx.Dangerous(err) if !can { ginx.Bomb(http.StatusForbidden, "forbidden") } } } switch f.Action { case "add": ginx.NewRender(c).Data(failedResults, models.TargetBindBgids(rt.Ctx, f.Idents, f.Bgids, f.Tags)) case "del": ginx.NewRender(c).Data(failedResults, models.TargetUnbindBgids(rt.Ctx, f.Idents, f.Bgids)) case "reset": ginx.NewRender(c).Data(failedResults, models.TargetOverrideBgids(rt.Ctx, f.Idents, f.Bgids, f.Tags)) default: ginx.Bomb(http.StatusBadRequest, "invalid action") } } func (rt *Router) targetUpdateBgidByService(c *gin.Context) { var f targetBgidForm var err error var failedResults = make(map[string]string) ginx.BindJSON(c, &f) if len(f.Idents) == 0 && len(f.HostIps) == 0 { ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided") } // Acquire idents by idents and hostIps failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps) if err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } ginx.NewRender(c).Data(failedResults, models.TargetOverrideBgids(rt.Ctx, f.Idents, []int64{f.Bgid}, nil)) } type identsForm struct { Idents []string `json:"idents" binding:"required_without=HostIps"` HostIps []string `json:"host_ips" binding:"required_without=Idents"` } func (rt *Router) targetDel(c *gin.Context) { var f identsForm var err error var failedResults = make(map[string]string) ginx.BindJSON(c, &f) if len(f.Idents) == 0 && len(f.HostIps) == 0 { ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided") } // Acquire idents by idents and hostIps failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps) if err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } ginx.NewRender(c).Data(failedResults, models.TargetDel(rt.Ctx, f.Idents, rt.TargetDeleteHook)) } func (rt *Router) targetDelByService(c *gin.Context) { var f identsForm var err error var failedResults = make(map[string]string) ginx.BindJSON(c, &f) if len(f.Idents) == 0 && len(f.HostIps) == 0 { ginx.Bomb(http.StatusBadRequest, "idents or host_ips must be provided") } // Acquire idents by idents and hostIps failedResults, f.Idents, err = models.TargetsGetIdentsByIdentsAndHostIps(rt.Ctx, f.Idents, f.HostIps) if err != nil { ginx.Bomb(http.StatusBadRequest, err.Error()) } ginx.NewRender(c).Data(failedResults, models.TargetDel(rt.Ctx, f.Idents, rt.TargetDeleteHook)) } func (rt *Router) checkTargetPerm(c *gin.Context, idents []string) { user := c.MustGet("user").(*models.User) nopri, err := user.NopriIdents(rt.Ctx, idents) ginx.Dangerous(err) if len(nopri) > 0 { ginx.Bomb(http.StatusForbidden, "forbidden") } } func (rt *Router) targetsOfAlertRule(c *gin.Context) { engineName := ginx.QueryStr(c, "engine_name", "") m, err := models.GetTargetsOfHostAlertRule(rt.Ctx, engineName) ret := make(map[string]map[int64][]string) for en, v := range m { if en != engineName { continue } ret[en] = make(map[int64][]string) for rid, idents := range v { ret[en][rid] = idents } } ginx.NewRender(c).Data(ret, err) } func (rt *Router) checkTargetsExistByIndent(idents []string) { notExists, err := models.TargetNoExistIdents(rt.Ctx, idents) ginx.Dangerous(err) if len(notExists) > 0 { ginx.Bomb(http.StatusBadRequest, "targets not exist: %s", strings.Join(notExists, ", ")) } } func (rt *Router) targetsOfHostQuery(c *gin.Context) { var queries []models.HostQuery ginx.BindJSON(c, &queries) hostsQuery := models.GetHostsQuery(queries) session := models.TargetFilterQueryBuild(rt.Ctx, hostsQuery, 0, 0) var lst []*models.Target err := session.Find(&lst).Error if err != nil { ginx.Bomb(http.StatusInternalServerError, err.Error()) } ginx.NewRender(c).Data(lst, nil) } func (rt *Router) targetStats(c *gin.Context) { bgids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",") var err error if len(bgids) > 0 { for _, gid := range bgids { if gid > 0 { rt.bgroCheck(c, gid) } } } else { user := c.MustGet("user").(*models.User) if !user.IsAdmin() { bgids, err = models.MyBusiGroupIds(rt.Ctx, user.Id) ginx.Dangerous(err) bgids = append(bgids, 0) } } targets := rt.TargetCache.GetAll() now := time.Now().Unix() var count, aliveCount, deadCount int64 memUsage := map[string]int64{"-1": 0, "20": 0, "40": 0, "60": 0, "80": 0, "100": 0} cpuUsage := map[string]int64{"-1": 0, "20": 0, "40": 0, "60": 0, "80": 0, "100": 0} versions := make(map[string]int64) bgidSet := make(map[int64]struct{}, len(bgids)) for _, gid := range bgids { bgidSet[gid] = struct{}{} } hasBgidFilter := len(bgids) > 0 for _, t := range targets { if hasBgidFilter { matched := false if _, ok := bgidSet[0]; ok && len(t.GroupIds) == 0 { matched = true } if !matched { for _, gid := range t.GroupIds { if _, ok := bgidSet[gid]; ok { matched = true break } } } if !matched { continue } } count++ if now-t.BeatTime < 180 { aliveCount++ } else { deadCount++ } if t.CpuNum <= 0 { cpuUsage["-1"]++ memUsage["-1"]++ } else { cpuUsage[usageBucket(t.CpuUtil)]++ memUsage[usageBucket(t.MemUtil)]++ } ver := t.AgentVersion if ver == "" { ver = "unknown" } versions[ver]++ } ginx.NewRender(c).Data(gin.H{ "count": count, "alive_count": aliveCount, "dead_count": deadCount, "mem_usage": memUsage, "cpu_usage": cpuUsage, "versions": versions, }, nil) } func usageBucket(val float64) string { switch { case val < 20: return "20" case val < 40: return "40" case val < 60: return "60" case val < 80: return "80" default: return "100" } } func (rt *Router) targetUpdate(c *gin.Context) { var f idents.TargetUpdate ginx.BindJSON(c, &f) ginx.NewRender(c).Message(rt.IdentSet.UpdateTargets(f.Lst, f.Now)) } ================================================ FILE: center/router/router_task.go ================================================ package router import ( "strings" "time" "github.com/ccfos/nightingale/v6/alert/sender" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/strx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/i18n" ) func (rt *Router) taskGets(c *gin.Context) { bgid := ginx.UrlParamInt64(c, "id") mine := ginx.QueryBool(c, "mine", false) days := ginx.QueryInt64(c, "days", 7) limit := ginx.QueryInt(c, "limit", 20) query := ginx.QueryStr(c, "query", "") user := c.MustGet("user").(*models.User) creator := "" if mine { creator = user.Username } beginTime := time.Now().Unix() - days*24*3600 total, err := models.TaskRecordTotal(rt.Ctx, []int64{bgid}, beginTime, creator, query) ginx.Dangerous(err) list, err := models.TaskRecordGets(rt.Ctx, []int64{bgid}, beginTime, creator, query, limit, ginx.Offset(c, limit)) ginx.Dangerous(err) ginx.NewRender(c).Data(gin.H{ "total": total, "list": list, }, nil) } func (rt *Router) taskGetsByGids(c *gin.Context) { gids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",") if len(gids) > 0 { for _, gid := range gids { rt.bgroCheck(c, gid) } } else { me := c.MustGet("user").(*models.User) if !me.IsAdmin() { var err error gids, err = models.MyBusiGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) if len(gids) == 0 { ginx.NewRender(c).Data([]int{}, nil) return } } } mine := ginx.QueryBool(c, "mine", false) days := ginx.QueryInt64(c, "days", 7) limit := ginx.QueryInt(c, "limit", 20) query := ginx.QueryStr(c, "query", "") user := c.MustGet("user").(*models.User) creator := "" if mine { creator = user.Username } beginTime := time.Now().Unix() - days*24*3600 total, err := models.TaskRecordTotal(rt.Ctx, gids, beginTime, creator, query) ginx.Dangerous(err) list, err := models.TaskRecordGets(rt.Ctx, gids, beginTime, creator, query, limit, ginx.Offset(c, limit)) ginx.Dangerous(err) ginx.NewRender(c).Data(gin.H{ "total": total, "list": list, }, nil) } func (rt *Router) taskRecordAdd(c *gin.Context) { var f *models.TaskRecord ginx.BindJSON(c, &f) ginx.NewRender(c).Message(f.Add(rt.Ctx)) } func (rt *Router) taskAdd(c *gin.Context) { if !rt.Ibex.Enable { ginx.Bomb(400, i18n.Sprintf(c.GetHeader("X-Language"), "This functionality has not been enabled. Please contact the system administrator to activate it.")) return } var f models.TaskForm ginx.BindJSON(c, &f) // 把 f.Hosts 中的空字符串过滤掉 hosts := make([]string, 0, len(f.Hosts)) for i := range f.Hosts { if strings.TrimSpace(f.Hosts[i]) != "" { hosts = append(hosts, strings.TrimSpace(f.Hosts[i])) } } f.Hosts = hosts bgid := ginx.UrlParamInt64(c, "id") user := c.MustGet("user").(*models.User) f.Creator = user.Username rt.checkTargetsExistByIndent(f.Hosts) err := f.Verify() ginx.Dangerous(err) f.HandleFH(f.Hosts[0]) // check permission rt.checkTargetPerm(c, f.Hosts) // call ibex taskId, err := sender.TaskAdd(f, user.Username, rt.Ctx.IsCenter) ginx.Dangerous(err) if taskId <= 0 { ginx.Dangerous("created task.id is zero") } // write db record := models.TaskRecord{ Id: taskId, GroupId: bgid, Title: f.Title, Account: f.Account, Batch: f.Batch, Tolerance: f.Tolerance, Timeout: f.Timeout, Pause: f.Pause, Script: f.Script, Args: f.Args, CreateAt: time.Now().Unix(), CreateBy: f.Creator, } err = record.Add(rt.Ctx) ginx.NewRender(c).Data(taskId, err) } ================================================ FILE: center/router/router_task_tpl.go ================================================ package router import ( "net/http" "sort" "strings" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/strx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/i18n" "github.com/toolkits/pkg/str" ) func (rt *Router) taskTplGets(c *gin.Context) { query := ginx.QueryStr(c, "query", "") limit := ginx.QueryInt(c, "limit", 20) groupId := ginx.UrlParamInt64(c, "id") total, err := models.TaskTplTotal(rt.Ctx, []int64{groupId}, query) ginx.Dangerous(err) list, err := models.TaskTplGets(rt.Ctx, []int64{groupId}, query, limit, ginx.Offset(c, limit)) ginx.Dangerous(err) models.FillUpdateByNicknames(rt.Ctx, list) ginx.NewRender(c).Data(gin.H{ "total": total, "list": list, }, nil) } func (rt *Router) taskTplGetsByGids(c *gin.Context) { query := ginx.QueryStr(c, "query", "") limit := ginx.QueryInt(c, "limit", 20) gids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "gids", ""), ",") if len(gids) > 0 { for _, gid := range gids { rt.bgroCheck(c, gid) } } else { me := c.MustGet("user").(*models.User) if !me.IsAdmin() { var err error gids, err = models.MyBusiGroupIds(rt.Ctx, me.Id) ginx.Dangerous(err) if len(gids) == 0 { ginx.NewRender(c).Data([]int{}, nil) return } } } total, err := models.TaskTplTotal(rt.Ctx, gids, query) ginx.Dangerous(err) list, err := models.TaskTplGets(rt.Ctx, gids, query, limit, ginx.Offset(c, limit)) ginx.Dangerous(err) models.FillUpdateByNicknames(rt.Ctx, list) ginx.NewRender(c).Data(gin.H{ "total": total, "list": list, }, nil) } func (rt *Router) taskTplGet(c *gin.Context) { tid := ginx.UrlParamInt64(c, "tid") tpl, err := models.TaskTplGet(rt.Ctx, "id = ?", tid) ginx.Dangerous(err) if tpl == nil { ginx.Bomb(404, "no such task template") } hosts, err := tpl.Hosts(rt.Ctx) ginx.NewRender(c).Data(gin.H{ "tpl": tpl, "hosts": hosts, }, err) } func (rt *Router) taskTplGetByService(c *gin.Context) { tid := ginx.UrlParamInt64(c, "tid") tpl, err := models.TaskTplGetById(rt.Ctx, tid) ginx.Dangerous(err) if tpl == nil { ginx.Bomb(404, "no such task template") } ginx.NewRender(c).Data(tpl, err) } func (rt *Router) taskTplGetsByService(c *gin.Context) { ginx.NewRender(c).Data(models.TaskTplGetAll(rt.Ctx)) } func (rt *Router) taskTplStatistics(c *gin.Context) { ginx.NewRender(c).Data(models.TaskTplStatistics(rt.Ctx)) } type taskTplForm struct { Title string `json:"title" binding:"required"` Batch int `json:"batch"` Tolerance int `json:"tolerance"` Timeout int `json:"timeout"` Pause string `json:"pause"` Script string `json:"script"` Args string `json:"args"` Tags []string `json:"tags"` Account string `json:"account"` Hosts []string `json:"hosts"` } func (f *taskTplForm) Verify() { // 传入的 f.Hosts 可能是 []string{"", "a", "b"},需要过滤掉空字符串 args := make([]string, 0, len(f.Hosts)) for _, ident := range f.Hosts { if strings.TrimSpace(ident) != "" { args = append(args, strings.TrimSpace(ident)) } } f.Hosts = args } func (rt *Router) taskTplAdd(c *gin.Context) { if !rt.Ibex.Enable { ginx.Bomb(400, i18n.Sprintf(c.GetHeader("X-Language"), "This functionality has not been enabled. Please contact the system administrator to activate it.")) return } var f taskTplForm ginx.BindJSON(c, &f) f.Verify() user := c.MustGet("user").(*models.User) now := time.Now().Unix() rt.checkTargetsExistByIndent(f.Hosts) sort.Strings(f.Tags) tpl := &models.TaskTpl{ GroupId: ginx.UrlParamInt64(c, "id"), Title: f.Title, Batch: f.Batch, Tolerance: f.Tolerance, Timeout: f.Timeout, Pause: f.Pause, Script: f.Script, Args: f.Args, Tags: strings.Join(f.Tags, " ") + " ", Account: f.Account, CreateBy: user.Username, UpdateBy: user.Username, CreateAt: now, UpdateAt: now, } ginx.NewRender(c).Message(tpl.Save(rt.Ctx, f.Hosts)) } func (rt *Router) taskTplPut(c *gin.Context) { tid := ginx.UrlParamInt64(c, "tid") tpl, err := models.TaskTplGet(rt.Ctx, "id = ?", tid) ginx.Dangerous(err) if tpl == nil { ginx.NewRender(c).Message("no such task template") return } user := c.MustGet("user").(*models.User) var f taskTplForm ginx.BindJSON(c, &f) f.Verify() rt.checkTargetsExistByIndent(f.Hosts) sort.Strings(f.Tags) tpl.Title = f.Title tpl.Batch = f.Batch tpl.Tolerance = f.Tolerance tpl.Timeout = f.Timeout tpl.Pause = f.Pause tpl.Script = f.Script tpl.Args = f.Args tpl.Tags = strings.Join(f.Tags, " ") + " " tpl.Account = f.Account tpl.UpdateBy = user.Username tpl.UpdateAt = time.Now().Unix() ginx.NewRender(c).Message(tpl.Update(rt.Ctx, f.Hosts)) } func (rt *Router) taskTplDel(c *gin.Context) { tid := ginx.UrlParamInt64(c, "tid") tpl, err := models.TaskTplGet(rt.Ctx, "id = ?", tid) ginx.Dangerous(err) if tpl == nil { ginx.NewRender(c).Message(nil) return } ids, err := models.GetAlertRuleIdsByTaskId(rt.Ctx, tid) ginx.Dangerous(err) if len(ids) > 0 { ginx.NewRender(c).Message("can't del this task tpl, used by alert rule ids(%v) ", ids) return } ginx.NewRender(c).Message(tpl.Del(rt.Ctx)) } type tplTagsForm struct { Ids []int64 `json:"ids" binding:"required"` Tags []string `json:"tags" binding:"required"` } func (f *tplTagsForm) Verify() { if len(f.Ids) == 0 { ginx.Bomb(http.StatusBadRequest, "arg(ids) empty") } if len(f.Tags) == 0 { ginx.Bomb(http.StatusBadRequest, "arg(tags) empty") } newTags := make([]string, 0, len(f.Tags)) for i := 0; i < len(f.Tags); i++ { tag := strings.TrimSpace(f.Tags[i]) if tag == "" { continue } if str.Dangerous(tag) { ginx.Bomb(http.StatusBadRequest, "arg(tags) invalid") } newTags = append(newTags, tag) } f.Tags = newTags if len(f.Tags) == 0 { ginx.Bomb(http.StatusBadRequest, "arg(tags) empty") } } func (rt *Router) taskTplBindTags(c *gin.Context) { var f tplTagsForm ginx.BindJSON(c, &f) f.Verify() username := c.MustGet("username").(string) for i := 0; i < len(f.Ids); i++ { tpl, err := models.TaskTplGet(rt.Ctx, "id = ?", f.Ids[i]) ginx.Dangerous(err) if tpl == nil { continue } ginx.Dangerous(tpl.AddTags(rt.Ctx, f.Tags, username)) } ginx.NewRender(c).Message(nil) } func (rt *Router) taskTplUnbindTags(c *gin.Context) { var f tplTagsForm ginx.BindJSON(c, &f) f.Verify() username := c.MustGet("username").(string) for i := 0; i < len(f.Ids); i++ { tpl, err := models.TaskTplGet(rt.Ctx, "id = ?", f.Ids[i]) ginx.Dangerous(err) if tpl == nil { continue } ginx.Dangerous(tpl.DelTags(rt.Ctx, f.Tags, username)) } ginx.NewRender(c).Message(nil) } ================================================ FILE: center/router/router_tdengine.go ================================================ package router import ( "fmt" "net/http" "github.com/ccfos/nightingale/v6/center/cconf" "github.com/ccfos/nightingale/v6/datasource/tdengine" "github.com/ccfos/nightingale/v6/dscache" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) type databasesQueryForm struct { Cate string `json:"cate" form:"cate"` DatasourceId int64 `json:"datasource_id" form:"datasource_id"` } func (rt *Router) tdengineDatabases(c *gin.Context) { var f databasesQueryForm ginx.BindJSON(c, &f) datasource, hit := dscache.DsCache.Get(f.Cate, f.DatasourceId) if _, ok := datasource.(*tdengine.TDengine); !hit || !ok { ginx.NewRender(c, http.StatusNotFound).Message("No such datasource") return } databases, err := datasource.(*tdengine.TDengine).ShowDatabases(rt.Ctx.Ctx) ginx.NewRender(c).Data(databases, err) } type tablesQueryForm struct { Cate string `json:"cate"` DatasourceId int64 `json:"datasource_id" ` Database string `json:"db"` IsStable bool `json:"is_stable"` } type Column struct { Name string `json:"name"` Type string `json:"type"` Size int `json:"size"` } // get tdengine tables func (rt *Router) tdengineTables(c *gin.Context) { var f tablesQueryForm ginx.BindJSON(c, &f) datasource, hit := dscache.DsCache.Get(f.Cate, f.DatasourceId) if _, ok := datasource.(*tdengine.TDengine); !hit || !ok { ginx.NewRender(c, http.StatusNotFound).Message("No such datasource") return } database := fmt.Sprintf("%s.tables", f.Database) if f.IsStable { database = fmt.Sprintf("%s.stables", f.Database) } tables, err := datasource.(*tdengine.TDengine).ShowTables(rt.Ctx.Ctx, database) ginx.NewRender(c).Data(tables, err) } type columnsQueryForm struct { Cate string `json:"cate"` DatasourceId int64 `json:"datasource_id" ` Database string `json:"db"` Table string `json:"table"` } func (rt *Router) tdengineColumns(c *gin.Context) { var f columnsQueryForm ginx.BindJSON(c, &f) datasource, hit := dscache.DsCache.Get(f.Cate, f.DatasourceId) if _, ok := datasource.(*tdengine.TDengine); !hit || !ok { ginx.NewRender(c, http.StatusNotFound).Message("No such datasource") return } query := map[string]string{ "database": f.Database, "table": f.Table, } columns, err := datasource.(*tdengine.TDengine).DescribeTable(rt.Ctx.Ctx, query) // 对齐前端,后续可以将 tdEngine 的查数据的接口都统一 tdColumns := make([]Column, len(columns)) for i, column := range columns { tdColumns[i] = Column{ Name: column.Field, Type: column.Type, } } ginx.NewRender(c).Data(tdColumns, err) } // query sql template func (rt *Router) QuerySqlTemplate(c *gin.Context) { cate := ginx.QueryStr(c, "cate") m := make(map[string]string) switch cate { case models.TDENGINE: m = cconf.TDengineSQLTpl } ginx.NewRender(c).Data(m, nil) } ================================================ FILE: center/router/router_trace_logs.go ================================================ package router import ( "encoding/json" "fmt" "io" "net/http" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/ccfos/nightingale/v6/pkg/loggrep" "github.com/toolkits/pkg/logger" "github.com/gin-gonic/gin" ) // traceLogsPage renders an HTML log viewer page for trace logs. func (rt *Router) traceLogsPage(c *gin.Context) { traceId := ginx.UrlParamStr(c, "traceid") if !loggrep.IsValidTraceID(traceId) { c.String(http.StatusBadRequest, "invalid trace id format") return } logs, instance, err := rt.getTraceLogs(traceId) if err != nil { c.String(http.StatusInternalServerError, "Error: %v", err) return } c.Header("Content-Type", "text/html; charset=utf-8") err = loggrep.RenderTraceLogsHTML(c.Writer, loggrep.TraceLogsPageData{ TraceID: traceId, Instance: instance, Logs: logs, Total: len(logs), }) if err != nil { c.String(http.StatusInternalServerError, "render error: %v", err) } } // traceLogsJSON returns JSON for trace logs. func (rt *Router) traceLogsJSON(c *gin.Context) { traceId := ginx.UrlParamStr(c, "traceid") if !loggrep.IsValidTraceID(traceId) { ginx.Bomb(200, "invalid trace id format") } logs, instance, err := rt.getTraceLogs(traceId) ginx.Dangerous(err) ginx.NewRender(c).Data(loggrep.EventDetailResp{ Logs: logs, Instance: instance, }, nil) } // getTraceLogs finds the same-engine instances and queries each one // until trace logs are found. Trace logs belong to a single instance. func (rt *Router) getTraceLogs(traceId string) ([]string, string, error) { keyword := "trace_id=" + traceId instance := fmt.Sprintf("%s:%d", rt.Alert.Heartbeat.IP, rt.HTTP.Port) engineName := rt.Alert.Heartbeat.EngineName // try local first logs, err := loggrep.GrepLatestLogFiles(rt.LogDir, keyword) if err == nil && len(logs) > 0 { return logs, instance, nil } // find all instances with the same engineName servers, err := models.AlertingEngineGetsInstances(rt.Ctx, "engine_cluster = ? and clock > ?", engineName, time.Now().Unix()-30) if err != nil { return nil, "", err } // loop through remote instances until we find logs for _, node := range servers { if node == instance { continue // already tried local } logs, nodeAddr, err := rt.forwardTraceLogs(node, traceId) if err != nil { logger.Errorf("forwardTraceLogs failed: %v", err) continue } if len(logs) > 0 { return logs, nodeAddr, nil } } return nil, instance, nil } func (rt *Router) forwardTraceLogs(node, traceId string) ([]string, string, error) { url := fmt.Sprintf("http://%s/v1/n9e/trace-logs/%s", node, traceId) req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, node, err } for user, pass := range rt.HTTP.APIForService.BasicAuth { req.SetBasicAuth(user, pass) break } client := &http.Client{Timeout: 15 * time.Second} resp, err := client.Do(req) if err != nil { return nil, node, fmt.Errorf("forward to %s failed: %v", node, err) } defer resp.Body.Close() body, err := io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024)) if err != nil { return nil, node, err } var result struct { Dat loggrep.EventDetailResp `json:"dat"` Err string `json:"err"` } if err := json.Unmarshal(body, &result); err != nil { return nil, node, err } if result.Err != "" { return nil, node, fmt.Errorf("%s", result.Err) } return result.Dat.Logs, result.Dat.Instance, nil } ================================================ FILE: center/router/router_user.go ================================================ package router import ( "fmt" "net/http" "strings" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/flashduty" "github.com/ccfos/nightingale/v6/pkg/ormx" "github.com/ccfos/nightingale/v6/pkg/secu" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/logger" "gorm.io/gorm" ) func (rt *Router) userBusiGroupsGets(c *gin.Context) { userid := ginx.QueryInt64(c, "userid", 0) username := ginx.QueryStr(c, "username", "") if userid == 0 && username == "" { ginx.Bomb(http.StatusBadRequest, "userid or username required") } var user *models.User var err error if userid > 0 { user, err = models.UserGetById(rt.Ctx, userid) } else { user, err = models.UserGetByUsername(rt.Ctx, username) } ginx.Dangerous(err) groups, err := user.BusiGroups(rt.Ctx, 10000, "") ginx.NewRender(c).Data(groups, err) } func (rt *Router) userFindAll(c *gin.Context) { list, err := models.UserGetAll(rt.Ctx) ginx.NewRender(c).Data(list, err) } func (rt *Router) userGets(c *gin.Context) { stime, etime := getTimeRange(c) limit := ginx.QueryInt(c, "limit", 20) query := ginx.QueryStr(c, "query", "") order := ginx.QueryStr(c, "order", "username") desc := ginx.QueryBool(c, "desc", false) usernames := strings.Split(ginx.QueryStr(c, "usernames", ""), ",") phones := strings.Split(ginx.QueryStr(c, "phones", ""), ",") emails := strings.Split(ginx.QueryStr(c, "emails", ""), ",") if len(usernames) == 1 && usernames[0] == "" { usernames = []string{} } if len(phones) == 1 && phones[0] == "" { phones = []string{} } if len(emails) == 1 && emails[0] == "" { emails = []string{} } go rt.UserCache.UpdateUsersLastActiveTime() total, err := models.UserTotal(rt.Ctx, query, stime, etime) ginx.Dangerous(err) list, err := models.UserGets(rt.Ctx, query, limit, ginx.Offset(c, limit), stime, etime, order, desc, usernames, phones, emails) ginx.Dangerous(err) user := c.MustGet("user").(*models.User) ginx.NewRender(c).Data(gin.H{ "list": list, "total": total, "admin": user.IsAdmin(), }, nil) } type userAddForm struct { Username string `json:"username" binding:"required"` Password string `json:"password" binding:"required"` Nickname string `json:"nickname"` Phone string `json:"phone"` Email string `json:"email"` Portrait string `json:"portrait"` Roles []string `json:"roles" binding:"required"` Contacts ormx.JSONObj `json:"contacts"` } func (rt *Router) userAddPost(c *gin.Context) { var f userAddForm ginx.BindJSON(c, &f) authPassWord := f.Password if rt.HTTP.RSA.OpenRSA { decPassWord, err := secu.Decrypt(f.Password, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord) if err != nil { logger.Errorf("RSA Decrypt failed: %v username: %s", err, f.Username) ginx.NewRender(c).Message(err) return } authPassWord = decPassWord } password, err := models.CryptoPass(rt.Ctx, authPassWord) ginx.Dangerous(err) if len(f.Roles) == 0 { ginx.Bomb(http.StatusBadRequest, "roles empty") } username := Username(c) u := models.User{ Username: f.Username, Password: password, Nickname: f.Nickname, Phone: f.Phone, Email: f.Email, Portrait: f.Portrait, Roles: strings.Join(f.Roles, " "), Contacts: f.Contacts, CreateBy: username, UpdateBy: username, } ginx.Dangerous(u.Verify()) ginx.NewRender(c).Message(u.Add(rt.Ctx)) } func (rt *Router) userProfileGet(c *gin.Context) { user := User(rt.Ctx, ginx.UrlParamInt64(c, "id")) ginx.NewRender(c).Data(user, nil) } type userProfileForm struct { Nickname string `json:"nickname"` Phone string `json:"phone"` Email string `json:"email"` Roles []string `json:"roles"` Contacts ormx.JSONObj `json:"contacts"` } func (rt *Router) userProfilePutByService(c *gin.Context) { var f models.User ginx.BindJSON(c, &f) if len(f.RolesLst) == 0 { ginx.Bomb(http.StatusBadRequest, "roles empty") } password, err := models.CryptoPass(rt.Ctx, f.Password) ginx.Dangerous(err) target := User(rt.Ctx, ginx.UrlParamInt64(c, "id")) target.Nickname = f.Nickname target.Password = password target.Phone = f.Phone target.Email = f.Email target.Portrait = f.Portrait target.Roles = strings.Join(f.RolesLst, " ") target.Contacts = f.Contacts target.UpdateBy = Username(c) ginx.NewRender(c).Message(target.UpdateAllFields(rt.Ctx)) } func (rt *Router) userProfilePut(c *gin.Context) { var f userProfileForm ginx.BindJSON(c, &f) if len(f.Roles) == 0 { ginx.Bomb(http.StatusBadRequest, "roles empty") } target := User(rt.Ctx, ginx.UrlParamInt64(c, "id")) oldInfo := models.User{ Username: target.Username, Phone: target.Phone, Email: target.Email, } target.Nickname = f.Nickname target.Phone = f.Phone target.Email = f.Email target.Roles = strings.Join(f.Roles, " ") target.Contacts = f.Contacts target.UpdateBy = c.MustGet("username").(string) if flashduty.NeedSyncUser(rt.Ctx) { flashduty.UpdateUser(rt.Ctx, oldInfo, f.Email, f.Phone) } ginx.NewRender(c).Message(target.UpdateAllFields(rt.Ctx)) } type userPasswordForm struct { Password string `json:"password" binding:"required"` } func (rt *Router) userPasswordPut(c *gin.Context) { var f userPasswordForm ginx.BindJSON(c, &f) target := User(rt.Ctx, ginx.UrlParamInt64(c, "id")) authPassWord := f.Password if rt.HTTP.RSA.OpenRSA { decPassWord, err := secu.Decrypt(f.Password, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord) if err != nil { logger.Errorf("RSA Decrypt failed: %v username: %s", err, target.Username) ginx.NewRender(c).Message(err) return } authPassWord = decPassWord } cryptoPass, err := models.CryptoPass(rt.Ctx, authPassWord) ginx.Dangerous(err) ginx.NewRender(c).Message(target.UpdatePassword(rt.Ctx, cryptoPass, c.MustGet("username").(string))) } func (rt *Router) userDel(c *gin.Context) { id := ginx.UrlParamInt64(c, "id") target, err := models.UserGetById(rt.Ctx, id) ginx.Dangerous(err) if target == nil { ginx.NewRender(c).Message(nil) return } // 如果要删除的用户是 admin 角色,检查是否是最后一个 admin if target.IsAdmin() { adminCount, err := models.CountAdminUsers(rt.Ctx) ginx.Dangerous(err) if adminCount <= 1 { ginx.Bomb(http.StatusBadRequest, "Cannot delete the last admin user") } } ginx.NewRender(c).Message(target.Del(rt.Ctx)) } func (rt *Router) installDateGet(c *gin.Context) { rootUser, err := models.UserGetByUsername(rt.Ctx, "root") if err != nil { logger.Errorf("get root user failed: %v", err) ginx.NewRender(c).Data(0, nil) return } if rootUser == nil { logger.Errorf("root user not found") ginx.NewRender(c).Data(0, nil) return } ginx.NewRender(c).Data(rootUser.CreateAt, nil) } // usersPhoneEncrypt 统一手机号加密 func (rt *Router) usersPhoneEncrypt(c *gin.Context) { users, err := models.UserGetAll(rt.Ctx) if err != nil { ginx.NewRender(c).Message(fmt.Errorf("get users failed: %v", err)) return } // 获取RSA密钥 _, publicKey, _, err := models.GetRSAKeys(rt.Ctx) if err != nil { ginx.NewRender(c).Message(fmt.Errorf("get RSA keys failed: %v", err)) return } // 先启用手机号加密功能 err = models.SetPhoneEncryptionEnabled(rt.Ctx, true) if err != nil { ginx.NewRender(c).Message(fmt.Errorf("enable phone encryption failed: %v", err)) return } // 刷新配置缓存 err = models.RefreshPhoneEncryptionCache(rt.Ctx) if err != nil { logger.Errorf("Failed to refresh phone encryption cache: %v", err) // 回滚配置 models.SetPhoneEncryptionEnabled(rt.Ctx, false) ginx.NewRender(c).Message(fmt.Errorf("refresh cache failed: %v", err)) return } successCount := 0 failCount := 0 var failedUsers []string // 使用事务处理所有用户的手机号加密 err = models.DB(rt.Ctx).Transaction(func(tx *gorm.DB) error { // 对每个用户的手机号进行加密 for _, user := range users { if user.Phone == "" { continue } if isPhoneEncrypted(user.Phone) { continue } encryptedPhone, err := secu.EncryptValue(user.Phone, publicKey) if err != nil { logger.Errorf("Failed to encrypt phone for user %s: %v", user.Username, err) failCount++ failedUsers = append(failedUsers, user.Username) continue } err = tx.Model(&models.User{}).Where("id = ?", user.Id).Update("phone", encryptedPhone).Error if err != nil { logger.Errorf("Failed to update phone for user %s: %v", user.Username, err) failCount++ failedUsers = append(failedUsers, user.Username) continue } successCount++ logger.Debugf("Successfully encrypted phone for user %s", user.Username) } // 如果有失败的用户,回滚事务 if failCount > 0 { return fmt.Errorf("encrypt failed users: %d, failed users: %v", failCount, failedUsers) } return nil }) if err != nil { // 加密失败,回滚配置 models.SetPhoneEncryptionEnabled(rt.Ctx, false) models.RefreshPhoneEncryptionCache(rt.Ctx) ginx.NewRender(c).Message(fmt.Errorf("encrypt phone failed: %v", err)) return } ginx.NewRender(c).Data(gin.H{ "success_count": successCount, "fail_count": failCount, }, nil) } func (rt *Router) usersPhoneDecryptRefresh(c *gin.Context) { err := models.RefreshPhoneEncryptionCache(rt.Ctx) if err != nil { ginx.NewRender(c).Message(fmt.Errorf("refresh phone encryption cache failed: %v", err)) return } ginx.NewRender(c).Message(nil) } // usersPhoneDecrypt 统一手机号解密 func (rt *Router) usersPhoneDecrypt(c *gin.Context) { // 先关闭手机号加密功能 err := models.SetPhoneEncryptionEnabled(rt.Ctx, false) if err != nil { ginx.NewRender(c).Message(fmt.Errorf("disable phone encryption failed: %v", err)) return } // 刷新配置缓存 err = models.RefreshPhoneEncryptionCache(rt.Ctx) if err != nil { logger.Errorf("Failed to refresh phone encryption cache: %v", err) // 回滚配置 models.SetPhoneEncryptionEnabled(rt.Ctx, true) ginx.NewRender(c).Message(fmt.Errorf("refresh cache failed: %v", err)) return } // 获取所有用户(此时加密开关已关闭,直接读取数据库原始数据) var users []*models.User err = models.DB(rt.Ctx).Find(&users).Error if err != nil { // 回滚配置 models.SetPhoneEncryptionEnabled(rt.Ctx, true) models.RefreshPhoneEncryptionCache(rt.Ctx) ginx.NewRender(c).Message(fmt.Errorf("get users failed: %v", err)) return } // 获取RSA密钥 privateKey, _, password, err := models.GetRSAKeys(rt.Ctx) if err != nil { // 回滚配置 models.SetPhoneEncryptionEnabled(rt.Ctx, true) models.RefreshPhoneEncryptionCache(rt.Ctx) ginx.NewRender(c).Message(fmt.Errorf("get RSA keys failed: %v", err)) return } successCount := 0 failCount := 0 var failedUsers []string // 使用事务处理所有用户的手机号解密 err = models.DB(rt.Ctx).Transaction(func(tx *gorm.DB) error { // 对每个用户的手机号进行解密 for _, user := range users { if user.Phone == "" { continue } // 检查是否是加密的手机号 if !isPhoneEncrypted(user.Phone) { continue } // 对手机号进行解密 decryptedPhone, err := secu.Decrypt(user.Phone, privateKey, password) if err != nil { logger.Errorf("Failed to decrypt phone for user %s: %v", user.Username, err) failCount++ failedUsers = append(failedUsers, user.Username) continue } // 直接更新数据库中的手机号字段(绕过GORM钩子) err = tx.Model(&models.User{}).Where("id = ?", user.Id).Update("phone", decryptedPhone).Error if err != nil { logger.Errorf("Failed to update phone for user %s: %v", user.Username, err) failCount++ failedUsers = append(failedUsers, user.Username) continue } successCount++ logger.Debugf("Successfully decrypted phone for user %s", user.Username) } // 如果有失败的用户,回滚事务 if failCount > 0 { return fmt.Errorf("decrypt failed users: %d, failed users: %v", failCount, failedUsers) } return nil }) if err != nil { // 解密失败,回滚配置 models.SetPhoneEncryptionEnabled(rt.Ctx, true) models.RefreshPhoneEncryptionCache(rt.Ctx) ginx.NewRender(c).Message(fmt.Errorf("decrypt phone failed: %v", err)) return } ginx.NewRender(c).Data(gin.H{ "success_count": successCount, "fail_count": failCount, }, nil) } // isPhoneEncrypted 检查手机号是否已经加密 func isPhoneEncrypted(phone string) bool { // 检查是否有 "enc:" 前缀标记 return len(phone) > 4 && phone[:4] == "enc:" } ================================================ FILE: center/router/router_user_group.go ================================================ package router import ( "net/http" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/flashduty" "github.com/ccfos/nightingale/v6/pkg/strx" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" "github.com/toolkits/pkg/logger" ) func (rt *Router) checkBusiGroupPerm(c *gin.Context) { me := c.MustGet("user").(*models.User) bg := BusiGroup(rt.Ctx, ginx.UrlParamInt64(c, "id")) can, err := me.CanDoBusiGroup(rt.Ctx, bg, ginx.UrlParamStr(c, "perm")) ginx.NewRender(c).Data(can, err) } func (rt *Router) userGroupGets(c *gin.Context) { limit := ginx.QueryInt(c, "limit", 1500) query := ginx.QueryStr(c, "query", "") me := c.MustGet("user").(*models.User) lst, err := me.UserGroups(rt.Ctx, limit, query) if err == nil { models.FillUpdateByNicknames(rt.Ctx, lst) } ginx.NewRender(c).Data(lst, err) } func (rt *Router) userGroupGetsByService(c *gin.Context) { ids := strx.IdsInt64ForAPI(ginx.QueryStr(c, "ids", "")) if len(ids) == 0 { lst, err := models.UserGroupGetAll(rt.Ctx) ginx.Dangerous(err) for i := 0; i < len(lst); i++ { ids, err := models.MemberIds(rt.Ctx, lst[i].Id) ginx.Dangerous(err) lst[i].Users, err = models.UserGetsByIds(rt.Ctx, ids) ginx.Dangerous(err) } ginx.NewRender(c).Data(lst, err) return } lst := make([]models.UserGroup, 0) for _, id := range ids { ug := UserGroup(rt.Ctx, id) ids, err := models.MemberIds(rt.Ctx, ug.Id) ginx.Dangerous(err) ug.Users, err = models.UserGetsByIds(rt.Ctx, ids) ginx.Dangerous(err) lst = append(lst, *ug) } ginx.NewRender(c).Data(lst, nil) } // user group member get by service func (rt *Router) userGroupMemberGetsByService(c *gin.Context) { members, err := models.UserGroupMemberGetAll(rt.Ctx) ginx.NewRender(c).Data(members, err) } type userGroupForm struct { Name string `json:"name" binding:"required"` Note string `json:"note"` IsSyncToFlashDuty bool `json:"is_sync_to_flashduty"` } func (rt *Router) userGroupAdd(c *gin.Context) { var f userGroupForm ginx.BindJSON(c, &f) me := c.MustGet("user").(*models.User) ug := models.UserGroup{ Name: f.Name, Note: f.Note, CreateBy: me.Username, UpdateBy: me.Username, } err := ug.Add(rt.Ctx) ginx.Dangerous(err) // Even failure is not a big deal models.UserGroupMemberAdd(rt.Ctx, ug.Id, me.Id) if f.IsSyncToFlashDuty || flashduty.NeedSyncTeam(rt.Ctx) { ugs, err := flashduty.NewUserGroupSyncer(rt.Ctx, &ug) ginx.Dangerous(err) err = ugs.SyncUGAdd() ginx.Dangerous(err) } ginx.NewRender(c).Data(ug.Id, err) } func (rt *Router) userGroupPut(c *gin.Context) { var f userGroupForm ginx.BindJSON(c, &f) me := c.MustGet("user").(*models.User) ug := c.MustGet("user_group").(*models.UserGroup) if ug.Name != f.Name { // name changed, check duplication num, err := models.UserGroupCount(rt.Ctx, "name=? and id<>?", f.Name, ug.Id) ginx.Dangerous(err) if num > 0 { ginx.Bomb(http.StatusOK, "UserGroup already exists") } } ug.Name = f.Name ug.Note = f.Note ug.UpdateBy = me.Username ug.UpdateAt = time.Now().Unix() if f.IsSyncToFlashDuty || flashduty.NeedSyncTeam(rt.Ctx) { ugs, err := flashduty.NewUserGroupSyncer(rt.Ctx, ug) ginx.Dangerous(err) err = ugs.SyncUGPut() ginx.Dangerous(err) } ginx.NewRender(c).Message(ug.Update(rt.Ctx, "Name", "Note", "UpdateAt", "UpdateBy")) } // Return all members, front-end search and paging func (rt *Router) userGroupGet(c *gin.Context) { ug := UserGroup(rt.Ctx, ginx.UrlParamInt64(c, "id")) ids, err := models.MemberIds(rt.Ctx, ug.Id) ginx.Dangerous(err) logger.Info("userGroupGet", ids) users, err := models.UserGetsByIds(rt.Ctx, ids) ginx.NewRender(c).Data(gin.H{ "users": users, "user_group": ug, }, err) } func (rt *Router) userGroupDel(c *gin.Context) { isSyncToFlashDuty := ginx.QueryBool(c, "is_sync_to_flashduty", false) ug := c.MustGet("user_group").(*models.UserGroup) if isSyncToFlashDuty || flashduty.NeedSyncTeam(rt.Ctx) { ugs, err := flashduty.NewUserGroupSyncer(rt.Ctx, ug) ginx.Dangerous(err) err = ugs.SyncUGDel() // 如果team 在 duty 被引用或者已经删除,会报错,可以忽略报错 if err != nil { logger.Warningf("failed to sync user group %s to flashduty's team: %v", ug.Name, err) } } ginx.NewRender(c).Message(ug.Del(rt.Ctx)) } func (rt *Router) userGroupMemberAdd(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) f.Verify() me := c.MustGet("user").(*models.User) ug := c.MustGet("user_group").(*models.UserGroup) err := ug.AddMembers(rt.Ctx, f.Ids) ginx.Dangerous(err) if err == nil { ug.UpdateAt = time.Now().Unix() ug.UpdateBy = me.Username ug.Update(rt.Ctx, "UpdateAt", "UpdateBy") } if f.IsSyncToFlashDuty || flashduty.NeedSyncTeam(rt.Ctx) { ugs, err := flashduty.NewUserGroupSyncer(rt.Ctx, ug) ginx.Dangerous(err) err = ugs.SyncMembersAdd() ginx.Dangerous(err) } ginx.NewRender(c).Message(err) } func (rt *Router) userGroupMemberDel(c *gin.Context) { var f idsForm ginx.BindJSON(c, &f) f.Verify() me := c.MustGet("user").(*models.User) ug := c.MustGet("user_group").(*models.UserGroup) err := ug.DelMembers(rt.Ctx, f.Ids) if err == nil { ug.UpdateAt = time.Now().Unix() ug.UpdateBy = me.Username ug.Update(rt.Ctx, "UpdateAt", "UpdateBy") } if f.IsSyncToFlashDuty || flashduty.NeedSyncTeam(rt.Ctx) { ugs, err := flashduty.NewUserGroupSyncer(rt.Ctx, ug) ginx.Dangerous(err) err = ugs.SyncMembersDel() ginx.Dangerous(err) } ginx.NewRender(c).Message(err) } ================================================ FILE: center/router/router_user_variable_config.go ================================================ package router import ( "strings" "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ginx" "github.com/gin-gonic/gin" ) func (rt *Router) userVariableConfigGets(context *gin.Context) { userVariables, err := models.ConfigsGetUserVariable(rt.Ctx) ginx.NewRender(context).Data(userVariables, err) } func (rt *Router) userVariableConfigAdd(context *gin.Context) { var f models.Configs ginx.BindJSON(context, &f) f.Ckey = strings.TrimSpace(f.Ckey) //insert external config. needs to make sure not plaintext for an encrypted type config username := context.MustGet("username").(string) now := time.Now().Unix() f.CreateBy = username f.UpdateBy = username f.CreateAt = now f.UpdateAt = now ginx.NewRender(context).Message(models.ConfigsUserVariableInsert(rt.Ctx, f)) } func (rt *Router) userVariableConfigPut(context *gin.Context) { var f models.Configs ginx.BindJSON(context, &f) f.Id = ginx.UrlParamInt64(context, "id") f.Ckey = strings.TrimSpace(f.Ckey) f.UpdateBy = context.MustGet("username").(string) f.UpdateAt = time.Now().Unix() user := context.MustGet("user").(*models.User) if !user.IsAdmin() && f.CreateBy != user.Username { // only admin or creator can update ginx.Bomb(403, "forbidden") } ginx.NewRender(context).Message(models.ConfigsUserVariableUpdate(rt.Ctx, f)) } func (rt *Router) userVariableConfigDel(context *gin.Context) { id := ginx.UrlParamInt64(context, "id") configs, err := models.ConfigGet(rt.Ctx, id) ginx.Dangerous(err) user := context.MustGet("user").(*models.User) if !user.IsAdmin() && configs.CreateBy != user.Username { // only admin or creator can delete ginx.Bomb(403, "forbidden") } if configs != nil && configs.External == models.ConfigExternal { ginx.NewRender(context).Message(models.ConfigsDel(rt.Ctx, []int64{id})) } else { ginx.NewRender(context).Message(nil) } } func (rt *Router) userVariableGetDecryptByService(context *gin.Context) { decryptMap, decryptErr := models.ConfigUserVariableGetDecryptMap(rt.Ctx, rt.HTTP.RSA.RSAPrivateKey, rt.HTTP.RSA.RSAPassWord) ginx.NewRender(context).Data(decryptMap, decryptErr) } ================================================ FILE: center/sso/init.go ================================================ package sso import ( "encoding/json" "fmt" "log" "time" "github.com/ccfos/nightingale/v6/center/cconf" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/cas" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/dingtalk" "github.com/ccfos/nightingale/v6/pkg/feishu" "github.com/ccfos/nightingale/v6/pkg/ldapx" "github.com/ccfos/nightingale/v6/pkg/oauth2x" "github.com/ccfos/nightingale/v6/pkg/oidcx" "github.com/ccfos/nightingale/v6/pkg/tplx" "github.com/BurntSushi/toml" "github.com/toolkits/pkg/logger" ) type SsoClient struct { OIDC *oidcx.SsoClient LDAP *ldapx.SsoClient CAS *cas.SsoClient OAuth2 *oauth2x.SsoClient DingTalk *dingtalk.SsoClient FeiShu *feishu.SsoClient LastUpdateTime int64 configCache *memsto.ConfigCache configLastUpdateTime int64 } const LDAP = ` Enable = false Host = 'ldap.example.org' Port = 389 BaseDn = 'dc=example,dc=org' BindUser = 'cn=manager,dc=example,dc=org' BindPass = '*******' SyncAddUsers = false SyncDelUsers = false # unit: s SyncInterval = 86400 # openldap format e.g. (&(uid=%s)) # AD format e.g. (&(sAMAccountName=%s)) AuthFilter = '(&(uid=%s))' UserFilter = '(&(uid=*))' CoverAttributes = true TLS = false StartTLS = true DefaultRoles = ['Standard'] [Attributes] Username = 'uid' Nickname = 'cn' Phone = 'mobile' Email = 'mail' ` const OAuth2 = ` Enable = false DisplayName = 'OAuth2登录' RedirectURL = 'http://n9e.com/callback/oauth' SsoAddr = 'https://sso.example.com/oauth2/authorize' SsoLogoutAddr = 'https://sso.example.com/oauth2/authorize/session/end' TokenAddr = 'https://sso.example.com/oauth2/token' UserInfoAddr = 'https://api.example.com/api/v1/user/info' TranTokenMethod = 'header' ClientId = '' ClientSecret = '' CoverAttributes = true DefaultRoles = ['Standard'] UserinfoIsArray = false UserinfoPrefix = 'data' Scopes = ['profile', 'email', 'phone'] [Attributes] Username = 'sub' Nickname = 'nickname' Phone = 'phone_number' Email = 'email' ` const CAS = ` Enable = false DisplayName = 'CAS登录' RedirectURL = 'http://n9e.com/callback/cas' SsoAddr = 'https://cas.example.com/cas/' SsoLogoutAddr = 'https://cas.example.com/cas/session/end' # LoginPath = '' CoverAttributes = true DefaultRoles = ['Standard'] [Attributes] Username = 'sub' Nickname = 'nickname' Phone = 'phone_number' Email = 'email' ` const OIDC = ` Enable = false DisplayName = 'OIDC登录' RedirectURL = 'http://n9e.com/callback' SsoAddr = 'http://sso.example.org' SsoLogoutAddr = 'http://sso.example.org/session/end' ClientId = '' ClientSecret = '' CoverAttributes = true DefaultRoles = ['Standard'] Scopes = ['openid', 'profile', 'email', 'phone'] [Attributes] Username = 'sub' Nickname = 'nickname' Phone = 'phone_number' Email = 'email' ` func Init(center cconf.Center, ctx *ctx.Context, configCache *memsto.ConfigCache) *SsoClient { ssoClient := new(SsoClient) m := make(map[string]string) m["LDAP"] = LDAP m["CAS"] = CAS m["OIDC"] = OIDC m["OAuth2"] = OAuth2 for name, config := range m { count, err := models.SsoConfigCountByName(ctx, name) if err != nil { logger.Error(err) continue } if count > 0 { continue } ssoConfig := models.SsoConfig{ Name: name, Content: config, } err = ssoConfig.Create(ctx) if err != nil { log.Fatalln(err) } } if configCache == nil { log.Fatalln(fmt.Errorf("configCache is nil, sso initialization failed")) } ssoClient.configCache = configCache userVariableMap := configCache.Get() configs, err := models.SsoConfigGets(ctx) if err != nil { log.Fatalln(err) } for _, cfg := range configs { cfg.Content = tplx.ReplaceTemplateUseText(cfg.Name, cfg.Content, userVariableMap) switch cfg.Name { case "LDAP": var config ldapx.Config err := toml.Unmarshal([]byte(cfg.Content), &config) if err != nil { log.Fatalln("init ldap failed", err) } ssoClient.LDAP = ldapx.New(config) case "OIDC": var config oidcx.Config err := toml.Unmarshal([]byte(cfg.Content), &config) if err != nil { log.Fatalln("init oidc failed:", err) } logger.Info("init oidc..") oidcClient, err := oidcx.New(config) if err != nil { logger.Error("init oidc failed:", err) } else { ssoClient.OIDC = oidcClient } case "CAS": var config cas.Config err := toml.Unmarshal([]byte(cfg.Content), &config) if err != nil { log.Fatalln("init cas failed:", err) } ssoClient.CAS = cas.New(config) case "OAuth2": var config oauth2x.Config err := toml.Unmarshal([]byte(cfg.Content), &config) if err != nil { log.Fatalln("init oauth2 failed:", err) } ssoClient.OAuth2 = oauth2x.New(config) case dingtalk.SsoTypeName: var config dingtalk.Config err := json.Unmarshal([]byte(cfg.Content), &config) if err != nil { log.Fatalf("init %s failed: %s", dingtalk.SsoTypeName, err) } ssoClient.DingTalk = dingtalk.New(config) case feishu.SsoTypeName: var config feishu.Config err := json.Unmarshal([]byte(cfg.Content), &config) if err != nil { log.Fatalf("init %s failed: %s", feishu.SsoTypeName, err) } ssoClient.FeiShu = feishu.New(config) } } go ssoClient.SyncSsoUsers(ctx) go ssoClient.Reload(ctx) return ssoClient } // 定期更新sso配置 func (s *SsoClient) reload(ctx *ctx.Context) error { lastUpdateTime, err := models.SsoConfigLastUpdateTime(ctx) if err != nil { return err } lastCacheUpdateTime := s.configCache.GetLastUpdateTime() if lastUpdateTime == s.LastUpdateTime && lastCacheUpdateTime == s.configLastUpdateTime { return nil } configs, err := models.SsoConfigGets(ctx) if err != nil { return err } userVariableMap := s.configCache.Get() ssoConfigMap := make(map[string]models.SsoConfig, 0) for _, cfg := range configs { ssoConfigMap[cfg.Name] = cfg cfg.Content = tplx.ReplaceTemplateUseText(cfg.Name, cfg.Content, userVariableMap) switch cfg.Name { case "LDAP": var config ldapx.Config err := toml.Unmarshal([]byte(cfg.Content), &config) if err != nil { logger.Warning("reload ldap failed", err) continue } s.LDAP.Reload(config) case "OIDC": var config oidcx.Config err := toml.Unmarshal([]byte(cfg.Content), &config) if err != nil { logger.Warning("reload oidc failed:", err) continue } logger.Info("reload oidc..") err = s.OIDC.Reload(config) if err != nil { logger.Error("reload oidc failed:", err) continue } case "CAS": var config cas.Config err := toml.Unmarshal([]byte(cfg.Content), &config) if err != nil { logger.Warning("reload cas failed:", err) continue } s.CAS.Reload(config) case "OAuth2": var config oauth2x.Config err := toml.Unmarshal([]byte(cfg.Content), &config) if err != nil { logger.Warning("reload oauth2 failed:", err) continue } s.OAuth2.Reload(config) } } if dingTalkConfig, ok := ssoConfigMap[dingtalk.SsoTypeName]; ok { var config dingtalk.Config err := json.Unmarshal([]byte(dingTalkConfig.Content), &config) if err != nil { logger.Warningf("reload %s failed: %s", dingtalk.SsoTypeName, err) } else { if s.DingTalk != nil { s.DingTalk.Reload(config) } else { s.DingTalk = dingtalk.New(config) } } } else { s.DingTalk = nil } if feiShuConfig, ok := ssoConfigMap[feishu.SsoTypeName]; ok { var config feishu.Config err := json.Unmarshal([]byte(feiShuConfig.Content), &config) if err != nil { logger.Warningf("reload %s failed: %s", feishu.SsoTypeName, err) } else { if s.FeiShu != nil { s.FeiShu.Reload(config) } else { s.FeiShu = feishu.New(config) } } } else { s.FeiShu = nil } s.LastUpdateTime = lastUpdateTime s.configLastUpdateTime = lastCacheUpdateTime return nil } func (s *SsoClient) Reload(ctx *ctx.Context) { duration := time.Duration(9000) * time.Millisecond for { time.Sleep(duration) if err := s.reload(ctx); err != nil { logger.Warning("reload sso client err:", err) } } } ================================================ FILE: center/sso/sync.go ================================================ package sso import ( "fmt" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/toolkits/pkg/logger" ) func (s *SsoClient) SyncSsoUsers(ctx *ctx.Context) { if err := s.LDAP.SyncAddAndDelUsers(ctx); err != nil { fmt.Println("failed to sync the addition and deletion of ldap users:", err) } if err := s.LDAP.SyncDelUsers(ctx); err != nil { fmt.Println("failed to sync deletion of ldap users:", err) } go s.loopSyncSsoUsers(ctx) } func (s *SsoClient) loopSyncSsoUsers(ctx *ctx.Context) { for { select { case <-s.LDAP.Ticker.C: lc := s.LDAP.Copy() if err := lc.SyncAddAndDelUsers(ctx); err != nil { logger.Warningf("failed to sync the addition and deletion of ldap users: %v", err) } if err := lc.SyncDelUsers(ctx); err != nil { logger.Warningf("failed to sync deletion of ldap users: %v", err) } } } } ================================================ FILE: cli/cli.go ================================================ package cli import ( "github.com/ccfos/nightingale/v6/cli/upgrade" ) func Upgrade(configFile string) error { return upgrade.Upgrade(configFile) } ================================================ FILE: cli/upgrade/config.go ================================================ package upgrade import ( "bytes" "path" "github.com/ccfos/nightingale/v6/pkg/cfg" "github.com/ccfos/nightingale/v6/pkg/ormx" "github.com/ccfos/nightingale/v6/pkg/tlsx" "github.com/koding/multiconfig" ) type Config struct { DB ormx.DBConfig Clusters []ClusterOptions } type ClusterOptions struct { Name string Prom string BasicAuthUser string BasicAuthPass string Headers []string Timeout int64 DialTimeout int64 UseTLS bool tlsx.ClientConfig MaxIdleConnsPerHost int } func Parse(fpath string, configPtr *Config) error { var ( tBuf []byte ) loaders := []multiconfig.Loader{ &multiconfig.TagLoader{}, &multiconfig.EnvironmentLoader{}, } s := cfg.NewFileScanner() s.Read(path.Join(fpath)) tBuf = append(tBuf, s.Data()...) tBuf = append(tBuf, []byte("\n")...) if s.Err() != nil { return s.Err() } if len(tBuf) != 0 { loaders = append(loaders, &multiconfig.TOMLLoader{Reader: bytes.NewReader(tBuf)}) } m := multiconfig.DefaultLoader{ Loader: multiconfig.MultiLoader(loaders...), Validator: multiconfig.MultiValidator(&multiconfig.RequiredValidator{}), } return m.Load(configPtr) } ================================================ FILE: cli/upgrade/readme.md ================================================ # v5 升级 v6 手册 0. 操作之前,记得备注下数据库! 1. 需要先将你正在使用的夜莺数据源表结构更新到和 v5.15.0 一致,[release](https://github.com/ccfos/nightingale/releases) 页面有每个版本表结构的更新说明,可以根据你正在使用的版本,按照说明,逐个执行的更新表结构的语句 2. 解压 n9e 安装包,导入 upgrade.sql 到 n9e_v5 数据库 ``` mysql -h 127.0.0.1 -u root -p1234 < cli/upgrade/upgrade.sql ``` 3. 执行 n9e-cli 完成数据库表结构升级, webapi.conf 为 v5 版本 n9e-webapi 正在使用的配置文件 ``` ./n9e-cli --upgrade --config webapi.conf ``` 4. 修改 n9e 配置文件中的数据库为 n9e_v5,启动 n9e 进程 ``` nohup ./n9e &> n9e.log & ``` 5. n9e 监听的端口为 17000,需要将之前的 web 端口和数据上报的端口,都调整为 17000 ================================================ FILE: cli/upgrade/upgrade.go ================================================ package upgrade import ( "context" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/storage" "github.com/toolkits/pkg/logger" ) func Upgrade(configFile string) error { var config Config Parse(configFile, &config) db, err := storage.New(config.DB) if err != nil { return err } ctx := ctx.NewContext(context.Background(), db, true) for _, cluster := range config.Clusters { count, err := models.GetDatasourcesCountByName(ctx, cluster.Name) if err != nil { logger.Errorf("get datasource %s count error: %v", cluster.Name, err) continue } if count > 0 { continue } header := make(map[string]string) headerCount := len(cluster.Headers) if headerCount > 0 && headerCount%2 == 0 { for i := 0; i < len(cluster.Headers); i += 2 { header[cluster.Headers[i]] = cluster.Headers[i+1] } } authJson := models.Auth{ BasicAuthUser: cluster.BasicAuthUser, BasicAuthPassword: cluster.BasicAuthPass, } httpJson := models.HTTP{ Timeout: cluster.Timeout, DialTimeout: cluster.DialTimeout, TLS: models.TLS{ SkipTlsVerify: cluster.UseTLS, }, MaxIdleConnsPerHost: cluster.MaxIdleConnsPerHost, Url: cluster.Prom, Headers: header, } datasource := models.Datasource{ PluginId: 1, PluginType: "prometheus", PluginTypeName: "Prometheus Like", Name: cluster.Name, HTTPJson: httpJson, AuthJson: authJson, ClusterName: "default", Status: "enabled", } err = datasource.Add(ctx) if err != nil { logger.Errorf("add datasource %s error: %v", cluster.Name, err) } } datasources, err := models.GetDatasources(ctx) if err != nil { return err } m := make(map[string]models.Datasource) for i := 0; i < len(datasources); i++ { m[datasources[i].Name] = datasources[i] } err = models.AlertRuleUpgradeToV6(ctx, m) if err != nil { return err } // alert mute err = models.AlertMuteUpgradeToV6(ctx, m) if err != nil { return err } // alert subscribe err = models.AlertSubscribeUpgradeToV6(ctx, m) if err != nil { return err } // recoding rule err = models.RecordingRuleUpgradeToV6(ctx, m) if err != nil { return err } // alert cur event err = models.AlertCurEventUpgradeToV6(ctx, m) if err != nil { return err } // alert his event err = models.AlertHisEventUpgradeToV6(ctx, m) if err != nil { return err } return nil } ================================================ FILE: cli/upgrade/upgrade.sql ================================================ use n9e_v5; insert into `role_operation`(role_name, operation) values('Guest', '/log/explorer'); insert into `role_operation`(role_name, operation) values('Guest', '/trace/explorer'); insert into `role_operation`(role_name, operation) values('Standard', '/log/explorer'); insert into `role_operation`(role_name, operation) values('Standard', '/trace/explorer'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules-built-in'); insert into `role_operation`(role_name, operation) values('Standard', '/dashboards-built-in'); insert into `role_operation`(role_name, operation) values('Standard', '/trace/dependencies'); insert into `role_operation`(role_name, operation) values('Standard', '/help/servers'); insert into `role_operation`(role_name, operation) values('Standard', '/help/migrate'); insert into `role_operation`(role_name, operation) values('Admin', '/help/source'); insert into `role_operation`(role_name, operation) values('Admin', '/help/sso'); insert into `role_operation`(role_name, operation) values('Admin', '/help/notification-tpls'); insert into `role_operation`(role_name, operation) values('Admin', '/help/notification-settings'); alter table `board` add built_in tinyint(1) not null default 0 comment '0:false 1:true'; alter table `board` add hide tinyint(1) not null default 0 comment '0:false 1:true'; alter table `chart_share` add datasource_id bigint unsigned not null default 0; alter table `alert_rule` add datasource_ids varchar(255) not null default ''; alter table `alert_rule` add rule_config text not null comment 'rule_config'; alter table `alert_rule` add annotations text not null comment 'annotations'; alter table `alert_mute` add datasource_ids varchar(255) not null default ''; alter table `alert_mute` add periodic_mutes varchar(4096) not null default '[]'; alter table `alert_mute` add mute_time_type tinyint(1) not null default 0; alter table `alert_subscribe` add datasource_ids varchar(255) not null default ''; alter table `alert_subscribe` add prod varchar(255) not null default ''; alter table `alert_subscribe` add webhooks text; alter table `alert_subscribe` add redefine_webhooks tinyint(1) default 0; alter table `alert_subscribe` add for_duration bigint not null default 0; alter table `recording_rule` add datasource_ids varchar(255) default ''; alter table `target` modify cluster varchar(128) not null default ''; alter table `alert_cur_event` add datasource_id bigint unsigned not null default 0; alter table `alert_cur_event` add annotations text not null comment 'annotations'; alter table `alert_cur_event` add rule_config text not null comment 'rule_config'; alter table `alert_his_event` add datasource_id bigint unsigned not null default 0; alter table `alert_his_event` add annotations text not null comment 'annotations'; alter table `alert_his_event` add rule_config text not null comment 'rule_config'; alter table `alerting_engines` add datasource_id bigint unsigned not null default 0; alter table `alerting_engines` change cluster engine_cluster varchar(128) not null default '' comment 'n9e engine cluster'; alter table `task_record` add event_id bigint not null comment 'event id' default 0; CREATE TABLE `datasource` ( `id` int unsigned NOT NULL AUTO_INCREMENT, `name` varchar(255) not null default '', `description` varchar(255) not null default '', `category` varchar(255) not null default '', `plugin_id` int unsigned not null default 0, `plugin_type` varchar(255) not null default '', `plugin_type_name` varchar(255) not null default '', `cluster_name` varchar(255) not null default '', `settings` text not null, `status` varchar(255) not null default '', `http` varchar(4096) not null default '', `auth` varchar(8192) not null default '', `created_at` bigint not null default 0, `created_by` varchar(64) not null default '', `updated_at` bigint not null default 0, `updated_by` varchar(64) not null default '', PRIMARY KEY (`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; -- datasource add weight field alter table `datasource` add `weight` int not null default 0; CREATE TABLE `builtin_cate` ( `id` bigint unsigned not null auto_increment, `name` varchar(191) not null, `user_id` bigint not null default 0, PRIMARY KEY (`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `notify_tpl` ( `id` bigint unsigned not null auto_increment, `channel` varchar(32) not null, `name` varchar(255) not null, `content` text not null, PRIMARY KEY (`id`), UNIQUE KEY (`channel`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `sso_config` ( `id` bigint unsigned not null auto_increment, `name` varchar(191) not null, `content` text not null, PRIMARY KEY (`id`), UNIQUE KEY (`name`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; ================================================ FILE: cmd/alert/main.go ================================================ package main import ( "flag" "fmt" "log" "os" "os/signal" "syscall" "github.com/ccfos/nightingale/v6/alert" "github.com/ccfos/nightingale/v6/pkg/osx" "github.com/ccfos/nightingale/v6/pkg/version" "github.com/toolkits/pkg/runner" ) var ( showVersion = flag.Bool("version", false, "Show version.") configDir = flag.String("configs", osx.GetEnv("N9E_ALERT_CONFIGS", "etc"), "Specify configuration directory.(env:N9E_ALERT_CONFIGS)") cryptoKey = flag.String("crypto-key", "", "Specify the secret key for configuration file field encryption.") ) func main() { flag.Parse() if *showVersion { fmt.Println(version.Version) os.Exit(0) } printEnv() cleanFunc, err := alert.Initialize(*configDir, *cryptoKey) if err != nil { log.Fatalln("failed to initialize:", err) } code := 1 sc := make(chan os.Signal, 1) signal.Notify(sc, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) EXIT: for { sig := <-sc fmt.Println("received signal:", sig.String()) switch sig { case syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT: code = 0 break EXIT case syscall.SIGHUP: // reload configuration? default: break EXIT } } cleanFunc() fmt.Println("process exited") os.Exit(code) } func printEnv() { runner.Init() fmt.Println("runner.cwd:", runner.Cwd) fmt.Println("runner.hostname:", runner.Hostname) fmt.Println("runner.fd_limits:", runner.FdLimits()) fmt.Println("runner.vm_limits:", runner.VMLimits()) } ================================================ FILE: cmd/center/main.go ================================================ package main import ( "flag" "fmt" "log" "os" "os/signal" "syscall" "github.com/ccfos/nightingale/v6/center" "github.com/ccfos/nightingale/v6/pkg/osx" "github.com/ccfos/nightingale/v6/pkg/version" "github.com/toolkits/pkg/net/tcpx" "github.com/toolkits/pkg/runner" ) var ( showVersion = flag.Bool("version", false, "Show version.") configDir = flag.String("configs", osx.GetEnv("N9E_CONFIGS", "etc"), "Specify configuration directory.(env:N9E_CONFIGS)") cryptoKey = flag.String("crypto-key", "", "Specify the secret key for configuration file field encryption.") ) func main() { flag.Parse() if *showVersion { fmt.Println(version.Version) os.Exit(0) } printEnv() tcpx.WaitHosts() cleanFunc, err := center.Initialize(*configDir, *cryptoKey) if err != nil { log.Fatalln("failed to initialize:", err) } code := 1 sc := make(chan os.Signal, 1) signal.Notify(sc, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) EXIT: for { sig := <-sc fmt.Println("received signal:", sig.String()) switch sig { case syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT: code = 0 break EXIT case syscall.SIGHUP: // reload configuration? default: break EXIT } } cleanFunc() fmt.Println("process exited") os.Exit(code) } func printEnv() { runner.Init() fmt.Println("runner.cwd:", runner.Cwd) fmt.Println("runner.hostname:", runner.Hostname) fmt.Println("runner.fd_limits:", runner.FdLimits()) fmt.Println("runner.vm_limits:", runner.VMLimits()) } ================================================ FILE: cmd/cli/main.go ================================================ package main import ( "flag" "fmt" "os" "github.com/ccfos/nightingale/v6/cli" "github.com/ccfos/nightingale/v6/pkg/version" ) var ( upgrade = flag.Bool("upgrade", false, "Upgrade the database.") showVersion = flag.Bool("version", false, "Show version.") configFile = flag.String("config", "", "Specify webapi.conf of v5.x version") ) func main() { flag.Parse() if *showVersion { fmt.Println(version.Version) os.Exit(0) } if *upgrade { if *configFile == "" { fmt.Println("Please specify the configuration directory.") os.Exit(1) } err := cli.Upgrade(*configFile) if err != nil { fmt.Println(err) os.Exit(1) } fmt.Print("Upgrade successfully.") os.Exit(0) } } ================================================ FILE: cmd/edge/edge.go ================================================ package main import ( "context" "errors" "fmt" "github.com/ccfos/nightingale/v6/alert" "github.com/ccfos/nightingale/v6/alert/astats" "github.com/ccfos/nightingale/v6/alert/dispatch" "github.com/ccfos/nightingale/v6/alert/process" alertrt "github.com/ccfos/nightingale/v6/alert/router" "github.com/ccfos/nightingale/v6/center/metas" "github.com/ccfos/nightingale/v6/conf" "github.com/ccfos/nightingale/v6/dscache" "github.com/ccfos/nightingale/v6/dumper" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/httpx" "github.com/ccfos/nightingale/v6/pkg/logx" "github.com/ccfos/nightingale/v6/pkg/macros" "github.com/ccfos/nightingale/v6/prom" "github.com/ccfos/nightingale/v6/pushgw/idents" pushgwrt "github.com/ccfos/nightingale/v6/pushgw/router" "github.com/ccfos/nightingale/v6/pushgw/writer" "github.com/ccfos/nightingale/v6/storage" "github.com/flashcatcloud/ibex/src/cmd/ibex" ) func Initialize(configDir string, cryptoKey string) (func(), error) { config, err := conf.InitConfig(configDir, cryptoKey) if err != nil { return nil, fmt.Errorf("failed to init config: %v", err) } logxClean, err := logx.Init(config.Log) if err != nil { return nil, err } //check CenterApi is default value if len(config.CenterApi.Addrs) < 1 { return nil, errors.New("failed to init config: the CenterApi configuration is missing") } ctx := ctx.NewContext(context.Background(), nil, false, config.CenterApi) var redis storage.Redis redis, err = storage.NewRedis(config.Redis) if err != nil { return nil, err } syncStats := memsto.NewSyncStats() targetCache := memsto.NewTargetCache(ctx, syncStats, redis) busiGroupCache := memsto.NewBusiGroupCache(ctx, syncStats) configCvalCache := memsto.NewCvalCache(ctx, syncStats) idents := idents.New(ctx, redis, config.Pushgw) metas := metas.New(redis) writers := writer.NewWriters(config.Pushgw) pushgwRouter := pushgwrt.New(config.HTTP, config.Pushgw, config.Alert, targetCache, busiGroupCache, idents, metas, writers, ctx) r := httpx.GinEngine(config.Global.RunMode, config.HTTP, configCvalCache.PrintBodyPaths, configCvalCache.PrintAccessLog) pushgwRouter.Config(r) macros.RegisterMacro(macros.MacroInVain) dscache.Init(ctx, false) if !config.Alert.Disable { configCache := memsto.NewConfigCache(ctx, syncStats, nil, "") alertStats := astats.NewSyncStats() dsCache := memsto.NewDatasourceCache(ctx, syncStats) alertMuteCache := memsto.NewAlertMuteCache(ctx, syncStats) alertRuleCache := memsto.NewAlertRuleCache(ctx, syncStats) notifyConfigCache := memsto.NewNotifyConfigCache(ctx, configCache) userCache := memsto.NewUserCache(ctx, syncStats) userGroupCache := memsto.NewUserGroupCache(ctx, syncStats) taskTplsCache := memsto.NewTaskTplCache(ctx) notifyRuleCache := memsto.NewNotifyRuleCache(ctx, syncStats) notifyChannelCache := memsto.NewNotifyChannelCache(ctx, syncStats) messageTemplateCache := memsto.NewMessageTemplateCache(ctx, syncStats) promClients := prom.NewPromClient(ctx) dispatch.InitRegisterQueryFunc(promClients) externalProcessors := process.NewExternalProcessors() alert.Start(config.Alert, config.Pushgw, syncStats, alertStats, externalProcessors, targetCache, busiGroupCache, alertMuteCache, alertRuleCache, notifyConfigCache, taskTplsCache, dsCache, ctx, promClients, userCache, userGroupCache, notifyRuleCache, notifyChannelCache, messageTemplateCache, configCvalCache) alertrtRouter := alertrt.New(config.HTTP, config.Alert, alertMuteCache, targetCache, busiGroupCache, alertStats, ctx, externalProcessors, config.Log.Dir) alertrtRouter.Config(r) if config.Ibex.Enable { ibex.ServerStart(false, nil, redis, config.HTTP.APIForService.BasicAuth, config.Alert.Heartbeat, &config.CenterApi, r, nil, config.Ibex, config.HTTP.Port) } } dumper.ConfigRouter(r) httpClean := httpx.Init(config.HTTP, r) return func() { logxClean() httpClean() }, nil } ================================================ FILE: cmd/edge/main.go ================================================ package main import ( "flag" "fmt" "log" "os" "os/signal" "syscall" "github.com/ccfos/nightingale/v6/pkg/osx" "github.com/ccfos/nightingale/v6/pkg/version" "github.com/toolkits/pkg/runner" ) var ( showVersion = flag.Bool("version", false, "Show version.") configDir = flag.String("configs", osx.GetEnv("N9E_EDGE_CONFIGS", "etc"), "Specify configuration directory.(env:N9E_EDGE_CONFIGS)") cryptoKey = flag.String("crypto-key", "", "Specify the secret key for configuration file field encryption.") ) func main() { flag.Parse() if *showVersion { fmt.Println(version.Version) os.Exit(0) } printEnv() cleanFunc, err := Initialize(*configDir, *cryptoKey) if err != nil { log.Fatalln("failed to initialize:", err) } code := 1 sc := make(chan os.Signal, 1) signal.Notify(sc, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) EXIT: for { sig := <-sc fmt.Println("received signal:", sig.String()) switch sig { case syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT: code = 0 break EXIT case syscall.SIGHUP: // reload configuration? default: break EXIT } } cleanFunc() fmt.Println("process exited") os.Exit(code) } func printEnv() { runner.Init() fmt.Println("runner.cwd:", runner.Cwd) fmt.Println("runner.hostname:", runner.Hostname) fmt.Println("runner.fd_limits:", runner.FdLimits()) fmt.Println("runner.vm_limits:", runner.VMLimits()) } ================================================ FILE: cmd/pushgw/main.go ================================================ package main import ( "flag" "fmt" "log" "os" "os/signal" "syscall" "github.com/ccfos/nightingale/v6/pkg/osx" "github.com/ccfos/nightingale/v6/pkg/version" "github.com/ccfos/nightingale/v6/pushgw" "github.com/toolkits/pkg/runner" ) var ( showVersion = flag.Bool("version", false, "Show version.") configDir = flag.String("configs", osx.GetEnv("N9E_PUSHGW_CONFIGS", "etc"), "Specify configuration directory.(env:N9E_PUSHGW_CONFIGS)") cryptoKey = flag.String("crypto-key", "", "Specify the secret key for configuration file field encryption.") ) func main() { flag.Parse() if *showVersion { fmt.Println(version.Version) os.Exit(0) } printEnv() cleanFunc, err := pushgw.Initialize(*configDir, *cryptoKey) if err != nil { log.Fatalln("failed to initialize:", err) } code := 1 sc := make(chan os.Signal, 1) signal.Notify(sc, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) EXIT: for { sig := <-sc fmt.Println("received signal:", sig.String()) switch sig { case syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT: code = 0 break EXIT case syscall.SIGHUP: // reload configuration? default: break EXIT } } cleanFunc() fmt.Println("process exited") os.Exit(code) } func printEnv() { runner.Init() fmt.Println("runner.cwd:", runner.Cwd) fmt.Println("runner.hostname:", runner.Hostname) fmt.Println("runner.fd_limits:", runner.FdLimits()) fmt.Println("runner.vm_limits:", runner.VMLimits()) } ================================================ FILE: conf/conf.go ================================================ package conf import ( "fmt" "net" "os" "strings" "github.com/ccfos/nightingale/v6/alert/aconf" "github.com/ccfos/nightingale/v6/center/cconf" "github.com/ccfos/nightingale/v6/pkg/cfg" "github.com/ccfos/nightingale/v6/pkg/httpx" "github.com/ccfos/nightingale/v6/pkg/logx" "github.com/ccfos/nightingale/v6/pkg/ormx" "github.com/ccfos/nightingale/v6/pushgw/pconf" "github.com/ccfos/nightingale/v6/storage" ) type ConfigType struct { Global GlobalConfig Log logx.Config HTTP httpx.Config DB ormx.DBConfig Redis storage.RedisConfig CenterApi CenterApi Pushgw pconf.Pushgw Alert aconf.Alert Center cconf.Center Ibex Ibex } type CenterApi struct { Addrs []string BasicAuthUser string BasicAuthPass string Timeout int64 } type GlobalConfig struct { RunMode string } type Ibex struct { Enable bool RPCListen string Output Output } type Output struct { ComeFrom string AgtdPort int } func InitConfig(configDir, cryptoKey string) (*ConfigType, error) { var config = new(ConfigType) if err := cfg.LoadConfigByDir(configDir, config); err != nil { return nil, fmt.Errorf("failed to load configs of directory: %s error: %s", configDir, err) } config.Pushgw.PreCheck() config.Alert.PreCheck(configDir) config.Center.PreCheck() err := decryptConfig(config, cryptoKey) if err != nil { return nil, err } if config.Alert.Heartbeat.IP == "" { // auto detect config.Alert.Heartbeat.IP = fmt.Sprint(GetOutboundIP()) if config.Alert.Heartbeat.IP == "" { hostname, err := os.Hostname() if err != nil { fmt.Println("failed to get hostname:", err) os.Exit(1) } if strings.Contains(hostname, "localhost") { fmt.Println("Warning! hostname contains substring localhost, setting a more unique hostname is recommended") } config.Alert.Heartbeat.IP = hostname } } config.Alert.Heartbeat.Endpoint = fmt.Sprintf("%s:%d", config.Alert.Heartbeat.IP, config.HTTP.Port) return config, nil } func GetOutboundIP() net.IP { conn, err := net.Dial("udp", "223.5.5.5:80") if err != nil { fmt.Println("auto get outbound ip fail:", err) return []byte{} } defer conn.Close() localAddr := conn.LocalAddr().(*net.UDPAddr) return localAddr.IP } ================================================ FILE: conf/crypto.go ================================================ package conf import ( "fmt" "github.com/ccfos/nightingale/v6/pkg/secu" ) func decryptConfig(config *ConfigType, cryptoKey string) error { decryptDsn, err := secu.DealWithDecrypt(config.DB.DSN, cryptoKey) if err != nil { return fmt.Errorf("failed to decrypt the db dsn: %s", err) } config.DB.DSN = decryptDsn decryptRedisPwd, err := secu.DealWithDecrypt(config.Redis.Password, cryptoKey) if err != nil { return fmt.Errorf("failed to decrypt the redis password: %s", err) } config.Redis.Password = decryptRedisPwd for k := range config.HTTP.APIForService.BasicAuth { decryptPwd, err := secu.DealWithDecrypt(config.HTTP.APIForService.BasicAuth[k], cryptoKey) if err != nil { return fmt.Errorf("failed to decrypt http basic auth password: %s", err) } config.HTTP.APIForService.BasicAuth[k] = decryptPwd } for k := range config.HTTP.APIForAgent.BasicAuth { decryptPwd, err := secu.DealWithDecrypt(config.HTTP.APIForAgent.BasicAuth[k], cryptoKey) if err != nil { return fmt.Errorf("failed to decrypt http basic auth password: %s", err) } config.HTTP.APIForAgent.BasicAuth[k] = decryptPwd } for i, v := range config.Pushgw.Writers { decryptWriterPwd, err := secu.DealWithDecrypt(v.BasicAuthPass, cryptoKey) if err != nil { return fmt.Errorf("failed to decrypt writer basic auth password: %s", err) } config.Pushgw.Writers[i].BasicAuthPass = decryptWriterPwd } return nil } ================================================ FILE: cron/clean_notify_record.go ================================================ package cron import ( "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/robfig/cron/v3" "github.com/toolkits/pkg/logger" ) func cleanNotifyRecord(ctx *ctx.Context, day int) { lastWeek := time.Now().Unix() - 86400*int64(day) err := models.DB(ctx).Model(&models.NotificationRecord{}).Where("created_at < ?", lastWeek).Delete(&models.NotificationRecord{}).Error if err != nil { logger.Errorf("Failed to clean notify record: %v", err) } } // 每天凌晨1点执行清理任务 func CleanNotifyRecord(ctx *ctx.Context, day int) { c := cron.New() if day < 1 { day = 7 } // 使用cron表达式设置每天凌晨1点执行 _, err := c.AddFunc("0 1 * * *", func() { cleanNotifyRecord(ctx, day) }) if err != nil { logger.Errorf("Failed to add clean notify record cron job: %v", err) return } // 启动cron任务 c.Start() } ================================================ FILE: cron/clean_pipeline_execution.go ================================================ package cron import ( "time" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/robfig/cron/v3" "github.com/toolkits/pkg/logger" ) const ( defaultBatchSize = 100 // 每批删除数量 defaultSleepMs = 10 // 每批删除后休眠时间(毫秒) ) // cleanPipelineExecutionInBatches 分批删除执行记录,避免大批量删除影响数据库性能 func cleanPipelineExecutionInBatches(ctx *ctx.Context, day int) { threshold := time.Now().Unix() - 86400*int64(day) var totalDeleted int64 for { deleted, err := models.DeleteEventPipelineExecutionsInBatches(ctx, threshold, defaultBatchSize) if err != nil { logger.Errorf("Failed to clean pipeline execution records in batch: %v", err) return } totalDeleted += deleted // 如果本批删除数量小于 batchSize,说明已删除完毕 if deleted < int64(defaultBatchSize) { break } // 休眠一段时间,降低数据库压力 time.Sleep(time.Duration(defaultSleepMs) * time.Millisecond) } if totalDeleted > 0 { logger.Infof("Cleaned %d pipeline execution records older than %d days", totalDeleted, day) } } // CleanPipelineExecution starts a cron job to clean old pipeline execution records in batches // Runs daily at 6:00 AM // day: 数据保留天数,默认 7 天 // 使用分批删除方式,每批 100 条,间隔 10ms,避免大批量删除影响数据库性能 func CleanPipelineExecution(ctx *ctx.Context, day int) { c := cron.New() if day < 1 { day = 7 // default retention: 7 days } _, err := c.AddFunc("0 6 * * *", func() { cleanPipelineExecutionInBatches(ctx, day) }) if err != nil { logger.Errorf("Failed to add clean pipeline execution cron job: %v", err) return } c.Start() logger.Infof("Pipeline execution cleanup cron started, retention: %d days, batch_size: %d, sleep_ms: %d", day, defaultBatchSize, defaultSleepMs) } ================================================ FILE: datasource/ck/clickhouse.go ================================================ package ck import ( "context" "fmt" "strings" "github.com/ccfos/nightingale/v6/datasource" ck "github.com/ccfos/nightingale/v6/dskit/clickhouse" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/macros" "github.com/mitchellh/mapstructure" "github.com/toolkits/pkg/logger" "github.com/ccfos/nightingale/v6/pkg/logx" ) const ( CKType = "ck" TimeFieldFormatEpochMilli = "epoch_millis" TimeFieldFormatEpochSecond = "epoch_second" DefaultLimit = 500 ) var ( ckPrivBanned = []string{ "INSERT", "CREATE", "DROP", "DELETE", "UPDATE", "ALL", } ckBannedOp = map[string]struct{}{ "CREATE": {}, "INSERT": {}, "ALTER": {}, "REVOKE": {}, "DROP": {}, "RENAME": {}, "ATTACH": {}, "DETACH": {}, "OPTIMIZE": {}, "TRUNCATE": {}, "SET": {}, } ) func init() { datasource.RegisterDatasource(CKType, new(Clickhouse)) } type CKShard struct { Addr string `json:"ck.addr" mapstructure:"ck.addr"` User string `json:"ck.user" mapstructure:"ck.user"` Password string `json:"ck.password" mapstructure:"ck.password"` Database string `json:"ck.db" mapstructure:"ck.db"` IsEncrypted bool `json:"ck.is_encrypt" mapstructure:"ck.is_encrypt"` } type QueryParam struct { Limit int `json:"limit" mapstructure:"limit"` Sql string `json:"sql" mapstructure:"sql"` Ref string `json:"ref" mapstructure:"ref"` From int64 `json:"from" mapstructure:"from"` To int64 `json:"to" mapstructure:"to"` TimeField string `json:"time_field" mapstructure:"time_field"` TimeFormat string `json:"time_format" mapstructure:"time_format"` Keys datasource.Keys `json:"keys" mapstructure:"keys"` Database string `json:"database" mapstructure:"database"` Table string `json:"table" mapstructure:"table"` } type Clickhouse struct { ck.Clickhouse `json:",inline" mapstructure:",squash"` } func (c *Clickhouse) Init(settings map[string]interface{}) (datasource.Datasource, error) { newest := new(Clickhouse) err := mapstructure.Decode(settings, newest) return newest, err } func (c *Clickhouse) InitClient() error { return c.InitCli() } func (c *Clickhouse) Validate(ctx context.Context) error { if len(c.Nodes) == 0 { return fmt.Errorf("ck shard is invalid, please check datasource setting") } addr := c.Nodes[0] if len(strings.Trim(c.User, " ")) == 0 { return fmt.Errorf("ck shard user is invalid, please check datasource setting") } if len(strings.Trim(addr, " ")) == 0 { return fmt.Errorf("ck shard addr is invalid, please check datasource setting") } // if len(strings.Trim(shard.Password, " ")) == 0 { // return fmt.Errorf("ck shard password is empty, please check datasource setting or set password for user") // } return nil } // Equal compares whether two objects are the same, used for caching func (c *Clickhouse) Equal(p datasource.Datasource) bool { plg, ok := p.(*Clickhouse) if !ok { logger.Errorf("unexpected plugin type, expected is ck") return false } // only compare first shard if len(c.Nodes) == 0 { logger.Errorf("ck shard is empty") return false } addr := c.Nodes[0] if len(plg.Nodes) == 0 { logger.Errorf("new ck plugin obj shard is empty") return false } newAddr := plg.Nodes[0] if c.User != plg.User { return false } if addr != newAddr { return false } if c.Password != plg.Password { return false } return true } func (c *Clickhouse) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { return nil, nil } func (c *Clickhouse) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { return nil, nil } func (c *Clickhouse) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) { return nil, nil } func (c *Clickhouse) QueryData(ctx context.Context, query interface{}) ([]models.DataResp, error) { ckQueryParam := new(ck.QueryParam) if err := mapstructure.Decode(query, ckQueryParam); err != nil { return nil, err } if strings.Contains(ckQueryParam.Sql, "$__") { var err error ckQueryParam.Sql, err = macros.Macro(ckQueryParam.Sql, ckQueryParam.From, ckQueryParam.To) if err != nil { return nil, err } } if ckQueryParam.Keys.ValueKey == "" { return nil, fmt.Errorf("valueKey is required") } rows, err := c.QueryTimeseries(ctx, ckQueryParam) if err != nil { logx.Warningf(ctx, "query:%+v get data err:%v", ckQueryParam, err) return nil, err } data := make([]models.DataResp, 0) for i := range rows { data = append(data, models.DataResp{ Ref: ckQueryParam.Ref, Metric: rows[i].Metric, Values: rows[i].Values, }) } return data, nil } func (c *Clickhouse) QueryLog(ctx context.Context, query interface{}) ([]interface{}, int64, error) { ckQueryParam := new(QueryParam) if err := mapstructure.Decode(query, ckQueryParam); err != nil { return nil, 0, err } if strings.Contains(ckQueryParam.Sql, "$__") { var err error ckQueryParam.Sql, err = macros.Macro(ckQueryParam.Sql, ckQueryParam.From, ckQueryParam.To) if err != nil { return nil, 0, err } } rows, err := c.Query(ctx, ckQueryParam) if err != nil { logx.Warningf(ctx, "query:%+v get data err:%v", ckQueryParam, err) return nil, 0, err } limit := getLimit(len(rows), ckQueryParam.Limit) logs := make([]interface{}, 0) for i := 0; i < limit; i++ { logs = append(logs, rows[i]) } return logs, int64(limit), nil } func getLimit(rowLen, pLimit int) int { limit := DefaultLimit if pLimit > 0 { limit = pLimit } if rowLen > limit { return limit } return rowLen } ================================================ FILE: datasource/commons/eslike/eslike.go ================================================ package eslike import ( "context" "encoding/json" "fmt" "strconv" "strings" "time" "github.com/araddon/dateparse" "github.com/bitly/go-simplejson" "github.com/mitchellh/mapstructure" "github.com/olivere/elastic/v7" "github.com/prometheus/common/model" "github.com/toolkits/pkg/logger" "github.com/ccfos/nightingale/v6/memsto" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/logx" ) type FixedField string const ( FieldIndex FixedField = "_index" FieldId FixedField = "_id" ) // LabelSeparator 用于分隔多个标签的分隔符 // 使用 ASCII 控制字符 Record Separator (0x1E),避免与用户数据中的 "--" 冲突 const LabelSeparator = "\x1e" type Query struct { Ref string `json:"ref" mapstructure:"ref"` IndexType string `json:"index_type" mapstructure:"index_type"` // 普通索引:index 索引模式:index_pattern Index string `json:"index" mapstructure:"index"` IndexPatternId int64 `json:"index_pattern" mapstructure:"index_pattern"` Filter string `json:"filter" mapstructure:"filter"` Offset int64 `json:"offset" mapstructure:"offset"` MetricAggr MetricAggr `json:"value" mapstructure:"value"` GroupBy []GroupBy `json:"group_by" mapstructure:"group_by"` DateField string `json:"date_field" mapstructure:"date_field"` Interval int64 `json:"interval" mapstructure:"interval"` Start int64 `json:"start" mapstructure:"start"` End int64 `json:"end" mapstructure:"end"` P int `json:"page" mapstructure:"page"` // 页码 Limit int `json:"limit" mapstructure:"limit"` // 每页个数 Ascending bool `json:"ascending" mapstructure:"ascending"` // 按照DataField排序 Timeout int `json:"timeout" mapstructure:"timeout"` MaxShard int `json:"max_shard" mapstructure:"max_shard"` SearchAfter *SearchAfter `json:"search_after" mapstructure:"search_after"` } type SortField struct { Field string `json:"field" mapstructure:"field"` Ascending bool `json:"ascending" mapstructure:"ascending"` } type SearchAfter struct { SortFields []SortField `json:"sort_fields" mapstructure:"sort_fields"` // 指定排序字段, 一般是timestamp:desc, _index:asc, _id:asc 三者组合,构成唯一的排序字段 SearchAfter []interface{} `json:"search_after" mapstructure:"search_after"` // 指定排序字段的搜索值,搜索值必须和sort_fields的顺序一致,为上一次查询的最后一条日志的值 } type MetricAggr struct { Field string `json:"field" mapstructure:"field"` Func string `json:"func" mapstructure:"func"` Ref string `json:"ref" mapstructure:"ref"` // 变量名,A B C } type GroupBy struct { Cate GroupByCate `json:"cate" mapstructure:"cate"` // 分组类型 Field string `json:"field" mapstructure:"field"` MinDocCount int64 `json:"min_doc_count" mapstructure:"min_doc_count"` Order string `json:"order" mapstructure:"order"` OrderBy string `json:"order_by" mapstructure:"order_by"` Size int `json:"size" mapstructure:"size"` Params []Param `json:"params" mapstructure:"params"` // 类型是 filter 时使用 Interval int64 `json:"interval" mapstructure:"interval"` // 分组间隔 } type SearchFunc func(ctx context.Context, indices []string, source interface{}, timeout int, maxShard int) (*elastic.SearchResult, error) type QueryFieldsFunc func(indices []string) ([]string, error) // 分组类型 type GroupByCate string const ( Filters GroupByCate = "filters" Histogram GroupByCate = "histogram" Terms GroupByCate = "terms" ) // 参数 type Param struct { Alias string `json:"alias,omitempty"` // 别名,a=b的形式,filter 特有 Query string `json:"query,omitempty"` // 查询条件,filter 特有 } type MetricPtr struct { Data map[string][][]float64 } func IterGetMap(m, ret map[string]interface{}, prefixKey string) { for k, v := range m { switch v.(type) { case map[string]interface{}: var key string if prefixKey != "" { key = fmt.Sprintf("%s.%s", prefixKey, k) } else { key = k } IterGetMap(v.(map[string]interface{}), ret, key) default: ret[prefixKey+"."+k] = []interface{}{v} } } } func TransferData(metric, ref string, m map[string][][]float64) []models.DataResp { var datas []models.DataResp for k, v := range m { data := models.DataResp{ Ref: ref, Metric: make(model.Metric), Labels: k, Values: v, } data.Metric["__name__"] = model.LabelValue(metric) labels := strings.Split(k, LabelSeparator) for _, label := range labels { arr := strings.SplitN(label, "=", 2) if len(arr) == 2 { data.Metric[model.LabelName(arr[0])] = model.LabelValue(arr[1]) } } datas = append(datas, data) } for i := 0; i < len(datas); i++ { for k, v := range datas[i].Metric { if k == "__name__" { datas[i].Metric[k] = model.LabelValue(ref) + "_" + v } } } return datas } func GetQueryString(filter string, q *elastic.RangeQuery) *elastic.BoolQuery { var queryString *elastic.BoolQuery if filter != "" { if strings.Contains(filter, ":") || strings.Contains(filter, "AND") || strings.Contains(filter, "OR") || strings.Contains(filter, "NOT") { queryString = elastic.NewBoolQuery().Must(elastic.NewQueryStringQuery(filter)).Filter(q) } else { queryString = elastic.NewBoolQuery().Filter(elastic.NewMultiMatchQuery(filter).Lenient(true).Type("phrase")).Filter(q) } } else { queryString = elastic.NewBoolQuery().Should(q) } return queryString } func getUnixTs(timeStr string) int64 { ts, err := strconv.ParseInt(timeStr, 10, 64) if err == nil { return ts } parsedTime, err := dateparse.ParseAny(timeStr) if err != nil { logger.Error("failed to ParseAny: ", err) return 0 } return parsedTime.UnixMilli() } func GetBuckets(labelKey string, keys []string, arr []interface{}, metrics *MetricPtr, labels string, ts int64, f string) { var err error bucketsKey := "" if len(keys) > 0 { bucketsKey = keys[0] } newlabels := "" for i := 0; i < len(arr); i++ { tmp := arr[i].(map[string]interface{}) keyAsString, getTs := tmp["key_as_string"] if getTs { ts = getUnixTs(keyAsString.(string)) } keyValue := tmp["key"] switch keyValue.(type) { case json.Number, string: if !getTs { if labels != "" { newlabels = fmt.Sprintf("%s%s%s=%v", labels, LabelSeparator, labelKey, keyValue) } else { newlabels = fmt.Sprintf("%s=%v", labelKey, keyValue) } } default: continue } var finalValue float64 if len(keys) == 0 { // 计算 doc_count 的情况 count := tmp["doc_count"] finalValue, err = count.(json.Number).Float64() if err != nil { logger.Warningf("labelKey:%s get value error:%v", labelKey, err) } newValues := []float64{float64(ts / 1000), finalValue} metrics.Data[newlabels] = append(metrics.Data[newlabels], newValues) continue } innerBuckets, exists := tmp[bucketsKey] if !exists { continue } nextBucketsArr, exists := innerBuckets.(map[string]interface{})["buckets"] if exists { if len(keys[1:]) >= 1 { GetBuckets(bucketsKey, keys[1:], nextBucketsArr.([]interface{}), metrics, newlabels, ts, f) } else { GetBuckets(bucketsKey, []string{}, nextBucketsArr.([]interface{}), metrics, newlabels, ts, f) } } else { // doc_count if f == "count" || f == "nodata" { count := tmp["doc_count"] finalValue, err = count.(json.Number).Float64() if err != nil { logger.Warningf("get %v value error:%v", count, err) } } else { values, exists := innerBuckets.(map[string]interface{})["value"] if exists { switch values.(type) { case json.Number: value, err := values.(json.Number).Float64() if err != nil { logger.Warningf("labelKey:%s get value error:%v", labelKey, err) } finalValue = value } } else { switch values.(type) { case map[string]interface{}: var err error values := innerBuckets.(map[string]interface{})["values"] for _, v := range values.(map[string]interface{}) { finalValue, err = v.(json.Number).Float64() if err != nil { logger.Warningf("labelKey:%s get value error:%v", labelKey, err) } } default: values := innerBuckets.(map[string]interface{})["values"] for _, v := range values.(map[string]interface{}) { // Todo 修复 v is nil 导致 panic 情况 finalValue, err = v.(json.Number).Float64() if err != nil { logger.Warningf("labelKey:%s get value error:%v", labelKey, err) } } } } } if _, exists := metrics.Data[newlabels]; !exists { metrics.Data[newlabels] = [][]float64{} } newValues := []float64{float64(ts / 1000), finalValue} metrics.Data[newlabels] = append(metrics.Data[newlabels], newValues) } } } func MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { param := new(Query) if err := mapstructure.Decode(query, param); err != nil { return nil, err } for i := 0; i < len(eventTags); i++ { arr := strings.SplitN(eventTags[i], "=", 2) if len(arr) == 2 { eventTags[i] = fmt.Sprintf("%s:%s", arr[0], strconv.Quote(arr[1])) } } if len(eventTags) > 0 { if param.Filter == "" { param.Filter = strings.Join(eventTags, " AND ") } else { param.Filter = param.Filter + " AND " + strings.Join(eventTags, " AND ") } } param.Start = start param.End = end return param, nil } func MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { param := new(Query) if err := mapstructure.Decode(query, param); err != nil { return nil, err } for i := 0; i < len(eventTags); i++ { arr := strings.SplitN(eventTags[i], "=", 2) if len(arr) == 2 { eventTags[i] = fmt.Sprintf("%s:%s", arr[0], strconv.Quote(arr[1])) } } if len(eventTags) > 0 { if param.Filter == "" { param.Filter = strings.Join(eventTags, " AND ") } else { param.Filter = param.Filter + " AND " + strings.Join(eventTags, " AND ") } } param.Start = start param.End = end return param, nil } var esIndexPatternCache *memsto.EsIndexPatternCacheType func SetEsIndexPatternCacheType(c *memsto.EsIndexPatternCacheType) { esIndexPatternCache = c } func GetEsIndexPatternCacheType() *memsto.EsIndexPatternCacheType { return esIndexPatternCache } func QueryData(ctx context.Context, queryParam interface{}, cliTimeout int64, version string, search SearchFunc) ([]models.DataResp, error) { param := new(Query) if err := mapstructure.Decode(queryParam, param); err != nil { return nil, err } if param.Timeout == 0 { param.Timeout = int(cliTimeout) / 1000 } if param.Interval == 0 { param.Interval = 60 } if param.MaxShard < 1 { param.MaxShard = 5 } if param.DateField == "" { param.DateField = "@timestamp" } var indexArr []string if param.IndexType == "index_pattern" { if ip, ok := GetEsIndexPatternCacheType().Get(param.IndexPatternId); ok { param.DateField = ip.TimeField indexArr = []string{ip.Name} param.Index = ip.Name } else { return nil, fmt.Errorf("index pattern:%d not found", param.IndexPatternId) } } else { indexArr = strings.Split(param.Index, ",") } q := elastic.NewRangeQuery(param.DateField) now := time.Now().Unix() var start, end int64 if param.End != 0 && param.Start != 0 { end = param.End start = param.Start } else { end = now start = end - param.Interval } delay, ok := ctx.Value("delay").(int64) if ok && delay != 0 { end = end - delay start = start - delay } if param.Offset > 0 { end = end - param.Offset start = start - param.Offset } q.Gte(time.Unix(start, 0).UnixMilli()) q.Lt(time.Unix(end, 0).UnixMilli()) q.Format("epoch_millis") field := param.MetricAggr.Field groupBys := param.GroupBy queryString := GetQueryString(param.Filter, q) var aggr elastic.Aggregation switch param.MetricAggr.Func { case "avg": aggr = elastic.NewAvgAggregation().Field(field) case "max": aggr = elastic.NewMaxAggregation().Field(field) case "min": aggr = elastic.NewMinAggregation().Field(field) case "sum": aggr = elastic.NewSumAggregation().Field(field) case "count": aggr = elastic.NewValueCountAggregation().Field(field) case "p90": aggr = elastic.NewPercentilesAggregation().Percentiles(90).Field(field) case "p95": aggr = elastic.NewPercentilesAggregation().Percentiles(95).Field(field) case "p99": aggr = elastic.NewPercentilesAggregation().Percentiles(99).Field(field) case "median": aggr = elastic.NewPercentilesAggregation().Percentiles(50).Field(field) default: return nil, fmt.Errorf("func %s not support", param.MetricAggr.Func) } tsAggr := elastic.NewDateHistogramAggregation(). Field(param.DateField). MinDocCount(1) versionParts := strings.Split(version, ".") major := 0 if len(versionParts) > 0 { if m, err := strconv.Atoi(versionParts[0]); err == nil { major = m } } minor := 0 if len(versionParts) > 1 { if m, err := strconv.Atoi(versionParts[1]); err == nil { minor = m } } if major >= 7 { // 添加偏移量,使第一个分桶bucket的左边界对齐为 start 时间 offset := (start % param.Interval) - param.Interval // 使用 fixed_interval 的条件:ES 7.2+ 或者任何 major > 7(例如 ES8) if (major > 7) || (major == 7 && minor >= 2) { // ES 7.2+ 以及 ES8+ 使用 fixed_interval tsAggr.FixedInterval(fmt.Sprintf("%ds", param.Interval)).Offset(fmt.Sprintf("%ds", offset)) } else { // 7.0-7.1 使用 interval(带 offset) tsAggr.Interval(fmt.Sprintf("%ds", param.Interval)).Offset(fmt.Sprintf("%ds", offset)) } } else { // 兼容 7.0 以下的版本 // OpenSearch 也使用这个字段 tsAggr.Interval(fmt.Sprintf("%ds", param.Interval)) } // group by var groupByAggregation elastic.Aggregation if len(groupBys) > 0 { groupBy := groupBys[0] if groupBy.MinDocCount == 0 { groupBy.MinDocCount = 1 } if groupBy.Size == 0 { groupBy.Size = 300 } switch groupBy.Cate { case Terms: if param.MetricAggr.Func != "count" { groupByAggregation = elastic.NewTermsAggregation().Field(groupBy.Field).SubAggregation(field, aggr).OrderByKeyDesc().Size(groupBy.Size).MinDocCount(int(groupBy.MinDocCount)) } else { groupByAggregation = elastic.NewTermsAggregation().Field(groupBy.Field).OrderByKeyDesc().Size(groupBy.Size).MinDocCount(int(groupBy.MinDocCount)) } case Histogram: if param.MetricAggr.Func != "count" { groupByAggregation = elastic.NewHistogramAggregation().Field(groupBy.Field).Interval(float64(groupBy.Interval)).SubAggregation(field, aggr) } else { groupByAggregation = elastic.NewHistogramAggregation().Field(groupBy.Field).Interval(float64(groupBy.Interval)) } case Filters: for _, filterParam := range groupBy.Params { if param.MetricAggr.Func != "count" { groupByAggregation = elastic.NewFilterAggregation().Filter(elastic.NewTermQuery(filterParam.Query, "true")).SubAggregation(field, aggr) } else { groupByAggregation = elastic.NewFilterAggregation().Filter(elastic.NewTermQuery(filterParam.Query, "true")) } } } for i := 1; i < len(groupBys); i++ { groupBy := groupBys[i] if groupBy.MinDocCount == 0 { groupBy.MinDocCount = 1 } if groupBy.Size == 0 { groupBy.Size = 300 } switch groupBy.Cate { case Terms: groupByAggregation = elastic.NewTermsAggregation().Field(groupBy.Field).SubAggregation(groupBys[i-1].Field, groupByAggregation).OrderByKeyDesc().Size(groupBy.Size).MinDocCount(int(groupBy.MinDocCount)) case Histogram: groupByAggregation = elastic.NewHistogramAggregation().Field(groupBy.Field).Interval(float64(groupBy.Interval)).SubAggregation(groupBys[i-1].Field, groupByAggregation) case Filters: for _, filterParam := range groupBy.Params { groupByAggregation = elastic.NewFilterAggregation().Filter(elastic.NewTermQuery(filterParam.Query, "true")).SubAggregation(groupBys[i-1].Field, groupByAggregation) } } } tsAggr.SubAggregation(groupBys[len(groupBys)-1].Field, groupByAggregation) } else if param.MetricAggr.Func != "count" { tsAggr.SubAggregation(field, aggr) } source, _ := queryString.Source() b, _ := json.Marshal(source) logx.Debugf(ctx, "query_data q:%+v indexArr:%+v tsAggr:%+v query_string:%s", param, indexArr, tsAggr, string(b)) searchSource := elastic.NewSearchSource(). Query(queryString). Aggregation("ts", tsAggr) searchSourceString, err := searchSource.Source() if err != nil { logx.Warningf(ctx, "query_data searchSource:%s to string error:%v", searchSourceString, err) } jsonSearchSource, err := json.Marshal(searchSourceString) if err != nil { logx.Warningf(ctx, "query_data searchSource:%s to json error:%v", searchSourceString, err) } result, err := search(ctx, indexArr, searchSource, param.Timeout, param.MaxShard) if err != nil { logx.Warningf(ctx, "query_data searchSource:%s query_data error:%v", searchSourceString, err) return nil, err } // 检查是否有 shard failures,有部分数据时仅记录警告继续处理 if shardErr := checkShardFailures(ctx, result.Shards, "query_data", searchSourceString); shardErr != nil { if len(result.Aggregations["ts"]) == 0 { return nil, shardErr } // 有部分数据,checkShardFailures 已记录警告,继续处理 } logx.Infof(ctx, "query_data searchSource:%s resp:%s", string(jsonSearchSource), string(result.Aggregations["ts"])) js, err := simplejson.NewJson(result.Aggregations["ts"]) if err != nil { return nil, err } bucketsData, err := js.Get("buckets").Array() if err != nil { return nil, err } var keys []string for i := len(groupBys) - 1; i >= 0; i-- { keys = append(keys, groupBys[i].Field) } if param.MetricAggr.Func != "count" { keys = append(keys, field) } metrics := &MetricPtr{Data: make(map[string][][]float64)} GetBuckets("", keys, bucketsData, metrics, "", 0, param.MetricAggr.Func) // Drop the last incomplete bucket to avoid inaccurate values at the boundary. // When the last bucket's time range extends beyond or reaches the query end time, // it may contain only partial data, making aggregated values (count, sum, etc.) artificially low. for k, v := range metrics.Data { if len(v) <= 1 { continue } lastTs := v[len(v)-1][0] if int64(lastTs)+param.Interval > end { metrics.Data[k] = v[:len(v)-1] } } items, err := TransferData(fmt.Sprintf("%s_%s", field, param.MetricAggr.Func), param.Ref, metrics.Data), nil var m map[string]interface{} bs, _ := json.Marshal(queryParam) json.Unmarshal(bs, &m) m["index"] = param.Index for i := range items { items[i].Query = fmt.Sprintf("%+v", m) } return items, nil } // checkShardFailures 检查 ES 查询结果中的 shard failures,返回格式化的错误信息 func checkShardFailures(ctx context.Context, shards *elastic.ShardsInfo, logPrefix string, queryContext interface{}) error { if shards == nil || shards.Failed == 0 || len(shards.Failures) == 0 { return nil } var failureReasons []string for _, failure := range shards.Failures { reason := "" if failure.Reason != nil { if reasonType, ok := failure.Reason["type"].(string); ok { reason = reasonType } if reasonMsg, ok := failure.Reason["reason"].(string); ok { if reason != "" { reason += ": " + reasonMsg } else { reason = reasonMsg } } } if reason != "" { failureReasons = append(failureReasons, fmt.Sprintf("index=%s shard=%d: %s", failure.Index, failure.Shard, reason)) } } if len(failureReasons) > 0 { errMsg := fmt.Sprintf("elasticsearch shard failures (%d/%d failed): %s", shards.Failed, shards.Total, strings.Join(failureReasons, "; ")) logx.Warningf(ctx, "%s query:%v %s", logPrefix, queryContext, errMsg) return fmt.Errorf("%s", errMsg) } return nil } func HitFilter(typ string) bool { switch typ { case "keyword", "date", "long", "integer", "short", "byte", "double", "float", "half_float", "scaled_float", "unsigned_long": return false default: return true } } func QueryLog(ctx context.Context, queryParam interface{}, timeout int64, version string, maxShard int, search SearchFunc) ([]interface{}, int64, error) { param := new(Query) if err := mapstructure.Decode(queryParam, param); err != nil { return nil, 0, err } if param.Timeout == 0 { param.Timeout = int(timeout) } var indexArr []string if param.IndexType == "index_pattern" { if ip, ok := GetEsIndexPatternCacheType().Get(param.IndexPatternId); ok { param.DateField = ip.TimeField indexArr = []string{ip.Name} } else { return nil, 0, fmt.Errorf("index pattern:%d not found", param.IndexPatternId) } } else { indexArr = strings.Split(param.Index, ",") } now := time.Now().Unix() var start, end int64 if param.End != 0 && param.Start != 0 { end = param.End start = param.Start } else { end = now start = end - param.Interval } q := elastic.NewRangeQuery(param.DateField) q.Gte(time.Unix(start, 0).UnixMilli()) q.Lt(time.Unix(end, 0).UnixMilli()) q.Format("epoch_millis") queryString := GetQueryString(param.Filter, q) if param.Limit <= 0 { param.Limit = 10 } if param.MaxShard < 1 { param.MaxShard = maxShard } // from+size 分页方式获取日志,受es 的max_result_window参数限制,默认最多返回1w条日志, 可以使用search_after方式获取更多日志 source := elastic.NewSearchSource(). TrackTotalHits(true). Query(queryString). Size(param.Limit) // 是否使用search_after方式 if param.SearchAfter != nil { // 设置默认排序字段 if len(param.SearchAfter.SortFields) == 0 { source = source.Sort(param.DateField, param.Ascending).Sort(string(FieldIndex), true).Sort(string(FieldId), true) } else { for _, field := range param.SearchAfter.SortFields { source = source.Sort(field.Field, field.Ascending) } } if len(param.SearchAfter.SearchAfter) > 0 { source = source.SearchAfter(param.SearchAfter.SearchAfter...) } } else { source = source.From(param.P).Sort(param.DateField, param.Ascending) } sourceBytes, _ := json.Marshal(source) result, err := search(ctx, indexArr, source, param.Timeout, param.MaxShard) if err != nil { logx.Warningf(ctx, "query_log source:%s error:%v", string(sourceBytes), err) return nil, 0, err } // 检查是否有 shard failures,有部分数据时仅记录警告继续处理 if shardErr := checkShardFailures(ctx, result.Shards, "query_log", string(sourceBytes)); shardErr != nil { if len(result.Hits.Hits) == 0 { return nil, 0, shardErr } // 有部分数据,checkShardFailures 已记录警告,继续处理 } total := result.TotalHits() var ret []interface{} logx.Debugf(ctx, "query_log source:%s len:%d total:%d", string(sourceBytes), len(result.Hits.Hits), total) resultBytes, _ := json.Marshal(result) logx.Debugf(ctx, "query_log source:%s result:%s", string(sourceBytes), string(resultBytes)) if strings.HasPrefix(version, "6") { for i := 0; i < len(result.Hits.Hits); i++ { var x map[string]interface{} err := json.Unmarshal(result.Hits.Hits[i].Source, &x) if err != nil { logx.Warningf(ctx, "Unmarshal source error:%v", err) continue } if result.Hits.Hits[i].Fields == nil { result.Hits.Hits[i].Fields = make(map[string]interface{}) } IterGetMap(x, result.Hits.Hits[i].Fields, "") ret = append(ret, result.Hits.Hits[i]) } } else { for _, hit := range result.Hits.Hits { ret = append(ret, hit) } } return ret, total, nil } ================================================ FILE: datasource/datasource.go ================================================ package datasource import ( "context" "fmt" "strings" "github.com/ccfos/nightingale/v6/models" ) type DatasourceType struct { Id int64 `json:"id"` Category string `json:"category"` PluginType string `json:"type"` PluginTypeName string `json:"type_name"` } type Keys struct { ValueKey string `json:"valueKey" mapstructure:"valueKey"` // 多个用空格分隔 LabelKey string `json:"labelKey" mapstructure:"labelKey"` // 多个用空格分隔 TimeKey string `json:"timeKey" mapstructure:"timeKey"` TimeFormat string `json:"timeFormat" mapstructure:"timeFormat"` } var DatasourceTypes map[int64]DatasourceType func init() { DatasourceTypes = make(map[int64]DatasourceType) DatasourceTypes[1] = DatasourceType{ Id: 1, Category: "timeseries", PluginType: "prometheus", PluginTypeName: "Prometheus Like", } DatasourceTypes[2] = DatasourceType{ Id: 2, Category: "logging", PluginType: "elasticsearch", PluginTypeName: "Elasticsearch", } DatasourceTypes[3] = DatasourceType{ Id: 3, Category: "logging", PluginType: "aliyun-sls", PluginTypeName: "SLS", } DatasourceTypes[4] = DatasourceType{ Id: 4, Category: "timeseries", PluginType: "ck", PluginTypeName: "ClickHouse", } DatasourceTypes[5] = DatasourceType{ Id: 5, Category: "timeseries", PluginType: "mysql", PluginTypeName: "MySQL", } DatasourceTypes[6] = DatasourceType{ Id: 6, Category: "timeseries", PluginType: "pgsql", PluginTypeName: "PostgreSQL", } DatasourceTypes[7] = DatasourceType{ Id: 7, Category: "logging", PluginType: "victorialogs", PluginTypeName: "VictoriaLogs", } } type NewDatasourceFn func(settings map[string]interface{}) (Datasource, error) var datasourceRegister = map[string]NewDatasourceFn{} type Datasource interface { Init(settings map[string]interface{}) (Datasource, error) // 初始化配置 InitClient() error // 初始化客户端 Validate(ctx context.Context) error // 参数验证 Equal(p Datasource) bool // 验证是否相等 MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) QueryData(ctx context.Context, query interface{}) ([]models.DataResp, error) QueryLog(ctx context.Context, query interface{}) ([]interface{}, int64, error) // 在生成告警事件时,会调用该方法,用于获取额外的数据 QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) } func RegisterDatasource(typ string, p Datasource) { if _, found := datasourceRegister[typ]; found { return } datasourceRegister[typ] = p.Init } func GetDatasourceByType(typ string, settings map[string]interface{}) (Datasource, error) { typ = strings.ReplaceAll(typ, ".logging", "") fn, found := datasourceRegister[typ] if !found { return nil, fmt.Errorf("plugin type %s not found", typ) } plug, err := fn(settings) if err != nil { return nil, err } return plug, nil } type DatasourceInfo struct { Id int64 `json:"id"` Name string `json:"name"` Identifier string `json:"identifier"` Description string `json:"description"` ClusterName string `json:"cluster_name"` Category string `json:"category"` PluginId int64 `json:"plugin_id"` Type string `json:"plugin_type"` PluginTypeName string `json:"plugin_type_name"` Settings map[string]interface{} `json:"settings"` HTTPJson models.HTTP `json:"http"` AuthJson models.Auth `json:"auth"` Status string `json:"status"` CreatedAt int64 `json:"created_at"` UpdatedAt int64 `json:"updated_at"` IsDefault bool `json:"is_default"` Weight int `json:"weight"` } ================================================ FILE: datasource/doris/doris.go ================================================ package doris import ( "context" "fmt" "strings" "time" "github.com/ccfos/nightingale/v6/datasource" "github.com/ccfos/nightingale/v6/dskit/doris" "github.com/ccfos/nightingale/v6/dskit/types" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/macros" "github.com/mitchellh/mapstructure" "github.com/toolkits/pkg/logger" "github.com/ccfos/nightingale/v6/pkg/logx" ) const ( DorisType = "doris" ) func init() { datasource.RegisterDatasource(DorisType, new(Doris)) } type Doris struct { doris.Doris `json:",inline" mapstructure:",squash"` } type QueryParam struct { Ref string `json:"ref" mapstructure:"ref"` Database string `json:"database" mapstructure:"database"` Table string `json:"table" mapstructure:"table"` SQL string `json:"sql" mapstructure:"sql"` Keys datasource.Keys `json:"keys" mapstructure:"keys"` Limit int `json:"limit" mapstructure:"limit"` From int64 `json:"from" mapstructure:"from"` To int64 `json:"to" mapstructure:"to"` TimeField string `json:"time_field" mapstructure:"time_field"` TimeFormat string `json:"time_format" mapstructure:"time_format"` Interval int64 `json:"interval" mapstructure:"interval"` // 查询时间间隔(秒) Offset int `json:"offset" mapstructure:"offset"` // 延迟计算,不在使用通用配置delay } func (d *Doris) InitClient() error { if len(d.Addr) == 0 { return fmt.Errorf("not found doris addr, please check datasource config") } if _, err := d.NewConn(context.TODO(), ""); err != nil { return err } return nil } func (d *Doris) Init(settings map[string]interface{}) (datasource.Datasource, error) { newest := new(Doris) err := mapstructure.Decode(settings, newest) return newest, err } func (d *Doris) Validate(ctx context.Context) error { if len(d.Addr) == 0 || len(strings.TrimSpace(d.Addr)) == 0 { return fmt.Errorf("doris addr is invalid, please check datasource setting") } if len(strings.TrimSpace(d.User)) == 0 { return fmt.Errorf("doris user is invalid, please check datasource setting") } return nil } // Equal compares whether two objects are the same, used for caching func (d *Doris) Equal(p datasource.Datasource) bool { newest, ok := p.(*Doris) if !ok { logger.Errorf("unexpected plugin type, expected is doris") return false } return d.Addr == newest.Addr && d.FeAddr == newest.FeAddr && d.User == newest.User && d.Password == newest.Password && d.EnableWrite == newest.EnableWrite && d.UserWrite == newest.UserWrite && d.PasswordWrite == newest.PasswordWrite && d.MaxQueryRows == newest.MaxQueryRows && d.Timeout == newest.Timeout && d.MaxIdleConns == newest.MaxIdleConns && d.MaxOpenConns == newest.MaxOpenConns && d.ConnMaxLifetime == newest.ConnMaxLifetime && d.ClusterName == newest.ClusterName } func (d *Doris) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { return nil, nil } func (d *Doris) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { return nil, nil } func (d *Doris) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) { return nil, nil } func (d *Doris) QueryData(ctx context.Context, query interface{}) ([]models.DataResp, error) { dorisQueryParam := new(QueryParam) if err := mapstructure.Decode(query, dorisQueryParam); err != nil { return nil, err } if dorisQueryParam.Keys.ValueKey == "" { return nil, fmt.Errorf("valueKey is required") } // 设置默认 interval if dorisQueryParam.Interval == 0 { dorisQueryParam.Interval = 60 } // 计算时间范围 now := time.Now().Unix() var start, end int64 if dorisQueryParam.To != 0 && dorisQueryParam.From != 0 { end = dorisQueryParam.To start = dorisQueryParam.From } else { end = now start = end - dorisQueryParam.Interval } if dorisQueryParam.Offset != 0 { end -= int64(dorisQueryParam.Offset) start -= int64(dorisQueryParam.Offset) } dorisQueryParam.From = start dorisQueryParam.To = end if strings.Contains(dorisQueryParam.SQL, "$__") { var err error dorisQueryParam.SQL, err = macros.Macro(dorisQueryParam.SQL, dorisQueryParam.From, dorisQueryParam.To) if err != nil { return nil, err } } items, err := d.QueryTimeseries(ctx, &doris.QueryParam{ Database: dorisQueryParam.Database, Sql: dorisQueryParam.SQL, Keys: types.Keys{ ValueKey: dorisQueryParam.Keys.ValueKey, LabelKey: dorisQueryParam.Keys.LabelKey, TimeKey: dorisQueryParam.Keys.TimeKey, Offset: dorisQueryParam.Offset, }, }) if err != nil { logx.Warningf(ctx, "query:%+v get data err:%v", dorisQueryParam, err) return []models.DataResp{}, err } data := make([]models.DataResp, 0) for i := range items { data = append(data, models.DataResp{ Ref: dorisQueryParam.Ref, Metric: items[i].Metric, Values: items[i].Values, }) } // parse resp to time series data logx.Infof(ctx, "req:%+v keys:%+v \n data:%v", dorisQueryParam, dorisQueryParam.Keys, data) return data, nil } func (d *Doris) QueryLog(ctx context.Context, query interface{}) ([]interface{}, int64, error) { dorisQueryParam := new(QueryParam) if err := mapstructure.Decode(query, dorisQueryParam); err != nil { return nil, 0, err } // 记录规则预览场景下,只传了interval, 没有传From和To now := time.Now().Unix() if dorisQueryParam.To == 0 && dorisQueryParam.From == 0 && dorisQueryParam.Interval != 0 { dorisQueryParam.To = now dorisQueryParam.From = now - dorisQueryParam.Interval } if dorisQueryParam.Offset != 0 { dorisQueryParam.To -= int64(dorisQueryParam.Offset) dorisQueryParam.From -= int64(dorisQueryParam.Offset) } if strings.Contains(dorisQueryParam.SQL, "$__") { var err error dorisQueryParam.SQL, err = macros.Macro(dorisQueryParam.SQL, dorisQueryParam.From, dorisQueryParam.To) if err != nil { return nil, 0, err } } items, err := d.QueryLogs(ctx, &doris.QueryParam{ Database: dorisQueryParam.Database, Sql: dorisQueryParam.SQL, }) if err != nil { logx.Warningf(ctx, "query:%+v get data err:%v", dorisQueryParam, err) return []interface{}{}, 0, err } logs := make([]interface{}, 0) for i := range items { logs = append(logs, items[i]) } return logs, int64(len(logs)), nil } func (d *Doris) DescribeTable(ctx context.Context, query interface{}) ([]*types.ColumnProperty, error) { dorisQueryParam := new(QueryParam) if err := mapstructure.Decode(query, dorisQueryParam); err != nil { return nil, err } return d.DescTable(ctx, dorisQueryParam.Database, dorisQueryParam.Table) } ================================================ FILE: datasource/es/es.go ================================================ package es import ( "context" "encoding/json" "fmt" "net" "net/http" "net/url" "reflect" "sort" "strings" "time" "github.com/ccfos/nightingale/v6/datasource" "github.com/ccfos/nightingale/v6/datasource/commons/eslike" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/tlsx" "github.com/mitchellh/mapstructure" "github.com/olivere/elastic/v7" "github.com/ccfos/nightingale/v6/pkg/logx" ) const ( ESType = "elasticsearch" ) type Elasticsearch struct { Addr string `json:"es.addr" mapstructure:"es.addr"` Nodes []string `json:"es.nodes" mapstructure:"es.nodes"` Timeout int64 `json:"es.timeout" mapstructure:"es.timeout"` // millis Basic BasicAuth `json:"es.basic" mapstructure:"es.basic"` TLS TLS `json:"es.tls" mapstructure:"es.tls"` Version string `json:"es.version" mapstructure:"es.version"` Headers map[string]string `json:"es.headers" mapstructure:"es.headers"` MinInterval int `json:"es.min_interval" mapstructure:"es.min_interval"` // seconds MaxShard int `json:"es.max_shard" mapstructure:"es.max_shard"` ClusterName string `json:"es.cluster_name" mapstructure:"es.cluster_name"` EnableWrite bool `json:"es.enable_write" mapstructure:"es.enable_write"` // 允许写操作 Client *elastic.Client `json:"es.client" mapstructure:"es.client"` } type TLS struct { SkipTlsVerify bool `json:"es.tls.skip_tls_verify" mapstructure:"es.tls.skip_tls_verify"` } type BasicAuth struct { Enable bool `json:"es.auth.enable" mapstructure:"es.auth.enable"` Username string `json:"es.user" mapstructure:"es.user"` Password string `json:"es.password" mapstructure:"es.password"` } func init() { datasource.RegisterDatasource(ESType, new(Elasticsearch)) } func (e *Elasticsearch) Init(settings map[string]interface{}) (datasource.Datasource, error) { newest := new(Elasticsearch) err := mapstructure.Decode(settings, newest) return newest, err } func (e *Elasticsearch) InitClient() error { transport := &http.Transport{ Proxy: http.ProxyFromEnvironment, DialContext: (&net.Dialer{ Timeout: time.Duration(e.Timeout) * time.Millisecond, }).DialContext, ResponseHeaderTimeout: time.Duration(e.Timeout) * time.Millisecond, } if len(e.Nodes) > 0 { e.Addr = e.Nodes[0] } if strings.Contains(e.Addr, "https") { tlsConfig := tlsx.ClientConfig{ InsecureSkipVerify: e.TLS.SkipTlsVerify, UseTLS: true, } cfg, err := tlsConfig.TLSConfig() if err != nil { return err } transport.TLSClientConfig = cfg } var err error options := []elastic.ClientOptionFunc{ elastic.SetURL(e.Nodes...), } if e.Basic.Username != "" { options = append(options, elastic.SetBasicAuth(e.Basic.Username, e.Basic.Password)) } headers := http.Header{} for k, v := range e.Headers { headers[k] = []string{v} } options = append(options, elastic.SetHeaders(headers)) options = append(options, elastic.SetHttpClient(&http.Client{Transport: transport})) options = append(options, elastic.SetSniff(false)) options = append(options, elastic.SetHealthcheck(false)) e.Client, err = elastic.NewClient(options...) if err != nil { return err } return err } func (e *Elasticsearch) Equal(other datasource.Datasource) bool { sort.Strings(e.Nodes) sort.Strings(other.(*Elasticsearch).Nodes) if strings.Join(e.Nodes, ",") != strings.Join(other.(*Elasticsearch).Nodes, ",") { return false } if e.Basic.Username != other.(*Elasticsearch).Basic.Username { return false } if e.Basic.Password != other.(*Elasticsearch).Basic.Password { return false } if e.TLS.SkipTlsVerify != other.(*Elasticsearch).TLS.SkipTlsVerify { return false } if e.EnableWrite != other.(*Elasticsearch).EnableWrite { return false } if !reflect.DeepEqual(e.Headers, other.(*Elasticsearch).Headers) { return false } return true } func (e *Elasticsearch) Validate(ctx context.Context) (err error) { if len(e.Nodes) == 0 { return fmt.Errorf("need a valid addr") } for _, addr := range e.Nodes { _, err = url.Parse(addr) if err != nil { return fmt.Errorf("parse addr error: %v", err) } } if e.Basic.Enable && (len(e.Basic.Username) == 0 || len(e.Basic.Password) == 0) { return fmt.Errorf("need a valid user, password") } if e.MaxShard == 0 { e.MaxShard = 5 } if e.MinInterval < 10 { e.MinInterval = 10 } if e.Timeout == 0 { e.Timeout = 60000 } return nil } func (e *Elasticsearch) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { return eslike.MakeLogQuery(ctx, query, eventTags, start, end) } func (e *Elasticsearch) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { return eslike.MakeTSQuery(ctx, query, eventTags, start, end) } func (e *Elasticsearch) QueryData(ctx context.Context, queryParam interface{}) ([]models.DataResp, error) { search := func(ctx context.Context, indices []string, source interface{}, timeout int, maxShard int) (*elastic.SearchResult, error) { return e.Client.Search(). Index(indices...). IgnoreUnavailable(true). Source(source). Timeout(fmt.Sprintf("%ds", timeout)). MaxConcurrentShardRequests(maxShard). Do(ctx) } return eslike.QueryData(ctx, queryParam, e.Timeout, e.Version, search) } func (e *Elasticsearch) QueryIndices() ([]string, error) { result, err := e.Client.IndexNames() return result, err } func (e *Elasticsearch) QueryFields(indexes []string) ([]string, error) { var fields []string result, err := elastic.NewGetFieldMappingService(e.Client).Index(indexes...).IgnoreUnavailable(true).Do(context.Background()) if err != nil { return fields, err } fieldMap := make(map[string]struct{}) for _, indexMap := range result { if m, exists := indexMap.(map[string]interface{})["mappings"]; exists { for k, v := range m.(map[string]interface{}) { // 兼容 es6 版本 if k == "doc" && strings.HasPrefix(e.Version, "6") { // if k == "doc" { for kk, vv := range v.(map[string]interface{}) { typ := getFieldType(kk, vv.(map[string]interface{})) if eslike.HitFilter(typ) { continue } if _, exists := fieldMap[kk]; !exists { fieldMap[kk] = struct{}{} fields = append(fields, kk) } } } else { // es7 版本 typ := getFieldType(k, v.(map[string]interface{})) if eslike.HitFilter(typ) { continue } if _, exists := fieldMap[k]; !exists { fieldMap[k] = struct{}{} fields = append(fields, k) } } } } } sort.Strings(fields) return fields, nil } func (e *Elasticsearch) QueryLog(ctx context.Context, queryParam interface{}) ([]interface{}, int64, error) { search := func(ctx context.Context, indices []string, source interface{}, timeout int, maxShard int) (*elastic.SearchResult, error) { // 应该是之前为了获取 fields 字段,做的这个兼容 // fields, err := e.QueryFields(indices) // if err != nil { // logger.Warningf("query data error:%v", err) // return nil, err // } // if source != nil && strings.HasPrefix(e.Version, "7") { // source = source.(*elastic.SearchSource).DocvalueFields(fields...) // } return e.Client.Search(). Index(indices...). IgnoreUnavailable(true). MaxConcurrentShardRequests(maxShard). Source(source). Timeout(fmt.Sprintf("%ds", timeout)). Do(ctx) } return eslike.QueryLog(ctx, queryParam, e.Timeout, e.Version, e.MaxShard, search) } func (e *Elasticsearch) QueryFieldValue(indexes []string, field string, query string) ([]string, error) { var values []string search := e.Client.Search(). IgnoreUnavailable(true). Index(indexes...). Size(0) if query != "" { search = search.Query(elastic.NewBoolQuery().Must(elastic.NewQueryStringQuery(query))) } search = search.Aggregation("distinct", elastic.NewTermsAggregation().Field(field).Size(10000)) result, err := search.Do(context.Background()) if err != nil { return values, err } agg, found := result.Aggregations.Terms("distinct") if !found { return values, nil } for _, bucket := range agg.Buckets { values = append(values, bucket.Key.(string)) } return values, nil } func (e *Elasticsearch) Test(ctx context.Context) (err error) { err = e.Validate(ctx) if err != nil { return err } if e.Addr == "" { return fmt.Errorf("addr is invalid") } if e.Version == "7.10+" { options := []elastic.ClientOptionFunc{ elastic.SetURL(e.Addr), } if e.Basic.Enable { options = append(options, elastic.SetBasicAuth(e.Basic.Username, e.Basic.Password)) } client, err := elastic.NewClient(options...) if err != nil { return fmt.Errorf("config is invalid:%v", err) } _, err = client.ElasticsearchVersion(e.Addr) if err != nil { return fmt.Errorf("config is invalid:%v", err) } } else { return fmt.Errorf("version must be 7.10+") } return nil } func getFieldType(key string, m map[string]interface{}) string { if innerMap, exists := m["mapping"]; exists { if innerM, exists := innerMap.(map[string]interface{})[key]; exists { if typ, exists := innerM.(map[string]interface{})["type"]; exists { return typ.(string) } } else { arr := strings.Split(key, ".") if innerM, exists := innerMap.(map[string]interface{})[arr[len(arr)-1]]; exists { if typ, exists := innerM.(map[string]interface{})["type"]; exists { return typ.(string) } } } } return "" } func (e *Elasticsearch) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) { search := func(ctx context.Context, indices []string, source interface{}, timeout int, maxShard int) (*elastic.SearchResult, error) { return e.Client.Search(). Index(indices...). IgnoreUnavailable(true). Source(source). Timeout(fmt.Sprintf("%ds", timeout)). Do(ctx) } param := new(eslike.Query) if err := mapstructure.Decode(query, param); err != nil { return nil, err } // 扩大查询范围, 解决上一次查询消耗时间太多,导致本次查询时间范围起止时间,滞后问题 param.Interval += 30 res, _, err := eslike.QueryLog(ctx, param, e.Timeout, e.Version, e.MaxShard, search) if err != nil { return nil, err } var result []map[string]string for _, item := range res { logx.Debugf(ctx, "query:%v item:%v", query, item) if itemMap, ok := item.(*elastic.SearchHit); ok { mItem := make(map[string]string) // 遍历 fields 字段的每个键值对 sourceMap := make(map[string]interface{}) err := json.Unmarshal(itemMap.Source, &sourceMap) if err != nil { logx.Warningf(ctx, "unmarshal source%s error:%v", string(itemMap.Source), err) continue } for k, v := range sourceMap { mItem[k] = fmt.Sprintf("%v", v) } // 将处理好的 map 添加到 m 切片中 result = append(result, mItem) if param.Limit > 0 { continue } // 只取第一条数据 break } } return result, nil } ================================================ FILE: datasource/mysql/mysql.go ================================================ package mysql import ( "context" "fmt" "strings" "time" "github.com/ccfos/nightingale/v6/datasource" "github.com/ccfos/nightingale/v6/dskit/mysql" "github.com/ccfos/nightingale/v6/dskit/sqlbase" "github.com/ccfos/nightingale/v6/dskit/types" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/macros" "github.com/mitchellh/mapstructure" "github.com/toolkits/pkg/logger" "github.com/ccfos/nightingale/v6/pkg/logx" ) const ( MySQLType = "mysql" ) func init() { datasource.RegisterDatasource(MySQLType, new(MySQL)) } type MySQL struct { mysql.MySQL `json:",inline" mapstructure:",squash"` } type QueryParam struct { Ref string `json:"ref" mapstructure:"ref"` Database string `json:"database" mapstructure:"database"` Table string `json:"table" mapstructure:"table"` SQL string `json:"sql" mapstructure:"sql"` Keys datasource.Keys `json:"keys" mapstructure:"keys"` From int64 `json:"from" mapstructure:"from"` To int64 `json:"to" mapstructure:"to"` } func (m *MySQL) InitClient() error { if len(m.Shards) == 0 { return fmt.Errorf("not found mysql addr, please check datasource config") } if _, err := m.NewConn(context.TODO(), ""); err != nil { return err } return nil } func (m *MySQL) Init(settings map[string]interface{}) (datasource.Datasource, error) { newest := new(MySQL) err := mapstructure.Decode(settings, newest) return newest, err } func (m *MySQL) Validate(ctx context.Context) error { if len(m.Shards) == 0 || len(strings.TrimSpace(m.Shards[0].Addr)) == 0 { return fmt.Errorf("mysql addr is invalid, please check datasource setting") } if len(strings.TrimSpace(m.Shards[0].User)) == 0 { return fmt.Errorf("mysql user is invalid, please check datasource setting") } return nil } // Equal compares whether two objects are the same, used for caching func (m *MySQL) Equal(p datasource.Datasource) bool { newest, ok := p.(*MySQL) if !ok { logger.Errorf("unexpected plugin type, expected is mysql") return false } if len(m.Shards) == 0 || len(newest.Shards) == 0 { return false } oldShard := m.Shards[0] newShard := newest.Shards[0] if oldShard.Addr != newShard.Addr { return false } if oldShard.User != newShard.User { return false } if oldShard.Password != newShard.Password { return false } if oldShard.MaxQueryRows != newShard.MaxQueryRows { return false } if oldShard.Timeout != newShard.Timeout { return false } if oldShard.MaxIdleConns != newShard.MaxIdleConns { return false } if oldShard.MaxOpenConns != newShard.MaxOpenConns { return false } if oldShard.ConnMaxLifetime != newShard.ConnMaxLifetime { return false } return true } func (m *MySQL) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { return nil, nil } func (m *MySQL) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { return nil, nil } func (m *MySQL) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) { return nil, nil } func (m *MySQL) QueryData(ctx context.Context, query interface{}) ([]models.DataResp, error) { mysqlQueryParam := new(QueryParam) if err := mapstructure.Decode(query, mysqlQueryParam); err != nil { return nil, err } if strings.Contains(mysqlQueryParam.SQL, "$__") { var err error mysqlQueryParam.SQL, err = macros.Macro(mysqlQueryParam.SQL, mysqlQueryParam.From, mysqlQueryParam.To) if err != nil { return nil, err } } if mysqlQueryParam.Keys.ValueKey == "" { return nil, fmt.Errorf("valueKey is required") } timeout := m.Shards[0].Timeout if timeout == 0 { timeout = 60 } timeoutCtx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second) defer cancel() items, err := m.QueryTimeseries(timeoutCtx, &sqlbase.QueryParam{ Sql: mysqlQueryParam.SQL, Keys: types.Keys{ ValueKey: mysqlQueryParam.Keys.ValueKey, LabelKey: mysqlQueryParam.Keys.LabelKey, TimeKey: mysqlQueryParam.Keys.TimeKey, }, }) if err != nil { logx.Warningf(ctx, "query:%+v get data err:%v", mysqlQueryParam, err) return []models.DataResp{}, err } data := make([]models.DataResp, 0) for i := range items { data = append(data, models.DataResp{ Ref: mysqlQueryParam.Ref, Metric: items[i].Metric, Values: items[i].Values, }) } return data, nil } func (m *MySQL) QueryLog(ctx context.Context, query interface{}) ([]interface{}, int64, error) { mysqlQueryParam := new(QueryParam) if err := mapstructure.Decode(query, mysqlQueryParam); err != nil { return nil, 0, err } if strings.Contains(mysqlQueryParam.SQL, "$__") { var err error mysqlQueryParam.SQL, err = macros.Macro(mysqlQueryParam.SQL, mysqlQueryParam.From, mysqlQueryParam.To) if err != nil { return nil, 0, err } } timeout := m.Shards[0].Timeout if timeout == 0 { timeout = 60 } timeoutCtx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second) defer cancel() items, err := m.Query(timeoutCtx, &sqlbase.QueryParam{ Sql: mysqlQueryParam.SQL, }) if err != nil { logx.Warningf(ctx, "query:%+v get data err:%v", mysqlQueryParam, err) return []interface{}{}, 0, err } logs := make([]interface{}, 0) for i := range items { logs = append(logs, items[i]) } return logs, 0, nil } func (m *MySQL) DescribeTable(ctx context.Context, query interface{}) ([]*types.ColumnProperty, error) { mysqlQueryParam := new(QueryParam) if err := mapstructure.Decode(query, mysqlQueryParam); err != nil { return nil, err } return m.DescTable(ctx, mysqlQueryParam.Database, mysqlQueryParam.Table) } ================================================ FILE: datasource/opensearch/opensearch.go ================================================ package opensearch import ( "bytes" "context" "encoding/json" "fmt" "io" "net" "net/http" "net/url" "reflect" "regexp" "sort" "strings" "time" "github.com/ccfos/nightingale/v6/datasource" "github.com/ccfos/nightingale/v6/datasource/commons/eslike" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/tlsx" "github.com/mitchellh/mapstructure" "github.com/olivere/elastic/v7" oscliv2 "github.com/opensearch-project/opensearch-go/v2" osapiv2 "github.com/opensearch-project/opensearch-go/v2/opensearchapi" ) const ( OpenSearchType = "opensearch" ) type OpenSearch struct { Addr string `json:"os.addr" mapstructure:"os.addr"` Nodes []string `json:"os.nodes" mapstructure:"os.nodes"` Timeout int64 `json:"os.timeout" mapstructure:"os.timeout"` // millis Basic BasicAuth `json:"os.basic" mapstructure:"os.basic"` TLS TLS `json:"os.tls" mapstructure:"os.tls"` Version string `json:"os.version" mapstructure:"os.version"` Headers map[string]string `json:"os.headers" mapstructure:"os.headers"` MinInterval int `json:"os.min_interval" mapstructure:"os.min_interval"` // seconds MaxShard int `json:"os.max_shard" mapstructure:"os.max_shard"` ClusterName string `json:"os.cluster_name" mapstructure:"os.cluster_name"` Client *oscliv2.Client `json:"os.client" mapstructure:"os.client"` } type TLS struct { SkipTlsVerify bool `json:"os.tls.skip_tls_verify" mapstructure:"os.tls.skip_tls_verify"` } type BasicAuth struct { Enable bool `json:"os.auth.enable" mapstructure:"os.auth.enable"` Username string `json:"os.user" mapstructure:"os.user"` Password string `json:"os.password" mapstructure:"os.password"` } func init() { datasource.RegisterDatasource(OpenSearchType, new(OpenSearch)) } func (os *OpenSearch) Init(settings map[string]interface{}) (datasource.Datasource, error) { newest := new(OpenSearch) err := mapstructure.Decode(settings, newest) return newest, err } func (os *OpenSearch) InitClient() error { transport := &http.Transport{ Proxy: http.ProxyFromEnvironment, DialContext: (&net.Dialer{ Timeout: time.Duration(os.Timeout) * time.Millisecond, }).DialContext, ResponseHeaderTimeout: time.Duration(os.Timeout) * time.Millisecond, } if len(os.Nodes) > 0 { os.Addr = os.Nodes[0] } if strings.Contains(os.Addr, "https") { tlsConfig := tlsx.ClientConfig{ InsecureSkipVerify: os.TLS.SkipTlsVerify, UseTLS: true, } cfg, err := tlsConfig.TLSConfig() if err != nil { return err } transport.TLSClientConfig = cfg } headers := http.Header{} for k, v := range os.Headers { headers[k] = []string{v} } options := oscliv2.Config{ Addresses: os.Nodes, Transport: transport, Header: headers, } // 只要有用户名就添加认证,不依赖 Enable 字段 if os.Basic.Username != "" { options.Username = os.Basic.Username options.Password = os.Basic.Password } var err = error(nil) os.Client, err = oscliv2.NewClient(options) return err } func (os *OpenSearch) Equal(other datasource.Datasource) bool { sort.Strings(os.Nodes) sort.Strings(other.(*OpenSearch).Nodes) if strings.Join(os.Nodes, ",") != strings.Join(other.(*OpenSearch).Nodes, ",") { return false } if os.Basic.Username != other.(*OpenSearch).Basic.Username { return false } if os.Basic.Password != other.(*OpenSearch).Basic.Password { return false } if os.TLS.SkipTlsVerify != other.(*OpenSearch).TLS.SkipTlsVerify { return false } if os.Timeout != other.(*OpenSearch).Timeout { return false } if !reflect.DeepEqual(os.Headers, other.(*OpenSearch).Headers) { return false } return true } func (os *OpenSearch) Validate(ctx context.Context) (err error) { if len(os.Nodes) == 0 { return fmt.Errorf("need a valid addr") } for _, addr := range os.Nodes { _, err = url.Parse(addr) if err != nil { return fmt.Errorf("parse addr error: %v", err) } } // 如果提供了用户名,必须同时提供密码 if len(os.Basic.Username) > 0 && len(os.Basic.Password) == 0 { return fmt.Errorf("password is required when username is provided") } if os.MaxShard == 0 { os.MaxShard = 5 } if os.MinInterval < 10 { os.MinInterval = 10 } if os.Timeout == 0 { os.Timeout = 6000 } if !strings.HasPrefix(os.Version, "2") { return fmt.Errorf("version must be 2.0+") } return nil } func (os *OpenSearch) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { return eslike.MakeLogQuery(ctx, query, eventTags, start, end) } func (os *OpenSearch) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { return eslike.MakeTSQuery(ctx, query, eventTags, start, end) } func search(ctx context.Context, indices []string, source interface{}, timeout int, cli *oscliv2.Client) (*elastic.SearchResult, error) { var body *bytes.Buffer = nil if source != nil { body = new(bytes.Buffer) err := json.NewEncoder(body).Encode(source) if err != nil { return nil, err } } req := osapiv2.SearchRequest{ Index: indices, Body: body, } if timeout > 0 { req.Timeout = time.Second * time.Duration(timeout) } resp, err := req.Do(ctx, cli) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode < 200 || resp.StatusCode >= 300 { return nil, fmt.Errorf("opensearch response not 2xx, resp is %v", resp) } bs, err := io.ReadAll(resp.Body) if err != nil { return nil, err } result := new(elastic.SearchResult) err = json.Unmarshal(bs, &result) if err != nil { return nil, err } return result, nil } func (os *OpenSearch) QueryData(ctx context.Context, queryParam interface{}) ([]models.DataResp, error) { search := func(ctx context.Context, indices []string, source interface{}, timeout int, maxShard int) (*elastic.SearchResult, error) { return search(ctx, indices, source, timeout, os.Client) } return eslike.QueryData(ctx, queryParam, os.Timeout, os.Version, search) } func (os *OpenSearch) QueryIndices() ([]string, error) { cir := osapiv2.CatIndicesRequest{ Format: "json", } rsp, err := cir.Do(context.Background(), os.Client) if err != nil { return nil, err } defer rsp.Body.Close() bs, err := io.ReadAll(rsp.Body) if err != nil { return nil, err } resp := make([]struct { Index string `json:"index"` }, 0) err = json.Unmarshal(bs, &resp) if err != nil { return nil, err } var ret []string for _, k := range resp { ret = append(ret, k.Index) } return ret, nil } func (os *OpenSearch) QueryFields(indices []string) ([]string, error) { var fields []string mappingRequest := osapiv2.IndicesGetMappingRequest{ Index: indices, } resp, err := mappingRequest.Do(context.Background(), os.Client) if err != nil { return fields, err } defer resp.Body.Close() bs, err := io.ReadAll(resp.Body) if err != nil { return fields, err } result := map[string]interface{}{} err = json.Unmarshal(bs, &result) if err != nil { return fields, err } idx := "" if len(indices) > 0 { idx = indices[0] } mappingIndex := "" indexReg, _ := regexp.Compile(idx) for key, value := range result { mappings, ok := value.(map[string]interface{}) if !ok { continue } if len(mappings) == 0 { continue } if key == idx || strings.Contains(key, idx) || (indexReg != nil && indexReg.MatchString(key)) { mappingIndex = key break } } if len(mappingIndex) == 0 { return fields, nil } fields = propertyMappingRange(result[mappingIndex], 1) sort.Strings(fields) return fields, nil } func propertyMappingRange(v interface{}, depth int) (fields []string) { mapping, ok := v.(map[string]interface{}) if !ok { return } if len(mapping) == 0 { return } for key, value := range mapping { if reflect.TypeOf(value).Kind() == reflect.Map { valueMap := value.(map[string]interface{}) if prop, found := valueMap["properties"]; found { subFields := propertyMappingRange(prop, depth+1) for i := range subFields { if depth == 1 { fields = append(fields, subFields[i]) } else { fields = append(fields, key+"."+subFields[i]) } } } else if typ, found := valueMap["type"]; found { if eslike.HitFilter(typ.(string)) { continue } fields = append(fields, key) } } } return } func (os *OpenSearch) QueryLog(ctx context.Context, queryParam interface{}) ([]interface{}, int64, error) { search := func(ctx context.Context, indices []string, source interface{}, timeout int, maxShard int) (*elastic.SearchResult, error) { return search(ctx, indices, source, timeout, os.Client) } return eslike.QueryLog(ctx, queryParam, os.Timeout, os.Version, 0, search) } func (os *OpenSearch) QueryFieldValue(indexes []string, field string, query string) ([]string, error) { var values []string source := elastic.NewSearchSource(). Size(0) if query != "" { source = source.Query(elastic.NewBoolQuery().Must(elastic.NewQueryStringQuery(query))) } source = source.Aggregation("distinct", elastic.NewTermsAggregation().Field(field).Size(10000)) result, err := search(context.Background(), indexes, source, 0, os.Client) if err != nil { return values, err } agg, found := result.Aggregations.Terms("distinct") if !found { return values, nil } for _, bucket := range agg.Buckets { values = append(values, bucket.Key.(string)) } return values, nil } func (os *OpenSearch) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) { return nil, nil } ================================================ FILE: datasource/postgresql/postgresql.go ================================================ package postgresql import ( "context" "fmt" "regexp" "strings" "time" "github.com/ccfos/nightingale/v6/datasource" "github.com/ccfos/nightingale/v6/pkg/macros" "github.com/ccfos/nightingale/v6/dskit/postgres" "github.com/ccfos/nightingale/v6/dskit/sqlbase" "github.com/ccfos/nightingale/v6/dskit/types" "github.com/ccfos/nightingale/v6/models" "github.com/mitchellh/mapstructure" "github.com/toolkits/pkg/logger" "github.com/ccfos/nightingale/v6/pkg/logx" ) const ( PostgreSQLType = "pgsql" ) var ( regx = `(?i)from\s+((?:"[^"]+"|[a-zA-Z0-9_]+))\.((?:"[^"]+"|[a-zA-Z0-9_]+))\.((?:"[^"]+"|[a-zA-Z0-9_]+))` ) func init() { datasource.RegisterDatasource(PostgreSQLType, new(PostgreSQL)) } type PostgreSQL struct { Shards []*postgres.PostgreSQL `json:"pgsql.shards" mapstructure:"pgsql.shards"` } type QueryParam struct { Ref string `json:"ref" mapstructure:"ref"` Database string `json:"database" mapstructure:"database"` Table string `json:"table" mapstructure:"table"` SQL string `json:"sql" mapstructure:"sql"` Keys datasource.Keys `json:"keys" mapstructure:"keys"` From int64 `json:"from" mapstructure:"from"` To int64 `json:"to" mapstructure:"to"` } func (p *PostgreSQL) InitClient() error { if len(p.Shards) == 0 { return fmt.Errorf("not found postgresql addr, please check datasource config") } for _, shard := range p.Shards { if db, err := shard.NewConn(context.TODO(), "postgres"); err != nil { defer sqlbase.CloseDB(db) return err } } return nil } func (p *PostgreSQL) Init(settings map[string]interface{}) (datasource.Datasource, error) { newest := new(PostgreSQL) err := mapstructure.Decode(settings, newest) return newest, err } func (p *PostgreSQL) Validate(ctx context.Context) error { if len(p.Shards) == 0 || len(strings.TrimSpace(p.Shards[0].Addr)) == 0 { return fmt.Errorf("postgresql addr is invalid, please check datasource setting") } if len(strings.TrimSpace(p.Shards[0].User)) == 0 { return fmt.Errorf("postgresql user is invalid, please check datasource setting") } return nil } // Equal compares whether two objects are the same, used for caching func (p *PostgreSQL) Equal(d datasource.Datasource) bool { newest, ok := d.(*PostgreSQL) if !ok { logger.Errorf("unexpected plugin type, expected is postgresql") return false } if len(p.Shards) == 0 || len(newest.Shards) == 0 { return false } oldShard := p.Shards[0] newShard := newest.Shards[0] if oldShard.Addr != newShard.Addr { return false } if oldShard.User != newShard.User { return false } if oldShard.Password != newShard.Password { return false } if oldShard.MaxQueryRows != newShard.MaxQueryRows { return false } if oldShard.Timeout != newShard.Timeout { return false } if oldShard.MaxIdleConns != newShard.MaxIdleConns { return false } if oldShard.MaxOpenConns != newShard.MaxOpenConns { return false } if oldShard.ConnMaxLifetime != newShard.ConnMaxLifetime { return false } return true } func (p *PostgreSQL) ShowDatabases(ctx context.Context) ([]string, error) { return p.Shards[0].ShowDatabases(ctx, "") } func (p *PostgreSQL) ShowTables(ctx context.Context, database string) ([]string, error) { p.Shards[0].DB = database rets, err := p.Shards[0].ShowTables(ctx, "") if err != nil { return nil, err } tables := make([]string, 0, len(rets)) for scheme, tabs := range rets { for _, tab := range tabs { tables = append(tables, scheme+"."+tab) } } return tables, nil } func (p *PostgreSQL) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { return nil, nil } func (p *PostgreSQL) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { return nil, nil } func (p *PostgreSQL) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) { return nil, nil } func (p *PostgreSQL) QueryData(ctx context.Context, query interface{}) ([]models.DataResp, error) { postgresqlQueryParam := new(QueryParam) if err := mapstructure.Decode(query, postgresqlQueryParam); err != nil { return nil, err } postgresqlQueryParam.SQL = formatSQLDatabaseNameWithRegex(postgresqlQueryParam.SQL) if strings.Contains(postgresqlQueryParam.SQL, "$__") { var err error postgresqlQueryParam.SQL, err = macros.Macro(postgresqlQueryParam.SQL, postgresqlQueryParam.From, postgresqlQueryParam.To) if err != nil { return nil, err } } if postgresqlQueryParam.Database != "" { p.Shards[0].DB = postgresqlQueryParam.Database } else { db, err := parseDBName(postgresqlQueryParam.SQL) if err != nil { return nil, err } p.Shards[0].DB = db } timeout := p.Shards[0].Timeout if timeout == 0 { timeout = 60 } timeoutCtx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second) defer cancel() items, err := p.Shards[0].QueryTimeseries(timeoutCtx, &sqlbase.QueryParam{ Sql: postgresqlQueryParam.SQL, Keys: types.Keys{ ValueKey: postgresqlQueryParam.Keys.ValueKey, LabelKey: postgresqlQueryParam.Keys.LabelKey, TimeKey: postgresqlQueryParam.Keys.TimeKey, }, }) if err != nil { logx.Warningf(ctx, "query:%+v get data err:%v", postgresqlQueryParam, err) return []models.DataResp{}, err } data := make([]models.DataResp, 0) for i := range items { data = append(data, models.DataResp{ Ref: postgresqlQueryParam.Ref, Metric: items[i].Metric, Values: items[i].Values, }) } // parse resp to time series data logx.Infof(ctx, "req:%+v keys:%+v \n data:%v", postgresqlQueryParam, postgresqlQueryParam.Keys, data) return data, nil } func (p *PostgreSQL) QueryLog(ctx context.Context, query interface{}) ([]interface{}, int64, error) { postgresqlQueryParam := new(QueryParam) if err := mapstructure.Decode(query, postgresqlQueryParam); err != nil { return nil, 0, err } if postgresqlQueryParam.Database != "" { p.Shards[0].DB = postgresqlQueryParam.Database } else { db, err := parseDBName(postgresqlQueryParam.SQL) if err != nil { return nil, 0, err } p.Shards[0].DB = db } postgresqlQueryParam.SQL = formatSQLDatabaseNameWithRegex(postgresqlQueryParam.SQL) if strings.Contains(postgresqlQueryParam.SQL, "$__") { var err error postgresqlQueryParam.SQL, err = macros.Macro(postgresqlQueryParam.SQL, postgresqlQueryParam.From, postgresqlQueryParam.To) if err != nil { return nil, 0, err } } timeout := p.Shards[0].Timeout if timeout == 0 { timeout = 60 } timeoutCtx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second) defer cancel() items, err := p.Shards[0].Query(timeoutCtx, &sqlbase.QueryParam{ Sql: postgresqlQueryParam.SQL, }) if err != nil { logx.Warningf(ctx, "query:%+v get data err:%v", postgresqlQueryParam, err) return []interface{}{}, 0, err } logs := make([]interface{}, 0) for i := range items { logs = append(logs, items[i]) } return logs, 0, nil } func (p *PostgreSQL) DescribeTable(ctx context.Context, query interface{}) ([]*types.ColumnProperty, error) { postgresqlQueryParam := new(QueryParam) if err := mapstructure.Decode(query, postgresqlQueryParam); err != nil { return nil, err } p.Shards[0].DB = postgresqlQueryParam.Database pairs := strings.Split(postgresqlQueryParam.Table, ".") // format: scheme.table_name scheme := "" table := postgresqlQueryParam.Table if len(pairs) == 2 { scheme = pairs[0] table = pairs[1] } return p.Shards[0].DescTable(ctx, scheme, table) } func parseDBName(sql string) (db string, err error) { re := regexp.MustCompile(regx) matches := re.FindStringSubmatch(sql) if len(matches) != 4 { return "", fmt.Errorf("no valid table name in format database.schema.table found") } return strings.Trim(matches[1], `"`), nil } // formatSQLDatabaseNameWithRegex 只对 dbname.scheme.tabname 格式进行数据库名称格式化,转为 "dbname".scheme.tabname // 在pgsql中,大小写是通过"" 双引号括起来区分的,默认pg都是转为小写的,所以这里转为 "dbname".scheme."tabname" func formatSQLDatabaseNameWithRegex(sql string) string { // 匹配 from dbname.scheme.table_name 的模式 // 使用捕获组来精确匹配数据库名称,确保后面跟着scheme和table re := regexp.MustCompile(`(?i)\bfrom\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\.\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\.\s*([a-zA-Z_][a-zA-Z0-9_]*)`) return re.ReplaceAllString(sql, `from "$1"."$2"."$3"`) } func extractColumns(sql string) ([]string, error) { // 将 SQL 转换为小写以简化匹配 sql = strings.ToLower(sql) // 匹配 SELECT 和 FROM 之间的内容 re := regexp.MustCompile(`select\s+(.*?)\s+from`) matches := re.FindStringSubmatch(sql) if len(matches) < 2 { return nil, fmt.Errorf("no columns found or invalid SQL syntax") } // 提取列部分 columnsString := matches[1] // 分割列 columns := splitColumns(columnsString) // 清理每个列名 for i, col := range columns { columns[i] = strings.TrimSpace(col) } return columns, nil } func splitColumns(columnsString string) []string { var columns []string var currentColumn strings.Builder parenthesesCount := 0 inQuotes := false for _, char := range columnsString { switch char { case '(': parenthesesCount++ currentColumn.WriteRune(char) case ')': parenthesesCount-- currentColumn.WriteRune(char) case '\'', '"': inQuotes = !inQuotes currentColumn.WriteRune(char) case ',': if parenthesesCount == 0 && !inQuotes { columns = append(columns, currentColumn.String()) currentColumn.Reset() } else { currentColumn.WriteRune(char) } default: currentColumn.WriteRune(char) } } if currentColumn.Len() > 0 { columns = append(columns, currentColumn.String()) } return columns } ================================================ FILE: datasource/prom/prom.go ================================================ package prom type Prometheus struct { PrometheusAddr string `json:"prometheus.addr"` PrometheusBasic struct { PrometheusUser string `json:"prometheus.user"` PrometheusPass string `json:"prometheus.password"` } `json:"prometheus.basic"` Headers map[string]string `json:"prometheus.headers"` PrometheusTimeout int64 `json:"prometheus.timeout"` ClusterName string `json:"prometheus.cluster_name"` WriteAddr string `json:"prometheus.write_addr"` TsdbType string `json:"prometheus.tsdb_type"` InternalAddr string `json:"prometheus.internal_addr"` } ================================================ FILE: datasource/tdengine/tdengine.go ================================================ package tdengine import ( "context" "encoding/json" "fmt" "reflect" "strconv" "strings" "time" "github.com/prometheus/common/model" "github.com/toolkits/pkg/logger" "github.com/ccfos/nightingale/v6/pkg/logx" "github.com/ccfos/nightingale/v6/datasource" td "github.com/ccfos/nightingale/v6/dskit/tdengine" "github.com/ccfos/nightingale/v6/models" "github.com/mitchellh/mapstructure" ) const ( TDEngineType = "tdengine" ) type TDengine struct { td.Tdengine `json:",inline" mapstructure:",squash"` } type TdengineQuery struct { From string `json:"from"` Interval int64 `json:"interval"` Keys Keys `json:"keys"` Query string `json:"query"` // 查询条件 Ref string `json:"ref"` // 变量 To string `json:"to"` } type Keys struct { LabelKey string `json:"labelKey"` // 多个用空格分隔 MetricKey string `json:"metricKey"` // 多个用空格分隔 TimeFormat string `json:"timeFormat"` } func init() { datasource.RegisterDatasource(TDEngineType, new(TDengine)) } func (td *TDengine) Init(settings map[string]interface{}) (datasource.Datasource, error) { newest := new(TDengine) err := mapstructure.Decode(settings, newest) return newest, err } func (td *TDengine) InitClient() error { td.InitCli() return nil } func (td *TDengine) Equal(other datasource.Datasource) bool { otherTD, ok := other.(*TDengine) if !ok { return false } if td.Addr != otherTD.Addr { return false } if td.Basic != nil && otherTD.Basic != nil { if td.Basic.User != otherTD.Basic.User { return false } if td.Basic.Password != otherTD.Basic.Password { return false } } if td.Token != otherTD.Token { return false } if td.Timeout != otherTD.Timeout { return false } if td.DialTimeout != otherTD.DialTimeout { return false } if td.MaxIdleConnsPerHost != otherTD.MaxIdleConnsPerHost { return false } if len(td.Headers) != len(otherTD.Headers) { return false } for k, v := range td.Headers { if otherV, ok := otherTD.Headers[k]; !ok || v != otherV { return false } } return true } func (td *TDengine) Validate(ctx context.Context) (err error) { return nil } func (td *TDengine) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { return nil, nil } func (td *TDengine) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { return nil, nil } func (td *TDengine) QueryData(ctx context.Context, queryParam interface{}) ([]models.DataResp, error) { return td.Query(ctx, queryParam, 0) } func (td *TDengine) QueryLog(ctx context.Context, queryParam interface{}) ([]interface{}, int64, error) { b, err := json.Marshal(queryParam) if err != nil { return nil, 0, err } var q TdengineQuery err = json.Unmarshal(b, &q) if err != nil { return nil, 0, err } if q.Interval == 0 { q.Interval = 60 } if q.From == "" { // 2023-09-21T05:37:30.000Z format to := time.Now().Unix() q.To = time.Unix(to, 0).UTC().Format(time.RFC3339) from := to - q.Interval q.From = time.Unix(from, 0).UTC().Format(time.RFC3339) } replacements := map[string]string{ "$from": fmt.Sprintf("'%s'", q.From), "$to": fmt.Sprintf("'%s'", q.To), "$interval": fmt.Sprintf("%ds", q.Interval), } for key, val := range replacements { q.Query = strings.ReplaceAll(q.Query, key, val) } if !strings.Contains(q.Query, "limit") { q.Query = q.Query + " limit 200" } data, err := td.QueryTable(q.Query) if err != nil { return nil, 0, err } return ConvertToTable(data), int64(len(data.Data)), nil } func (td *TDengine) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) { return nil, nil } func (td *TDengine) Query(ctx context.Context, query interface{}, delay ...int) ([]models.DataResp, error) { b, err := json.Marshal(query) if err != nil { return nil, err } var q TdengineQuery err = json.Unmarshal(b, &q) if err != nil { return nil, err } if q.Interval == 0 { q.Interval = 60 } delaySec := 0 if len(delay) > 0 { delaySec = delay[0] } if q.From == "" { // 2023-09-21T05:37:30.000Z format to := time.Now().Unix() - int64(delaySec) q.To = time.Unix(to, 0).UTC().Format(time.RFC3339) from := to - q.Interval q.From = time.Unix(from, 0).UTC().Format(time.RFC3339) } replacements := map[string]string{ "$from": fmt.Sprintf("'%s'", q.From), "$to": fmt.Sprintf("'%s'", q.To), "$interval": fmt.Sprintf("%ds", q.Interval), } for key, val := range replacements { q.Query = strings.ReplaceAll(q.Query, key, val) } data, err := td.QueryTable(q.Query) if err != nil { return nil, err } logx.Debugf(ctx, "tdengine query:%s result: %+v", q.Query, data) return ConvertToTStData(data, q.Keys, q.Ref) } func ConvertToTStData(src td.APIResponse, key Keys, ref string) ([]models.DataResp, error) { metricIdxMap := make(map[string]int) labelIdxMap := make(map[string]int) metricMap := make(map[string]struct{}) if key.MetricKey != "" { metricList := strings.Split(key.MetricKey, " ") for _, metric := range metricList { metricMap[metric] = struct{}{} } } labelMap := make(map[string]string) if key.LabelKey != "" { labelList := strings.Split(key.LabelKey, " ") for _, label := range labelList { labelMap[label] = label } } tsIdx := -1 for colIndex, colData := range src.ColumnMeta { colName := colData[0].(string) var colType string // 处理v2版本数字类型和v3版本字符串类型 switch t := colData[1].(type) { case float64: // v2版本数字类型映射 switch int(t) { case 1: colType = "BOOL" case 2: colType = "TINYINT" case 3: colType = "SMALLINT" case 4: colType = "INT" case 5: colType = "BIGINT" case 6: colType = "FLOAT" case 7: colType = "DOUBLE" case 8: colType = "BINARY" case 9: colType = "TIMESTAMP" case 10: colType = "NCHAR" default: colType = "UNKNOWN" } case string: // v3版本直接使用字符串类型 colType = t default: logger.Warningf("unexpected column type format: %v", colData[1]) continue } switch colType { case "TIMESTAMP": tsIdx = colIndex case "BIGINT", "INT", "INT UNSIGNED", "BIGINT UNSIGNED", "FLOAT", "DOUBLE", "SMALLINT", "SMALLINT UNSIGNED", "TINYINT", "TINYINT UNSIGNED", "BOOL": if len(metricMap) > 0 { if _, ok := metricMap[colName]; !ok { continue } metricIdxMap[colName] = colIndex } else { metricIdxMap[colName] = colIndex } default: if len(labelMap) > 0 { if _, ok := labelMap[colName]; !ok { continue } labelIdxMap[colName] = colIndex } else { labelIdxMap[colName] = colIndex } } } if tsIdx == -1 { return nil, fmt.Errorf("timestamp column not found, please check your query") } var result []models.DataResp m := make(map[string]*models.DataResp) for _, row := range src.Data { for metricName, metricIdx := range metricIdxMap { value, err := interfaceToFloat64(row[metricIdx]) if err != nil { logger.Warningf("parse %v value failed: %v", row, err) continue } metric := make(model.Metric) for labelName, labelIdx := range labelIdxMap { metric[model.LabelName(labelName)] = model.LabelValue(row[labelIdx].(string)) } metric[model.MetricNameLabel] = model.LabelValue(metricName) // transfer 2022-06-29T05:52:16.603Z to unix timestamp t, err := parseTimeString(row[tsIdx].(string)) if err != nil { logger.Warningf("parse %v timestamp failed: %v", row, err) continue } timestamp := t.Unix() if _, ok := m[metric.String()]; !ok { m[metric.String()] = &models.DataResp{ Metric: metric, Values: [][]float64{ {float64(timestamp), value}, }, } } else { m[metric.String()].Values = append(m[metric.String()].Values, []float64{float64(timestamp), value}) } } } for _, v := range m { v.Ref = ref result = append(result, *v) } return result, nil } func interfaceToFloat64(input interface{}) (float64, error) { // Check for the kind of the value first if input == nil { return 0, fmt.Errorf("unsupported type: %T", input) } kind := reflect.TypeOf(input).Kind() switch kind { case reflect.Float64: return input.(float64), nil case reflect.Float32: return float64(input.(float32)), nil case reflect.Int, reflect.Int32, reflect.Int64, reflect.Int8, reflect.Int16: return float64(reflect.ValueOf(input).Int()), nil case reflect.Uint, reflect.Uint32, reflect.Uint64, reflect.Uint8, reflect.Uint16: return float64(reflect.ValueOf(input).Uint()), nil case reflect.String: return strconv.ParseFloat(input.(string), 64) case reflect.Bool: if input.(bool) { return 1.0, nil } return 0.0, nil default: return 0, fmt.Errorf("unsupported type: %T", input) } } func parseTimeString(ts string) (time.Time, error) { // 尝试不同的时间格式 formats := []string{ // 标准格式 time.Layout, // "01/02 03:04:05PM '06 -0700" time.ANSIC, // "Mon Jan _2 15:04:05 2006" time.UnixDate, // "Mon Jan _2 15:04:05 MST 2006" time.RubyDate, // "Mon Jan 02 15:04:05 -0700 2006" time.RFC822, // "02 Jan 06 15:04 MST" time.RFC822Z, // "02 Jan 06 15:04 -0700" time.RFC850, // "Monday, 02-Jan-06 15:04:05 MST" time.RFC1123, // "Mon, 02 Jan 2006 15:04:05 MST" time.RFC1123Z, // "Mon, 02 Jan 2006 15:04:05 -0700" time.RFC3339, // "2006-01-02T15:04:05Z07:00" time.RFC3339Nano, // "2006-01-02T15:04:05.999999999Z07:00" time.Kitchen, // "3:04PM" // 实用时间戳格式 time.Stamp, // "Jan _2 15:04:05" time.StampMilli, // "Jan _2 15:04:05.000" time.StampMicro, // "Jan _2 15:04:05.000000" time.StampNano, // "Jan _2 15:04:05.000000000" time.DateTime, // "2006-01-02 15:04:05" time.DateOnly, // "2006-01-02" time.TimeOnly, // "15:04:05" // 常用自定义格式 "2006-01-02T15:04:05", // 无时区的ISO格式 "2006-01-02T15:04:05.000Z", "2006-01-02T15:04:05Z", "2006-01-02 15:04:05.999999999", // 纳秒 "2006-01-02 15:04:05.999999", // 微秒 "2006-01-02 15:04:05.999", // 毫秒 "2006/01/02", "20060102", "01/02/2006", "2006年01月02日", "2006年01月02日 15:04:05", } var lastErr error for _, format := range formats { t, err := time.Parse(format, ts) if err == nil { return t, nil } lastErr = err } // 尝试解析 Unix 时间戳 if timestamp, err := strconv.ParseInt(ts, 10, 64); err == nil { switch len(ts) { case 10: // 秒 return time.Unix(timestamp, 0), nil case 13: // 毫秒 return time.Unix(timestamp/1000, (timestamp%1000)*1000000), nil case 16: // 微秒 return time.Unix(timestamp/1000000, (timestamp%1000000)*1000), nil case 19: // 纳秒 return time.Unix(timestamp/1000000000, timestamp%1000000000), nil } } return time.Time{}, fmt.Errorf("failed to parse time with any format: %v", lastErr) } func ConvertToTable(src td.APIResponse) []interface{} { var resp []interface{} for i := range src.Data { cur := make(map[string]interface{}) for j := range src.Data[i] { cur[src.ColumnMeta[j][0].(string)] = src.Data[i][j] } resp = append(resp, cur) } return resp } ================================================ FILE: datasource/victorialogs/victorialogs.go ================================================ package victorialogs import ( "context" "fmt" "net/url" "reflect" "strconv" "time" "github.com/ccfos/nightingale/v6/datasource" "github.com/ccfos/nightingale/v6/dskit/victorialogs" "github.com/ccfos/nightingale/v6/models" "github.com/mitchellh/mapstructure" "github.com/prometheus/common/model" ) const ( VictoriaLogsType = "victorialogs" ) // VictoriaLogs 数据源实现 type VictoriaLogs struct { victorialogs.VictoriaLogs `json:",inline" mapstructure:",squash"` } // Query 查询参数 type Query struct { Query string `json:"query" mapstructure:"query"` // LogsQL 查询语句 Start int64 `json:"start" mapstructure:"start"` // 开始时间(秒) End int64 `json:"end" mapstructure:"end"` // 结束时间(秒) Time int64 `json:"time" mapstructure:"time"` // 单点时间(秒)- 用于告警 Step string `json:"step" mapstructure:"step"` // 步长,如 "1m", "5m" Limit int `json:"limit" mapstructure:"limit"` // 限制返回数量 Ref string `json:"ref" mapstructure:"ref"` // 变量引用名(如 A、B) } // IsInstantQuery 判断是否为即时查询(告警场景) func (q *Query) IsInstantQuery() bool { return q.Time > 0 || (q.Start >= 0 && q.Start == q.End) } func init() { datasource.RegisterDatasource(VictoriaLogsType, new(VictoriaLogs)) } // Init 初始化配置 func (vl *VictoriaLogs) Init(settings map[string]interface{}) (datasource.Datasource, error) { newest := new(VictoriaLogs) err := mapstructure.Decode(settings, newest) return newest, err } // InitClient 初始化客户端 func (vl *VictoriaLogs) InitClient() error { if err := vl.InitHTTPClient(); err != nil { return fmt.Errorf("failed to init victorialogs http client: %w", err) } return nil } // Validate 参数验证 func (vl *VictoriaLogs) Validate(ctx context.Context) error { if vl.VictorialogsAddr == "" { return fmt.Errorf("victorialogs.addr is required") } // 验证 URL 格式 _, err := url.Parse(vl.VictorialogsAddr) if err != nil { return fmt.Errorf("invalid victorialogs.addr: %w", err) } // 必须同时提供用户名和密码 if (vl.VictorialogsBasic.VictorialogsUser != "" && vl.VictorialogsBasic.VictorialogsPass == "") || (vl.VictorialogsBasic.VictorialogsUser == "" && vl.VictorialogsBasic.VictorialogsPass != "") { return fmt.Errorf("both username and password must be provided") } // 设置默认值 if vl.Timeout == 0 { vl.Timeout = 10000 // 默认 10 秒 } if vl.MaxQueryRows == 0 { vl.MaxQueryRows = 1000 } return nil } // Equal 验证是否相等 func (vl *VictoriaLogs) Equal(other datasource.Datasource) bool { o, ok := other.(*VictoriaLogs) if !ok { return false } return vl.VictorialogsAddr == o.VictorialogsAddr && vl.VictorialogsBasic.VictorialogsUser == o.VictorialogsBasic.VictorialogsUser && vl.VictorialogsBasic.VictorialogsPass == o.VictorialogsBasic.VictorialogsPass && vl.VictorialogsTls.SkipTlsVerify == o.VictorialogsTls.SkipTlsVerify && vl.Timeout == o.Timeout && reflect.DeepEqual(vl.Headers, o.Headers) } // QueryLog 日志查询 func (vl *VictoriaLogs) QueryLog(ctx context.Context, queryParam interface{}) ([]interface{}, int64, error) { param := new(Query) if err := mapstructure.Decode(queryParam, param); err != nil { return nil, 0, fmt.Errorf("decode query param failed: %w", err) } logs, err := vl.Query(ctx, param.Query, param.Start, param.End, param.Limit) if err != nil { return nil, 0, err } // 转换为 interface{} 数组 result := make([]interface{}, len(logs)) for i, log := range logs { result[i] = log } // 调用 HitsLogs 获取真实的 total total, err := vl.HitsLogs(ctx, param.Query, param.Start, param.End) if err != nil { // 如果获取 total 失败,使用当前结果数量 total = int64(len(logs)) } return result, total, nil } // QueryData 指标数据查询 func (vl *VictoriaLogs) QueryData(ctx context.Context, queryParam interface{}) ([]models.DataResp, error) { param := new(Query) if err := mapstructure.Decode(queryParam, param); err != nil { return nil, fmt.Errorf("decode query param failed: %w", err) } // 判断使用哪个 API if param.IsInstantQuery() { return vl.queryDataInstant(ctx, param) } return vl.queryDataRange(ctx, param) } // queryDataInstant 告警场景,调用 /select/logsql/stats_query func (vl *VictoriaLogs) queryDataInstant(ctx context.Context, param *Query) ([]models.DataResp, error) { queryTime := param.Time if queryTime == 0 { queryTime = param.End // 如果没有 time,使用 end 作为查询时间点 } if queryTime == 0 { queryTime = time.Now().Unix() } result, err := vl.StatsQuery(ctx, param.Query, queryTime) if err != nil { return nil, err } return convertPrometheusInstantToDataResp(result, param.Ref), nil } // queryDataRange 看图场景,调用 /select/logsql/stats_query_range func (vl *VictoriaLogs) queryDataRange(ctx context.Context, param *Query) ([]models.DataResp, error) { step := param.Step if step == "" { // 根据时间范围计算合适的步长 duration := param.End - param.Start if duration <= 3600 { step = "1m" // 1 小时内,1 分钟步长 } else if duration <= 86400 { step = "5m" // 1 天内,5 分钟步长 } else { step = "1h" // 超过 1 天,1 小时步长 } } result, err := vl.StatsQueryRange(ctx, param.Query, param.Start, param.End, step) if err != nil { return nil, err } return convertPrometheusRangeToDataResp(result, param.Ref), nil } // convertPrometheusInstantToDataResp 将 Prometheus Instant Query 格式转换为 DataResp func convertPrometheusInstantToDataResp(resp *victorialogs.PrometheusResponse, ref string) []models.DataResp { var dataResps []models.DataResp for _, item := range resp.Data.Result { dataResp := models.DataResp{ Ref: ref, } // 转换 Metric dataResp.Metric = make(model.Metric) for k, v := range item.Metric { dataResp.Metric[model.LabelName(k)] = model.LabelValue(v) } if len(item.Value) == 2 { // [timestamp, value] timestamp := item.Value[0].(float64) value, _ := strconv.ParseFloat(item.Value[1].(string), 64) dataResp.Values = [][]float64{ {timestamp, value}, } } dataResps = append(dataResps, dataResp) } return dataResps } // convertPrometheusRangeToDataResp 将 Prometheus Range Query 格式转换为 DataResp func convertPrometheusRangeToDataResp(resp *victorialogs.PrometheusResponse, ref string) []models.DataResp { var dataResps []models.DataResp for _, item := range resp.Data.Result { dataResp := models.DataResp{ Ref: ref, } // 转换 Metric dataResp.Metric = make(model.Metric) for k, v := range item.Metric { dataResp.Metric[model.LabelName(k)] = model.LabelValue(v) } var values [][]float64 for _, v := range item.Values { if len(v) == 2 { timestamp := v[0].(float64) value, _ := strconv.ParseFloat(v[1].(string), 64) values = append(values, []float64{timestamp, value}) } } dataResp.Values = values dataResps = append(dataResps, dataResp) } return dataResps } // MakeLogQuery 构造日志查询参数 func (vl *VictoriaLogs) MakeLogQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { q := &Query{ Start: start, End: end, Limit: 1000, } // 如果 query 是字符串,直接使用 if queryStr, ok := query.(string); ok { q.Query = queryStr } else if queryMap, ok := query.(map[string]interface{}); ok { // 如果是 map,尝试提取 query 字段 if qStr, exists := queryMap["query"]; exists { q.Query = fmt.Sprintf("%v", qStr) } if limit, exists := queryMap["limit"]; exists { if limitInt, ok := limit.(int); ok { q.Limit = limitInt } else if limitFloat, ok := limit.(float64); ok { q.Limit = int(limitFloat) } } } return q, nil } // MakeTSQuery 构造时序查询参数 func (vl *VictoriaLogs) MakeTSQuery(ctx context.Context, query interface{}, eventTags []string, start, end int64) (interface{}, error) { q := &Query{ Start: start, End: end, } // 如果 query 是字符串,直接使用 if queryStr, ok := query.(string); ok { q.Query = queryStr } else if queryMap, ok := query.(map[string]interface{}); ok { // 如果是 map,提取相关字段 if qStr, exists := queryMap["query"]; exists { q.Query = fmt.Sprintf("%v", qStr) } if step, exists := queryMap["step"]; exists { q.Step = fmt.Sprintf("%v", step) } } return q, nil } // QueryMapData 用于告警事件生成时获取额外数据 func (vl *VictoriaLogs) QueryMapData(ctx context.Context, query interface{}) ([]map[string]string, error) { param := new(Query) if err := mapstructure.Decode(query, param); err != nil { return nil, err } // 扩大查询范围,解决时间滞后问题 if param.End > 0 && param.Start > 0 { param.Start = param.Start - 30 } // 限制只取 1 条 param.Limit = 1 logs, _, err := vl.QueryLog(ctx, param) if err != nil { return nil, err } var result []map[string]string for _, log := range logs { if logMap, ok := log.(map[string]interface{}); ok { strMap := make(map[string]string) for k, v := range logMap { strMap[k] = fmt.Sprintf("%v", v) } result = append(result, strMap) break // 只取第一条 } } return result, nil } ================================================ FILE: doc/README.bak.md ================================================

nightingale - cloud native monitoring

GitHub latest release Docs Docker pulls GitHub Repo stars GitHub Repo issues GitHub Repo issues closed GitHub forks GitHub contributors GitHub contributors License

All-in-one 的开源观测平台
开箱即用,集数据采集、可视化、监控告警于一体
推荐升级您的 Prometheus + AlertManager + Grafana + ELK + Jaeger 组合方案到夜莺!

[English](./README_en.md) | [中文](./README.md) ## 功能和特点 - **开箱即用** - 支持 Docker、Helm Chart、云服务等多种部署方式,集数据采集、监控告警、可视化为一体,内置多种监控仪表盘、快捷视图、告警规则模板,导入即可快速使用,**大幅降低云原生监控系统的建设成本、学习成本、使用成本**; - **专业告警** - 可视化的告警配置和管理,支持丰富的告警规则,提供屏蔽规则、订阅规则的配置能力,支持告警多种送达渠道,支持告警自愈、告警事件管理等; - **推荐您使用夜莺的同时,无缝搭配[FlashDuty](https://flashcat.cloud/product/flashcat-duty/),实现告警聚合收敛、认领、升级、排班、协同,让告警的触达既高效,又确保告警处理不遗漏、做到件件有回响**。 - **云原生** - 以交钥匙的方式快速构建企业级的云原生监控体系,支持 [Categraf](https://github.com/flashcatcloud/categraf)、Telegraf、Grafana-agent 等多种采集器,支持 Prometheus、VictoriaMetrics、M3DB、ElasticSearch、Jaeger 等多种数据源,兼容支持导入 Grafana 仪表盘,**与云原生生态无缝集成**; - **高性能 高可用** - 得益于夜莺的多数据源管理引擎,和夜莺引擎侧优秀的架构设计,借助于高性能时序库,可以满足数亿时间线的采集、存储、告警分析场景,节省大量成本; - 夜莺监控组件均可水平扩展,无单点,已在上千家企业部署落地,经受了严苛的生产实践检验。众多互联网头部公司,夜莺集群机器达百台,处理数亿级时间线,重度使用夜莺监控; - **灵活扩展 中心化管理** - 夜莺监控,可部署在 1 核 1G 的云主机,可在上百台机器集群化部署,可运行在 K8s 中;也可将时序库、告警引擎等组件下沉到各机房、各 Region,兼顾边缘部署和中心化统一管理,**解决数据割裂,缺乏统一视图的难题**; - **开放社区** - 托管于[中国计算机学会开源发展委员会](https://www.ccf.org.cn/kyfzwyh/),有[快猫星云](https://flashcat.cloud)和众多公司的持续投入,和数千名社区用户的积极参与,以及夜莺监控项目清晰明确的定位,都保证了夜莺开源社区健康、长久的发展。活跃、专业的社区用户也在持续迭代和沉淀更多的最佳实践于产品中; ## 使用场景 1. **如果您希望在一个平台中,统一管理和查看 Metrics、Logging、Tracing 数据,推荐你使用夜莺**: - 请参考阅读:[不止于监控,夜莺 V6 全新升级为开源观测平台](http://flashcat.cloud/blog/nightingale-v6-release/) 2. **如果您在使用 Prometheus 过程中,有以下的一个或者多个需求场景,推荐您无缝升级到夜莺**: - Prometheus、Alertmanager、Grafana 等多个系统较为割裂,缺乏统一视图,无法开箱即用; - 通过修改配置文件来管理 Prometheus、Alertmanager 的方式,学习曲线大,协同有难度; - 数据量过大而无法扩展您的 Prometheus 集群; - 生产环境运行多套 Prometheus 集群,面临管理和使用成本高的问题; 3. **如果您在使用 Zabbix,有以下的场景,推荐您升级到夜莺**: - 监控的数据量太大,希望有更好的扩展解决方案; - 学习曲线高,多人多团队模式下,希望有更好的协同使用效率; - 微服务和云原生架构下,监控数据的生命周期多变、监控数据维度基数高,Zabbix 数据模型不易适配; - 了解更多Zabbix和夜莺监控的对比,推荐您进一步阅读[Zabbix 和夜莺监控选型对比](https://flashcat.cloud/blog/zabbx-vs-nightingale/) 4. **如果您在使用 [Open-Falcon](https://github.com/open-falcon/falcon-plus),我们推荐您升级到夜莺:** - 关于 Open-Falcon 和夜莺的详细介绍,请参考阅读:[云原生监控的十个特点和趋势](http://flashcat.cloud/blog/10-trends-of-cloudnative-monitoring/) - 监控系统和可观测平台的区别,请参考阅读:[从监控系统到可观测平台,Gap有多大 ](https://flashcat.cloud/blog/gap-of-monitoring-to-o11y/) 5. **我们推荐您使用 [Categraf](https://github.com/flashcatcloud/categraf) 作为首选的监控数据采集器**: - [Categraf](https://github.com/flashcatcloud/categraf) 是夜莺监控的默认采集器,采用开放插件机制和 All-in-one 的设计理念,同时支持 metric、log、trace、event 的采集。Categraf 不仅可以采集 CPU、内存、网络等系统层面的指标,也集成了众多开源组件的采集能力,支持K8s生态。Categraf 内置了对应的仪表盘和告警规则,开箱即用。 ## 文档 [English Doc](https://n9e.github.io/) | [中文文档](https://flashcat.cloud/docs/) ## 产品示意图 https://user-images.githubusercontent.com/792850/216888712-2565fcea-9df5-47bd-a49e-d60af9bd76e8.mp4 ## 夜莺架构 夜莺监控可以接收各种采集器上报的监控数据(比如 [Categraf](https://github.com/flashcatcloud/categraf)、telegraf、grafana-agent、Prometheus),并写入多种流行的时序数据库中(可以支持Prometheus、M3DB、VictoriaMetrics、Thanos、TDEngine等),提供告警规则、屏蔽规则、订阅规则的配置能力,提供监控数据的查看能力,提供告警自愈机制(告警触发之后自动回调某个webhook地址或者执行某个脚本),提供历史告警事件的存储管理、分组查看的能力。 ### 中心汇聚式部署方案 ![中心汇聚式部署方案](https://download.flashcat.cloud/ulric/20230327133406.png) 夜莺只有一个模块,就是 n9e,可以部署多个 n9e 实例组成集群,n9e 依赖 2 个存储,数据库、Redis,数据库可以使用 MySQL 或 Postgres,自己按需选用。 n9e 提供的是 HTTP 接口,前面负载均衡可以是 4 层的,也可以是 7 层的。一般就选用 Nginx 就可以了。 n9e 这个模块接收到数据之后,需要转发给后端的时序库,相关配置是: ```toml [Pushgw] LabelRewrite = true [[Pushgw.Writers]] Url = "http://127.0.0.1:9090/api/v1/write" ``` > 注意:虽然数据源可以在页面配置了,但是上报转发链路,还是需要在配置文件指定。 所有机房的 agent( 比如 Categraf、Telegraf、 Grafana-agent、Datadog-agent ),都直接推数据给 n9e,这个架构最为简单,维护成本最低。当然,前提是要求机房之间网络链路比较好,一般有专线。如果网络链路不好,则要使用下面的部署方式了。 ### 边缘下沉式混杂部署方案 ![边缘下沉式混杂部署方案](https://download.flashcat.cloud/ulric/20230327135615.png) 这个图尝试解释 3 种不同的情形,比如 A 机房和中心网络链路很好,Categraf 可以直接汇报数据给中心 n9e 模块,另一个机房网络链路不好,就需要把时序库下沉部署,时序库下沉了,对应的告警引擎和转发网关也都要跟随下沉,这样数据不会跨机房传输,比较稳定。但是心跳还是需要往中心心跳,要不然在对象列表里看不到机器的 CPU、内存使用率。还有的时候,可能是接入的一个已有的 Prometheus,数据采集没有走 Categraf,那此时只需要把 Prometheus 作为数据源接入夜莺即可,可以在夜莺里看图、配告警规则,但是就是在对象列表里看不到,也不能使用告警自愈的功能,问题也不大,核心功能都不受影响。 边缘机房,下沉部署时序库、告警引擎、转发网关的时候,要注意,告警引擎需要依赖数据库,因为要同步告警规则,转发网关也要依赖数据库,因为要注册对象到数据库里去,需要打通相关网络,告警引擎和转发网关都不用Redis,所以无需为 Redis 打通网络。 ### VictoriaMetrics 集群架构 如果单机版本的时序数据库(比如 Prometheus) 性能有瓶颈或容灾较差,我们推荐使用 [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics),VictoriaMetrics 架构较为简单,性能优异,易于部署和运维,架构图如上。VictoriaMetrics 更详尽的文档,还请参考其[官网](https://victoriametrics.com/)。 ## 夜莺社区 开源项目要更有生命力,离不开开放的治理架构和源源不断的开发者和用户共同参与,我们致力于建立开放、中立的开源治理架构,吸纳更多来自企业、高校等各方面对云原生监控感兴趣、有热情的开发者,一起打造有活力的夜莺开源社区。关于《夜莺开源项目和社区治理架构(草案)》,请查阅 [COMMUNITY GOVERNANCE](./doc/community-governance.md). **我们欢迎您以各种方式参与到夜莺开源项目和开源社区中来,工作包括不限于**: - 补充和完善文档 => [n9e.github.io](https://n9e.github.io/) - 分享您在使用夜莺监控过程中的最佳实践和经验心得 => [文章分享](https://flashcat.cloud/docs/content/flashcat-monitor/nightingale/share/) - 提交产品建议 =》 [github issue](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=kind%2Ffeature&template=enhancement.md) - 提交代码,让夜莺监控更快、更稳、更好用 => [github pull request](https://github.com/ccfos/nightingale/pulls) **尊重、认可和记录每一位贡献者的工作**是夜莺开源社区的第一指导原则,我们提倡**高效的提问**,这既是对开发者时间的尊重,也是对整个社区知识沉淀的贡献: - 提问之前请先查阅 [FAQ](https://www.gitlink.org.cn/ccfos/nightingale/wiki/faq) - 我们使用[论坛](https://answer.flashcat.cloud/)进行交流,有问题可以到这里搜索、提问 ## Who is using Nightingale 您可以通过在 **[Who is Using Nightingale](https://github.com/ccfos/nightingale/issues/897)** 登记您的使用情况,分享您的使用经验。 ## Stargazers over time [![Stargazers over time](https://starchart.cc/ccfos/nightingale.svg)](https://starchart.cc/ccfos/nightingale) ## Contributors ## License [Apache License V2.0](https://github.com/ccfos/nightingale/blob/main/LICENSE) ## 加入交流群 ================================================ FILE: doc/active-contributors.md ================================================ ## Active Contributors - [xiaoziv](https://github.com/xiaoziv) - [tanxiao1990](https://github.com/tanxiao1990) - [bbaobelief](https://github.com/bbaobelief) - [freedomkk-qfeng](https://github.com/freedomkk-qfeng) - [lsy1990](https://github.com/lsy1990) ================================================ FILE: doc/committers.md ================================================ ## Committers - [YeningQin](https://github.com/710leo) - [FeiKong](https://github.com/kongfei605) - [XiaqingDai](https://github.com/jsers) ================================================ FILE: doc/community-governance.md ================================================ [夜莺监控](https://github.com/ccfos/nightingale "夜莺监控")是一款开源云原生监控系统,由滴滴设计开发,2020 年 3 月份开源之后,凭借其优秀的产品设计、灵活性架构和明确清晰的定位,夜莺监控快速发展为国内最活跃的企业级云原生监控方案。[截止当前](具体指2022年8月 "截止当前"),在 [Github](https://github.com/ccfos/nightingale "Github") 上已经迭代发布了 **70** 多个版本,获得了 **5K** 多个 Star,**80** 多位代码贡献者。快速的迭代,也让夜莺监控的用户群越来越大,涉及各行各业。 更进一步,夜莺监控于 2022 年 5 月 11 日,正式捐赠予中国计算机学会开源发展委员会 [CCF ODC](https://www.ccf.org.cn/kyfzwyh/ "CCF ODC"),为 CCF ODC 成立后接受捐赠的第一个开源项目。 开源项目要更有生命力,离不开开放的治理架构和源源不断的开发者共同参与。夜莺监控项目加入 CCF 开源大家庭后,能在计算机学会的支持和带动下,进一步结合云原生、可观测、国产化等多个技术发展的需求,建立开放、中立的开源治理架构,打造更专业、有活力的开发者社区。 **今天,我们郑重发布夜莺监控开源社区治理架构,并公示相关的任命和社区荣誉,期待开源的道路上,一起同行。** # 夜莺监控开源社区架构 ### User|用户 > 欢迎任何个人、公司以及组织,使用夜莺监控,并积极的反馈 bug、提交功能需求、以及相互帮助,我们推荐使用 [Github Issue](https://github.com/ccfos/nightingale/issues "Github Issue") 来跟踪 bug 和管理需求。 社区用户,可以通过在 **[Who is Using Nightingale](https://github.com/ccfos/nightingale/issues/897 "Who is Using Nightingale")** 登记您的使用情况,并分享您使用夜莺监控的经验,将会自动进入 **[END USERS](https://github.com/ccfos/nightingale/blob/main/doc/end-users.md "END USERS")** 文件列表,并获得社区的 **VIP Support**。 ### Contributor|贡献者 > 欢迎每一位用户,包括但不限于以下方式参与到夜莺开源社区并做出贡献: 1. 在 [Github Issue](https://github.com/ccfos/nightingale/issues "Github Issue") 中积极参与讨论,参与社区活动; 1. 提交代码补丁; 1. 翻译、修订、补充和完善[文档](https://n9e.github.io "文档"); 1. 分享夜莺监控的使用经验,积极布道; 1. 提交建议 / 批评; 年度累计向 [CCFOS/NIGHTINGALE](https://github.com/ccfos/nightingale "CCFOS/NIGHTINGALE") 提交 **5** 个PR(被合并),或者因为其他贡献被**项目管委会**一致认可,将会自动进入到 **[ACTIVE CONTRIBUTORS](https://github.com/ccfos/nightingale/blob/main/doc/active-contributors.md "ACTIVE CONTRIBUTORS")** 列表,并获得夜莺开源社区颁发的证书,享有夜莺开源社区一定的权益和福利。 所有向 [CCFOS/NIGHTINGALE](https://github.com/ccfos/nightingale "CCFOS/NIGHTINGALE") 提交过PR(被合并),或者做出过重要贡献的 Contributor,都会被永久记载于 [CONTRIBUTORS](https://github.com/ccfos/nightingale/blob/main/doc/contributors.md "CONTRIBUTORS") 列表。 ### Committer|提交者 > Committer 是指拥有 [CCFOS/NIGHTINGALE](https://github.com/ccfos/nightingale "CCFOS/NIGHTINGALE") 代码仓库写操作权限的贡献者。原则上 Committer 能够自主决策某个代码补丁是否可以合入到夜莺代码仓库,但是项目管委会拥有最终的决策权。 Committer 承担以下一个或多个职责: - 积极回应 Issues; - Review PRs; - 参加开发者例行会议,积极讨论项目规划和技术方案; - 代表夜莺开源社区出席相关技术会议并做演讲; Committer 记录并公示于 **[COMMITTERS](https://github.com/ccfos/nightingale/blob/main/doc/committers.md "COMMITTERS")** 列表,并获得夜莺开源社区颁发的证书,以及享有夜莺开源社区的各种权益和福利。 ### PMC|项目管委会 > PMC(项目管委会)作为一个实体,来管理和领导夜莺项目,为整个项目的发展全权负责。项目管委会相关内容记录并公示于文件[PMC](https://github.com/ccfos/nightingale/blob/main/doc/pmc.md "PMC"). - 项目管委会成员(PMC Member),从 Contributor 或者 Committer 中选举产生,他们拥有 [CCFOS/NIGHTINGALE](https://github.com/ccfos/nightingale "CCFOS/NIGHTINGALE") 代码仓库的写操作权限,拥有 Nightingale 社区相关事务的投票权、以及提名 Committer 候选人的权利。 - 项目管委会主席(PMC Chair),从项目管委会成员中投票产生。管委会主席是 **[CCF ODC](https://www.ccf.org.cn/kyfzwyh/ "CCF ODC")** 和项目管委会之间的沟通桥梁,履行特定的项目管理职责。 ## Communication|沟通机制 1. 我们推荐使用邮件列表来反馈建议(待发布); 2. 我们推荐使用 [Github Issue](https://github.com/ccfos/nightingale/issues "Github Issue") 跟踪 bug 和管理需求; 3. 我们推荐使用 [Github Milestone](https://github.com/ccfos/nightingale/milestones "Github Milestone") 来管理项目进度和规划; 4. 我们推荐使用腾讯会议来定期召开项目例会(会议 ID 待发布); ## Documentation|文档 1. 我们推荐使用 [Github Pages](https://n9e.github.io "Github Pages") 来沉淀文档; 2. 我们推荐使用 [Gitlink Wiki](https://www.gitlink.org.cn/ccfos/nightingale/wiki/faq "Gitlink Wiki") 来沉淀 FAQ; ## Operation|运营机制 1. 我们定期组织用户、贡献者、项目管委会成员之间的沟通会议,讨论项目开发的目标、方案、进度,以及讨论相关需求的合理性、优先级等议题; 2. 我们定期组织 meetup (线上&线下),创造良好的用户交流分享环境,并沉淀相关内容到文档站点; 3. 我们定期组织夜莺开发者大会,分享 [best user story](https://n9e.github.io/docs/prologue/share/ "best user story")、同步年度开发目标和计划、讨论新技术方向等; ## Philosophy|社区指导原则 >尊重、认可和记录每一位贡献者的工作。 ## 关于提问的原则 按照**尊重、认可、记录每一位贡献者的工作**原则,我们提倡**高效的提问**,这既是对开发者时间的尊重,也是对整个社区的知识沉淀的贡献: 1. 提问之前请先查阅 [FAQ](https://www.gitlink.org.cn/ccfos/nightingale/wiki/faq "FAQ") ; 2. 提问之前请先搜索 [Github Issues](https://github.com/ccfos/nightingale/issues "Github Issue"); 3. 我们优先推荐通过提交 [Github Issue](https://github.com/ccfos/nightingale/issues "Github Issue") 来提问,如果[有问题点击这里](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=kind%2Fbug&template=bug_report.yml "有问题点击这里") | [有需求建议点击这里](https://github.com/ccfos/nightingale/issues/new?assignees=&labels=kind%2Ffeature&template=enhancement.md "有需求建议点击这里"); ================================================ FILE: doc/contributors.md ================================================ ## Contributors ================================================ FILE: doc/end-users.md ================================================ ## End Users - [中移动](https://github.com/ccfos/nightingale/issues/897#issuecomment-1086573166) - [inke](https://github.com/ccfos/nightingale/issues/897#issuecomment-1099840636) - [方正证券](https://github.com/ccfos/nightingale/issues/897#issuecomment-1110492461) ================================================ FILE: doc/pmc.md ================================================ ### PMC Chair - [laiwei](https://github.com/laiwei) ### PMC Co-Chair - [UlricQin](https://github.com/UlricQin) ### PMC Member ================================================ FILE: doc/server-dash.json ================================================ { "name": "夜莺大盘", "tags": "", "configs": { "var": [], "panels": [ { "targets": [ { "refId": "A", "expr": "rate(n9e_server_samples_received_total[1m])" } ], "name": "每秒接收的数据点个数", "options": { "tooltip": { "mode": "all", "sort": "none" }, "legend": { "displayMode": "hidden" }, "standardOptions": {}, "thresholds": {} }, "custom": { "drawStyle": "lines", "lineInterpolation": "smooth", "fillOpacity": 0.5, "stack": "off" }, "version": "2.0.0", "type": "timeseries", "layout": { "h": 4, "w": 12, "x": 0, "y": 0, "i": "53fcb9dc-23f9-41e0-bc5e-121eed14c3a4", "isResizable": true }, "id": "53fcb9dc-23f9-41e0-bc5e-121eed14c3a4" }, { "targets": [ { "refId": "A", "expr": "rate(n9e_server_alerts_total[10m])" } ], "name": "每秒产生的告警事件个数", "options": { "tooltip": { "mode": "all", "sort": "none" }, "legend": { "displayMode": "hidden" }, "standardOptions": {}, "thresholds": {} }, "custom": { "drawStyle": "lines", "lineInterpolation": "smooth", "fillOpacity": 0.5, "stack": "off" }, "version": "2.0.0", "type": "timeseries", "layout": { "h": 4, "w": 12, "x": 12, "y": 0, "i": "47fc6252-9cc8-4b53-8e27-0c5c59a47269", "isResizable": true }, "id": "f70dcb8b-b58b-4ef9-9e48-f230d9e17140" }, { "targets": [ { "refId": "A", "expr": "n9e_server_alert_queue_size" } ], "name": "告警事件内存队列长度", "options": { "tooltip": { "mode": "all", "sort": "none" }, "legend": { "displayMode": "hidden" }, "standardOptions": {}, "thresholds": {} }, "custom": { "drawStyle": "lines", "lineInterpolation": "smooth", "fillOpacity": 0.5, "stack": "off" }, "version": "2.0.0", "type": "timeseries", "layout": { "h": 4, "w": 12, "x": 0, "y": 4, "i": "ad1af16c-de0c-45f4-8875-cea4e85d51d0", "isResizable": true }, "id": "caf23e58-d907-42b0-9ed6-722c8c6f3c5f" }, { "targets": [ { "refId": "A", "expr": "n9e_server_http_request_duration_seconds_sum/n9e_server_http_request_duration_seconds_count" } ], "name": "数据接收接口平均响应时间(单位:秒)", "options": { "tooltip": { "mode": "all", "sort": "desc" }, "legend": { "displayMode": "hidden" }, "standardOptions": {}, "thresholds": {} }, "custom": { "drawStyle": "lines", "lineInterpolation": "smooth", "fillOpacity": 0.5, "stack": "normal" }, "version": "2.0.0", "type": "timeseries", "layout": { "h": 4, "w": 12, "x": 12, "y": 4, "i": "64c3abc2-404c-4462-a82f-c109a21dac91", "isResizable": true }, "id": "6b8d2db1-efca-4b9e-b429-57a9d2272bc5" }, { "targets": [ { "refId": "A", "expr": "n9e_server_sample_queue_size" } ], "name": "内存数据队列长度", "options": { "tooltip": { "mode": "all", "sort": "desc" }, "legend": { "displayMode": "hidden" }, "standardOptions": {}, "thresholds": {} }, "custom": { "drawStyle": "lines", "lineInterpolation": "smooth", "fillOpacity": 0.5, "stack": "off" }, "version": "2.0.0", "type": "timeseries", "layout": { "h": 4, "w": 12, "x": 0, "y": 8, "i": "1c7da942-58c2-40dc-b42f-983e4a35b89b", "isResizable": true }, "id": "bd41677d-40d3-482e-bb6e-fbd25df46d87" }, { "targets": [ { "refId": "A", "expr": "avg(n9e_server_forward_duration_seconds_sum/n9e_server_forward_duration_seconds_count)" } ], "name": "数据发往TSDB平均耗时(单位:秒)", "options": { "tooltip": { "mode": "all", "sort": "desc" }, "legend": { "displayMode": "hidden" }, "standardOptions": { "decimals": 8 }, "thresholds": {} }, "custom": { "drawStyle": "lines", "lineInterpolation": "smooth", "fillOpacity": 0.5, "stack": "normal" }, "version": "2.0.0", "type": "timeseries", "layout": { "h": 4, "w": 12, "x": 12, "y": 8, "i": "eed94a0b-954f-48ac-82e5-a2eada1c8a3d", "isResizable": true }, "id": "c8642e72-f384-46a5-8410-1e6be2953c3c" } ], "version": "2.0.0" } } ================================================ FILE: docker/.dockerignore ================================================ compose-host-network compose-postgres compose-bridge initsql build.sh ================================================ FILE: docker/Dockerfile.goreleaser ================================================ FROM --platform=$TARGETPLATFORM python:3-slim WORKDIR /app ADD n9e /app/ ADD etc /app/etc/ ADD integrations /app/integrations/ RUN pip install requests Jinja2 EXPOSE 17000 CMD ["/app/n9e", "-h"] ================================================ FILE: docker/Dockerfile.goreleaser.arm64 ================================================ FROM --platform=$TARGETPLATFORM python:3-slim WORKDIR /app ADD n9e /app/ ADD etc /app/etc/ ADD integrations /app/integrations/ EXPOSE 17000 CMD ["/app/n9e", "-h"] ================================================ FILE: docker/build.sh ================================================ #!/bin/sh if [ $# -ne 1 ]; then echo "$0 " exit 0 fi tag=$1 echo "tag: ${tag}" rm -rf n9e pub cp ../n9e . docker build -t nightingale:${tag} . docker tag nightingale:${tag} ulric2019/nightingale:${tag} docker push ulric2019/nightingale:${tag} rm -rf n9e pub ================================================ FILE: docker/compose-bridge/docker-compose.yaml ================================================ networks: nightingale: driver: bridge services: mysql: image: "mysql:8" container_name: mysql hostname: mysql restart: always environment: TZ: Asia/Shanghai MYSQL_ROOT_PASSWORD: 1234 volumes: - ./mysqldata:/var/lib/mysql/ - ../initsql:/docker-entrypoint-initdb.d/ - ./etc-mysql/my.cnf:/etc/my.cnf networks: - nightingale ports: - "3306:3306" redis: image: "redis:6.2" container_name: redis hostname: redis restart: always environment: TZ: Asia/Shanghai networks: - nightingale ports: - "6379:6379" # prometheus: # image: prom/prometheus # container_name: prometheus # hostname: prometheus # restart: always # environment: # TZ: Asia/Shanghai # volumes: # - ./etc-prometheus:/etc/prometheus # command: # - "--config.file=/etc/prometheus/prometheus.yml" # - "--storage.tsdb.path=/prometheus" # - "--web.console.libraries=/usr/share/prometheus/console_libraries" # - "--web.console.templates=/usr/share/prometheus/consoles" # - "--enable-feature=remote-write-receiver" # - "--query.lookback-delta=2m" # networks: # - nightingale # ports: # - "9090:9090" victoriametrics: image: victoriametrics/victoria-metrics:v1.79.12 container_name: victoriametrics hostname: victoriametrics restart: always environment: TZ: Asia/Shanghai ports: - "8428:8428" networks: - nightingale command: - "--loggerTimezone=Asia/Shanghai" nightingale: image: flashcatcloud/nightingale:latest container_name: nightingale hostname: nightingale restart: always environment: GIN_MODE: release TZ: Asia/Shanghai WAIT_HOSTS: mysql:3306, redis:6379 volumes: - ./etc-nightingale:/app/etc networks: - nightingale ports: - "17000:17000" - "20090:20090" depends_on: - mysql - redis - victoriametrics command: - /app/n9e categraf: image: "flashcatcloud/categraf:latest" container_name: "categraf" hostname: "categraf01" restart: always environment: TZ: Asia/Shanghai HOST_PROC: /hostfs/proc HOST_SYS: /hostfs/sys HOST_MOUNT_PREFIX: /hostfs WAIT_HOSTS: nightingale:17000, nightingale:20090 volumes: - ./etc-categraf:/etc/categraf/conf - /:/hostfs networks: - nightingale depends_on: - nightingale ================================================ FILE: docker/compose-bridge/etc-categraf/config.toml ================================================ [global] # whether print configs print_configs = false # add label(agent_hostname) to series # "" -> auto detect hostname # "xx" -> use specified string xx # "$hostname" -> auto detect hostname # "$ip" -> auto detect ip # "$hostname-$ip" -> auto detect hostname and ip to replace the vars hostname = "$HOSTNAME" # will not add label(agent_hostname) if true omit_hostname = false # s | ms precision = "ms" # global collect interval interval = 15 # [global.labels] # source="categraf" # region = "shanghai" # env = "localhost" [writer_opt] # default: 2000 batch = 2000 # channel(as queue) size chan_size = 10000 [[writers]] url = "http://nightingale:17000/prometheus/v1/write" # Basic auth username basic_auth_user = "" # Basic auth password basic_auth_pass = "" # timeout settings, unit: ms timeout = 5000 dial_timeout = 2500 max_idle_conns_per_host = 100 [http] enable = false address = ":9100" print_access = false run_mode = "release" [heartbeat] enable = true # report os version cpu.util mem.util metadata url = "http://nightingale:17000/v1/n9e/heartbeat" # interval, unit: s interval = 10 # Basic auth username basic_auth_user = "" # Basic auth password basic_auth_pass = "" ## Optional headers # headers = ["X-From", "categraf", "X-Xyz", "abc"] # timeout settings, unit: ms timeout = 5000 dial_timeout = 2500 max_idle_conns_per_host = 100 [ibex] enable = true ## ibex flush interval interval = "1000ms" ## n9e ibex server rpc address servers = ["nightingale:20090"] ## temp script dir meta_dir = "./meta" ================================================ FILE: docker/compose-bridge/etc-categraf/input.cpu/cpu.toml ================================================ # # collect interval # interval = 15 # # whether collect per cpu # collect_per_cpu = false ================================================ FILE: docker/compose-bridge/etc-categraf/input.disk/disk.toml ================================================ # # collect interval # interval = 15 # # By default stats will be gathered for all mount points. # # Set mount_points will restrict the stats to only the specified mount points. mount_points = ["/"] # Ignore mount points by filesystem type. ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs", "nsfs"] ================================================ FILE: docker/compose-bridge/etc-categraf/input.diskio/diskio.toml ================================================ # # collect interval # interval = 15 # # By default, categraf will gather stats for all devices including disk partitions. # # Setting devices will restrict the stats to the specified devices. # devices = ["sda", "sdb", "vd*"] ================================================ FILE: docker/compose-bridge/etc-categraf/input.kernel/kernel.toml ================================================ # # collect interval # interval = 15 ================================================ FILE: docker/compose-bridge/etc-categraf/input.mem/mem.toml ================================================ # # collect interval # interval = 15 # # whether collect platform specified metrics collect_platform_fields = true ================================================ FILE: docker/compose-bridge/etc-categraf/input.mysql/mysql.toml ================================================ [[instances]] address = "mysql:3306" username = "root" password = "1234" # # set tls=custom to enable tls # parameters = "tls=false" # extra_status_metrics = true # extra_innodb_metrics = false # gather_processlist_processes_by_state = false # gather_processlist_processes_by_user = false # gather_schema_size = true # gather_table_size = false # gather_system_table_size = false # gather_slave_status = true # # timeout # timeout_seconds = 3 # # interval = global.interval * interval_times # interval_times = 1 # important! use global unique string to specify instance labels = { instance="docker-compose-mysql" } ## Optional TLS Config # use_tls = false # tls_min_version = "1.2" # tls_ca = "/etc/categraf/ca.pem" # tls_cert = "/etc/categraf/cert.pem" # tls_key = "/etc/categraf/key.pem" ## Use TLS but skip chain & host verification # insecure_skip_verify = true #[[instances.queries]] # measurement = "lock_wait" # metric_fields = [ "total" ] # timeout = "3s" # request = ''' #SELECT count(*) as total FROM information_schema.innodb_trx WHERE trx_state='LOCK WAIT' #''' ================================================ FILE: docker/compose-bridge/etc-categraf/input.net/net.toml ================================================ # # collect interval # interval = 15 # # whether collect protocol stats on Linux # collect_protocol_stats = false # # setting interfaces will tell categraf to gather these explicit interfaces # interfaces = ["eth0"] ================================================ FILE: docker/compose-bridge/etc-categraf/input.netstat/netstat.toml ================================================ # # collect interval # interval = 15 ================================================ FILE: docker/compose-bridge/etc-categraf/input.processes/processes.toml ================================================ # # collect interval # interval = 15 # # force use ps command to gather # force_ps = false # # force use /proc to gather # force_proc = false ================================================ FILE: docker/compose-bridge/etc-categraf/input.prometheus/prometheus.toml ================================================ [[instances]] urls = [ "http://nightingale:17000/metrics" ] ================================================ FILE: docker/compose-bridge/etc-categraf/input.redis/redis.toml ================================================ [[instances]] address = "redis:6379" username = "" password = "" # pool_size = 2 ## 是否开启slowlog 收集 # gather_slowlog = true ## 最多收集少条slowlog # slowlog_max_len = 100 ## 收集距离现在多少秒以内的slowlog ## 注意插件的采集周期,该参数不要小于采集周期,否则会有slowlog查不到 # slowlog_time_window=30 # 指标 # redis_slow_log{ident=dev-01 client_addr=127.0.0.1:56364 client_name= cmd="info ALL" log_id=983} 74 (单位微秒) # # Optional. Specify redis commands to retrieve values # commands = [ # {command = ["get", "sample-key1"], metric = "custom_metric_name1"}, # {command = ["get", "sample-key2"], metric = "custom_metric_name2"} # ] # # interval = global.interval * interval_times # interval_times = 1 # important! use global unique string to specify instance labels = { instance="docker-compose-redis" } ## Optional TLS Config # use_tls = false # tls_min_version = "1.2" # tls_ca = "/etc/categraf/ca.pem" # tls_cert = "/etc/categraf/cert.pem" # tls_key = "/etc/categraf/key.pem" ## Use TLS but skip chain & host verification # insecure_skip_verify = true ================================================ FILE: docker/compose-bridge/etc-categraf/input.system/system.toml ================================================ # # collect interval # interval = 15 # # whether collect metric: system_n_users # collect_user_number = false ================================================ FILE: docker/compose-bridge/etc-mysql/my.cnf ================================================ [mysqld] pid-file = /var/run/mysqld/mysqld.pid socket = /var/run/mysqld/mysqld.sock datadir = /var/lib/mysql bind-address = 0.0.0.0 ================================================ FILE: docker/compose-bridge/etc-nightingale/config.toml ================================================ [Global] RunMode = "release" [Log] # log write dir Dir = "logs" # log level: DEBUG INFO WARNING ERROR Level = "INFO" # stdout, stderr, file Output = "stdout" # # rotate by time # KeepHours = 4 # # rotate by size # RotateNum = 3 # # unit: MB # RotateSize = 256 [HTTP] # http listening address Host = "0.0.0.0" # http listening port Port = 17000 # https cert file path CertFile = "" # https key file path KeyFile = "" # whether print access log PrintAccessLog = false # whether enable pprof PProf = false # expose prometheus /metrics? ExposeMetrics = true # http graceful shutdown timeout, unit: s ShutdownTimeout = 30 # max content length: 64M MaxContentLength = 67108864 # http server read timeout, unit: s ReadTimeout = 20 # http server write timeout, unit: s WriteTimeout = 40 # http server idle timeout, unit: s IdleTimeout = 120 [HTTP.ShowCaptcha] Enable = false [HTTP.APIForAgent] Enable = true # [HTTP.APIForAgent.BasicAuth] # user001 = "ccc26da7b9aba533cbb263a36c07dcc5" [HTTP.APIForService] Enable = false [HTTP.APIForService.BasicAuth] user001 = "ccc26da7b9aba533cbb263a36c07dcc5" [HTTP.JWTAuth] # unit: min AccessExpired = 1500 # unit: min RefreshExpired = 10080 RedisKeyPrefix = "/jwt/" [HTTP.TokenAuth] Enable = false HeaderUserTokenKey = "X-User-Token" [HTTP.ProxyAuth] # if proxy auth enabled, jwt auth is disabled Enable = false # username key in http proxy header HeaderUserNameKey = "X-User-Name" DefaultRoles = ["Standard"] [HTTP.RSA] # open RSA OpenRSA = false [DB] # postgres: DSN="host=127.0.0.1 port=5432 user=root dbname=n9e_v6 password=1234 sslmode=disable" DSN="root:1234@tcp(mysql:3306)/n9e_v6?charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true" # enable debug mode or not Debug = false # mysql postgres DBType = "mysql" # unit: s MaxLifetime = 7200 # max open connections MaxOpenConns = 150 # max idle connections MaxIdleConns = 50 [Redis] # address, ip:port or ip1:port,ip2:port for cluster and sentinel(SentinelAddrs) Address = "redis:6379" # Username = "" # Password = "" # DB = 0 # UseTLS = false # TLSMinVersion = "1.2" # standalone cluster sentinel RedisType = "standalone" # Mastername for sentinel type # MasterName = "mymaster" # SentinelUsername = "" # SentinelPassword = "" [Alert] [Alert.Heartbeat] # auto detect if blank IP = "" # unit ms Interval = 1000 EngineName = "default" # [Alert.Alerting] # NotifyConcurrency = 10 [Center] MetricsYamlFile = "./etc/metrics.yaml" I18NHeaderKey = "X-Language" [Center.AnonymousAccess] PromQuerier = false AlertDetail = false [Pushgw] # use target labels in database instead of in series LabelRewrite = true ForceUseServerTS = true # [Pushgw.DebugSample] # ident = "xx" # __name__ = "xx" # [Pushgw.WriterOpt] # QueueMaxSize = 1000000 # QueuePopSize = 1000 [[Pushgw.Writers]] # Url = "http://127.0.0.1:8480/insert/0/prometheus/api/v1/write" Url = "http://victoriametrics:8428/api/v1/write" # Basic auth username BasicAuthUser = "" # Basic auth password BasicAuthPass = "" # timeout settings, unit: ms Headers = ["X-From", "n9e"] Timeout = 10000 DialTimeout = 3000 TLSHandshakeTimeout = 30000 ExpectContinueTimeout = 1000 IdleConnTimeout = 90000 # time duration, unit: ms KeepAlive = 30000 MaxConnsPerHost = 0 MaxIdleConns = 100 MaxIdleConnsPerHost = 100 ## Optional TLS Config # UseTLS = false # TLSCA = "/etc/n9e/ca.pem" # TLSCert = "/etc/n9e/cert.pem" # TLSKey = "/etc/n9e/key.pem" # InsecureSkipVerify = false # [[Writers.WriteRelabels]] # Action = "replace" # SourceLabels = ["__address__"] # Regex = "([^:]+)(?::\\d+)?" # Replacement = "$1:80" # TargetLabel = "__address__" [Ibex] Enable = true RPCListen = "0.0.0.0:20090" ================================================ FILE: docker/compose-bridge/etc-nightingale/metrics.yaml ================================================ zh: ip_conntrack_count: 连接跟踪表条目总数(单位:int, count) ip_conntrack_max: 连接跟踪表最大容量(单位:int, size) cpu_usage_idle: CPU空闲率(单位:%) cpu_usage_active: CPU使用率(单位:%) cpu_usage_system: CPU内核态时间占比(单位:%) cpu_usage_user: CPU用户态时间占比(单位:%) cpu_usage_nice: 低优先级用户态CPU时间占比,也就是进程nice值被调整为1-19之间的CPU时间。这里注意,nice可取值范围是-20到19,数值越大,优先级反而越低(单位:%) cpu_usage_iowait: CPU等待I/O的时间占比(单位:%) cpu_usage_irq: CPU处理硬中断的时间占比(单位:%) cpu_usage_softirq: CPU处理软中断的时间占比(单位:%) cpu_usage_steal: 在虚拟机环境下有该指标,表示CPU被其他虚拟机争用的时间占比,超过20就表示争抢严重(单位:%) cpu_usage_guest: 通过虚拟化运行其他操作系统的时间,也就是运行虚拟机的CPU时间占比(单位:%) cpu_usage_guest_nice: 以低优先级运行虚拟机的时间占比(单位:%) disk_free: 硬盘分区剩余量(单位:byte) disk_used: 硬盘分区使用量(单位:byte) disk_used_percent: 硬盘分区使用率(单位:%) disk_total: 硬盘分区总量(单位:byte) disk_inodes_free: 硬盘分区inode剩余量 disk_inodes_used: 硬盘分区inode使用量 disk_inodes_total: 硬盘分区inode总量 diskio_io_time: 从设备视角来看I/O请求总时间,队列中有I/O请求就计数(单位:毫秒),counter类型,需要用函数求rate才有使用价值 diskio_iops_in_progress: 已经分配给设备驱动且尚未完成的IO请求,不包含在队列中但尚未分配给设备驱动的IO请求,gauge类型 diskio_merged_reads: 相邻读请求merge读的次数,counter类型 diskio_merged_writes: 相邻写请求merge写的次数,counter类型 diskio_read_bytes: 读取的byte数量,counter类型,需要用函数求rate才有使用价值 diskio_read_time: 读请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值 diskio_reads: 读请求次数,counter类型,需要用函数求rate才有使用价值 diskio_weighted_io_time: 从I/O请求视角来看I/O等待总时间,如果同时有多个I/O请求,时间会叠加(单位:毫秒) diskio_write_bytes: 写入的byte数量,counter类型,需要用函数求rate才有使用价值 diskio_write_time: 写请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值 diskio_writes: 写请求次数,counter类型,需要用函数求rate才有使用价值 kernel_boot_time: 内核启动时间 kernel_context_switches: 内核上下文切换次数 kernel_entropy_avail: linux系统内部的熵池 kernel_interrupts: 内核中断次数 kernel_processes_forked: fork的进程数 mem_active: 活跃使用的内存总数(包括cache和buffer内存) mem_available: 可用内存大小(bytes) mem_available_percent: 内存剩余百分比(0~100) mem_buffered: 用来给文件做缓冲大小 mem_cached: 被高速缓冲存储器(cache memory)用的内存的大小(等于 diskcache minus SwapCache ) mem_commit_limit: 根据超额分配比率('vm.overcommit_ratio'),这是当前在系统上分配可用的内存总量,这个限制只是在模式2('vm.overcommit_memory')时启用 mem_committed_as: 目前在系统上分配的内存量。是所有进程申请的内存的总和 mem_dirty: 等待被写回到磁盘的内存大小 mem_free: 空闲内存大小(bytes) mem_high_free: 未被使用的高位内存大小 mem_high_total: 高位内存总大小(Highmem是指所有内存高于860MB的物理内存,Highmem区域供用户程序使用,或用于页面缓存。该区域不是直接映射到内核空间。内核必须使用不同的手法使用该段内存) mem_huge_page_size: 每个大页的大小 mem_huge_pages_free: 池中尚未分配的 HugePages 数量 mem_huge_pages_total: 预留HugePages的总个数 mem_inactive: 空闲的内存数(包括free和available的内存) mem_low_free: 未被使用的低位大小 mem_low_total: 低位内存总大小,低位可以达到高位内存一样的作用,而且它还能够被内核用来记录一些自己的数据结构 mem_mapped: 设备和文件等映射的大小 mem_page_tables: 管理内存分页页面的索引表的大小 mem_shared: 多个进程共享的内存总额 mem_slab: 内核数据结构缓存的大小,可以减少申请和释放内存带来的消耗 mem_sreclaimable: 可收回Slab的大小 mem_sunreclaim: 不可收回Slab的大小(SUnreclaim+SReclaimable=Slab) mem_swap_cached: 被高速缓冲存储器(cache memory)用的交换空间的大小,已经被交换出来的内存,但仍然被存放在swapfile中。用来在需要的时候很快的被替换而不需要再次打开I/O端口 mem_swap_free: 未被使用交换空间的大小 mem_swap_total: 交换空间的总大小 mem_total: 内存总数 mem_used: 已用内存数 mem_used_percent: 已用内存数百分比(0~100) mem_vmalloc_chunk: 最大的连续未被使用的vmalloc区域 mem_vmalloc_totalL: 可以vmalloc虚拟内存大小 mem_vmalloc_used: vmalloc已使用的虚拟内存大小 mem_write_back: 正在被写回到磁盘的内存大小 mem_write_back_tmp: FUSE用于临时写回缓冲区的内存 net_bytes_recv: 网卡收包总数(bytes),计算每秒速率时需要用到rate/irate函数 net_bytes_sent: 网卡发包总数(bytes),计算每秒速率时需要用到rate/irate函数 net_drop_in: 网卡收丢包数量 net_drop_out: 网卡发丢包数量 net_err_in: 网卡收包错误数量 net_err_out: 网卡发包错误数量 net_packets_recv: 网卡收包数量 net_packets_sent: 网卡发包数量 net_bits_recv: 网卡收包总数(bits),计算每秒速率时需要用到rate/irate函数 net_bits_sent: 网卡发包总数(bits),计算每秒速率时需要用到rate/irate函数 netstat_tcp_established: ESTABLISHED状态的网络链接数 netstat_tcp_fin_wait1: FIN_WAIT1状态的网络链接数 netstat_tcp_fin_wait2: FIN_WAIT2状态的网络链接数 netstat_tcp_last_ack: LAST_ACK状态的网络链接数 netstat_tcp_listen: LISTEN状态的网络链接数 netstat_tcp_syn_recv: SYN_RECV状态的网络链接数 netstat_tcp_syn_sent: SYN_SENT状态的网络链接数 netstat_tcp_time_wait: TIME_WAIT状态的网络链接数 netstat_udp_socket: UDP状态的网络链接数 netstat_sockets_used: 已使用的所有协议套接字总量 netstat_tcp_inuse: 正在使用(正在侦听)的TCP套接字数量 netstat_tcp_orphan: 无主(不属于任何进程)的TCP连接数(无用、待销毁的TCP socket数) netstat_tcp_tw: TIME_WAIT状态的TCP连接数 netstat_tcp_alloc: 已分配(已建立、已申请到sk_buff)的TCP套接字数量 netstat_tcp_mem: TCP套接字内存Page使用量 netstat_udp_inuse: 在使用的UDP套接字数量 netstat_udp_mem: UDP套接字内存Page使用量 netstat_udplite_inuse: 正在使用的 udp lite 数量 netstat_raw_inuse: 正在使用的 raw socket 数量 netstat_frag_inuse: ip fragment 数量 netstat_frag_memory: ip fragment 已经分配的内存(byte) #[ping] ping_percent_packet_loss: ping数据包丢失百分比(%) ping_result_code: ping返回码('0','1') net_response_result_code: 网络探测结果,0表示正常,非0表示异常 net_response_response_time: 网络探测时延,单位:秒 processes_blocked: 不可中断的睡眠状态下的进程数('U','D','L') processes_dead: 回收中的进程数('X') processes_idle: 挂起的空闲进程数('I') processes_paging: 分页进程数('P') processes_running: 运行中的进程数('R') processes_sleeping: 可中断进程数('S') processes_stopped: 暂停状态进程数('T') processes_total: 总进程数 processes_total_threads: 总线程数 processes_unknown: 未知状态进程数 processes_zombies: 僵尸态进程数('Z') swap_used_percent: Swap空间换出数据量 system_load1: 1分钟平均load值 system_load5: 5分钟平均load值 system_load15: 15分钟平均load值 system_load_norm_1: 1分钟平均load值/逻辑CPU个数 system_load_norm_5: 5分钟平均load值/逻辑CPU个数 system_load_norm_15: 15分钟平均load值/逻辑CPU个数 system_n_users: 用户数 system_n_cpus: CPU核数 system_uptime: 系统启动时间 nginx_accepts: 自nginx启动起,与客户端建立过得连接总数 nginx_active: 当前nginx正在处理的活动连接数,等于Reading/Writing/Waiting总和 nginx_handled: 自nginx启动起,处理过的客户端连接总数 nginx_reading: 正在读取HTTP请求头部的连接总数 nginx_requests: 自nginx启动起,处理过的客户端请求总数,由于存在HTTP Keep-Alive请求,该值会大于handled值 nginx_upstream_check_fall: upstream_check模块检测到后端失败的次数 nginx_upstream_check_rise: upstream_check模块对后端的检测次数 nginx_upstream_check_status_code: 后端upstream的状态,up为1,down为0 nginx_waiting: 开启 keep-alive 的情况下,这个值等于 active – (reading+writing), 意思就是 Nginx 已经处理完正在等候下一次请求指令的驻留连接 nginx_writing: 正在向客户端发送响应的连接总数 http_response_content_length: HTTP消息实体的传输长度 http_response_http_response_code: http响应状态码 http_response_response_time: http响应用时 http_response_result_code: url探测结果0为正常否则url无法访问 # [aws cloudwatch rds] cloudwatch_aws_rds_bin_log_disk_usage_average: rds 磁盘使用平均值 cloudwatch_aws_rds_bin_log_disk_usage_maximum: rds 磁盘使用量最大值 cloudwatch_aws_rds_bin_log_disk_usage_minimum: rds binlog 磁盘使用量最低 cloudwatch_aws_rds_bin_log_disk_usage_sample_count: rds binlog 磁盘使用情况样本计数 cloudwatch_aws_rds_bin_log_disk_usage_sum: rds binlog 磁盘使用总和 cloudwatch_aws_rds_burst_balance_average: rds 突发余额平均值 cloudwatch_aws_rds_burst_balance_maximum: rds 突发余额最大值 cloudwatch_aws_rds_burst_balance_minimum: rds 突发余额最低 cloudwatch_aws_rds_burst_balance_sample_count: rds 突发平衡样本计数 cloudwatch_aws_rds_burst_balance_sum: rds 突发余额总和 cloudwatch_aws_rds_cpu_utilization_average: rds cpu 利用率平均值 cloudwatch_aws_rds_cpu_utilization_maximum: rds cpu 利用率最大值 cloudwatch_aws_rds_cpu_utilization_minimum: rds cpu 利用率最低 cloudwatch_aws_rds_cpu_utilization_sample_count: rds cpu 利用率样本计数 cloudwatch_aws_rds_cpu_utilization_sum: rds cpu 利用率总和 cloudwatch_aws_rds_database_connections_average: rds 数据库连接平均值 cloudwatch_aws_rds_database_connections_maximum: rds 数据库连接数最大值 cloudwatch_aws_rds_database_connections_minimum: rds 数据库连接最小 cloudwatch_aws_rds_database_connections_sample_count: rds 数据库连接样本数 cloudwatch_aws_rds_database_connections_sum: rds 数据库连接总和 cloudwatch_aws_rds_db_load_average: rds db 平均负载 cloudwatch_aws_rds_db_load_cpu_average: rds db 负载 cpu 平均值 cloudwatch_aws_rds_db_load_cpu_maximum: rds db 负载 cpu 最大值 cloudwatch_aws_rds_db_load_cpu_minimum: rds db 负载 cpu 最小值 cloudwatch_aws_rds_db_load_cpu_sample_count: rds db 加载 CPU 样本数 cloudwatch_aws_rds_db_load_cpu_sum: rds db 加载cpu总和 cloudwatch_aws_rds_db_load_maximum: rds 数据库负载最大值 cloudwatch_aws_rds_db_load_minimum: rds 数据库负载最小值 cloudwatch_aws_rds_db_load_non_cpu_average: rds 加载非 CPU 平均值 cloudwatch_aws_rds_db_load_non_cpu_maximum: rds 加载非 cpu 最大值 cloudwatch_aws_rds_db_load_non_cpu_minimum: rds 加载非 cpu 最小值 cloudwatch_aws_rds_db_load_non_cpu_sample_count: rds 加载非 cpu 样本计数 cloudwatch_aws_rds_db_load_non_cpu_sum: rds 加载非cpu总和 cloudwatch_aws_rds_db_load_sample_count: rds db 加载样本计数 cloudwatch_aws_rds_db_load_sum: rds db 负载总和 cloudwatch_aws_rds_disk_queue_depth_average: rds 磁盘队列深度平均值 cloudwatch_aws_rds_disk_queue_depth_maximum: rds 磁盘队列深度最大值 cloudwatch_aws_rds_disk_queue_depth_minimum: rds 磁盘队列深度最小值 cloudwatch_aws_rds_disk_queue_depth_sample_count: rds 磁盘队列深度样本计数 cloudwatch_aws_rds_disk_queue_depth_sum: rds 磁盘队列深度总和 cloudwatch_aws_rds_ebs_byte_balance__average: rds ebs 字节余额平均值 cloudwatch_aws_rds_ebs_byte_balance__maximum: rds ebs 字节余额最大值 cloudwatch_aws_rds_ebs_byte_balance__minimum: rds ebs 字节余额最低 cloudwatch_aws_rds_ebs_byte_balance__sample_count: rds ebs 字节余额样本数 cloudwatch_aws_rds_ebs_byte_balance__sum: rds ebs 字节余额总和 cloudwatch_aws_rds_ebsio_balance__average: rds ebsio 余额平均值 cloudwatch_aws_rds_ebsio_balance__maximum: rds ebsio 余额最大值 cloudwatch_aws_rds_ebsio_balance__minimum: rds ebsio 余额最低 cloudwatch_aws_rds_ebsio_balance__sample_count: rds ebsio 平衡样本计数 cloudwatch_aws_rds_ebsio_balance__sum: rds ebsio 余额总和 cloudwatch_aws_rds_free_storage_space_average: rds 免费存储空间平均 cloudwatch_aws_rds_free_storage_space_maximum: rds 最大可用存储空间 cloudwatch_aws_rds_free_storage_space_minimum: rds 最低可用存储空间 cloudwatch_aws_rds_free_storage_space_sample_count: rds 可用存储空间样本数 cloudwatch_aws_rds_free_storage_space_sum: rds 免费存储空间总和 cloudwatch_aws_rds_freeable_memory_average: rds 可用内存平均值 cloudwatch_aws_rds_freeable_memory_maximum: rds 最大可用内存 cloudwatch_aws_rds_freeable_memory_minimum: rds 最小可用内存 cloudwatch_aws_rds_freeable_memory_sample_count: rds 可释放内存样本数 cloudwatch_aws_rds_freeable_memory_sum: rds 可释放内存总和 cloudwatch_aws_rds_lvm_read_iops_average: rds lvm 读取 iops 平均值 cloudwatch_aws_rds_lvm_read_iops_maximum: rds lvm 读取 iops 最大值 cloudwatch_aws_rds_lvm_read_iops_minimum: rds lvm 读取 iops 最低 cloudwatch_aws_rds_lvm_read_iops_sample_count: rds lvm 读取 iops 样本计数 cloudwatch_aws_rds_lvm_read_iops_sum: rds lvm 读取 iops 总和 cloudwatch_aws_rds_lvm_write_iops_average: rds lvm 写入 iops 平均值 cloudwatch_aws_rds_lvm_write_iops_maximum: rds lvm 写入 iops 最大值 cloudwatch_aws_rds_lvm_write_iops_minimum: rds lvm 写入 iops 最低 cloudwatch_aws_rds_lvm_write_iops_sample_count: rds lvm 写入 iops 样本计数 cloudwatch_aws_rds_lvm_write_iops_sum: rds lvm 写入 iops 总和 cloudwatch_aws_rds_network_receive_throughput_average: rds 网络接收吞吐量平均 cloudwatch_aws_rds_network_receive_throughput_maximum: rds 网络接收吞吐量最大值 cloudwatch_aws_rds_network_receive_throughput_minimum: rds 网络接收吞吐量最小值 cloudwatch_aws_rds_network_receive_throughput_sample_count: rds 网络接收吞吐量样本计数 cloudwatch_aws_rds_network_receive_throughput_sum: rds 网络接收吞吐量总和 cloudwatch_aws_rds_network_transmit_throughput_average: rds 网络传输吞吐量平均值 cloudwatch_aws_rds_network_transmit_throughput_maximum: rds 网络传输吞吐量最大 cloudwatch_aws_rds_network_transmit_throughput_minimum: rds 网络传输吞吐量最小值 cloudwatch_aws_rds_network_transmit_throughput_sample_count: rds 网络传输吞吐量样本计数 cloudwatch_aws_rds_network_transmit_throughput_sum: rds 网络传输吞吐量总和 cloudwatch_aws_rds_read_iops_average: rds 读取 iops 平均值 cloudwatch_aws_rds_read_iops_maximum: rds 最大读取 iops cloudwatch_aws_rds_read_iops_minimum: rds 读取 iops 最低 cloudwatch_aws_rds_read_iops_sample_count: rds 读取 iops 样本计数 cloudwatch_aws_rds_read_iops_sum: rds 读取 iops 总和 cloudwatch_aws_rds_read_latency_average: rds 读取延迟平均值 cloudwatch_aws_rds_read_latency_maximum: rds 读取延迟最大值 cloudwatch_aws_rds_read_latency_minimum: rds 最小读取延迟 cloudwatch_aws_rds_read_latency_sample_count: rds 读取延迟样本计数 cloudwatch_aws_rds_read_latency_sum: rds 读取延迟总和 cloudwatch_aws_rds_read_throughput_average: rds 读取吞吐量平均值 cloudwatch_aws_rds_read_throughput_maximum: rds 最大读取吞吐量 cloudwatch_aws_rds_read_throughput_minimum: rds 最小读取吞吐量 cloudwatch_aws_rds_read_throughput_sample_count: rds 读取吞吐量样本计数 cloudwatch_aws_rds_read_throughput_sum: rds 读取吞吐量总和 cloudwatch_aws_rds_swap_usage_average: rds 交换使用平均值 cloudwatch_aws_rds_swap_usage_maximum: rds 交换使用最大值 cloudwatch_aws_rds_swap_usage_minimum: rds 交换使用量最低 cloudwatch_aws_rds_swap_usage_sample_count: rds 交换使用示例计数 cloudwatch_aws_rds_swap_usage_sum: rds 交换使用总和 cloudwatch_aws_rds_write_iops_average: rds 写入 iops 平均值 cloudwatch_aws_rds_write_iops_maximum: rds 写入 iops 最大值 cloudwatch_aws_rds_write_iops_minimum: rds 写入 iops 最低 cloudwatch_aws_rds_write_iops_sample_count: rds 写入 iops 样本计数 cloudwatch_aws_rds_write_iops_sum: rds 写入 iops 总和 cloudwatch_aws_rds_write_latency_average: rds 写入延迟平均值 cloudwatch_aws_rds_write_latency_maximum: rds 最大写入延迟 cloudwatch_aws_rds_write_latency_minimum: rds 写入延迟最小值 cloudwatch_aws_rds_write_latency_sample_count: rds 写入延迟样本计数 cloudwatch_aws_rds_write_latency_sum: rds 写入延迟总和 cloudwatch_aws_rds_write_throughput_average: rds 写入吞吐量平均值 cloudwatch_aws_rds_write_throughput_maximum: rds 最大写入吞吐量 cloudwatch_aws_rds_write_throughput_minimum: rds 写入吞吐量最小值 cloudwatch_aws_rds_write_throughput_sample_count: rds 写入吞吐量样本计数 cloudwatch_aws_rds_write_throughput_sum: rds 写入吞吐量总和 en: ip_conntrack_count: the number of entries in the conntrack table(unit:int, count) ip_conntrack_max: the max capacity of the conntrack table(unit:int, size) cpu_usage_idle: "CPU idle rate(unit:%)" cpu_usage_active: "CPU usage rate(unit:%)" cpu_usage_system: "CPU kernel state time proportion(unit:%)" cpu_usage_user: "CPU user attitude time proportion(unit:%)" cpu_usage_nice: "The proportion of low priority CPU time, that is, the process NICE value is adjusted to the CPU time between 1-19. Note here that the value range of NICE is -20 to 19, the larger the value, the lower the priority, the lower the priority(unit:%)" cpu_usage_iowait: "CPU waiting for I/O time proportion(unit:%)" cpu_usage_irq: "CPU processing hard interrupt time proportion(unit:%)" cpu_usage_softirq: "CPU processing soft interrupt time proportion(unit:%)" cpu_usage_steal: "In the virtual machine environment, there is this indicator, which means that the CPU is used by other virtual machines for the proportion of time.(unit:%)" cpu_usage_guest: "The time to run other operating systems by virtualization, that is, the proportion of CPU time running the virtual machine(unit:%)" cpu_usage_guest_nice: "The proportion of time to run the virtual machine at low priority(unit:%)" disk_free: "The remaining amount of the hard disk partition (unit: byte)" disk_used: "Hard disk partitional use (unit: byte)" disk_used_percent: "Hard disk partitional use rate (unit:%)" disk_total: "Total amount of hard disk partition (unit: byte)" disk_inodes_free: "Hard disk partition INODE remaining amount" disk_inodes_used: "Hard disk partition INODE usage amount" disk_inodes_total: "The total amount of hard disk partition INODE" diskio_io_time: "From the perspective of the device perspective, the total time of I/O request, the I/O request in the queue is count (unit: millisecond), the counter type, you need to use the function to find the value" diskio_iops_in_progress: "IO requests that have been assigned to device -driven and have not yet been completed, not included in the queue but not yet assigned to the device -driven IO request, Gauge type" diskio_merged_reads: "The number of times of adjacent reading request Merge, the counter type" diskio_merged_writes: "The number of times the request Merge writes, the counter type" diskio_read_bytes: "The number of byte reads, the counter type, you need to use the function to find the Rate to use the value" diskio_read_time: "The total time of reading request (unit: millisecond), the counter type, you need to use the function to find the Rate to have the value of use" diskio_reads: "Read the number of requests, the counter type, you need to use the function to find the Rate to use the value" diskio_weighted_io_time: "From the perspective of the I/O request perspective, I/O wait for the total time. If there are multiple I/O requests at the same time, the time will be superimposed (unit: millisecond)" diskio_write_bytes: "The number of bytes written, the counter type, you need to use the function to find the Rate to use the value" diskio_write_time: "The total time of the request (unit: millisecond), the counter type, you need to use the function to find the rate to have the value of use" diskio_writes: "Write the number of requests, the counter type, you need to use the function to find the rate to use value" kernel_boot_time: "Kernel startup time" kernel_context_switches: "Number of kernel context switching times" kernel_entropy_avail: "Entropy pool inside the Linux system" kernel_interrupts: "Number of kernel interruption" kernel_processes_forked: "ForK's process number" mem_active: "The total number of memory (including Cache and BUFFER memory)" mem_available: "Application can use memory numbers" mem_available_percent: "Memory remaining percentage (0 ~ 100)" mem_buffered: "Used to make buffer size for the file" mem_cached: "The size of the memory used by the cache memory (equal to diskcache minus Swap Cache )" mem_commit_limit: "According to the over allocation ratio ('vm.overCommit _ Ratio'), this is the current total memory that can be allocated on the system." mem_committed_as: "Currently allocated on the system. It is the sum of the memory of all process applications" mem_dirty: "Waiting to be written back to the memory size of the disk" mem_free: "Senior memory number" mem_high_free: "Unused high memory size" mem_high_total: "The total memory size of the high memory (Highmem refers to all the physical memory that is higher than 860 MB of memory, the HighMem area is used for user programs, or for page cache. This area is not directly mapped to the kernel space. The kernels must use different methods to use this section of memory. )" mem_huge_page_size: "The size of each big page" mem_huge_pages_free: "The number of Huge Pages in the pool that have not been allocated" mem_huge_pages_total: "Reserve the total number of Huge Pages" mem_inactive: "Free memory (including the memory of free and available)" mem_low_free: "Unused low size" mem_low_total: "The total size of the low memory memory can achieve the same role of high memory, and it can be used by the kernel to record some of its own data structure" mem_mapped: "The size of the mapping of equipment and files" mem_page_tables: "The size of the index table of the management of the memory paging page" mem_shared: "The total memory shared by multiple processes" mem_slab: "The size of the kernel data structure cache can reduce the consumption of application and release memory" mem_sreclaimable: "The size of the SLAB can be recovered" mem_sunreclaim: "The size of the SLAB cannot be recovered(SUnreclaim+SReclaimable=Slab)" mem_swap_cached: "The size of the swap space used by the cache memory (cache memory), the memory that has been swapped out, but is still stored in the swapfile. Used to be quickly replaced when needed without opening the I/O port again" mem_swap_free: "The size of the switching space is not used" mem_swap_total: "The total size of the exchange space" mem_total: "Total memory" mem_used: "Memory number" mem_used_percent: "The memory has been used by several percentage (0 ~ 100)" mem_vmalloc_chunk: "The largest continuous unused vmalloc area" mem_vmalloc_totalL: "You can vmalloc virtual memory size" mem_vmalloc_used: "Vmalloc's virtual memory size" mem_write_back: "The memory size of the disk is being written back to the disk" mem_write_back_tmp: "Fuse is used to temporarily write back the memory of the buffer area" net_bytes_recv: "Total inbound traffic(bytes) of network card" net_bytes_sent: "Total outbound traffic(bytes) of network card" net_bits_recv: "Total inbound traffic(bits) of network card" net_bits_sent: "Total outbound traffic(bits) of network card" net_drop_in: "The number of packets for network cards" net_drop_out: "The number of packets issued by the network card" net_err_in: "The number of incorrect packets of the network card" net_err_out: "Number of incorrect number of network cards" net_packets_recv: "Net card collection quantity" net_packets_sent: "Number of network card issuance" netstat_tcp_established: "ESTABLISHED status network link number" netstat_tcp_fin_wait1: "FIN _ WAIT1 status network link number" netstat_tcp_fin_wait2: "FIN _ WAIT2 status number of network links" netstat_tcp_last_ack: "LAST_ ACK status number of network links" netstat_tcp_listen: "Number of network links in Listen status" netstat_tcp_syn_recv: "SYN _ RECV status number of network links" netstat_tcp_syn_sent: "SYN _ SENT status number of network links" netstat_tcp_time_wait: "Time _ WAIT status network link number" netstat_udp_socket: "Number of network links in UDP status" processes_blocked: "The number of processes in the unreproducible sleep state('U','D','L')" processes_dead: "Number of processes in recycling('X')" processes_idle: "Number of idle processes hanging('I')" processes_paging: "Number of paging processes('P')" processes_running: "Number of processes during operation('R')" processes_sleeping: "Can interrupt the number of processes('S')" processes_stopped: "Pushing status process number('T')" processes_total: "Total process number" processes_total_threads: "Number of threads" processes_unknown: "Unknown status process number" processes_zombies: "Number of zombies('Z')" swap_used_percent: "SWAP space replace the data volume" system_load1: "1 minute average load value" system_load5: "5 minutes average load value" system_load15: "15 minutes average load value" system_load_norm_1: "1 minute average load value/logical CPU number" system_load_norm_5: "5 minutes average load value/logical CPU number" system_load_norm_15: "15 minutes average load value/logical CPU number" system_n_users: "User number" system_n_cpus: "CPU nuclear number" system_uptime: "System startup time" nginx_accepts: "Since Nginx started, the total number of connections has been established with the client" nginx_active: "The current number of activity connections that Nginx is being processed is equal to Reading/Writing/Waiting" nginx_handled: "Starting from Nginx, the total number of client connections that have been processed" nginx_reading: "Reading the total number of connections on the http request header" nginx_requests: "Since nginx is started, the total number of client requests processed, due to the existence of HTTP Keep-Alive requests, this value will be greater than the handled value" nginx_upstream_check_fall: "UPStream_CHECK module detects the number of back -end failures" nginx_upstream_check_rise: "UPSTREAM _ Check module to detect the number of back -end" nginx_upstream_check_status_code: "The state of the backstream is 1, and the down is 0" nginx_waiting: "When keep-alive is enabled, this value is equal to active – (reading+writing), which means that Nginx has processed the resident connection that is waiting for the next request command" nginx_writing: "The total number of connections to send a response to the client" http_response_content_length: "HTTP message entity transmission length" http_response_http_response_code: "http response status code" http_response_response_time: "When http ring application" http_response_result_code: "URL detection result 0 is normal, otherwise the URL cannot be accessed" # [mysqld_exporter] mysql_global_status_uptime: The number of seconds that the server has been up.(Gauge) mysql_global_status_uptime_since_flush_status: The number of seconds since the most recent FLUSH STATUS statement.(Gauge) mysql_global_status_queries: The number of statements executed by the server. This variable includes statements executed within stored programs, unlike the Questions variable. It does not count COM_PING or COM_STATISTICS commands.(Counter) mysql_global_status_threads_connected: The number of currently open connections.(Counter) mysql_global_status_connections: The number of connection attempts (successful or not) to the MySQL server.(Gauge) mysql_global_status_max_used_connections: The maximum number of connections that have been in use simultaneously since the server started.(Gauge) mysql_global_status_threads_running: The number of threads that are not sleeping.(Gauge) mysql_global_status_questions: The number of statements executed by the server. This includes only statements sent to the server by clients and not statements executed within stored programs, unlike the Queries variable. This variable does not count COM_PING, COM_STATISTICS, COM_STMT_PREPARE, COM_STMT_CLOSE, or COM_STMT_RESET commands.(Counter) mysql_global_status_threads_cached: The number of threads in the thread cache.(Counter) mysql_global_status_threads_created: The number of threads created to handle connections. If Threads_created is big, you may want to increase the thread_cache_size value. The cache miss rate can be calculated as Threads_created/Connections.(Counter) mysql_global_status_created_tmp_tables: The number of internal temporary tables created by the server while executing statements.(Counter) mysql_global_status_created_tmp_disk_tables: The number of internal on-disk temporary tables created by the server while executing statements. You can compare the number of internal on-disk temporary tables created to the total number of internal temporary tables created by comparing Created_tmp_disk_tables and Created_tmp_tables values.(Counter) mysql_global_status_created_tmp_files: How many temporary files mysqld has created.(Counter) mysql_global_status_select_full_join: The number of joins that perform table scans because they do not use indexes. If this value is not 0, you should carefully check the indexes of your tables.(Counter) mysql_global_status_select_full_range_join: The number of joins that used a range search on a reference table.(Counter) mysql_global_status_select_range: The number of joins that used ranges on the first table. This is normally not a critical issue even if the value is quite large.(Counter) mysql_global_status_select_range_check: The number of joins without keys that check for key usage after each row. If this is not 0, you should carefully check the indexes of your tables.(Counter) mysql_global_status_select_scan: The number of joins that did a full scan of the first table.(Counter) mysql_global_status_sort_rows: The number of sorted rows.(Counter) mysql_global_status_sort_range: The number of sorts that were done using ranges.(Counter) mysql_global_status_sort_merge_passes: The number of merge passes that the sort algorithm has had to do. If this value is large, you should consider increasing the value of the sort_buffer_size system variable.(Counter) mysql_global_status_sort_scan: The number of sorts that were done by scanning the table.(Counter) mysql_global_status_slow_queries: The number of queries that have taken more than long_query_time seconds. This counter increments regardless of whether the slow query log is enabled.(Counter) mysql_global_status_aborted_connects: The number of failed attempts to connect to the MySQL server.(Counter) mysql_global_status_aborted_clients: The number of connections that were aborted because the client died without closing the connection properly.(Counter) mysql_global_status_table_locks_immediate: The number of times that a request for a table lock could be granted immediately. Locks Immediate rising and falling is normal activity.(Counter) mysql_global_status_table_locks_waited: The number of times that a request for a table lock could not be granted immediately and a wait was needed. If this is high and you have performance problems, you should first optimize your queries, and then either split your table or tables or use replication.(Counter) mysql_global_status_bytes_received: The number of bytes received from all clients.(Counter) mysql_global_status_bytes_sent: The number of bytes sent to all clients.(Counter) mysql_global_status_innodb_page_size: InnoDB page size (default 16KB). Many values are counted in pages; the page size enables them to be easily converted to bytes.(Gauge) mysql_global_status_buffer_pool_pages: The number of pages in the InnoDB buffer pool.(Gauge) mysql_global_status_commands_total: The number of times each xxx statement has been executed.(Counter) mysql_global_status_handlers_total: Handler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes. This is in fact the layer between the Storage Engine and MySQL.(Counter) mysql_global_status_opened_files: The number of files that have been opened with my_open() (a mysys library function). Parts of the server that open files without using this function do not increment the count.(Counter) mysql_global_status_open_tables: The number of tables that are open.(Gauge) mysql_global_status_opened_tables: The number of tables that have been opened. If Opened_tables is big, your table_open_cache value is probably too small.(Counter) mysql_global_status_table_open_cache_hits: The number of hits for open tables cache lookups.(Counter) mysql_global_status_table_open_cache_misses: The number of misses for open tables cache lookups.(Counter) mysql_global_status_table_open_cache_overflows: The number of overflows for the open tables cache.(Counter) mysql_global_status_innodb_num_open_files: The number of files InnoDB currently holds open.(Gauge) mysql_global_status_connection_errors_total: These variables provide information about errors that occur during the client connection process.(Counter) mysql_global_status_innodb_buffer_pool_read_requests: The number of logical read requests.(Counter) mysql_global_status_innodb_buffer_pool_reads: The number of logical reads that InnoDB could not satisfy from the buffer pool, and had to read directly from disk.(Counter) mysql_global_variables_thread_cache_size: How many threads the server should cache for reuse.(Gauge) mysql_global_variables_max_connections: The maximum permitted number of simultaneous client connections.(Gauge) mysql_global_variables_innodb_buffer_pool_size: The size in bytes of the buffer pool, the memory area where InnoDB caches table and index data. The default value is 134217728 bytes (128MB).(Gauge) mysql_global_variables_innodb_log_buffer_size: The size in bytes of the buffer that InnoDB uses to write to the log files on disk.(Gauge) mysql_global_variables_key_buffer_size: Index blocks for MyISAM tables are buffered and are shared by all threads.(Gauge) mysql_global_variables_query_cache_size: The amount of memory allocated for caching query results.(Gauge) mysql_global_variables_table_open_cache: The number of open tables for all threads.(Gauge) mysql_global_variables_open_files_limit: The number of file descriptors available to mysqld from the operating system.(Gauge) # [redis_exporter] redis_active_defrag_running: When activedefrag is enabled, this indicates whether defragmentation is currently active, and the CPU percentage it intends to utilize. redis_allocator_active_bytes: Total bytes in the allocator active pages, this includes external-fragmentation. redis_allocator_allocated_bytes: Total bytes allocated form the allocator, including internal-fragmentation. Normally the same as used_memory. redis_allocator_frag_bytes: Delta between allocator_active and allocator_allocated. See note about mem_fragmentation_bytes. redis_allocator_frag_ratio: Ratio between allocator_active and allocator_allocated. This is the true (external) fragmentation metric (not mem_fragmentation_ratio). redis_allocator_resident_bytes: Total bytes resident (RSS) in the allocator, this includes pages that can be released to the OS (by MEMORY PURGE, or just waiting). redis_allocator_rss_bytes: Delta between allocator_resident and allocator_active. redis_allocator_rss_ratio: Ratio between allocator_resident and allocator_active. This usually indicates pages that the allocator can and probably will soon release back to the OS. redis_aof_current_rewrite_duration_sec: Duration of the on-going AOF rewrite operation if any. redis_aof_enabled: Flag indicating AOF logging is activated. redis_aof_last_bgrewrite_status: Status of the last AOF rewrite operation. redis_aof_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last AOF rewrite operation. redis_aof_last_rewrite_duration_sec: Duration of the last AOF rewrite operation in seconds. redis_aof_last_write_status: Status of the last write operation to the AOF. redis_aof_rewrite_in_progress: Flag indicating a AOF rewrite operation is on-going. redis_aof_rewrite_scheduled: Flag indicating an AOF rewrite operation will be scheduled once the on-going RDB save is complete. redis_blocked_clients: Number of clients pending on a blocking call (BLPOP, BRPOP, BRPOPLPUSH, BLMOVE, BZPOPMIN, BZPOPMAX). redis_client_recent_max_input_buffer_bytes: Biggest input buffer among current client connections. redis_client_recent_max_output_buffer_bytes: Biggest output buffer among current client connections. redis_cluster_enabled: Indicate Redis cluster is enabled. redis_commands_duration_seconds_total: The total CPU time consumed by these commands.(Counter) redis_commands_processed_total: Total number of commands processed by the server.(Counter) redis_commands_total: The number of calls that reached command execution (not rejected).(Counter) redis_config_maxclients: The value of the maxclients configuration directive. This is the upper limit for the sum of connected_clients, connected_slaves and cluster_connections. redis_config_maxmemory: The value of the maxmemory configuration directive. redis_connected_clients: Number of client connections (excluding connections from replicas). redis_connected_slaves: Number of connected replicas. redis_connections_received_total: Total number of connections accepted by the server.(Counter) redis_cpu_sys_children_seconds_total: System CPU consumed by the background processes.(Counter) redis_cpu_sys_seconds_total: System CPU consumed by the Redis server, which is the sum of system CPU consumed by all threads of the server process (main thread and background threads).(Counter) redis_cpu_user_children_seconds_total: User CPU consumed by the background processes.(Counter) redis_cpu_user_seconds_total: User CPU consumed by the Redis server, which is the sum of user CPU consumed by all threads of the server process (main thread and background threads).(Counter) redis_db_keys: Total number of keys by DB. redis_db_keys_expiring: Total number of expiring keys by DB redis_defrag_hits: Number of value reallocations performed by active the defragmentation process. redis_defrag_misses: Number of aborted value reallocations started by the active defragmentation process. redis_defrag_key_hits: Number of keys that were actively defragmented. redis_defrag_key_misses: Number of keys that were skipped by the active defragmentation process. redis_evicted_keys_total: Number of evicted keys due to maxmemory limit.(Counter) redis_expired_keys_total: Total number of key expiration events.(Counter) redis_expired_stale_percentage: The percentage of keys probably expired. redis_expired_time_cap_reached_total: The count of times that active expiry cycles have stopped early. redis_exporter_last_scrape_connect_time_seconds: The duration(in seconds) to connect when scrape. redis_exporter_last_scrape_duration_seconds: The last scrape duration. redis_exporter_last_scrape_error: The last scrape error status. redis_exporter_scrape_duration_seconds_count: Durations of scrapes by the exporter redis_exporter_scrape_duration_seconds_sum: Durations of scrapes by the exporter redis_exporter_scrapes_total: Current total redis scrapes.(Counter) redis_instance_info: Information about the Redis instance. redis_keyspace_hits_total: Hits total.(Counter) redis_keyspace_misses_total: Misses total.(Counter) redis_last_key_groups_scrape_duration_milliseconds: Duration of the last key group metrics scrape in milliseconds. redis_last_slow_execution_duration_seconds: The amount of time needed for last slow execution, in seconds. redis_latest_fork_seconds: The amount of time needed for last fork, in seconds. redis_lazyfree_pending_objects: The number of objects waiting to be freed (as a result of calling UNLINK, or FLUSHDB and FLUSHALL with the ASYNC option). redis_master_repl_offset: The server's current replication offset. redis_mem_clients_normal: Memory used by normal clients.(Gauge) redis_mem_clients_slaves: Memory used by replica clients - Starting Redis 7.0, replica buffers share memory with the replication backlog, so this field can show 0 when replicas don't trigger an increase of memory usage. redis_mem_fragmentation_bytes: Delta between used_memory_rss and used_memory. Note that when the total fragmentation bytes is low (few megabytes), a high ratio (e.g. 1.5 and above) is not an indication of an issue. redis_mem_fragmentation_ratio: Ratio between used_memory_rss and used_memory. Note that this doesn't only includes fragmentation, but also other process overheads (see the allocator_* metrics), and also overheads like code, shared libraries, stack, etc. redis_mem_not_counted_for_eviction_bytes: (Gauge) redis_memory_max_bytes: Max memory limit in bytes. redis_memory_used_bytes: Total number of bytes allocated by Redis using its allocator (either standard libc, jemalloc, or an alternative allocator such as tcmalloc) redis_memory_used_dataset_bytes: The size in bytes of the dataset (used_memory_overhead subtracted from used_memory) redis_memory_used_lua_bytes: Number of bytes used by the Lua engine. redis_memory_used_overhead_bytes: The sum in bytes of all overheads that the server allocated for managing its internal data structures. redis_memory_used_peak_bytes: Peak memory consumed by Redis (in bytes) redis_memory_used_rss_bytes: Number of bytes that Redis allocated as seen by the operating system (a.k.a resident set size). This is the number reported by tools such as top(1) and ps(1) redis_memory_used_scripts_bytes: Number of bytes used by cached Lua scripts redis_memory_used_startup_bytes: Initial amount of memory consumed by Redis at startup in bytes redis_migrate_cached_sockets_total: The number of sockets open for MIGRATE purposes redis_net_input_bytes_total: Total input bytes(Counter) redis_net_output_bytes_total: Total output bytes(Counter) redis_process_id: Process ID redis_pubsub_channels: Global number of pub/sub channels with client subscriptions redis_pubsub_patterns: Global number of pub/sub pattern with client subscriptions redis_rdb_bgsave_in_progress: Flag indicating a RDB save is on-going redis_rdb_changes_since_last_save: Number of changes since the last dump redis_rdb_current_bgsave_duration_sec: Duration of the on-going RDB save operation if any redis_rdb_last_bgsave_duration_sec: Duration of the last RDB save operation in seconds redis_rdb_last_bgsave_status: Status of the last RDB save operation redis_rdb_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last RDB save operation redis_rdb_last_save_timestamp_seconds: Epoch-based timestamp of last successful RDB save redis_rejected_connections_total: Number of connections rejected because of maxclients limit(Counter) redis_repl_backlog_first_byte_offset: The master offset of the replication backlog buffer redis_repl_backlog_history_bytes: Size in bytes of the data in the replication backlog buffer redis_repl_backlog_is_active: Flag indicating replication backlog is active redis_replica_partial_resync_accepted: The number of accepted partial resync requests(Gauge) redis_replica_partial_resync_denied: The number of denied partial resync requests(Gauge) redis_replica_resyncs_full: The number of full resyncs with replicas redis_replication_backlog_bytes: Memory used by replication backlog redis_second_repl_offset: The offset up to which replication IDs are accepted. redis_slave_expires_tracked_keys: The number of keys tracked for expiry purposes (applicable only to writable replicas)(Gauge) redis_slowlog_last_id: Last id of slowlog redis_slowlog_length: Total slowlog redis_start_time_seconds: Start time of the Redis instance since unix epoch in seconds. redis_target_scrape_request_errors_total: Errors in requests to the exporter redis_up: Flag indicating redis instance is up redis_uptime_in_seconds: Number of seconds since Redis server start # [windows_exporter] windows_cpu_clock_interrupts_total: Total number of received and serviced clock tick interrupts(counter) windows_cpu_core_frequency_mhz: Core frequency in megahertz(gauge) windows_cpu_cstate_seconds_total: Time spent in low-power idle state(counter) windows_cpu_dpcs_total: Total number of received and serviced deferred procedure calls (DPCs)(counter) windows_cpu_idle_break_events_total: Total number of time processor was woken from idle(counter) windows_cpu_interrupts_total: Total number of received and serviced hardware interrupts(counter) windows_cpu_parking_status: Parking Status represents whether a processor is parked or not(gauge) windows_cpu_processor_performance: Processor Performance is the average performance of the processor while it is executing instructions, as a percentage of the nominal performance of the processor. On some processors, Processor Performance may exceed 100%(gauge) windows_cpu_time_total: Time that processor spent in different modes (idle, user, system, ...)(counter) windows_cs_hostname: Labeled system hostname information as provided by ComputerSystem.DNSHostName and ComputerSystem.Domain(gauge) windows_cs_logical_processors: ComputerSystem.NumberOfLogicalProcessors(gauge) windows_cs_physical_memory_bytes: ComputerSystem.TotalPhysicalMemory(gauge) windows_exporter_build_info: A metric with a constant '1' value labeled by version, revision, branch, and goversion from which windows_exporter was built.(gauge) windows_exporter_collector_duration_seconds: Duration of a collection.(gauge) windows_exporter_collector_success: Whether the collector was successful.(gauge) windows_exporter_collector_timeout: Whether the collector timed out.(gauge) windows_exporter_perflib_snapshot_duration_seconds: Duration of perflib snapshot capture(gauge) windows_logical_disk_free_bytes: Free space in bytes (LogicalDisk.PercentFreeSpace)(gauge) windows_logical_disk_idle_seconds_total: Seconds that the disk was idle (LogicalDisk.PercentIdleTime)(counter) windows_logical_disk_read_bytes_total: The number of bytes transferred from the disk during read operations (LogicalDisk.DiskReadBytesPerSec)(counter) windows_logical_disk_read_latency_seconds_total: Shows the average time, in seconds, of a read operation from the disk (LogicalDisk.AvgDiskSecPerRead)(counter) windows_logical_disk_read_seconds_total: Seconds that the disk was busy servicing read requests (LogicalDisk.PercentDiskReadTime)(counter) windows_logical_disk_read_write_latency_seconds_total: Shows the time, in seconds, of the average disk transfer (LogicalDisk.AvgDiskSecPerTransfer)(counter) windows_logical_disk_reads_total: The number of read operations on the disk (LogicalDisk.DiskReadsPerSec)(counter) windows_logical_disk_requests_queued: The number of requests queued to the disk (LogicalDisk.CurrentDiskQueueLength)(gauge) windows_logical_disk_size_bytes: Total space in bytes (LogicalDisk.PercentFreeSpace_Base)(gauge) windows_logical_disk_split_ios_total: The number of I/Os to the disk were split into multiple I/Os (LogicalDisk.SplitIOPerSec)(counter) windows_logical_disk_write_bytes_total: The number of bytes transferred to the disk during write operations (LogicalDisk.DiskWriteBytesPerSec)(counter) windows_logical_disk_write_latency_seconds_total: Shows the average time, in seconds, of a write operation to the disk (LogicalDisk.AvgDiskSecPerWrite)(counter) windows_logical_disk_write_seconds_total: Seconds that the disk was busy servicing write requests (LogicalDisk.PercentDiskWriteTime)(counter) windows_logical_disk_writes_total: The number of write operations on the disk (LogicalDisk.DiskWritesPerSec)(counter) windows_net_bytes_received_total: (Network.BytesReceivedPerSec)(counter) windows_net_bytes_sent_total: (Network.BytesSentPerSec)(counter) windows_net_bytes_total: (Network.BytesTotalPerSec)(counter) windows_net_current_bandwidth: (Network.CurrentBandwidth)(gauge) windows_net_packets_outbound_discarded_total: (Network.PacketsOutboundDiscarded)(counter) windows_net_packets_outbound_errors_total: (Network.PacketsOutboundErrors)(counter) windows_net_packets_received_discarded_total: (Network.PacketsReceivedDiscarded)(counter) windows_net_packets_received_errors_total: (Network.PacketsReceivedErrors)(counter) windows_net_packets_received_total: (Network.PacketsReceivedPerSec)(counter) windows_net_packets_received_unknown_total: (Network.PacketsReceivedUnknown)(counter) windows_net_packets_sent_total: (Network.PacketsSentPerSec)(counter) windows_net_packets_total: (Network.PacketsPerSec)(counter) windows_os_info: OperatingSystem.Caption, OperatingSystem.Version(gauge) windows_os_paging_free_bytes: OperatingSystem.FreeSpaceInPagingFiles(gauge) windows_os_paging_limit_bytes: OperatingSystem.SizeStoredInPagingFiles(gauge) windows_os_physical_memory_free_bytes: OperatingSystem.FreePhysicalMemory(gauge) windows_os_process_memory_limix_bytes: OperatingSystem.MaxProcessMemorySize(gauge) windows_os_processes: OperatingSystem.NumberOfProcesses(gauge) windows_os_processes_limit: OperatingSystem.MaxNumberOfProcesses(gauge) windows_os_time: OperatingSystem.LocalDateTime(gauge) windows_os_timezone: OperatingSystem.LocalDateTime(gauge) windows_os_users: OperatingSystem.NumberOfUsers(gauge) windows_os_virtual_memory_bytes: OperatingSystem.TotalVirtualMemorySize(gauge) windows_os_virtual_memory_free_bytes: OperatingSystem.FreeVirtualMemory(gauge) windows_os_visible_memory_bytes: OperatingSystem.TotalVisibleMemorySize(gauge) windows_service_info: A metric with a constant '1' value labeled with service information(gauge) windows_service_start_mode: The start mode of the service (StartMode)(gauge) windows_service_state: The state of the service (State)(gauge) windows_service_status: The status of the service (Status)(gauge) windows_system_context_switches_total: Total number of context switches (WMI source is PerfOS_System.ContextSwitchesPersec)(counter) windows_system_exception_dispatches_total: Total number of exceptions dispatched (WMI source is PerfOS_System.ExceptionDispatchesPersec)(counter) windows_system_processor_queue_length: Length of processor queue (WMI source is PerfOS_System.ProcessorQueueLength)(gauge) windows_system_system_calls_total: Total number of system calls (WMI source is PerfOS_System.SystemCallsPersec)(counter) windows_system_system_up_time: System boot time (WMI source is PerfOS_System.SystemUpTime)(gauge) windows_system_threads: Current number of threads (WMI source is PerfOS_System.Threads)(gauge) # [node_exporter] # SYSTEM # CPU context switch 次数 node_context_switches_total: context_switches # Interrupts 次数 node_intr_total: Interrupts # 运行的进程数 node_procs_running: Processes in runnable state # 熵池大小 node_entropy_available_bits: Entropy available to random number generators node_time_seconds: System time in seconds since epoch (1970) node_boot_time_seconds: Node boot time, in unixtime # CPU node_cpu_seconds_total: Seconds the CPUs spent in each mode node_load1: cpu load 1m node_load5: cpu load 5m node_load15: cpu load 15m # MEM # 内核态 # 内核用于缓存数据结构供自己使用的内存 node_memory_Slab_bytes: Memory used by the kernel to cache data structures for its own use # slab中可回收的部分 node_memory_SReclaimable_bytes: SReclaimable - Part of Slab, that might be reclaimed, such as caches # slab中不可回收的部分 node_memory_SUnreclaim_bytes: Part of Slab, that cannot be reclaimed on memory pressure # Vmalloc内存区的大小 node_memory_VmallocTotal_bytes: Total size of vmalloc memory area # vmalloc已分配的内存,虚拟地址空间上的连续的内存 node_memory_VmallocUsed_bytes: Amount of vmalloc area which is used # vmalloc区可用的连续最大快的大小,通过此指标可以知道vmalloc可分配连续内存的最大值 node_memory_VmallocChunk_bytes: Largest contiguous block of vmalloc area which is free # 内存的硬件故障删除掉的内存页的总大小 node_memory_HardwareCorrupted_bytes: Amount of RAM that the kernel identified as corrupted / not working # 用于在虚拟和物理内存地址之间映射的内存 node_memory_PageTables_bytes: Memory used to map between virtual and physical memory addresses (gauge) # 内核栈内存,常驻内存,不可回收 node_memory_KernelStack_bytes: Kernel memory stack. This is not reclaimable # 用来访问高端内存,复制高端内存的临时buffer,称为“bounce buffering”,会降低I/O 性能 node_memory_Bounce_bytes: Memory used for block device bounce buffers #用户态 # 单个巨页大小 node_memory_Hugepagesize_bytes: Huge Page size # 系统分配的常驻巨页数 node_memory_HugePages_Total: Total size of the pool of huge pages # 系统空闲的巨页数 node_memory_HugePages_Free: Huge pages in the pool that are not yet allocated # 进程已申请但未使用的巨页数 node_memory_HugePages_Rsvd: Huge pages for which a commitment to allocate from the pool has been made, but no allocation # 超过系统设定的常驻HugePages数量的个数 node_memory_HugePages_Surp: Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages # 透明巨页 Transparent HugePages (THP) node_memory_AnonHugePages_bytes: Memory in anonymous huge pages # inactivelist中的File-backed内存 node_memory_Inactive_file_bytes: File-backed memory on inactive LRU list # inactivelist中的Anonymous内存 node_memory_Inactive_anon_bytes: Anonymous and swap cache on inactive LRU list, including tmpfs (shmem) # activelist中的File-backed内存 node_memory_Active_file_bytes: File-backed memory on active LRU list # activelist中的Anonymous内存 node_memory_Active_anon_bytes: Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs # 禁止换出的页,对应 Unevictable 链表 node_memory_Unevictable_bytes: Amount of unevictable memory that can't be swapped out for a variety of reasons # 共享内存 node_memory_Shmem_bytes: Used shared memory (shared between several processes, thus including RAM disks) # 匿名页内存大小 node_memory_AnonPages_bytes: Memory in user pages not backed by files # 被关联的内存页大小 node_memory_Mapped_bytes: Used memory in mapped pages files which have been mapped, such as libraries # file-backed内存页缓存大小 node_memory_Cached_bytes: Parked file data (file content) cache # 系统中有多少匿名页曾经被swap-out、现在又被swap-in并且swap-in之后页面中的内容一直没发生变化 node_memory_SwapCached_bytes: Memory that keeps track of pages that have been fetched from swap but not yet been modified # 被mlock()系统调用锁定的内存大小 node_memory_Mlocked_bytes: Size of pages locked to memory using the mlock() system call # 块设备(block device)所占用的缓存页 node_memory_Buffers_bytes: Block device (e.g. harddisk) cache node_memory_SwapTotal_bytes: Memory information field SwapTotal_bytes node_memory_SwapFree_bytes: Memory information field SwapFree_bytes # DISK node_filesystem_avail_bytes: Filesystem space available to non-root users in byte node_filesystem_free_bytes: Filesystem free space in bytes node_filesystem_size_bytes: Filesystem size in bytes node_filesystem_files_free: Filesystem total free file nodes node_filesystem_files: Filesystem total free file nodes node_filefd_maximum: Max open files node_filefd_allocated: Open files node_filesystem_readonly: Filesystem read-only status node_filesystem_device_error: Whether an error occurred while getting statistics for the given device node_disk_reads_completed_total: The total number of reads completed successfully node_disk_writes_completed_total: The total number of writes completed successfully node_disk_reads_merged_total: The number of reads merged node_disk_writes_merged_total: The number of writes merged node_disk_read_bytes_total: The total number of bytes read successfully node_disk_written_bytes_total: The total number of bytes written successfully node_disk_io_time_seconds_total: Total seconds spent doing I/Os node_disk_read_time_seconds_total: The total number of seconds spent by all reads node_disk_write_time_seconds_total: The total number of seconds spent by all writes node_disk_io_time_weighted_seconds_total: The weighted of seconds spent doing I/Os # NET node_network_receive_bytes_total: Network device statistic receive_bytes (counter) node_network_transmit_bytes_total: Network device statistic transmit_bytes (counter) node_network_receive_packets_total: Network device statistic receive_bytes node_network_transmit_packets_total: Network device statistic transmit_bytes node_network_receive_errs_total: Network device statistic receive_errs node_network_transmit_errs_total: Network device statistic transmit_errs node_network_receive_drop_total: Network device statistic receive_drop node_network_transmit_drop_total: Network device statistic transmit_drop node_nf_conntrack_entries: Number of currently allocated flow entries for connection tracking node_sockstat_TCP_alloc: Number of TCP sockets in state alloc node_sockstat_TCP_inuse: Number of TCP sockets in state inuse node_sockstat_TCP_orphan: Number of TCP sockets in state orphan node_sockstat_TCP_tw: Number of TCP sockets in state tw node_netstat_Tcp_CurrEstab: Statistic TcpCurrEstab node_sockstat_sockets_used: Number of IPv4 sockets in use # [kafka_exporter] kafka_brokers: count of kafka_brokers (gauge) kafka_topic_partitions: Number of partitions for this Topic (gauge) kafka_topic_partition_current_offset: Current Offset of a Broker at Topic/Partition (gauge) kafka_consumergroup_current_offset: Current Offset of a ConsumerGroup at Topic/Partition (gauge) kafka_consumer_lag_millis: Current approximation of consumer lag for a ConsumerGroup at Topic/Partition (gauge) kafka_topic_partition_under_replicated_partition: 1 if Topic/Partition is under Replicated # [zookeeper_exporter] zk_znode_count: The total count of znodes stored zk_ephemerals_count: The number of Ephemerals nodes zk_watch_count: The number of watchers setup over Zookeeper nodes. zk_approximate_data_size: Size of data in bytes that a zookeeper server has in its data tree zk_outstanding_requests: Number of currently executing requests zk_packets_sent: Count of the number of zookeeper packets sent from a server zk_packets_received: Count of the number of zookeeper packets received by a server zk_num_alive_connections: Number of active clients connected to a zookeeper server zk_open_file_descriptor_count: Number of file descriptors that a zookeeper server has open zk_max_file_descriptor_count: Maximum number of file descriptors that a zookeeper server can open zk_avg_latency: Average time in milliseconds for requests to be processed zk_min_latency: Maximum time in milliseconds for a request to be processed zk_max_latency: Minimum time in milliseconds for a request to be processed ================================================ FILE: docker/compose-bridge/etc-nightingale/script/notify.bak.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- import sys import json import urllib2 import smtplib from email.mime.text import MIMEText reload(sys) sys.setdefaultencoding('utf8') notify_channel_funcs = { "email":"email", "sms":"sms", "voice":"voice", "dingtalk":"dingtalk", "wecom":"wecom", "feishu":"feishu" } mail_host = "smtp.163.com" mail_port = 994 mail_user = "ulricqin" mail_pass = "password" mail_from = "ulricqin@163.com" class Sender(object): @classmethod def send_email(cls, payload): if mail_user == "ulricqin" and mail_pass == "password": print("invalid smtp configuration") return users = payload.get('event').get("notify_users_obj") emails = {} for u in users: if u.get("email"): emails[u.get("email")] = 1 if not emails: return recipients = emails.keys() mail_body = payload.get('tpls').get("email.tpl", "email.tpl not found") message = MIMEText(mail_body, 'html', 'utf-8') message['From'] = mail_from message['To'] = ", ".join(recipients) message["Subject"] = payload.get('tpls').get("subject.tpl", "subject.tpl not found") try: smtp = smtplib.SMTP_SSL(mail_host, mail_port) smtp.login(mail_user, mail_pass) smtp.sendmail(mail_from, recipients, message.as_string()) smtp.close() except smtplib.SMTPException, error: print(error) @classmethod def send_wecom(cls, payload): users = payload.get('event').get("notify_users_obj") tokens = {} for u in users: contacts = u.get("contacts") if contacts.get("wecom_robot_token", ""): tokens[contacts.get("wecom_robot_token", "")] = 1 opener = urllib2.build_opener(urllib2.HTTPHandler()) method = "POST" for t in tokens: url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}".format(t) body = { "msgtype": "markdown", "markdown": { "content": payload.get('tpls').get("wecom.tpl", "wecom.tpl not found") } } request = urllib2.Request(url, data=json.dumps(body)) request.add_header("Content-Type",'application/json;charset=utf-8') request.get_method = lambda: method try: connection = opener.open(request) print(connection.read()) except urllib2.HTTPError, error: print(error) @classmethod def send_dingtalk(cls, payload): event = payload.get('event') users = event.get("notify_users_obj") rule_name = event.get("rule_name") event_state = "Triggered" if event.get("is_recovered"): event_state = "Recovered" tokens = {} phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 contacts = u.get("contacts") if contacts.get("dingtalk_robot_token", ""): tokens[contacts.get("dingtalk_robot_token", "")] = 1 opener = urllib2.build_opener(urllib2.HTTPHandler()) method = "POST" for t in tokens: url = "https://oapi.dingtalk.com/robot/send?access_token={}".format(t) body = { "msgtype": "markdown", "markdown": { "title": "{} - {}".format(event_state, rule_name), "text": payload.get('tpls').get("dingtalk.tpl", "dingtalk.tpl not found") + ' '.join(["@"+i for i in phones.keys()]) }, "at": { "atMobiles": phones.keys(), "isAtAll": False } } request = urllib2.Request(url, data=json.dumps(body)) request.add_header("Content-Type",'application/json;charset=utf-8') request.get_method = lambda: method try: connection = opener.open(request) print(connection.read()) except urllib2.HTTPError, error: print(error) @classmethod def send_feishu(cls, payload): users = payload.get('event').get("notify_users_obj") tokens = {} phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 contacts = u.get("contacts") if contacts.get("feishu_robot_token", ""): tokens[contacts.get("feishu_robot_token", "")] = 1 opener = urllib2.build_opener(urllib2.HTTPHandler()) method = "POST" for t in tokens: url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t) body = { "msg_type": "text", "content": { "text": payload.get('tpls').get("feishu.tpl", "feishu.tpl not found") }, "at": { "atMobiles": phones.keys(), "isAtAll": False } } request = urllib2.Request(url, data=json.dumps(body)) request.add_header("Content-Type",'application/json;charset=utf-8') request.get_method = lambda: method try: connection = opener.open(request) print(connection.read()) except urllib2.HTTPError, error: print(error) @classmethod def send_sms(cls, payload): users = payload.get('event').get("notify_users_obj") phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 if phones: print("send_sms not implemented, phones: {}".format(phones.keys())) @classmethod def send_voice(cls, payload): users = payload.get('event').get("notify_users_obj") phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 if phones: print("send_voice not implemented, phones: {}".format(phones.keys())) def main(): payload = json.load(sys.stdin) with open(".payload", 'w') as f: f.write(json.dumps(payload, indent=4)) for ch in payload.get('event').get('notify_channels'): send_func_name = "send_{}".format(notify_channel_funcs.get(ch.strip())) if not hasattr(Sender, send_func_name): print("function: {} not found", send_func_name) continue send_func = getattr(Sender, send_func_name) send_func(payload) def hello(): print("hello nightingale") if __name__ == "__main__": if len(sys.argv) == 1: main() elif sys.argv[1] == "hello": hello() else: print("I am confused") ================================================ FILE: docker/compose-bridge/etc-nightingale/script/notify.py ================================================ #!/usr/bin/env python3 # -*- coding: UTF-8 -*- import sys import json class Sender(object): @classmethod def send_email(cls, payload): # already done in go code pass @classmethod def send_wecom(cls, payload): # already done in go code pass @classmethod def send_dingtalk(cls, payload): # already done in go code pass @classmethod def send_feishu(cls, payload): # already done in go code pass @classmethod def send_mm(cls, payload): # already done in go code pass @classmethod def send_sms(cls, payload): users = payload.get('event').get("notify_users_obj") phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 if phones: print("send_sms not implemented, phones: {}".format(phones.keys())) @classmethod def send_voice(cls, payload): users = payload.get('event').get("notify_users_obj") phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 if phones: print("send_voice not implemented, phones: {}".format(phones.keys())) def main(): payload = json.load(sys.stdin) with open(".payload", 'w') as f: f.write(json.dumps(payload, indent=4)) for ch in payload.get('event').get('notify_channels'): send_func_name = "send_{}".format(ch.strip()) if not hasattr(Sender, send_func_name): print("function: {} not found", send_func_name) continue send_func = getattr(Sender, send_func_name) send_func(payload) def hello(): print("hello nightingale") if __name__ == "__main__": if len(sys.argv) == 1: main() elif sys.argv[1] == "hello": hello() else: print("I am confused") ================================================ FILE: docker/compose-bridge/etc-nightingale/script/notify_feishu.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- import sys import json import requests class Sender(object): @classmethod def send_email(cls, payload): # already done in go code pass @classmethod def send_wecom(cls, payload): # already done in go code pass @classmethod def send_dingtalk(cls, payload): # already done in go code pass @classmethod def send_ifeishu(cls, payload): users = payload.get('event').get("notify_users_obj") tokens = {} phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 contacts = u.get("contacts") if contacts.get("feishu_robot_token", ""): tokens[contacts.get("feishu_robot_token", "")] = 1 headers = { "Content-Type": "application/json;charset=utf-8", "Host": "open.feishu.cn" } for t in tokens: url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t) body = { "msg_type": "text", "content": { "text": payload.get('tpls').get("feishu", "feishu not found") }, "at": { "atMobiles": list(phones.keys()), "isAtAll": False } } response = requests.post(url, headers=headers, data=json.dumps(body)) print(f"notify_ifeishu: token={t} status_code={response.status_code} response_text={response.text}") @classmethod def send_mm(cls, payload): # already done in go code pass @classmethod def send_sms(cls, payload): pass @classmethod def send_voice(cls, payload): pass def main(): payload = json.load(sys.stdin) with open(".payload", 'w') as f: f.write(json.dumps(payload, indent=4)) for ch in payload.get('event').get('notify_channels'): send_func_name = "send_{}".format(ch.strip()) if not hasattr(Sender, send_func_name): print("function: {} not found", send_func_name) continue send_func = getattr(Sender, send_func_name) send_func(payload) def hello(): print("hello nightingale") if __name__ == "__main__": if len(sys.argv) == 1: main() elif sys.argv[1] == "hello": hello() else: print("I am confused") ================================================ FILE: docker/compose-bridge/etc-nightingale/script/rule_converter.py ================================================ import json import yaml ''' 将promtheus/vmalert的rule转换为n9e中的rule 支持k8s的rule configmap ''' rule_file = 'rules.yaml' def convert_interval(interval): if interval.endswith('s') or interval.endswith('S'): return int(interval[:-1]) if interval.endswith('m') or interval.endswith('M'): return int(interval[:-1]) * 60 if interval.endswith('h') or interval.endswith('H'): return int(interval[:-1]) * 60 * 60 if interval.endswith('d') or interval.endswith('D'): return int(interval[:-1]) * 60 * 60 * 24 return int(interval) def convert_alert(rule, interval): name = rule['alert'] prom_ql = rule['expr'] if 'for' in rule: prom_for_duration = convert_interval(rule['for']) else: prom_for_duration = 0 prom_eval_interval = convert_interval(interval) note = '' if 'annotations' in rule: for v in rule['annotations'].values(): note = v break annotations = {} if 'annotations' in rule: for k, v in rule['annotations'].items(): annotations[k] = v append_tags = [] severity = 2 if 'labels' in rule: for k, v in rule['labels'].items(): if k != 'severity': append_tags.append('{}={}'.format(k, v)) continue if v == 'critical': severity = 1 elif v == 'info': severity = 3 # elif v == 'warning': # severity = 2 n9e_alert_rule = { "name": name, "note": note, "severity": severity, "disabled": 0, "prom_for_duration": prom_for_duration, "prom_ql": prom_ql, "prom_eval_interval": prom_eval_interval, "enable_stime": "00:00", "enable_etime": "23:59", "enable_days_of_week": [ "1", "2", "3", "4", "5", "6", "0" ], "enable_in_bg": 0, "notify_recovered": 1, "notify_channels": [], "notify_repeat_step": 60, "recover_duration": 0, "callbacks": [], "runbook_url": "", "append_tags": append_tags, "annotations":annotations } return n9e_alert_rule def convert_record(rule, interval): name = rule['record'] prom_ql = rule['expr'] prom_eval_interval = convert_interval(interval) note = '' append_tags = [] if 'labels' in rule: for k, v in rule['labels'].items(): append_tags.append('{}={}'.format(k, v)) n9e_record_rule = { "name": name, "note": note, "disabled": 0, "prom_ql": prom_ql, "prom_eval_interval": prom_eval_interval, "append_tags": append_tags } return n9e_record_rule ''' example of rule group file --- groups: - name: example rules: - alert: HighRequestLatency expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5 for: 10m labels: severity: page annotations: summary: High request latency ''' def deal_group(group): """ parse single prometheus/vmalert rule group """ alert_rules = [] record_rules = [] for rule_segment in group['groups']: if 'interval' in rule_segment: interval = rule_segment['interval'] else: interval = '15s' for rule in rule_segment['rules']: if 'alert' in rule: alert_rules.append(convert_alert(rule, interval)) else: record_rules.append(convert_record(rule, interval)) return alert_rules, record_rules ''' example of k8s rule configmap --- apiVersion: v1 kind: ConfigMap metadata: name: rulefiles-0 data: etcdrules.yaml: | groups: - name: etcd rules: - alert: etcdInsufficientMembers annotations: message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value}}).' expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2) for: 3m labels: severity: critical ''' def deal_configmap(rule_configmap): """ parse rule configmap from k8s """ all_record_rules = [] all_alert_rules = [] for _, rule_group_str in rule_configmap['data'].items(): rule_group = yaml.load(rule_group_str, Loader=yaml.FullLoader) alert_rules, record_rules = deal_group(rule_group) all_alert_rules.extend(alert_rules) all_record_rules.extend(record_rules) return all_alert_rules, all_record_rules def main(): with open(rule_file, 'r') as f: rule_config = yaml.load(f, Loader=yaml.FullLoader) # 如果文件是k8s中的configmap,使用下面的方法 # alert_rules, record_rules = deal_configmap(rule_config) alert_rules, record_rules = deal_group(rule_config) with open("alert-rules.json", 'w') as fw: json.dump(alert_rules, fw, indent=2, ensure_ascii=False) with open("record-rules.json", 'w') as fw: json.dump(record_rules, fw, indent=2, ensure_ascii=False) if __name__ == '__main__': main() ================================================ FILE: docker/compose-host-network/docker-compose.yaml ================================================ version: "3.7" services: mysql: image: "mysql:8" container_name: mysql hostname: mysql restart: always environment: TZ: Asia/Shanghai MYSQL_ROOT_PASSWORD: 1234 volumes: - ./mysqldata:/var/lib/mysql/ - ../initsql:/docker-entrypoint-initdb.d/ - ./etc-mysql/my.cnf:/etc/my.cnf network_mode: host redis: image: "redis:6.2" container_name: redis hostname: redis restart: always environment: TZ: Asia/Shanghai network_mode: host prometheus: image: prom/prometheus:v2.55.1 container_name: prometheus hostname: prometheus restart: always environment: TZ: Asia/Shanghai volumes: - ./etc-prometheus:/etc/prometheus network_mode: host command: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" - "--web.console.libraries=/usr/share/prometheus/console_libraries" - "--web.console.templates=/usr/share/prometheus/consoles" - "--enable-feature=remote-write-receiver" - "--query.lookback-delta=2m" n9e: image: flashcatcloud/nightingale:latest container_name: n9e hostname: n9e restart: always environment: GIN_MODE: release TZ: Asia/Shanghai WAIT_HOSTS: 127.0.0.1:3306, 127.0.0.1:6379 volumes: - ./etc-nightingale:/app/etc network_mode: host depends_on: - mysql - redis - prometheus command: - /app/n9e categraf: image: "flashcatcloud/categraf:latest" container_name: "categraf" hostname: "categraf01" restart: always environment: TZ: Asia/Shanghai HOST_PROC: /hostfs/proc HOST_SYS: /hostfs/sys HOST_MOUNT_PREFIX: /hostfs WAIT_HOSTS: 127.0.0.1:17000, 127.0.0.1:20090 volumes: - ./etc-categraf:/etc/categraf/conf - /:/hostfs network_mode: host depends_on: - n9e ================================================ FILE: docker/compose-host-network/etc-categraf/config.toml ================================================ [global] # whether print configs print_configs = false # add label(agent_hostname) to series # "" -> auto detect hostname # "xx" -> use specified string xx # "$hostname" -> auto detect hostname # "$ip" -> auto detect ip # "$hostname-$ip" -> auto detect hostname and ip to replace the vars hostname = "$HOSTNAME" # will not add label(agent_hostname) if true omit_hostname = false # s | ms precision = "ms" # global collect interval interval = 15 [global.labels] source="categraf" # region = "shanghai" # env = "localhost" [writer_opt] # default: 2000 batch = 2000 # channel(as queue) size chan_size = 10000 [[writers]] url = "http://127.0.0.1:17000/prometheus/v1/write" # Basic auth username basic_auth_user = "" # Basic auth password basic_auth_pass = "" # timeout settings, unit: ms timeout = 5000 dial_timeout = 2500 max_idle_conns_per_host = 100 [http] enable = false address = ":9100" print_access = false run_mode = "release" [heartbeat] enable = true # report os version cpu.util mem.util metadata url = "http://127.0.0.1:17000/v1/n9e/heartbeat" # interval, unit: s interval = 10 # Basic auth username basic_auth_user = "" # Basic auth password basic_auth_pass = "" ## Optional headers # headers = ["X-From", "categraf", "X-Xyz", "abc"] # timeout settings, unit: ms timeout = 5000 dial_timeout = 2500 max_idle_conns_per_host = 100 [ibex] enable = true ## ibex flush interval interval = "1000ms" ## n9e ibex server rpc address servers = ["127.0.0.1:20090"] ## temp script dir meta_dir = "./meta" ================================================ FILE: docker/compose-host-network/etc-categraf/input.cpu/cpu.toml ================================================ # # collect interval # interval = 15 # # whether collect per cpu # collect_per_cpu = false ================================================ FILE: docker/compose-host-network/etc-categraf/input.disk/disk.toml ================================================ # # collect interval # interval = 15 # # By default stats will be gathered for all mount points. # # Set mount_points will restrict the stats to only the specified mount points. # mount_points = ["/"] # Ignore mount points by filesystem type. ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] ignore_mount_points = ["/boot"] ================================================ FILE: docker/compose-host-network/etc-categraf/input.diskio/diskio.toml ================================================ # # collect interval # interval = 15 # # By default, categraf will gather stats for all devices including disk partitions. # # Setting devices will restrict the stats to the specified devices. # devices = ["sda", "sdb", "vd*"] ================================================ FILE: docker/compose-host-network/etc-categraf/input.kernel/kernel.toml ================================================ # # collect interval # interval = 15 ================================================ FILE: docker/compose-host-network/etc-categraf/input.mem/mem.toml ================================================ # # collect interval # interval = 15 # # whether collect platform specified metrics collect_platform_fields = true ================================================ FILE: docker/compose-host-network/etc-categraf/input.net/net.toml ================================================ # # collect interval # interval = 15 # # whether collect protocol stats on Linux # collect_protocol_stats = false # # setting interfaces will tell categraf to gather these explicit interfaces # interfaces = ["eth0"] ================================================ FILE: docker/compose-host-network/etc-categraf/input.netstat/netstat.toml ================================================ # # collect interval # interval = 15 ================================================ FILE: docker/compose-host-network/etc-categraf/input.processes/processes.toml ================================================ # # collect interval # interval = 15 # # force use ps command to gather # force_ps = false # # force use /proc to gather # force_proc = false ================================================ FILE: docker/compose-host-network/etc-categraf/input.system/system.toml ================================================ # # collect interval # interval = 15 # # whether collect metric: system_n_users # collect_user_number = false ================================================ FILE: docker/compose-host-network/etc-mysql/my.cnf ================================================ [mysqld] pid-file = /var/run/mysqld/mysqld.pid socket = /var/run/mysqld/mysqld.sock datadir = /var/lib/mysql bind-address = 127.0.0.1 ================================================ FILE: docker/compose-host-network/etc-nightingale/config.toml ================================================ [Global] RunMode = "release" [Log] # log write dir Dir = "logs" # log level: DEBUG INFO WARNING ERROR Level = "INFO" # stdout, stderr, file Output = "stdout" # # rotate by time # KeepHours = 4 # # rotate by size # RotateNum = 3 # # unit: MB # RotateSize = 256 [HTTP] # http listening address Host = "0.0.0.0" # http listening port Port = 17000 # https cert file path CertFile = "" # https key file path KeyFile = "" # whether print access log PrintAccessLog = false # whether enable pprof PProf = false # expose prometheus /metrics? ExposeMetrics = true # http graceful shutdown timeout, unit: s ShutdownTimeout = 30 # max content length: 64M MaxContentLength = 67108864 # http server read timeout, unit: s ReadTimeout = 20 # http server write timeout, unit: s WriteTimeout = 40 # http server idle timeout, unit: s IdleTimeout = 120 [HTTP.ShowCaptcha] Enable = false [HTTP.APIForAgent] Enable = true # [HTTP.APIForAgent.BasicAuth] # user001 = "ccc26da7b9aba533cbb263a36c07dcc5" [HTTP.APIForService] Enable = false [HTTP.APIForService.BasicAuth] user001 = "ccc26da7b9aba533cbb263a36c07dcc5" [HTTP.JWTAuth] # unit: min AccessExpired = 1500 # unit: min RefreshExpired = 10080 RedisKeyPrefix = "/jwt/" [HTTP.ProxyAuth] # if proxy auth enabled, jwt auth is disabled Enable = false # username key in http proxy header HeaderUserNameKey = "X-User-Name" DefaultRoles = ["Standard"] [HTTP.RSA] # open RSA OpenRSA = false [DB] # postgres: host=%s port=%s user=%s dbname=%s password=%s sslmode=%s # postgres: DSN="host=127.0.0.1 port=5432 user=root dbname=n9e_v6 password=1234 sslmode=disable" DSN="root:1234@tcp(127.0.0.1:3306)/n9e_v6?charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true" # enable debug mode or not Debug = false # mysql postgres DBType = "mysql" # unit: s MaxLifetime = 7200 # max open connections MaxOpenConns = 150 # max idle connections MaxIdleConns = 50 [Redis] # address, ip:port or ip1:port,ip2:port for cluster and sentinel(SentinelAddrs) Address = "127.0.0.1:6379" # Username = "" # Password = "" # DB = 0 # UseTLS = false # TLSMinVersion = "1.2" # standalone cluster sentinel RedisType = "standalone" # Mastername for sentinel type # MasterName = "mymaster" # SentinelUsername = "" # SentinelPassword = "" [Alert] [Alert.Heartbeat] # auto detect if blank IP = "" # unit ms Interval = 1000 EngineName = "default" # [Alert.Alerting] # NotifyConcurrency = 10 [Center] MetricsYamlFile = "./etc/metrics.yaml" I18NHeaderKey = "X-Language" [Center.AnonymousAccess] PromQuerier = true AlertDetail = true [Pushgw] # use target labels in database instead of in series LabelRewrite = true ForceUseServerTS = true # [Pushgw.DebugSample] # ident = "xx" # __name__ = "xx" # [Pushgw.WriterOpt] # QueueMaxSize = 1000000 # QueuePopSize = 1000 [[Pushgw.Writers]] # Url = "http://127.0.0.1:8480/insert/0/prometheus/api/v1/write" Url = "http://127.0.0.1:9090/api/v1/write" # Basic auth username BasicAuthUser = "" # Basic auth password BasicAuthPass = "" # timeout settings, unit: ms Headers = ["X-From", "n9e"] Timeout = 10000 DialTimeout = 3000 TLSHandshakeTimeout = 30000 ExpectContinueTimeout = 1000 IdleConnTimeout = 90000 # time duration, unit: ms KeepAlive = 30000 MaxConnsPerHost = 0 MaxIdleConns = 100 MaxIdleConnsPerHost = 100 ## Optional TLS Config # UseTLS = false # TLSCA = "/etc/n9e/ca.pem" # TLSCert = "/etc/n9e/cert.pem" # TLSKey = "/etc/n9e/key.pem" # InsecureSkipVerify = false # [[Writers.WriteRelabels]] # Action = "replace" # SourceLabels = ["__address__"] # Regex = "([^:]+)(?::\\d+)?" # Replacement = "$1:80" # TargetLabel = "__address__" [Ibex] Enable = true RPCListen = "0.0.0.0:20090" ================================================ FILE: docker/compose-host-network/etc-nightingale/metrics.yaml ================================================ zh: ip_conntrack_count: 连接跟踪表条目总数(单位:int, count) ip_conntrack_max: 连接跟踪表最大容量(单位:int, size) cpu_usage_idle: CPU空闲率(单位:%) cpu_usage_active: CPU使用率(单位:%) cpu_usage_system: CPU内核态时间占比(单位:%) cpu_usage_user: CPU用户态时间占比(单位:%) cpu_usage_nice: 低优先级用户态CPU时间占比,也就是进程nice值被调整为1-19之间的CPU时间。这里注意,nice可取值范围是-20到19,数值越大,优先级反而越低(单位:%) cpu_usage_iowait: CPU等待I/O的时间占比(单位:%) cpu_usage_irq: CPU处理硬中断的时间占比(单位:%) cpu_usage_softirq: CPU处理软中断的时间占比(单位:%) cpu_usage_steal: 在虚拟机环境下有该指标,表示CPU被其他虚拟机争用的时间占比,超过20就表示争抢严重(单位:%) cpu_usage_guest: 通过虚拟化运行其他操作系统的时间,也就是运行虚拟机的CPU时间占比(单位:%) cpu_usage_guest_nice: 以低优先级运行虚拟机的时间占比(单位:%) disk_free: 硬盘分区剩余量(单位:byte) disk_used: 硬盘分区使用量(单位:byte) disk_used_percent: 硬盘分区使用率(单位:%) disk_total: 硬盘分区总量(单位:byte) disk_inodes_free: 硬盘分区inode剩余量 disk_inodes_used: 硬盘分区inode使用量 disk_inodes_total: 硬盘分区inode总量 diskio_io_time: 从设备视角来看I/O请求总时间,队列中有I/O请求就计数(单位:毫秒),counter类型,需要用函数求rate才有使用价值 diskio_iops_in_progress: 已经分配给设备驱动且尚未完成的IO请求,不包含在队列中但尚未分配给设备驱动的IO请求,gauge类型 diskio_merged_reads: 相邻读请求merge读的次数,counter类型 diskio_merged_writes: 相邻写请求merge写的次数,counter类型 diskio_read_bytes: 读取的byte数量,counter类型,需要用函数求rate才有使用价值 diskio_read_time: 读请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值 diskio_reads: 读请求次数,counter类型,需要用函数求rate才有使用价值 diskio_weighted_io_time: 从I/O请求视角来看I/O等待总时间,如果同时有多个I/O请求,时间会叠加(单位:毫秒) diskio_write_bytes: 写入的byte数量,counter类型,需要用函数求rate才有使用价值 diskio_write_time: 写请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值 diskio_writes: 写请求次数,counter类型,需要用函数求rate才有使用价值 kernel_boot_time: 内核启动时间 kernel_context_switches: 内核上下文切换次数 kernel_entropy_avail: linux系统内部的熵池 kernel_interrupts: 内核中断次数 kernel_processes_forked: fork的进程数 mem_active: 活跃使用的内存总数(包括cache和buffer内存) mem_available: 可用内存大小(bytes) mem_available_percent: 内存剩余百分比(0~100) mem_buffered: 用来给文件做缓冲大小 mem_cached: 被高速缓冲存储器(cache memory)用的内存的大小(等于 diskcache minus SwapCache ) mem_commit_limit: 根据超额分配比率('vm.overcommit_ratio'),这是当前在系统上分配可用的内存总量,这个限制只是在模式2('vm.overcommit_memory')时启用 mem_committed_as: 目前在系统上分配的内存量。是所有进程申请的内存的总和 mem_dirty: 等待被写回到磁盘的内存大小 mem_free: 空闲内存大小(bytes) mem_high_free: 未被使用的高位内存大小 mem_high_total: 高位内存总大小(Highmem是指所有内存高于860MB的物理内存,Highmem区域供用户程序使用,或用于页面缓存。该区域不是直接映射到内核空间。内核必须使用不同的手法使用该段内存) mem_huge_page_size: 每个大页的大小 mem_huge_pages_free: 池中尚未分配的 HugePages 数量 mem_huge_pages_total: 预留HugePages的总个数 mem_inactive: 空闲的内存数(包括free和available的内存) mem_low_free: 未被使用的低位大小 mem_low_total: 低位内存总大小,低位可以达到高位内存一样的作用,而且它还能够被内核用来记录一些自己的数据结构 mem_mapped: 设备和文件等映射的大小 mem_page_tables: 管理内存分页页面的索引表的大小 mem_shared: 多个进程共享的内存总额 mem_slab: 内核数据结构缓存的大小,可以减少申请和释放内存带来的消耗 mem_sreclaimable: 可收回Slab的大小 mem_sunreclaim: 不可收回Slab的大小(SUnreclaim+SReclaimable=Slab) mem_swap_cached: 被高速缓冲存储器(cache memory)用的交换空间的大小,已经被交换出来的内存,但仍然被存放在swapfile中。用来在需要的时候很快的被替换而不需要再次打开I/O端口 mem_swap_free: 未被使用交换空间的大小 mem_swap_total: 交换空间的总大小 mem_total: 内存总数 mem_used: 已用内存数 mem_used_percent: 已用内存数百分比(0~100) mem_vmalloc_chunk: 最大的连续未被使用的vmalloc区域 mem_vmalloc_totalL: 可以vmalloc虚拟内存大小 mem_vmalloc_used: vmalloc已使用的虚拟内存大小 mem_write_back: 正在被写回到磁盘的内存大小 mem_write_back_tmp: FUSE用于临时写回缓冲区的内存 net_bytes_recv: 网卡收包总数(bytes),计算每秒速率时需要用到rate/irate函数 net_bytes_sent: 网卡发包总数(bytes),计算每秒速率时需要用到rate/irate函数 net_drop_in: 网卡收丢包数量 net_drop_out: 网卡发丢包数量 net_err_in: 网卡收包错误数量 net_err_out: 网卡发包错误数量 net_packets_recv: 网卡收包数量 net_packets_sent: 网卡发包数量 net_bits_recv: 网卡收包总数(bits),计算每秒速率时需要用到rate/irate函数 net_bits_sent: 网卡发包总数(bits),计算每秒速率时需要用到rate/irate函数 netstat_tcp_established: ESTABLISHED状态的网络链接数 netstat_tcp_fin_wait1: FIN_WAIT1状态的网络链接数 netstat_tcp_fin_wait2: FIN_WAIT2状态的网络链接数 netstat_tcp_last_ack: LAST_ACK状态的网络链接数 netstat_tcp_listen: LISTEN状态的网络链接数 netstat_tcp_syn_recv: SYN_RECV状态的网络链接数 netstat_tcp_syn_sent: SYN_SENT状态的网络链接数 netstat_tcp_time_wait: TIME_WAIT状态的网络链接数 netstat_udp_socket: UDP状态的网络链接数 netstat_sockets_used: 已使用的所有协议套接字总量 netstat_tcp_inuse: 正在使用(正在侦听)的TCP套接字数量 netstat_tcp_orphan: 无主(不属于任何进程)的TCP连接数(无用、待销毁的TCP socket数) netstat_tcp_tw: TIME_WAIT状态的TCP连接数 netstat_tcp_alloc: 已分配(已建立、已申请到sk_buff)的TCP套接字数量 netstat_tcp_mem: TCP套接字内存Page使用量 netstat_udp_inuse: 在使用的UDP套接字数量 netstat_udp_mem: UDP套接字内存Page使用量 netstat_udplite_inuse: 正在使用的 udp lite 数量 netstat_raw_inuse: 正在使用的 raw socket 数量 netstat_frag_inuse: ip fragment 数量 netstat_frag_memory: ip fragment 已经分配的内存(byte) #[ping] ping_percent_packet_loss: ping数据包丢失百分比(%) ping_result_code: ping返回码('0','1') net_response_result_code: 网络探测结果,0表示正常,非0表示异常 net_response_response_time: 网络探测时延,单位:秒 processes_blocked: 不可中断的睡眠状态下的进程数('U','D','L') processes_dead: 回收中的进程数('X') processes_idle: 挂起的空闲进程数('I') processes_paging: 分页进程数('P') processes_running: 运行中的进程数('R') processes_sleeping: 可中断进程数('S') processes_stopped: 暂停状态进程数('T') processes_total: 总进程数 processes_total_threads: 总线程数 processes_unknown: 未知状态进程数 processes_zombies: 僵尸态进程数('Z') swap_used_percent: Swap空间换出数据量 system_load1: 1分钟平均load值 system_load5: 5分钟平均load值 system_load15: 15分钟平均load值 system_load_norm_1: 1分钟平均load值/逻辑CPU个数 system_load_norm_5: 5分钟平均load值/逻辑CPU个数 system_load_norm_15: 15分钟平均load值/逻辑CPU个数 system_n_users: 用户数 system_n_cpus: CPU核数 system_uptime: 系统启动时间 nginx_accepts: 自nginx启动起,与客户端建立过得连接总数 nginx_active: 当前nginx正在处理的活动连接数,等于Reading/Writing/Waiting总和 nginx_handled: 自nginx启动起,处理过的客户端连接总数 nginx_reading: 正在读取HTTP请求头部的连接总数 nginx_requests: 自nginx启动起,处理过的客户端请求总数,由于存在HTTP Keep-Alive请求,该值会大于handled值 nginx_upstream_check_fall: upstream_check模块检测到后端失败的次数 nginx_upstream_check_rise: upstream_check模块对后端的检测次数 nginx_upstream_check_status_code: 后端upstream的状态,up为1,down为0 nginx_waiting: 开启 keep-alive 的情况下,这个值等于 active – (reading+writing), 意思就是 Nginx 已经处理完正在等候下一次请求指令的驻留连接 nginx_writing: 正在向客户端发送响应的连接总数 http_response_content_length: HTTP消息实体的传输长度 http_response_http_response_code: http响应状态码 http_response_response_time: http响应用时 http_response_result_code: url探测结果0为正常否则url无法访问 # [aws cloudwatch rds] cloudwatch_aws_rds_bin_log_disk_usage_average: rds 磁盘使用平均值 cloudwatch_aws_rds_bin_log_disk_usage_maximum: rds 磁盘使用量最大值 cloudwatch_aws_rds_bin_log_disk_usage_minimum: rds binlog 磁盘使用量最低 cloudwatch_aws_rds_bin_log_disk_usage_sample_count: rds binlog 磁盘使用情况样本计数 cloudwatch_aws_rds_bin_log_disk_usage_sum: rds binlog 磁盘使用总和 cloudwatch_aws_rds_burst_balance_average: rds 突发余额平均值 cloudwatch_aws_rds_burst_balance_maximum: rds 突发余额最大值 cloudwatch_aws_rds_burst_balance_minimum: rds 突发余额最低 cloudwatch_aws_rds_burst_balance_sample_count: rds 突发平衡样本计数 cloudwatch_aws_rds_burst_balance_sum: rds 突发余额总和 cloudwatch_aws_rds_cpu_utilization_average: rds cpu 利用率平均值 cloudwatch_aws_rds_cpu_utilization_maximum: rds cpu 利用率最大值 cloudwatch_aws_rds_cpu_utilization_minimum: rds cpu 利用率最低 cloudwatch_aws_rds_cpu_utilization_sample_count: rds cpu 利用率样本计数 cloudwatch_aws_rds_cpu_utilization_sum: rds cpu 利用率总和 cloudwatch_aws_rds_database_connections_average: rds 数据库连接平均值 cloudwatch_aws_rds_database_connections_maximum: rds 数据库连接数最大值 cloudwatch_aws_rds_database_connections_minimum: rds 数据库连接最小 cloudwatch_aws_rds_database_connections_sample_count: rds 数据库连接样本数 cloudwatch_aws_rds_database_connections_sum: rds 数据库连接总和 cloudwatch_aws_rds_db_load_average: rds db 平均负载 cloudwatch_aws_rds_db_load_cpu_average: rds db 负载 cpu 平均值 cloudwatch_aws_rds_db_load_cpu_maximum: rds db 负载 cpu 最大值 cloudwatch_aws_rds_db_load_cpu_minimum: rds db 负载 cpu 最小值 cloudwatch_aws_rds_db_load_cpu_sample_count: rds db 加载 CPU 样本数 cloudwatch_aws_rds_db_load_cpu_sum: rds db 加载cpu总和 cloudwatch_aws_rds_db_load_maximum: rds 数据库负载最大值 cloudwatch_aws_rds_db_load_minimum: rds 数据库负载最小值 cloudwatch_aws_rds_db_load_non_cpu_average: rds 加载非 CPU 平均值 cloudwatch_aws_rds_db_load_non_cpu_maximum: rds 加载非 cpu 最大值 cloudwatch_aws_rds_db_load_non_cpu_minimum: rds 加载非 cpu 最小值 cloudwatch_aws_rds_db_load_non_cpu_sample_count: rds 加载非 cpu 样本计数 cloudwatch_aws_rds_db_load_non_cpu_sum: rds 加载非cpu总和 cloudwatch_aws_rds_db_load_sample_count: rds db 加载样本计数 cloudwatch_aws_rds_db_load_sum: rds db 负载总和 cloudwatch_aws_rds_disk_queue_depth_average: rds 磁盘队列深度平均值 cloudwatch_aws_rds_disk_queue_depth_maximum: rds 磁盘队列深度最大值 cloudwatch_aws_rds_disk_queue_depth_minimum: rds 磁盘队列深度最小值 cloudwatch_aws_rds_disk_queue_depth_sample_count: rds 磁盘队列深度样本计数 cloudwatch_aws_rds_disk_queue_depth_sum: rds 磁盘队列深度总和 cloudwatch_aws_rds_ebs_byte_balance__average: rds ebs 字节余额平均值 cloudwatch_aws_rds_ebs_byte_balance__maximum: rds ebs 字节余额最大值 cloudwatch_aws_rds_ebs_byte_balance__minimum: rds ebs 字节余额最低 cloudwatch_aws_rds_ebs_byte_balance__sample_count: rds ebs 字节余额样本数 cloudwatch_aws_rds_ebs_byte_balance__sum: rds ebs 字节余额总和 cloudwatch_aws_rds_ebsio_balance__average: rds ebsio 余额平均值 cloudwatch_aws_rds_ebsio_balance__maximum: rds ebsio 余额最大值 cloudwatch_aws_rds_ebsio_balance__minimum: rds ebsio 余额最低 cloudwatch_aws_rds_ebsio_balance__sample_count: rds ebsio 平衡样本计数 cloudwatch_aws_rds_ebsio_balance__sum: rds ebsio 余额总和 cloudwatch_aws_rds_free_storage_space_average: rds 免费存储空间平均 cloudwatch_aws_rds_free_storage_space_maximum: rds 最大可用存储空间 cloudwatch_aws_rds_free_storage_space_minimum: rds 最低可用存储空间 cloudwatch_aws_rds_free_storage_space_sample_count: rds 可用存储空间样本数 cloudwatch_aws_rds_free_storage_space_sum: rds 免费存储空间总和 cloudwatch_aws_rds_freeable_memory_average: rds 可用内存平均值 cloudwatch_aws_rds_freeable_memory_maximum: rds 最大可用内存 cloudwatch_aws_rds_freeable_memory_minimum: rds 最小可用内存 cloudwatch_aws_rds_freeable_memory_sample_count: rds 可释放内存样本数 cloudwatch_aws_rds_freeable_memory_sum: rds 可释放内存总和 cloudwatch_aws_rds_lvm_read_iops_average: rds lvm 读取 iops 平均值 cloudwatch_aws_rds_lvm_read_iops_maximum: rds lvm 读取 iops 最大值 cloudwatch_aws_rds_lvm_read_iops_minimum: rds lvm 读取 iops 最低 cloudwatch_aws_rds_lvm_read_iops_sample_count: rds lvm 读取 iops 样本计数 cloudwatch_aws_rds_lvm_read_iops_sum: rds lvm 读取 iops 总和 cloudwatch_aws_rds_lvm_write_iops_average: rds lvm 写入 iops 平均值 cloudwatch_aws_rds_lvm_write_iops_maximum: rds lvm 写入 iops 最大值 cloudwatch_aws_rds_lvm_write_iops_minimum: rds lvm 写入 iops 最低 cloudwatch_aws_rds_lvm_write_iops_sample_count: rds lvm 写入 iops 样本计数 cloudwatch_aws_rds_lvm_write_iops_sum: rds lvm 写入 iops 总和 cloudwatch_aws_rds_network_receive_throughput_average: rds 网络接收吞吐量平均 cloudwatch_aws_rds_network_receive_throughput_maximum: rds 网络接收吞吐量最大值 cloudwatch_aws_rds_network_receive_throughput_minimum: rds 网络接收吞吐量最小值 cloudwatch_aws_rds_network_receive_throughput_sample_count: rds 网络接收吞吐量样本计数 cloudwatch_aws_rds_network_receive_throughput_sum: rds 网络接收吞吐量总和 cloudwatch_aws_rds_network_transmit_throughput_average: rds 网络传输吞吐量平均值 cloudwatch_aws_rds_network_transmit_throughput_maximum: rds 网络传输吞吐量最大 cloudwatch_aws_rds_network_transmit_throughput_minimum: rds 网络传输吞吐量最小值 cloudwatch_aws_rds_network_transmit_throughput_sample_count: rds 网络传输吞吐量样本计数 cloudwatch_aws_rds_network_transmit_throughput_sum: rds 网络传输吞吐量总和 cloudwatch_aws_rds_read_iops_average: rds 读取 iops 平均值 cloudwatch_aws_rds_read_iops_maximum: rds 最大读取 iops cloudwatch_aws_rds_read_iops_minimum: rds 读取 iops 最低 cloudwatch_aws_rds_read_iops_sample_count: rds 读取 iops 样本计数 cloudwatch_aws_rds_read_iops_sum: rds 读取 iops 总和 cloudwatch_aws_rds_read_latency_average: rds 读取延迟平均值 cloudwatch_aws_rds_read_latency_maximum: rds 读取延迟最大值 cloudwatch_aws_rds_read_latency_minimum: rds 最小读取延迟 cloudwatch_aws_rds_read_latency_sample_count: rds 读取延迟样本计数 cloudwatch_aws_rds_read_latency_sum: rds 读取延迟总和 cloudwatch_aws_rds_read_throughput_average: rds 读取吞吐量平均值 cloudwatch_aws_rds_read_throughput_maximum: rds 最大读取吞吐量 cloudwatch_aws_rds_read_throughput_minimum: rds 最小读取吞吐量 cloudwatch_aws_rds_read_throughput_sample_count: rds 读取吞吐量样本计数 cloudwatch_aws_rds_read_throughput_sum: rds 读取吞吐量总和 cloudwatch_aws_rds_swap_usage_average: rds 交换使用平均值 cloudwatch_aws_rds_swap_usage_maximum: rds 交换使用最大值 cloudwatch_aws_rds_swap_usage_minimum: rds 交换使用量最低 cloudwatch_aws_rds_swap_usage_sample_count: rds 交换使用示例计数 cloudwatch_aws_rds_swap_usage_sum: rds 交换使用总和 cloudwatch_aws_rds_write_iops_average: rds 写入 iops 平均值 cloudwatch_aws_rds_write_iops_maximum: rds 写入 iops 最大值 cloudwatch_aws_rds_write_iops_minimum: rds 写入 iops 最低 cloudwatch_aws_rds_write_iops_sample_count: rds 写入 iops 样本计数 cloudwatch_aws_rds_write_iops_sum: rds 写入 iops 总和 cloudwatch_aws_rds_write_latency_average: rds 写入延迟平均值 cloudwatch_aws_rds_write_latency_maximum: rds 最大写入延迟 cloudwatch_aws_rds_write_latency_minimum: rds 写入延迟最小值 cloudwatch_aws_rds_write_latency_sample_count: rds 写入延迟样本计数 cloudwatch_aws_rds_write_latency_sum: rds 写入延迟总和 cloudwatch_aws_rds_write_throughput_average: rds 写入吞吐量平均值 cloudwatch_aws_rds_write_throughput_maximum: rds 最大写入吞吐量 cloudwatch_aws_rds_write_throughput_minimum: rds 写入吞吐量最小值 cloudwatch_aws_rds_write_throughput_sample_count: rds 写入吞吐量样本计数 cloudwatch_aws_rds_write_throughput_sum: rds 写入吞吐量总和 en: ip_conntrack_count: the number of entries in the conntrack table(unit:int, count) ip_conntrack_max: the max capacity of the conntrack table(unit:int, size) cpu_usage_idle: "CPU idle rate(unit:%)" cpu_usage_active: "CPU usage rate(unit:%)" cpu_usage_system: "CPU kernel state time proportion(unit:%)" cpu_usage_user: "CPU user attitude time proportion(unit:%)" cpu_usage_nice: "The proportion of low priority CPU time, that is, the process NICE value is adjusted to the CPU time between 1-19. Note here that the value range of NICE is -20 to 19, the larger the value, the lower the priority, the lower the priority(unit:%)" cpu_usage_iowait: "CPU waiting for I/O time proportion(unit:%)" cpu_usage_irq: "CPU processing hard interrupt time proportion(unit:%)" cpu_usage_softirq: "CPU processing soft interrupt time proportion(unit:%)" cpu_usage_steal: "In the virtual machine environment, there is this indicator, which means that the CPU is used by other virtual machines for the proportion of time.(unit:%)" cpu_usage_guest: "The time to run other operating systems by virtualization, that is, the proportion of CPU time running the virtual machine(unit:%)" cpu_usage_guest_nice: "The proportion of time to run the virtual machine at low priority(unit:%)" disk_free: "The remaining amount of the hard disk partition (unit: byte)" disk_used: "Hard disk partitional use (unit: byte)" disk_used_percent: "Hard disk partitional use rate (unit:%)" disk_total: "Total amount of hard disk partition (unit: byte)" disk_inodes_free: "Hard disk partition INODE remaining amount" disk_inodes_used: "Hard disk partition INODE usage amount" disk_inodes_total: "The total amount of hard disk partition INODE" diskio_io_time: "From the perspective of the device perspective, the total time of I/O request, the I/O request in the queue is count (unit: millisecond), the counter type, you need to use the function to find the value" diskio_iops_in_progress: "IO requests that have been assigned to device -driven and have not yet been completed, not included in the queue but not yet assigned to the device -driven IO request, Gauge type" diskio_merged_reads: "The number of times of adjacent reading request Merge, the counter type" diskio_merged_writes: "The number of times the request Merge writes, the counter type" diskio_read_bytes: "The number of byte reads, the counter type, you need to use the function to find the Rate to use the value" diskio_read_time: "The total time of reading request (unit: millisecond), the counter type, you need to use the function to find the Rate to have the value of use" diskio_reads: "Read the number of requests, the counter type, you need to use the function to find the Rate to use the value" diskio_weighted_io_time: "From the perspective of the I/O request perspective, I/O wait for the total time. If there are multiple I/O requests at the same time, the time will be superimposed (unit: millisecond)" diskio_write_bytes: "The number of bytes written, the counter type, you need to use the function to find the Rate to use the value" diskio_write_time: "The total time of the request (unit: millisecond), the counter type, you need to use the function to find the rate to have the value of use" diskio_writes: "Write the number of requests, the counter type, you need to use the function to find the rate to use value" kernel_boot_time: "Kernel startup time" kernel_context_switches: "Number of kernel context switching times" kernel_entropy_avail: "Entropy pool inside the Linux system" kernel_interrupts: "Number of kernel interruption" kernel_processes_forked: "ForK's process number" mem_active: "The total number of memory (including Cache and BUFFER memory)" mem_available: "Application can use memory numbers" mem_available_percent: "Memory remaining percentage (0 ~ 100)" mem_buffered: "Used to make buffer size for the file" mem_cached: "The size of the memory used by the cache memory (equal to diskcache minus Swap Cache )" mem_commit_limit: "According to the over allocation ratio ('vm.overCommit _ Ratio'), this is the current total memory that can be allocated on the system." mem_committed_as: "Currently allocated on the system. It is the sum of the memory of all process applications" mem_dirty: "Waiting to be written back to the memory size of the disk" mem_free: "Senior memory number" mem_high_free: "Unused high memory size" mem_high_total: "The total memory size of the high memory (Highmem refers to all the physical memory that is higher than 860 MB of memory, the HighMem area is used for user programs, or for page cache. This area is not directly mapped to the kernel space. The kernels must use different methods to use this section of memory. )" mem_huge_page_size: "The size of each big page" mem_huge_pages_free: "The number of Huge Pages in the pool that have not been allocated" mem_huge_pages_total: "Reserve the total number of Huge Pages" mem_inactive: "Free memory (including the memory of free and available)" mem_low_free: "Unused low size" mem_low_total: "The total size of the low memory memory can achieve the same role of high memory, and it can be used by the kernel to record some of its own data structure" mem_mapped: "The size of the mapping of equipment and files" mem_page_tables: "The size of the index table of the management of the memory paging page" mem_shared: "The total memory shared by multiple processes" mem_slab: "The size of the kernel data structure cache can reduce the consumption of application and release memory" mem_sreclaimable: "The size of the SLAB can be recovered" mem_sunreclaim: "The size of the SLAB cannot be recovered(SUnreclaim+SReclaimable=Slab)" mem_swap_cached: "The size of the swap space used by the cache memory (cache memory), the memory that has been swapped out, but is still stored in the swapfile. Used to be quickly replaced when needed without opening the I/O port again" mem_swap_free: "The size of the switching space is not used" mem_swap_total: "The total size of the exchange space" mem_total: "Total memory" mem_used: "Memory number" mem_used_percent: "The memory has been used by several percentage (0 ~ 100)" mem_vmalloc_chunk: "The largest continuous unused vmalloc area" mem_vmalloc_totalL: "You can vmalloc virtual memory size" mem_vmalloc_used: "Vmalloc's virtual memory size" mem_write_back: "The memory size of the disk is being written back to the disk" mem_write_back_tmp: "Fuse is used to temporarily write back the memory of the buffer area" net_bytes_recv: "Total inbound traffic(bytes) of network card" net_bytes_sent: "Total outbound traffic(bytes) of network card" net_bits_recv: "Total inbound traffic(bits) of network card" net_bits_sent: "Total outbound traffic(bits) of network card" net_drop_in: "The number of packets for network cards" net_drop_out: "The number of packets issued by the network card" net_err_in: "The number of incorrect packets of the network card" net_err_out: "Number of incorrect number of network cards" net_packets_recv: "Net card collection quantity" net_packets_sent: "Number of network card issuance" netstat_tcp_established: "ESTABLISHED status network link number" netstat_tcp_fin_wait1: "FIN _ WAIT1 status network link number" netstat_tcp_fin_wait2: "FIN _ WAIT2 status number of network links" netstat_tcp_last_ack: "LAST_ ACK status number of network links" netstat_tcp_listen: "Number of network links in Listen status" netstat_tcp_syn_recv: "SYN _ RECV status number of network links" netstat_tcp_syn_sent: "SYN _ SENT status number of network links" netstat_tcp_time_wait: "Time _ WAIT status network link number" netstat_udp_socket: "Number of network links in UDP status" processes_blocked: "The number of processes in the unreproducible sleep state('U','D','L')" processes_dead: "Number of processes in recycling('X')" processes_idle: "Number of idle processes hanging('I')" processes_paging: "Number of paging processes('P')" processes_running: "Number of processes during operation('R')" processes_sleeping: "Can interrupt the number of processes('S')" processes_stopped: "Pushing status process number('T')" processes_total: "Total process number" processes_total_threads: "Number of threads" processes_unknown: "Unknown status process number" processes_zombies: "Number of zombies('Z')" swap_used_percent: "SWAP space replace the data volume" system_load1: "1 minute average load value" system_load5: "5 minutes average load value" system_load15: "15 minutes average load value" system_load_norm_1: "1 minute average load value/logical CPU number" system_load_norm_5: "5 minutes average load value/logical CPU number" system_load_norm_15: "15 minutes average load value/logical CPU number" system_n_users: "User number" system_n_cpus: "CPU nuclear number" system_uptime: "System startup time" nginx_accepts: "Since Nginx started, the total number of connections has been established with the client" nginx_active: "The current number of activity connections that Nginx is being processed is equal to Reading/Writing/Waiting" nginx_handled: "Starting from Nginx, the total number of client connections that have been processed" nginx_reading: "Reading the total number of connections on the http request header" nginx_requests: "Since nginx is started, the total number of client requests processed, due to the existence of HTTP Keep-Alive requests, this value will be greater than the handled value" nginx_upstream_check_fall: "UPStream_CHECK module detects the number of back -end failures" nginx_upstream_check_rise: "UPSTREAM _ Check module to detect the number of back -end" nginx_upstream_check_status_code: "The state of the backstream is 1, and the down is 0" nginx_waiting: "When keep-alive is enabled, this value is equal to active – (reading+writing), which means that Nginx has processed the resident connection that is waiting for the next request command" nginx_writing: "The total number of connections to send a response to the client" http_response_content_length: "HTTP message entity transmission length" http_response_http_response_code: "http response status code" http_response_response_time: "When http ring application" http_response_result_code: "URL detection result 0 is normal, otherwise the URL cannot be accessed" # [mysqld_exporter] mysql_global_status_uptime: The number of seconds that the server has been up.(Gauge) mysql_global_status_uptime_since_flush_status: The number of seconds since the most recent FLUSH STATUS statement.(Gauge) mysql_global_status_queries: The number of statements executed by the server. This variable includes statements executed within stored programs, unlike the Questions variable. It does not count COM_PING or COM_STATISTICS commands.(Counter) mysql_global_status_threads_connected: The number of currently open connections.(Counter) mysql_global_status_connections: The number of connection attempts (successful or not) to the MySQL server.(Gauge) mysql_global_status_max_used_connections: The maximum number of connections that have been in use simultaneously since the server started.(Gauge) mysql_global_status_threads_running: The number of threads that are not sleeping.(Gauge) mysql_global_status_questions: The number of statements executed by the server. This includes only statements sent to the server by clients and not statements executed within stored programs, unlike the Queries variable. This variable does not count COM_PING, COM_STATISTICS, COM_STMT_PREPARE, COM_STMT_CLOSE, or COM_STMT_RESET commands.(Counter) mysql_global_status_threads_cached: The number of threads in the thread cache.(Counter) mysql_global_status_threads_created: The number of threads created to handle connections. If Threads_created is big, you may want to increase the thread_cache_size value. The cache miss rate can be calculated as Threads_created/Connections.(Counter) mysql_global_status_created_tmp_tables: The number of internal temporary tables created by the server while executing statements.(Counter) mysql_global_status_created_tmp_disk_tables: The number of internal on-disk temporary tables created by the server while executing statements. You can compare the number of internal on-disk temporary tables created to the total number of internal temporary tables created by comparing Created_tmp_disk_tables and Created_tmp_tables values.(Counter) mysql_global_status_created_tmp_files: How many temporary files mysqld has created.(Counter) mysql_global_status_select_full_join: The number of joins that perform table scans because they do not use indexes. If this value is not 0, you should carefully check the indexes of your tables.(Counter) mysql_global_status_select_full_range_join: The number of joins that used a range search on a reference table.(Counter) mysql_global_status_select_range: The number of joins that used ranges on the first table. This is normally not a critical issue even if the value is quite large.(Counter) mysql_global_status_select_range_check: The number of joins without keys that check for key usage after each row. If this is not 0, you should carefully check the indexes of your tables.(Counter) mysql_global_status_select_scan: The number of joins that did a full scan of the first table.(Counter) mysql_global_status_sort_rows: The number of sorted rows.(Counter) mysql_global_status_sort_range: The number of sorts that were done using ranges.(Counter) mysql_global_status_sort_merge_passes: The number of merge passes that the sort algorithm has had to do. If this value is large, you should consider increasing the value of the sort_buffer_size system variable.(Counter) mysql_global_status_sort_scan: The number of sorts that were done by scanning the table.(Counter) mysql_global_status_slow_queries: The number of queries that have taken more than long_query_time seconds. This counter increments regardless of whether the slow query log is enabled.(Counter) mysql_global_status_aborted_connects: The number of failed attempts to connect to the MySQL server.(Counter) mysql_global_status_aborted_clients: The number of connections that were aborted because the client died without closing the connection properly.(Counter) mysql_global_status_table_locks_immediate: The number of times that a request for a table lock could be granted immediately. Locks Immediate rising and falling is normal activity.(Counter) mysql_global_status_table_locks_waited: The number of times that a request for a table lock could not be granted immediately and a wait was needed. If this is high and you have performance problems, you should first optimize your queries, and then either split your table or tables or use replication.(Counter) mysql_global_status_bytes_received: The number of bytes received from all clients.(Counter) mysql_global_status_bytes_sent: The number of bytes sent to all clients.(Counter) mysql_global_status_innodb_page_size: InnoDB page size (default 16KB). Many values are counted in pages; the page size enables them to be easily converted to bytes.(Gauge) mysql_global_status_buffer_pool_pages: The number of pages in the InnoDB buffer pool.(Gauge) mysql_global_status_commands_total: The number of times each xxx statement has been executed.(Counter) mysql_global_status_handlers_total: Handler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes. This is in fact the layer between the Storage Engine and MySQL.(Counter) mysql_global_status_opened_files: The number of files that have been opened with my_open() (a mysys library function). Parts of the server that open files without using this function do not increment the count.(Counter) mysql_global_status_open_tables: The number of tables that are open.(Gauge) mysql_global_status_opened_tables: The number of tables that have been opened. If Opened_tables is big, your table_open_cache value is probably too small.(Counter) mysql_global_status_table_open_cache_hits: The number of hits for open tables cache lookups.(Counter) mysql_global_status_table_open_cache_misses: The number of misses for open tables cache lookups.(Counter) mysql_global_status_table_open_cache_overflows: The number of overflows for the open tables cache.(Counter) mysql_global_status_innodb_num_open_files: The number of files InnoDB currently holds open.(Gauge) mysql_global_status_connection_errors_total: These variables provide information about errors that occur during the client connection process.(Counter) mysql_global_status_innodb_buffer_pool_read_requests: The number of logical read requests.(Counter) mysql_global_status_innodb_buffer_pool_reads: The number of logical reads that InnoDB could not satisfy from the buffer pool, and had to read directly from disk.(Counter) mysql_global_variables_thread_cache_size: How many threads the server should cache for reuse.(Gauge) mysql_global_variables_max_connections: The maximum permitted number of simultaneous client connections.(Gauge) mysql_global_variables_innodb_buffer_pool_size: The size in bytes of the buffer pool, the memory area where InnoDB caches table and index data. The default value is 134217728 bytes (128MB).(Gauge) mysql_global_variables_innodb_log_buffer_size: The size in bytes of the buffer that InnoDB uses to write to the log files on disk.(Gauge) mysql_global_variables_key_buffer_size: Index blocks for MyISAM tables are buffered and are shared by all threads.(Gauge) mysql_global_variables_query_cache_size: The amount of memory allocated for caching query results.(Gauge) mysql_global_variables_table_open_cache: The number of open tables for all threads.(Gauge) mysql_global_variables_open_files_limit: The number of file descriptors available to mysqld from the operating system.(Gauge) # [redis_exporter] redis_active_defrag_running: When activedefrag is enabled, this indicates whether defragmentation is currently active, and the CPU percentage it intends to utilize. redis_allocator_active_bytes: Total bytes in the allocator active pages, this includes external-fragmentation. redis_allocator_allocated_bytes: Total bytes allocated form the allocator, including internal-fragmentation. Normally the same as used_memory. redis_allocator_frag_bytes: Delta between allocator_active and allocator_allocated. See note about mem_fragmentation_bytes. redis_allocator_frag_ratio: Ratio between allocator_active and allocator_allocated. This is the true (external) fragmentation metric (not mem_fragmentation_ratio). redis_allocator_resident_bytes: Total bytes resident (RSS) in the allocator, this includes pages that can be released to the OS (by MEMORY PURGE, or just waiting). redis_allocator_rss_bytes: Delta between allocator_resident and allocator_active. redis_allocator_rss_ratio: Ratio between allocator_resident and allocator_active. This usually indicates pages that the allocator can and probably will soon release back to the OS. redis_aof_current_rewrite_duration_sec: Duration of the on-going AOF rewrite operation if any. redis_aof_enabled: Flag indicating AOF logging is activated. redis_aof_last_bgrewrite_status: Status of the last AOF rewrite operation. redis_aof_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last AOF rewrite operation. redis_aof_last_rewrite_duration_sec: Duration of the last AOF rewrite operation in seconds. redis_aof_last_write_status: Status of the last write operation to the AOF. redis_aof_rewrite_in_progress: Flag indicating a AOF rewrite operation is on-going. redis_aof_rewrite_scheduled: Flag indicating an AOF rewrite operation will be scheduled once the on-going RDB save is complete. redis_blocked_clients: Number of clients pending on a blocking call (BLPOP, BRPOP, BRPOPLPUSH, BLMOVE, BZPOPMIN, BZPOPMAX). redis_client_recent_max_input_buffer_bytes: Biggest input buffer among current client connections. redis_client_recent_max_output_buffer_bytes: Biggest output buffer among current client connections. redis_cluster_enabled: Indicate Redis cluster is enabled. redis_commands_duration_seconds_total: The total CPU time consumed by these commands.(Counter) redis_commands_processed_total: Total number of commands processed by the server.(Counter) redis_commands_total: The number of calls that reached command execution (not rejected).(Counter) redis_config_maxclients: The value of the maxclients configuration directive. This is the upper limit for the sum of connected_clients, connected_slaves and cluster_connections. redis_config_maxmemory: The value of the maxmemory configuration directive. redis_connected_clients: Number of client connections (excluding connections from replicas). redis_connected_slaves: Number of connected replicas. redis_connections_received_total: Total number of connections accepted by the server.(Counter) redis_cpu_sys_children_seconds_total: System CPU consumed by the background processes.(Counter) redis_cpu_sys_seconds_total: System CPU consumed by the Redis server, which is the sum of system CPU consumed by all threads of the server process (main thread and background threads).(Counter) redis_cpu_user_children_seconds_total: User CPU consumed by the background processes.(Counter) redis_cpu_user_seconds_total: User CPU consumed by the Redis server, which is the sum of user CPU consumed by all threads of the server process (main thread and background threads).(Counter) redis_db_keys: Total number of keys by DB. redis_db_keys_expiring: Total number of expiring keys by DB redis_defrag_hits: Number of value reallocations performed by active the defragmentation process. redis_defrag_misses: Number of aborted value reallocations started by the active defragmentation process. redis_defrag_key_hits: Number of keys that were actively defragmented. redis_defrag_key_misses: Number of keys that were skipped by the active defragmentation process. redis_evicted_keys_total: Number of evicted keys due to maxmemory limit.(Counter) redis_expired_keys_total: Total number of key expiration events.(Counter) redis_expired_stale_percentage: The percentage of keys probably expired. redis_expired_time_cap_reached_total: The count of times that active expiry cycles have stopped early. redis_exporter_last_scrape_connect_time_seconds: The duration(in seconds) to connect when scrape. redis_exporter_last_scrape_duration_seconds: The last scrape duration. redis_exporter_last_scrape_error: The last scrape error status. redis_exporter_scrape_duration_seconds_count: Durations of scrapes by the exporter redis_exporter_scrape_duration_seconds_sum: Durations of scrapes by the exporter redis_exporter_scrapes_total: Current total redis scrapes.(Counter) redis_instance_info: Information about the Redis instance. redis_keyspace_hits_total: Hits total.(Counter) redis_keyspace_misses_total: Misses total.(Counter) redis_last_key_groups_scrape_duration_milliseconds: Duration of the last key group metrics scrape in milliseconds. redis_last_slow_execution_duration_seconds: The amount of time needed for last slow execution, in seconds. redis_latest_fork_seconds: The amount of time needed for last fork, in seconds. redis_lazyfree_pending_objects: The number of objects waiting to be freed (as a result of calling UNLINK, or FLUSHDB and FLUSHALL with the ASYNC option). redis_master_repl_offset: The server's current replication offset. redis_mem_clients_normal: Memory used by normal clients.(Gauge) redis_mem_clients_slaves: Memory used by replica clients - Starting Redis 7.0, replica buffers share memory with the replication backlog, so this field can show 0 when replicas don't trigger an increase of memory usage. redis_mem_fragmentation_bytes: Delta between used_memory_rss and used_memory. Note that when the total fragmentation bytes is low (few megabytes), a high ratio (e.g. 1.5 and above) is not an indication of an issue. redis_mem_fragmentation_ratio: Ratio between used_memory_rss and used_memory. Note that this doesn't only includes fragmentation, but also other process overheads (see the allocator_* metrics), and also overheads like code, shared libraries, stack, etc. redis_mem_not_counted_for_eviction_bytes: (Gauge) redis_memory_max_bytes: Max memory limit in bytes. redis_memory_used_bytes: Total number of bytes allocated by Redis using its allocator (either standard libc, jemalloc, or an alternative allocator such as tcmalloc) redis_memory_used_dataset_bytes: The size in bytes of the dataset (used_memory_overhead subtracted from used_memory) redis_memory_used_lua_bytes: Number of bytes used by the Lua engine. redis_memory_used_overhead_bytes: The sum in bytes of all overheads that the server allocated for managing its internal data structures. redis_memory_used_peak_bytes: Peak memory consumed by Redis (in bytes) redis_memory_used_rss_bytes: Number of bytes that Redis allocated as seen by the operating system (a.k.a resident set size). This is the number reported by tools such as top(1) and ps(1) redis_memory_used_scripts_bytes: Number of bytes used by cached Lua scripts redis_memory_used_startup_bytes: Initial amount of memory consumed by Redis at startup in bytes redis_migrate_cached_sockets_total: The number of sockets open for MIGRATE purposes redis_net_input_bytes_total: Total input bytes(Counter) redis_net_output_bytes_total: Total output bytes(Counter) redis_process_id: Process ID redis_pubsub_channels: Global number of pub/sub channels with client subscriptions redis_pubsub_patterns: Global number of pub/sub pattern with client subscriptions redis_rdb_bgsave_in_progress: Flag indicating a RDB save is on-going redis_rdb_changes_since_last_save: Number of changes since the last dump redis_rdb_current_bgsave_duration_sec: Duration of the on-going RDB save operation if any redis_rdb_last_bgsave_duration_sec: Duration of the last RDB save operation in seconds redis_rdb_last_bgsave_status: Status of the last RDB save operation redis_rdb_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last RDB save operation redis_rdb_last_save_timestamp_seconds: Epoch-based timestamp of last successful RDB save redis_rejected_connections_total: Number of connections rejected because of maxclients limit(Counter) redis_repl_backlog_first_byte_offset: The master offset of the replication backlog buffer redis_repl_backlog_history_bytes: Size in bytes of the data in the replication backlog buffer redis_repl_backlog_is_active: Flag indicating replication backlog is active redis_replica_partial_resync_accepted: The number of accepted partial resync requests(Gauge) redis_replica_partial_resync_denied: The number of denied partial resync requests(Gauge) redis_replica_resyncs_full: The number of full resyncs with replicas redis_replication_backlog_bytes: Memory used by replication backlog redis_second_repl_offset: The offset up to which replication IDs are accepted. redis_slave_expires_tracked_keys: The number of keys tracked for expiry purposes (applicable only to writable replicas)(Gauge) redis_slowlog_last_id: Last id of slowlog redis_slowlog_length: Total slowlog redis_start_time_seconds: Start time of the Redis instance since unix epoch in seconds. redis_target_scrape_request_errors_total: Errors in requests to the exporter redis_up: Flag indicating redis instance is up redis_uptime_in_seconds: Number of seconds since Redis server start # [windows_exporter] windows_cpu_clock_interrupts_total: Total number of received and serviced clock tick interrupts(counter) windows_cpu_core_frequency_mhz: Core frequency in megahertz(gauge) windows_cpu_cstate_seconds_total: Time spent in low-power idle state(counter) windows_cpu_dpcs_total: Total number of received and serviced deferred procedure calls (DPCs)(counter) windows_cpu_idle_break_events_total: Total number of time processor was woken from idle(counter) windows_cpu_interrupts_total: Total number of received and serviced hardware interrupts(counter) windows_cpu_parking_status: Parking Status represents whether a processor is parked or not(gauge) windows_cpu_processor_performance: Processor Performance is the average performance of the processor while it is executing instructions, as a percentage of the nominal performance of the processor. On some processors, Processor Performance may exceed 100%(gauge) windows_cpu_time_total: Time that processor spent in different modes (idle, user, system, ...)(counter) windows_cs_hostname: Labeled system hostname information as provided by ComputerSystem.DNSHostName and ComputerSystem.Domain(gauge) windows_cs_logical_processors: ComputerSystem.NumberOfLogicalProcessors(gauge) windows_cs_physical_memory_bytes: ComputerSystem.TotalPhysicalMemory(gauge) windows_exporter_build_info: A metric with a constant '1' value labeled by version, revision, branch, and goversion from which windows_exporter was built.(gauge) windows_exporter_collector_duration_seconds: Duration of a collection.(gauge) windows_exporter_collector_success: Whether the collector was successful.(gauge) windows_exporter_collector_timeout: Whether the collector timed out.(gauge) windows_exporter_perflib_snapshot_duration_seconds: Duration of perflib snapshot capture(gauge) windows_logical_disk_free_bytes: Free space in bytes (LogicalDisk.PercentFreeSpace)(gauge) windows_logical_disk_idle_seconds_total: Seconds that the disk was idle (LogicalDisk.PercentIdleTime)(counter) windows_logical_disk_read_bytes_total: The number of bytes transferred from the disk during read operations (LogicalDisk.DiskReadBytesPerSec)(counter) windows_logical_disk_read_latency_seconds_total: Shows the average time, in seconds, of a read operation from the disk (LogicalDisk.AvgDiskSecPerRead)(counter) windows_logical_disk_read_seconds_total: Seconds that the disk was busy servicing read requests (LogicalDisk.PercentDiskReadTime)(counter) windows_logical_disk_read_write_latency_seconds_total: Shows the time, in seconds, of the average disk transfer (LogicalDisk.AvgDiskSecPerTransfer)(counter) windows_logical_disk_reads_total: The number of read operations on the disk (LogicalDisk.DiskReadsPerSec)(counter) windows_logical_disk_requests_queued: The number of requests queued to the disk (LogicalDisk.CurrentDiskQueueLength)(gauge) windows_logical_disk_size_bytes: Total space in bytes (LogicalDisk.PercentFreeSpace_Base)(gauge) windows_logical_disk_split_ios_total: The number of I/Os to the disk were split into multiple I/Os (LogicalDisk.SplitIOPerSec)(counter) windows_logical_disk_write_bytes_total: The number of bytes transferred to the disk during write operations (LogicalDisk.DiskWriteBytesPerSec)(counter) windows_logical_disk_write_latency_seconds_total: Shows the average time, in seconds, of a write operation to the disk (LogicalDisk.AvgDiskSecPerWrite)(counter) windows_logical_disk_write_seconds_total: Seconds that the disk was busy servicing write requests (LogicalDisk.PercentDiskWriteTime)(counter) windows_logical_disk_writes_total: The number of write operations on the disk (LogicalDisk.DiskWritesPerSec)(counter) windows_net_bytes_received_total: (Network.BytesReceivedPerSec)(counter) windows_net_bytes_sent_total: (Network.BytesSentPerSec)(counter) windows_net_bytes_total: (Network.BytesTotalPerSec)(counter) windows_net_current_bandwidth: (Network.CurrentBandwidth)(gauge) windows_net_packets_outbound_discarded_total: (Network.PacketsOutboundDiscarded)(counter) windows_net_packets_outbound_errors_total: (Network.PacketsOutboundErrors)(counter) windows_net_packets_received_discarded_total: (Network.PacketsReceivedDiscarded)(counter) windows_net_packets_received_errors_total: (Network.PacketsReceivedErrors)(counter) windows_net_packets_received_total: (Network.PacketsReceivedPerSec)(counter) windows_net_packets_received_unknown_total: (Network.PacketsReceivedUnknown)(counter) windows_net_packets_sent_total: (Network.PacketsSentPerSec)(counter) windows_net_packets_total: (Network.PacketsPerSec)(counter) windows_os_info: OperatingSystem.Caption, OperatingSystem.Version(gauge) windows_os_paging_free_bytes: OperatingSystem.FreeSpaceInPagingFiles(gauge) windows_os_paging_limit_bytes: OperatingSystem.SizeStoredInPagingFiles(gauge) windows_os_physical_memory_free_bytes: OperatingSystem.FreePhysicalMemory(gauge) windows_os_process_memory_limix_bytes: OperatingSystem.MaxProcessMemorySize(gauge) windows_os_processes: OperatingSystem.NumberOfProcesses(gauge) windows_os_processes_limit: OperatingSystem.MaxNumberOfProcesses(gauge) windows_os_time: OperatingSystem.LocalDateTime(gauge) windows_os_timezone: OperatingSystem.LocalDateTime(gauge) windows_os_users: OperatingSystem.NumberOfUsers(gauge) windows_os_virtual_memory_bytes: OperatingSystem.TotalVirtualMemorySize(gauge) windows_os_virtual_memory_free_bytes: OperatingSystem.FreeVirtualMemory(gauge) windows_os_visible_memory_bytes: OperatingSystem.TotalVisibleMemorySize(gauge) windows_service_info: A metric with a constant '1' value labeled with service information(gauge) windows_service_start_mode: The start mode of the service (StartMode)(gauge) windows_service_state: The state of the service (State)(gauge) windows_service_status: The status of the service (Status)(gauge) windows_system_context_switches_total: Total number of context switches (WMI source is PerfOS_System.ContextSwitchesPersec)(counter) windows_system_exception_dispatches_total: Total number of exceptions dispatched (WMI source is PerfOS_System.ExceptionDispatchesPersec)(counter) windows_system_processor_queue_length: Length of processor queue (WMI source is PerfOS_System.ProcessorQueueLength)(gauge) windows_system_system_calls_total: Total number of system calls (WMI source is PerfOS_System.SystemCallsPersec)(counter) windows_system_system_up_time: System boot time (WMI source is PerfOS_System.SystemUpTime)(gauge) windows_system_threads: Current number of threads (WMI source is PerfOS_System.Threads)(gauge) # [node_exporter] # SYSTEM # CPU context switch 次数 node_context_switches_total: context_switches # Interrupts 次数 node_intr_total: Interrupts # 运行的进程数 node_procs_running: Processes in runnable state # 熵池大小 node_entropy_available_bits: Entropy available to random number generators node_time_seconds: System time in seconds since epoch (1970) node_boot_time_seconds: Node boot time, in unixtime # CPU node_cpu_seconds_total: Seconds the CPUs spent in each mode node_load1: cpu load 1m node_load5: cpu load 5m node_load15: cpu load 15m # MEM # 内核态 # 内核用于缓存数据结构供自己使用的内存 node_memory_Slab_bytes: Memory used by the kernel to cache data structures for its own use # slab中可回收的部分 node_memory_SReclaimable_bytes: SReclaimable - Part of Slab, that might be reclaimed, such as caches # slab中不可回收的部分 node_memory_SUnreclaim_bytes: Part of Slab, that cannot be reclaimed on memory pressure # Vmalloc内存区的大小 node_memory_VmallocTotal_bytes: Total size of vmalloc memory area # vmalloc已分配的内存,虚拟地址空间上的连续的内存 node_memory_VmallocUsed_bytes: Amount of vmalloc area which is used # vmalloc区可用的连续最大快的大小,通过此指标可以知道vmalloc可分配连续内存的最大值 node_memory_VmallocChunk_bytes: Largest contiguous block of vmalloc area which is free # 内存的硬件故障删除掉的内存页的总大小 node_memory_HardwareCorrupted_bytes: Amount of RAM that the kernel identified as corrupted / not working # 用于在虚拟和物理内存地址之间映射的内存 node_memory_PageTables_bytes: Memory used to map between virtual and physical memory addresses (gauge) # 内核栈内存,常驻内存,不可回收 node_memory_KernelStack_bytes: Kernel memory stack. This is not reclaimable # 用来访问高端内存,复制高端内存的临时buffer,称为“bounce buffering”,会降低I/O 性能 node_memory_Bounce_bytes: Memory used for block device bounce buffers #用户态 # 单个巨页大小 node_memory_Hugepagesize_bytes: Huge Page size # 系统分配的常驻巨页数 node_memory_HugePages_Total: Total size of the pool of huge pages # 系统空闲的巨页数 node_memory_HugePages_Free: Huge pages in the pool that are not yet allocated # 进程已申请但未使用的巨页数 node_memory_HugePages_Rsvd: Huge pages for which a commitment to allocate from the pool has been made, but no allocation # 超过系统设定的常驻HugePages数量的个数 node_memory_HugePages_Surp: Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages # 透明巨页 Transparent HugePages (THP) node_memory_AnonHugePages_bytes: Memory in anonymous huge pages # inactivelist中的File-backed内存 node_memory_Inactive_file_bytes: File-backed memory on inactive LRU list # inactivelist中的Anonymous内存 node_memory_Inactive_anon_bytes: Anonymous and swap cache on inactive LRU list, including tmpfs (shmem) # activelist中的File-backed内存 node_memory_Active_file_bytes: File-backed memory on active LRU list # activelist中的Anonymous内存 node_memory_Active_anon_bytes: Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs # 禁止换出的页,对应 Unevictable 链表 node_memory_Unevictable_bytes: Amount of unevictable memory that can't be swapped out for a variety of reasons # 共享内存 node_memory_Shmem_bytes: Used shared memory (shared between several processes, thus including RAM disks) # 匿名页内存大小 node_memory_AnonPages_bytes: Memory in user pages not backed by files # 被关联的内存页大小 node_memory_Mapped_bytes: Used memory in mapped pages files which have been mapped, such as libraries # file-backed内存页缓存大小 node_memory_Cached_bytes: Parked file data (file content) cache # 系统中有多少匿名页曾经被swap-out、现在又被swap-in并且swap-in之后页面中的内容一直没发生变化 node_memory_SwapCached_bytes: Memory that keeps track of pages that have been fetched from swap but not yet been modified # 被mlock()系统调用锁定的内存大小 node_memory_Mlocked_bytes: Size of pages locked to memory using the mlock() system call # 块设备(block device)所占用的缓存页 node_memory_Buffers_bytes: Block device (e.g. harddisk) cache node_memory_SwapTotal_bytes: Memory information field SwapTotal_bytes node_memory_SwapFree_bytes: Memory information field SwapFree_bytes # DISK node_filesystem_avail_bytes: Filesystem space available to non-root users in byte node_filesystem_free_bytes: Filesystem free space in bytes node_filesystem_size_bytes: Filesystem size in bytes node_filesystem_files_free: Filesystem total free file nodes node_filesystem_files: Filesystem total free file nodes node_filefd_maximum: Max open files node_filefd_allocated: Open files node_filesystem_readonly: Filesystem read-only status node_filesystem_device_error: Whether an error occurred while getting statistics for the given device node_disk_reads_completed_total: The total number of reads completed successfully node_disk_writes_completed_total: The total number of writes completed successfully node_disk_reads_merged_total: The number of reads merged node_disk_writes_merged_total: The number of writes merged node_disk_read_bytes_total: The total number of bytes read successfully node_disk_written_bytes_total: The total number of bytes written successfully node_disk_io_time_seconds_total: Total seconds spent doing I/Os node_disk_read_time_seconds_total: The total number of seconds spent by all reads node_disk_write_time_seconds_total: The total number of seconds spent by all writes node_disk_io_time_weighted_seconds_total: The weighted of seconds spent doing I/Os # NET node_network_receive_bytes_total: Network device statistic receive_bytes (counter) node_network_transmit_bytes_total: Network device statistic transmit_bytes (counter) node_network_receive_packets_total: Network device statistic receive_bytes node_network_transmit_packets_total: Network device statistic transmit_bytes node_network_receive_errs_total: Network device statistic receive_errs node_network_transmit_errs_total: Network device statistic transmit_errs node_network_receive_drop_total: Network device statistic receive_drop node_network_transmit_drop_total: Network device statistic transmit_drop node_nf_conntrack_entries: Number of currently allocated flow entries for connection tracking node_sockstat_TCP_alloc: Number of TCP sockets in state alloc node_sockstat_TCP_inuse: Number of TCP sockets in state inuse node_sockstat_TCP_orphan: Number of TCP sockets in state orphan node_sockstat_TCP_tw: Number of TCP sockets in state tw node_netstat_Tcp_CurrEstab: Statistic TcpCurrEstab node_sockstat_sockets_used: Number of IPv4 sockets in use # [kafka_exporter] kafka_brokers: count of kafka_brokers (gauge) kafka_topic_partitions: Number of partitions for this Topic (gauge) kafka_topic_partition_current_offset: Current Offset of a Broker at Topic/Partition (gauge) kafka_consumergroup_current_offset: Current Offset of a ConsumerGroup at Topic/Partition (gauge) kafka_consumer_lag_millis: Current approximation of consumer lag for a ConsumerGroup at Topic/Partition (gauge) kafka_topic_partition_under_replicated_partition: 1 if Topic/Partition is under Replicated # [zookeeper_exporter] zk_znode_count: The total count of znodes stored zk_ephemerals_count: The number of Ephemerals nodes zk_watch_count: The number of watchers setup over Zookeeper nodes. zk_approximate_data_size: Size of data in bytes that a zookeeper server has in its data tree zk_outstanding_requests: Number of currently executing requests zk_packets_sent: Count of the number of zookeeper packets sent from a server zk_packets_received: Count of the number of zookeeper packets received by a server zk_num_alive_connections: Number of active clients connected to a zookeeper server zk_open_file_descriptor_count: Number of file descriptors that a zookeeper server has open zk_max_file_descriptor_count: Maximum number of file descriptors that a zookeeper server can open zk_avg_latency: Average time in milliseconds for requests to be processed zk_min_latency: Maximum time in milliseconds for a request to be processed zk_max_latency: Minimum time in milliseconds for a request to be processed ================================================ FILE: docker/compose-host-network/etc-nightingale/script/notify.bak.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- import sys import json import urllib2 import smtplib from email.mime.text import MIMEText reload(sys) sys.setdefaultencoding('utf8') notify_channel_funcs = { "email":"email", "sms":"sms", "voice":"voice", "dingtalk":"dingtalk", "wecom":"wecom", "feishu":"feishu" } mail_host = "smtp.163.com" mail_port = 994 mail_user = "ulricqin" mail_pass = "password" mail_from = "ulricqin@163.com" class Sender(object): @classmethod def send_email(cls, payload): if mail_user == "ulricqin" and mail_pass == "password": print("invalid smtp configuration") return users = payload.get('event').get("notify_users_obj") emails = {} for u in users: if u.get("email"): emails[u.get("email")] = 1 if not emails: return recipients = emails.keys() mail_body = payload.get('tpls').get("email.tpl", "email.tpl not found") message = MIMEText(mail_body, 'html', 'utf-8') message['From'] = mail_from message['To'] = ", ".join(recipients) message["Subject"] = payload.get('tpls').get("subject.tpl", "subject.tpl not found") try: smtp = smtplib.SMTP_SSL(mail_host, mail_port) smtp.login(mail_user, mail_pass) smtp.sendmail(mail_from, recipients, message.as_string()) smtp.close() except smtplib.SMTPException, error: print(error) @classmethod def send_wecom(cls, payload): users = payload.get('event').get("notify_users_obj") tokens = {} for u in users: contacts = u.get("contacts") if contacts.get("wecom_robot_token", ""): tokens[contacts.get("wecom_robot_token", "")] = 1 opener = urllib2.build_opener(urllib2.HTTPHandler()) method = "POST" for t in tokens: url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}".format(t) body = { "msgtype": "markdown", "markdown": { "content": payload.get('tpls').get("wecom.tpl", "wecom.tpl not found") } } request = urllib2.Request(url, data=json.dumps(body)) request.add_header("Content-Type",'application/json;charset=utf-8') request.get_method = lambda: method try: connection = opener.open(request) print(connection.read()) except urllib2.HTTPError, error: print(error) @classmethod def send_dingtalk(cls, payload): event = payload.get('event') users = event.get("notify_users_obj") rule_name = event.get("rule_name") event_state = "Triggered" if event.get("is_recovered"): event_state = "Recovered" tokens = {} phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 contacts = u.get("contacts") if contacts.get("dingtalk_robot_token", ""): tokens[contacts.get("dingtalk_robot_token", "")] = 1 opener = urllib2.build_opener(urllib2.HTTPHandler()) method = "POST" for t in tokens: url = "https://oapi.dingtalk.com/robot/send?access_token={}".format(t) body = { "msgtype": "markdown", "markdown": { "title": "{} - {}".format(event_state, rule_name), "text": payload.get('tpls').get("dingtalk.tpl", "dingtalk.tpl not found") + ' '.join(["@"+i for i in phones.keys()]) }, "at": { "atMobiles": phones.keys(), "isAtAll": False } } request = urllib2.Request(url, data=json.dumps(body)) request.add_header("Content-Type",'application/json;charset=utf-8') request.get_method = lambda: method try: connection = opener.open(request) print(connection.read()) except urllib2.HTTPError, error: print(error) @classmethod def send_feishu(cls, payload): users = payload.get('event').get("notify_users_obj") tokens = {} phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 contacts = u.get("contacts") if contacts.get("feishu_robot_token", ""): tokens[contacts.get("feishu_robot_token", "")] = 1 opener = urllib2.build_opener(urllib2.HTTPHandler()) method = "POST" for t in tokens: url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t) body = { "msg_type": "text", "content": { "text": payload.get('tpls').get("feishu.tpl", "feishu.tpl not found") }, "at": { "atMobiles": phones.keys(), "isAtAll": False } } request = urllib2.Request(url, data=json.dumps(body)) request.add_header("Content-Type",'application/json;charset=utf-8') request.get_method = lambda: method try: connection = opener.open(request) print(connection.read()) except urllib2.HTTPError, error: print(error) @classmethod def send_sms(cls, payload): users = payload.get('event').get("notify_users_obj") phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 if phones: print("send_sms not implemented, phones: {}".format(phones.keys())) @classmethod def send_voice(cls, payload): users = payload.get('event').get("notify_users_obj") phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 if phones: print("send_voice not implemented, phones: {}".format(phones.keys())) def main(): payload = json.load(sys.stdin) with open(".payload", 'w') as f: f.write(json.dumps(payload, indent=4)) for ch in payload.get('event').get('notify_channels'): send_func_name = "send_{}".format(notify_channel_funcs.get(ch.strip())) if not hasattr(Sender, send_func_name): print("function: {} not found", send_func_name) continue send_func = getattr(Sender, send_func_name) send_func(payload) def hello(): print("hello nightingale") if __name__ == "__main__": if len(sys.argv) == 1: main() elif sys.argv[1] == "hello": hello() else: print("I am confused") ================================================ FILE: docker/compose-host-network/etc-nightingale/script/notify.py ================================================ #!/usr/bin/env python3 # -*- coding: UTF-8 -*- import sys import json class Sender(object): @classmethod def send_email(cls, payload): # already done in go code pass @classmethod def send_wecom(cls, payload): # already done in go code pass @classmethod def send_dingtalk(cls, payload): # already done in go code pass @classmethod def send_feishu(cls, payload): # already done in go code pass @classmethod def send_mm(cls, payload): # already done in go code pass @classmethod def send_sms(cls, payload): users = payload.get('event').get("notify_users_obj") phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 if phones: print("send_sms not implemented, phones: {}".format(phones.keys())) @classmethod def send_voice(cls, payload): users = payload.get('event').get("notify_users_obj") phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 if phones: print("send_voice not implemented, phones: {}".format(phones.keys())) def main(): payload = json.load(sys.stdin) with open(".payload", 'w') as f: f.write(json.dumps(payload, indent=4)) for ch in payload.get('event').get('notify_channels'): send_func_name = "send_{}".format(ch.strip()) if not hasattr(Sender, send_func_name): print("function: {} not found", send_func_name) continue send_func = getattr(Sender, send_func_name) send_func(payload) def hello(): print("hello nightingale") if __name__ == "__main__": if len(sys.argv) == 1: main() elif sys.argv[1] == "hello": hello() else: print("I am confused") ================================================ FILE: docker/compose-host-network/etc-nightingale/script/notify_feishu.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- import sys import json import requests class Sender(object): @classmethod def send_email(cls, payload): # already done in go code pass @classmethod def send_wecom(cls, payload): # already done in go code pass @classmethod def send_dingtalk(cls, payload): # already done in go code pass @classmethod def send_ifeishu(cls, payload): users = payload.get('event').get("notify_users_obj") tokens = {} phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 contacts = u.get("contacts") if contacts.get("feishu_robot_token", ""): tokens[contacts.get("feishu_robot_token", "")] = 1 headers = { "Content-Type": "application/json;charset=utf-8", "Host": "open.feishu.cn" } for t in tokens: url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t) body = { "msg_type": "text", "content": { "text": payload.get('tpls').get("feishu", "feishu not found") }, "at": { "atMobiles": list(phones.keys()), "isAtAll": False } } response = requests.post(url, headers=headers, data=json.dumps(body)) print(f"notify_ifeishu: token={t} status_code={response.status_code} response_text={response.text}") @classmethod def send_mm(cls, payload): # already done in go code pass @classmethod def send_sms(cls, payload): pass @classmethod def send_voice(cls, payload): pass def main(): payload = json.load(sys.stdin) with open(".payload", 'w') as f: f.write(json.dumps(payload, indent=4)) for ch in payload.get('event').get('notify_channels'): send_func_name = "send_{}".format(ch.strip()) if not hasattr(Sender, send_func_name): print("function: {} not found", send_func_name) continue send_func = getattr(Sender, send_func_name) send_func(payload) def hello(): print("hello nightingale") if __name__ == "__main__": if len(sys.argv) == 1: main() elif sys.argv[1] == "hello": hello() else: print("I am confused") ================================================ FILE: docker/compose-host-network/etc-nightingale/script/rule_converter.py ================================================ import json import yaml ''' 将promtheus/vmalert的rule转换为n9e中的rule 支持k8s的rule configmap ''' rule_file = 'rules.yaml' def convert_interval(interval): if interval.endswith('s') or interval.endswith('S'): return int(interval[:-1]) if interval.endswith('m') or interval.endswith('M'): return int(interval[:-1]) * 60 if interval.endswith('h') or interval.endswith('H'): return int(interval[:-1]) * 60 * 60 if interval.endswith('d') or interval.endswith('D'): return int(interval[:-1]) * 60 * 60 * 24 return int(interval) def convert_alert(rule, interval): name = rule['alert'] prom_ql = rule['expr'] if 'for' in rule: prom_for_duration = convert_interval(rule['for']) else: prom_for_duration = 0 prom_eval_interval = convert_interval(interval) note = '' if 'annotations' in rule: for v in rule['annotations'].values(): note = v break annotations = {} if 'annotations' in rule: for k, v in rule['annotations'].items(): annotations[k] = v append_tags = [] severity = 2 if 'labels' in rule: for k, v in rule['labels'].items(): if k != 'severity': append_tags.append('{}={}'.format(k, v)) continue if v == 'critical': severity = 1 elif v == 'info': severity = 3 # elif v == 'warning': # severity = 2 n9e_alert_rule = { "name": name, "note": note, "severity": severity, "disabled": 0, "prom_for_duration": prom_for_duration, "prom_ql": prom_ql, "prom_eval_interval": prom_eval_interval, "enable_stime": "00:00", "enable_etime": "23:59", "enable_days_of_week": [ "1", "2", "3", "4", "5", "6", "0" ], "enable_in_bg": 0, "notify_recovered": 1, "notify_channels": [], "notify_repeat_step": 60, "recover_duration": 0, "callbacks": [], "runbook_url": "", "append_tags": append_tags, "annotations":annotations } return n9e_alert_rule def convert_record(rule, interval): name = rule['record'] prom_ql = rule['expr'] prom_eval_interval = convert_interval(interval) note = '' append_tags = [] if 'labels' in rule: for k, v in rule['labels'].items(): append_tags.append('{}={}'.format(k, v)) n9e_record_rule = { "name": name, "note": note, "disabled": 0, "prom_ql": prom_ql, "prom_eval_interval": prom_eval_interval, "append_tags": append_tags } return n9e_record_rule ''' example of rule group file --- groups: - name: example rules: - alert: HighRequestLatency expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5 for: 10m labels: severity: page annotations: summary: High request latency ''' def deal_group(group): """ parse single prometheus/vmalert rule group """ alert_rules = [] record_rules = [] for rule_segment in group['groups']: if 'interval' in rule_segment: interval = rule_segment['interval'] else: interval = '15s' for rule in rule_segment['rules']: if 'alert' in rule: alert_rules.append(convert_alert(rule, interval)) else: record_rules.append(convert_record(rule, interval)) return alert_rules, record_rules ''' example of k8s rule configmap --- apiVersion: v1 kind: ConfigMap metadata: name: rulefiles-0 data: etcdrules.yaml: | groups: - name: etcd rules: - alert: etcdInsufficientMembers annotations: message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value}}).' expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2) for: 3m labels: severity: critical ''' def deal_configmap(rule_configmap): """ parse rule configmap from k8s """ all_record_rules = [] all_alert_rules = [] for _, rule_group_str in rule_configmap['data'].items(): rule_group = yaml.load(rule_group_str, Loader=yaml.FullLoader) alert_rules, record_rules = deal_group(rule_group) all_alert_rules.extend(alert_rules) all_record_rules.extend(record_rules) return all_alert_rules, all_record_rules def main(): with open(rule_file, 'r') as f: rule_config = yaml.load(f, Loader=yaml.FullLoader) # 如果文件是k8s中的configmap,使用下面的方法 # alert_rules, record_rules = deal_configmap(rule_config) alert_rules, record_rules = deal_group(rule_config) with open("alert-rules.json", 'w') as fw: json.dump(alert_rules, fw, indent=2, ensure_ascii=False) with open("record-rules.json", 'w') as fw: json.dump(record_rules, fw, indent=2, ensure_ascii=False) if __name__ == '__main__': main() ================================================ FILE: docker/compose-host-network/etc-prometheus/prometheus.yml ================================================ # my global config global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: # - alertmanager:9093 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: # - "first_rules.yml" # - "second_rules.yml" scrape_configs: # The job name is added as a label `job=` to any timeseries scraped from this config. - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] - job_name: 'nightingale' static_configs: - targets: ['localhost:17000'] ================================================ FILE: docker/compose-host-network-metric-log/docker-compose.yaml ================================================ version: "3.7" services: mysql: image: "mysql:8" container_name: mysql hostname: mysql restart: always environment: TZ: Asia/Shanghai MYSQL_ROOT_PASSWORD: 1234 volumes: - ./mysqldata:/var/lib/mysql/ - ../initsql:/docker-entrypoint-initdb.d/ - ./etc-mysql/my.cnf:/etc/my.cnf network_mode: host redis: image: "redis:6.2" container_name: redis hostname: redis restart: always environment: TZ: Asia/Shanghai network_mode: host prometheus: image: prom/prometheus container_name: prometheus hostname: prometheus restart: always environment: TZ: Asia/Shanghai volumes: - ./etc-prometheus:/etc/prometheus network_mode: host command: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" - "--web.console.libraries=/usr/share/prometheus/console_libraries" - "--web.console.templates=/usr/share/prometheus/consoles" - "--enable-feature=remote-write-receiver" - "--query.lookback-delta=2m" n9e: image: flashcatcloud/nightingale:latest container_name: n9e hostname: n9e restart: always environment: GIN_MODE: release TZ: Asia/Shanghai WAIT_HOSTS: 127.0.0.1:3306, 127.0.0.1:6379 volumes: - ./etc-nightingale:/app/etc - ./n9e-logs:/app/logs network_mode: host depends_on: - mysql - redis - prometheus command: - /app/n9e categraf: image: "flashcatcloud/categraf:latest" container_name: "categraf" hostname: "categraf01" restart: always environment: TZ: Asia/Shanghai HOST_PROC: /hostfs/proc HOST_SYS: /hostfs/sys HOST_MOUNT_PREFIX: /hostfs WAIT_HOSTS: 127.0.0.1:17000, 127.0.0.1:20090, 127.0.0.1:9092 volumes: - ./etc-categraf:/etc/categraf/conf - ./n9e-logs:/logs - /:/hostfs network_mode: host depends_on: - n9e - kafka zookeeper: image: bitnami/zookeeper:3.9 container_name: "zookeeper" restart: always environment: - TZ=Asia/Shanghai - ALLOW_ANONYMOUS_LOGIN=yes network_mode: host depends_on: - n9e kafka: image: bitnami/kafka:3.4 container_name: "kafka" restart: always environment: TZ: Asia/Shanghai KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://127.0.0.1:9092 KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 KAFKA_ZOOKEEPER_CONNECT: 127.0.0.1:2181 KAFKA_CFG_MESSAGE_MAX_BYTES: 2000000 network_mode: host depends_on: - zookeeper elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch:7.10.1 container_name: "elasticsearch" restart: always environment: - TZ=Asia/Shanghai - discovery.type=single-node network_mode: host depends_on: - kafka logstash: image: docker.elastic.co/logstash/logstash:8.11.3 container_name: "logstash" restart: always environment: - TZ=Asia/Shanghai - LS_JAVA_OPTS=-Xmx256m -Xms256m volumes: - ./etc-logstash/logstash.yaml:/etc/logstash/conf.d/logstash.yaml entrypoint: - logstash - -f - /etc/logstash/conf.d/logstash.yaml network_mode: host depends_on: - elasticsearch - kafka logging: driver: "json-file" options: max-size: "200m" max-file: "3" ================================================ FILE: docker/compose-host-network-metric-log/etc-categraf/config.toml ================================================ [global] # whether print configs print_configs = false # add label(agent_hostname) to series # "" -> auto detect hostname # "xx" -> use specified string xx # "$hostname" -> auto detect hostname # "$ip" -> auto detect ip # "$hostname-$ip" -> auto detect hostname and ip to replace the vars hostname = "$HOSTNAME" # will not add label(agent_hostname) if true omit_hostname = false # s | ms precision = "ms" # global collect interval interval = 15 [global.labels] source="categraf" # region = "shanghai" # env = "localhost" [writer_opt] # default: 2000 batch = 2000 # channel(as queue) size chan_size = 10000 [[writers]] url = "http://127.0.0.1:17000/prometheus/v1/write" # Basic auth username basic_auth_user = "" # Basic auth password basic_auth_pass = "" # timeout settings, unit: ms timeout = 5000 dial_timeout = 2500 max_idle_conns_per_host = 100 [http] enable = false address = ":9100" print_access = false run_mode = "release" [heartbeat] enable = true # report os version cpu.util mem.util metadata url = "http://127.0.0.1:17000/v1/n9e/heartbeat" # interval, unit: s interval = 10 # Basic auth username basic_auth_user = "" # Basic auth password basic_auth_pass = "" ## Optional headers # headers = ["X-From", "categraf", "X-Xyz", "abc"] # timeout settings, unit: ms timeout = 5000 dial_timeout = 2500 max_idle_conns_per_host = 100 [ibex] enable = true ## ibex flush interval interval = "1000ms" ## n9e ibex server rpc address servers = ["127.0.0.1:20090"] ## temp script dir meta_dir = "./meta" ================================================ FILE: docker/compose-host-network-metric-log/etc-categraf/input.cpu/cpu.toml ================================================ # # collect interval # interval = 15 # # whether collect per cpu # collect_per_cpu = false ================================================ FILE: docker/compose-host-network-metric-log/etc-categraf/input.disk/disk.toml ================================================ # # collect interval # interval = 15 # # By default stats will be gathered for all mount points. # # Set mount_points will restrict the stats to only the specified mount points. # mount_points = ["/"] # Ignore mount points by filesystem type. ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] ignore_mount_points = ["/boot"] ================================================ FILE: docker/compose-host-network-metric-log/etc-categraf/input.diskio/diskio.toml ================================================ # # collect interval # interval = 15 # # By default, categraf will gather stats for all devices including disk partitions. # # Setting devices will restrict the stats to the specified devices. # devices = ["sda", "sdb", "vd*"] ================================================ FILE: docker/compose-host-network-metric-log/etc-categraf/input.kernel/kernel.toml ================================================ # # collect interval # interval = 15 ================================================ FILE: docker/compose-host-network-metric-log/etc-categraf/input.mem/mem.toml ================================================ # # collect interval # interval = 15 # # whether collect platform specified metrics collect_platform_fields = true ================================================ FILE: docker/compose-host-network-metric-log/etc-categraf/input.net/net.toml ================================================ # # collect interval # interval = 15 # # whether collect protocol stats on Linux # collect_protocol_stats = false # # setting interfaces will tell categraf to gather these explicit interfaces # interfaces = ["eth0"] ================================================ FILE: docker/compose-host-network-metric-log/etc-categraf/input.netstat/netstat.toml ================================================ # # collect interval # interval = 15 ================================================ FILE: docker/compose-host-network-metric-log/etc-categraf/input.processes/processes.toml ================================================ # # collect interval # interval = 15 # # force use ps command to gather # force_ps = false # # force use /proc to gather # force_proc = false ================================================ FILE: docker/compose-host-network-metric-log/etc-categraf/input.system/system.toml ================================================ # # collect interval # interval = 15 # # whether collect metric: system_n_users # collect_user_number = false ================================================ FILE: docker/compose-host-network-metric-log/etc-categraf/logs.toml ================================================ [logs] ## just a placeholder api_key = "ef4ahfbwzwwtlwfpbertgq1i6mq0ab1q" ## enable log collect or not enable = true ## the server receive logs, http/tcp/kafka, only kafka brokers can be multiple ip:ports with concatenation character "," send_to = "127.0.0.1:9092" ## send logs with protocol: http/tcp/kafka send_type = "kafka" topic = "flashcatcloud" ## send logs with compression or not use_compress = false ## use ssl or not send_with_tls = false ## send logs in batchs batch_wait = 5 ## save offset in this path run_path = "/opt/categraf/run" ## max files can be open open_files_limit = 100 ## scan config file in 10 seconds scan_period = 10 ## read buffer of udp frame_size = 9000 ## channal size, default 100 ## 读取日志缓冲区,行数 chan_size = 1000 ## pipeline num , default 4 ## 有多少线程处理日志 pipeline=4 ## configuration for kafka ## 指定kafka版本 kafka_version="2.8.1" # 默认0 表示串行,如果对日志顺序有要求,保持默认配置 batch_max_concurrence = 0 # 最大并发批次, 默认100 batch_max_size=100 # 每次最大发送的内容上限 默认1000000 batch_max_contentsize=1000000 # client timeout in seconds producer_timeout= 10 # 是否开启sasl模式 sasl_enable = false sasl_user = "admin" sasl_password = "admin" # PLAIN sasl_mechanism= "PLAIN" # v1 sasl_version=1 # set true sasl_handshake = true # optional # sasl_auth_identity="" # ## # v0.3.39以上版本新增,是否开启pod日志采集 enable_collect_container=false # 是否采集所有pod的stdout stderr collect_container_all = false ## glog processing rules # [[logs.Processing_rules]] ## single log configure [[logs.items]] ## file/journald/tcp/udp type = "file" ## type=file, path is required; type=journald/tcp/udp, port is required path = "/logs/*" source = "n9e" service = "n9e_service" ================================================ FILE: docker/compose-host-network-metric-log/etc-logstash/logstash.yaml ================================================ input { kafka { bootstrap_servers => "127.0.0.1:9092" topics => ["flashcatcloud"] codec => json type => n9e } } filter { grok { match => {"message" => "%{LOGLEVEL:status}"} overwrite => ["status"] } } output { elasticsearch { hosts => ["127.0.0.1:9200"] index => "n9e-%{+YYYY.MM.DD}" } } ================================================ FILE: docker/compose-host-network-metric-log/etc-mysql/my.cnf ================================================ [mysqld] pid-file = /var/run/mysqld/mysqld.pid socket = /var/run/mysqld/mysqld.sock datadir = /var/lib/mysql bind-address = 127.0.0.1 ================================================ FILE: docker/compose-host-network-metric-log/etc-nightingale/config.toml ================================================ [Global] RunMode = "release" [Log] # log write dir Dir = "logs" # log level: DEBUG INFO WARNING ERROR Level = "INFO" # stdout, stderr, file Output = "file" # # rotate by time KeepHours = 4 # # rotate by size # RotateNum = 3 # # unit: MB # RotateSize = 256 [HTTP] # http listening address Host = "0.0.0.0" # http listening port Port = 17000 # https cert file path CertFile = "" # https key file path KeyFile = "" # whether print access log PrintAccessLog = false # whether enable pprof PProf = false # expose prometheus /metrics? ExposeMetrics = true # http graceful shutdown timeout, unit: s ShutdownTimeout = 30 # max content length: 64M MaxContentLength = 67108864 # http server read timeout, unit: s ReadTimeout = 20 # http server write timeout, unit: s WriteTimeout = 40 # http server idle timeout, unit: s IdleTimeout = 120 [HTTP.ShowCaptcha] Enable = false [HTTP.APIForAgent] Enable = true # [HTTP.APIForAgent.BasicAuth] # user001 = "ccc26da7b9aba533cbb263a36c07dcc5" [HTTP.APIForService] Enable = false [HTTP.APIForService.BasicAuth] user001 = "ccc26da7b9aba533cbb263a36c07dcc5" [HTTP.JWTAuth] # unit: min AccessExpired = 1500 # unit: min RefreshExpired = 10080 RedisKeyPrefix = "/jwt/" [HTTP.ProxyAuth] # if proxy auth enabled, jwt auth is disabled Enable = false # username key in http proxy header HeaderUserNameKey = "X-User-Name" DefaultRoles = ["Standard"] [HTTP.RSA] # open RSA OpenRSA = false [DB] # postgres: host=%s port=%s user=%s dbname=%s password=%s sslmode=%s # postgres: DSN="host=127.0.0.1 port=5432 user=root dbname=n9e_v6 password=1234 sslmode=disable" DSN="root:1234@tcp(127.0.0.1:3306)/n9e_v6?charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true" # enable debug mode or not Debug = false # mysql postgres DBType = "mysql" # unit: s MaxLifetime = 7200 # max open connections MaxOpenConns = 150 # max idle connections MaxIdleConns = 50 [Redis] # address, ip:port or ip1:port,ip2:port for cluster and sentinel(SentinelAddrs) Address = "127.0.0.1:6379" # Username = "" # Password = "" # DB = 0 # UseTLS = false # TLSMinVersion = "1.2" # standalone cluster sentinel RedisType = "standalone" # Mastername for sentinel type # MasterName = "mymaster" # SentinelUsername = "" # SentinelPassword = "" [Alert] [Alert.Heartbeat] # auto detect if blank IP = "" # unit ms Interval = 1000 EngineName = "default" # [Alert.Alerting] # NotifyConcurrency = 10 [Center] MetricsYamlFile = "./etc/metrics.yaml" I18NHeaderKey = "X-Language" [Center.AnonymousAccess] PromQuerier = true AlertDetail = true [Pushgw] # use target labels in database instead of in series LabelRewrite = true ForceUseServerTS = true # [Pushgw.DebugSample] # ident = "xx" # __name__ = "xx" # [Pushgw.WriterOpt] # QueueMaxSize = 1000000 # QueuePopSize = 1000 [[Pushgw.Writers]] # Url = "http://127.0.0.1:8480/insert/0/prometheus/api/v1/write" Url = "http://127.0.0.1:9090/api/v1/write" # Basic auth username BasicAuthUser = "" # Basic auth password BasicAuthPass = "" # timeout settings, unit: ms Headers = ["X-From", "n9e"] Timeout = 10000 DialTimeout = 3000 TLSHandshakeTimeout = 30000 ExpectContinueTimeout = 1000 IdleConnTimeout = 90000 # time duration, unit: ms KeepAlive = 30000 MaxConnsPerHost = 0 MaxIdleConns = 100 MaxIdleConnsPerHost = 100 ## Optional TLS Config # UseTLS = false # TLSCA = "/etc/n9e/ca.pem" # TLSCert = "/etc/n9e/cert.pem" # TLSKey = "/etc/n9e/key.pem" # InsecureSkipVerify = false # [[Writers.WriteRelabels]] # Action = "replace" # SourceLabels = ["__address__"] # Regex = "([^:]+)(?::\\d+)?" # Replacement = "$1:80" # TargetLabel = "__address__" [Ibex] Enable = true RPCListen = "0.0.0.0:20090" ================================================ FILE: docker/compose-host-network-metric-log/etc-nightingale/metrics.yaml ================================================ zh: ip_conntrack_count: 连接跟踪表条目总数(单位:int, count) ip_conntrack_max: 连接跟踪表最大容量(单位:int, size) cpu_usage_idle: CPU空闲率(单位:%) cpu_usage_active: CPU使用率(单位:%) cpu_usage_system: CPU内核态时间占比(单位:%) cpu_usage_user: CPU用户态时间占比(单位:%) cpu_usage_nice: 低优先级用户态CPU时间占比,也就是进程nice值被调整为1-19之间的CPU时间。这里注意,nice可取值范围是-20到19,数值越大,优先级反而越低(单位:%) cpu_usage_iowait: CPU等待I/O的时间占比(单位:%) cpu_usage_irq: CPU处理硬中断的时间占比(单位:%) cpu_usage_softirq: CPU处理软中断的时间占比(单位:%) cpu_usage_steal: 在虚拟机环境下有该指标,表示CPU被其他虚拟机争用的时间占比,超过20就表示争抢严重(单位:%) cpu_usage_guest: 通过虚拟化运行其他操作系统的时间,也就是运行虚拟机的CPU时间占比(单位:%) cpu_usage_guest_nice: 以低优先级运行虚拟机的时间占比(单位:%) disk_free: 硬盘分区剩余量(单位:byte) disk_used: 硬盘分区使用量(单位:byte) disk_used_percent: 硬盘分区使用率(单位:%) disk_total: 硬盘分区总量(单位:byte) disk_inodes_free: 硬盘分区inode剩余量 disk_inodes_used: 硬盘分区inode使用量 disk_inodes_total: 硬盘分区inode总量 diskio_io_time: 从设备视角来看I/O请求总时间,队列中有I/O请求就计数(单位:毫秒),counter类型,需要用函数求rate才有使用价值 diskio_iops_in_progress: 已经分配给设备驱动且尚未完成的IO请求,不包含在队列中但尚未分配给设备驱动的IO请求,gauge类型 diskio_merged_reads: 相邻读请求merge读的次数,counter类型 diskio_merged_writes: 相邻写请求merge写的次数,counter类型 diskio_read_bytes: 读取的byte数量,counter类型,需要用函数求rate才有使用价值 diskio_read_time: 读请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值 diskio_reads: 读请求次数,counter类型,需要用函数求rate才有使用价值 diskio_weighted_io_time: 从I/O请求视角来看I/O等待总时间,如果同时有多个I/O请求,时间会叠加(单位:毫秒) diskio_write_bytes: 写入的byte数量,counter类型,需要用函数求rate才有使用价值 diskio_write_time: 写请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值 diskio_writes: 写请求次数,counter类型,需要用函数求rate才有使用价值 kernel_boot_time: 内核启动时间 kernel_context_switches: 内核上下文切换次数 kernel_entropy_avail: linux系统内部的熵池 kernel_interrupts: 内核中断次数 kernel_processes_forked: fork的进程数 mem_active: 活跃使用的内存总数(包括cache和buffer内存) mem_available: 可用内存大小(bytes) mem_available_percent: 内存剩余百分比(0~100) mem_buffered: 用来给文件做缓冲大小 mem_cached: 被高速缓冲存储器(cache memory)用的内存的大小(等于 diskcache minus SwapCache ) mem_commit_limit: 根据超额分配比率('vm.overcommit_ratio'),这是当前在系统上分配可用的内存总量,这个限制只是在模式2('vm.overcommit_memory')时启用 mem_committed_as: 目前在系统上分配的内存量。是所有进程申请的内存的总和 mem_dirty: 等待被写回到磁盘的内存大小 mem_free: 空闲内存大小(bytes) mem_high_free: 未被使用的高位内存大小 mem_high_total: 高位内存总大小(Highmem是指所有内存高于860MB的物理内存,Highmem区域供用户程序使用,或用于页面缓存。该区域不是直接映射到内核空间。内核必须使用不同的手法使用该段内存) mem_huge_page_size: 每个大页的大小 mem_huge_pages_free: 池中尚未分配的 HugePages 数量 mem_huge_pages_total: 预留HugePages的总个数 mem_inactive: 空闲的内存数(包括free和available的内存) mem_low_free: 未被使用的低位大小 mem_low_total: 低位内存总大小,低位可以达到高位内存一样的作用,而且它还能够被内核用来记录一些自己的数据结构 mem_mapped: 设备和文件等映射的大小 mem_page_tables: 管理内存分页页面的索引表的大小 mem_shared: 多个进程共享的内存总额 mem_slab: 内核数据结构缓存的大小,可以减少申请和释放内存带来的消耗 mem_sreclaimable: 可收回Slab的大小 mem_sunreclaim: 不可收回Slab的大小(SUnreclaim+SReclaimable=Slab) mem_swap_cached: 被高速缓冲存储器(cache memory)用的交换空间的大小,已经被交换出来的内存,但仍然被存放在swapfile中。用来在需要的时候很快的被替换而不需要再次打开I/O端口 mem_swap_free: 未被使用交换空间的大小 mem_swap_total: 交换空间的总大小 mem_total: 内存总数 mem_used: 已用内存数 mem_used_percent: 已用内存数百分比(0~100) mem_vmalloc_chunk: 最大的连续未被使用的vmalloc区域 mem_vmalloc_totalL: 可以vmalloc虚拟内存大小 mem_vmalloc_used: vmalloc已使用的虚拟内存大小 mem_write_back: 正在被写回到磁盘的内存大小 mem_write_back_tmp: FUSE用于临时写回缓冲区的内存 net_bytes_recv: 网卡收包总数(bytes),计算每秒速率时需要用到rate/irate函数 net_bytes_sent: 网卡发包总数(bytes),计算每秒速率时需要用到rate/irate函数 net_drop_in: 网卡收丢包数量 net_drop_out: 网卡发丢包数量 net_err_in: 网卡收包错误数量 net_err_out: 网卡发包错误数量 net_packets_recv: 网卡收包数量 net_packets_sent: 网卡发包数量 net_bits_recv: 网卡收包总数(bits),计算每秒速率时需要用到rate/irate函数 net_bits_sent: 网卡发包总数(bits),计算每秒速率时需要用到rate/irate函数 netstat_tcp_established: ESTABLISHED状态的网络链接数 netstat_tcp_fin_wait1: FIN_WAIT1状态的网络链接数 netstat_tcp_fin_wait2: FIN_WAIT2状态的网络链接数 netstat_tcp_last_ack: LAST_ACK状态的网络链接数 netstat_tcp_listen: LISTEN状态的网络链接数 netstat_tcp_syn_recv: SYN_RECV状态的网络链接数 netstat_tcp_syn_sent: SYN_SENT状态的网络链接数 netstat_tcp_time_wait: TIME_WAIT状态的网络链接数 netstat_udp_socket: UDP状态的网络链接数 netstat_sockets_used: 已使用的所有协议套接字总量 netstat_tcp_inuse: 正在使用(正在侦听)的TCP套接字数量 netstat_tcp_orphan: 无主(不属于任何进程)的TCP连接数(无用、待销毁的TCP socket数) netstat_tcp_tw: TIME_WAIT状态的TCP连接数 netstat_tcp_alloc: 已分配(已建立、已申请到sk_buff)的TCP套接字数量 netstat_tcp_mem: TCP套接字内存Page使用量 netstat_udp_inuse: 在使用的UDP套接字数量 netstat_udp_mem: UDP套接字内存Page使用量 netstat_udplite_inuse: 正在使用的 udp lite 数量 netstat_raw_inuse: 正在使用的 raw socket 数量 netstat_frag_inuse: ip fragment 数量 netstat_frag_memory: ip fragment 已经分配的内存(byte) #[ping] ping_percent_packet_loss: ping数据包丢失百分比(%) ping_result_code: ping返回码('0','1') net_response_result_code: 网络探测结果,0表示正常,非0表示异常 net_response_response_time: 网络探测时延,单位:秒 processes_blocked: 不可中断的睡眠状态下的进程数('U','D','L') processes_dead: 回收中的进程数('X') processes_idle: 挂起的空闲进程数('I') processes_paging: 分页进程数('P') processes_running: 运行中的进程数('R') processes_sleeping: 可中断进程数('S') processes_stopped: 暂停状态进程数('T') processes_total: 总进程数 processes_total_threads: 总线程数 processes_unknown: 未知状态进程数 processes_zombies: 僵尸态进程数('Z') swap_used_percent: Swap空间换出数据量 system_load1: 1分钟平均load值 system_load5: 5分钟平均load值 system_load15: 15分钟平均load值 system_load_norm_1: 1分钟平均load值/逻辑CPU个数 system_load_norm_5: 5分钟平均load值/逻辑CPU个数 system_load_norm_15: 15分钟平均load值/逻辑CPU个数 system_n_users: 用户数 system_n_cpus: CPU核数 system_uptime: 系统启动时间 nginx_accepts: 自nginx启动起,与客户端建立过得连接总数 nginx_active: 当前nginx正在处理的活动连接数,等于Reading/Writing/Waiting总和 nginx_handled: 自nginx启动起,处理过的客户端连接总数 nginx_reading: 正在读取HTTP请求头部的连接总数 nginx_requests: 自nginx启动起,处理过的客户端请求总数,由于存在HTTP Keep-Alive请求,该值会大于handled值 nginx_upstream_check_fall: upstream_check模块检测到后端失败的次数 nginx_upstream_check_rise: upstream_check模块对后端的检测次数 nginx_upstream_check_status_code: 后端upstream的状态,up为1,down为0 nginx_waiting: 开启 keep-alive 的情况下,这个值等于 active – (reading+writing), 意思就是 Nginx 已经处理完正在等候下一次请求指令的驻留连接 nginx_writing: 正在向客户端发送响应的连接总数 http_response_content_length: HTTP消息实体的传输长度 http_response_http_response_code: http响应状态码 http_response_response_time: http响应用时 http_response_result_code: url探测结果0为正常否则url无法访问 # [aws cloudwatch rds] cloudwatch_aws_rds_bin_log_disk_usage_average: rds 磁盘使用平均值 cloudwatch_aws_rds_bin_log_disk_usage_maximum: rds 磁盘使用量最大值 cloudwatch_aws_rds_bin_log_disk_usage_minimum: rds binlog 磁盘使用量最低 cloudwatch_aws_rds_bin_log_disk_usage_sample_count: rds binlog 磁盘使用情况样本计数 cloudwatch_aws_rds_bin_log_disk_usage_sum: rds binlog 磁盘使用总和 cloudwatch_aws_rds_burst_balance_average: rds 突发余额平均值 cloudwatch_aws_rds_burst_balance_maximum: rds 突发余额最大值 cloudwatch_aws_rds_burst_balance_minimum: rds 突发余额最低 cloudwatch_aws_rds_burst_balance_sample_count: rds 突发平衡样本计数 cloudwatch_aws_rds_burst_balance_sum: rds 突发余额总和 cloudwatch_aws_rds_cpu_utilization_average: rds cpu 利用率平均值 cloudwatch_aws_rds_cpu_utilization_maximum: rds cpu 利用率最大值 cloudwatch_aws_rds_cpu_utilization_minimum: rds cpu 利用率最低 cloudwatch_aws_rds_cpu_utilization_sample_count: rds cpu 利用率样本计数 cloudwatch_aws_rds_cpu_utilization_sum: rds cpu 利用率总和 cloudwatch_aws_rds_database_connections_average: rds 数据库连接平均值 cloudwatch_aws_rds_database_connections_maximum: rds 数据库连接数最大值 cloudwatch_aws_rds_database_connections_minimum: rds 数据库连接最小 cloudwatch_aws_rds_database_connections_sample_count: rds 数据库连接样本数 cloudwatch_aws_rds_database_connections_sum: rds 数据库连接总和 cloudwatch_aws_rds_db_load_average: rds db 平均负载 cloudwatch_aws_rds_db_load_cpu_average: rds db 负载 cpu 平均值 cloudwatch_aws_rds_db_load_cpu_maximum: rds db 负载 cpu 最大值 cloudwatch_aws_rds_db_load_cpu_minimum: rds db 负载 cpu 最小值 cloudwatch_aws_rds_db_load_cpu_sample_count: rds db 加载 CPU 样本数 cloudwatch_aws_rds_db_load_cpu_sum: rds db 加载cpu总和 cloudwatch_aws_rds_db_load_maximum: rds 数据库负载最大值 cloudwatch_aws_rds_db_load_minimum: rds 数据库负载最小值 cloudwatch_aws_rds_db_load_non_cpu_average: rds 加载非 CPU 平均值 cloudwatch_aws_rds_db_load_non_cpu_maximum: rds 加载非 cpu 最大值 cloudwatch_aws_rds_db_load_non_cpu_minimum: rds 加载非 cpu 最小值 cloudwatch_aws_rds_db_load_non_cpu_sample_count: rds 加载非 cpu 样本计数 cloudwatch_aws_rds_db_load_non_cpu_sum: rds 加载非cpu总和 cloudwatch_aws_rds_db_load_sample_count: rds db 加载样本计数 cloudwatch_aws_rds_db_load_sum: rds db 负载总和 cloudwatch_aws_rds_disk_queue_depth_average: rds 磁盘队列深度平均值 cloudwatch_aws_rds_disk_queue_depth_maximum: rds 磁盘队列深度最大值 cloudwatch_aws_rds_disk_queue_depth_minimum: rds 磁盘队列深度最小值 cloudwatch_aws_rds_disk_queue_depth_sample_count: rds 磁盘队列深度样本计数 cloudwatch_aws_rds_disk_queue_depth_sum: rds 磁盘队列深度总和 cloudwatch_aws_rds_ebs_byte_balance__average: rds ebs 字节余额平均值 cloudwatch_aws_rds_ebs_byte_balance__maximum: rds ebs 字节余额最大值 cloudwatch_aws_rds_ebs_byte_balance__minimum: rds ebs 字节余额最低 cloudwatch_aws_rds_ebs_byte_balance__sample_count: rds ebs 字节余额样本数 cloudwatch_aws_rds_ebs_byte_balance__sum: rds ebs 字节余额总和 cloudwatch_aws_rds_ebsio_balance__average: rds ebsio 余额平均值 cloudwatch_aws_rds_ebsio_balance__maximum: rds ebsio 余额最大值 cloudwatch_aws_rds_ebsio_balance__minimum: rds ebsio 余额最低 cloudwatch_aws_rds_ebsio_balance__sample_count: rds ebsio 平衡样本计数 cloudwatch_aws_rds_ebsio_balance__sum: rds ebsio 余额总和 cloudwatch_aws_rds_free_storage_space_average: rds 免费存储空间平均 cloudwatch_aws_rds_free_storage_space_maximum: rds 最大可用存储空间 cloudwatch_aws_rds_free_storage_space_minimum: rds 最低可用存储空间 cloudwatch_aws_rds_free_storage_space_sample_count: rds 可用存储空间样本数 cloudwatch_aws_rds_free_storage_space_sum: rds 免费存储空间总和 cloudwatch_aws_rds_freeable_memory_average: rds 可用内存平均值 cloudwatch_aws_rds_freeable_memory_maximum: rds 最大可用内存 cloudwatch_aws_rds_freeable_memory_minimum: rds 最小可用内存 cloudwatch_aws_rds_freeable_memory_sample_count: rds 可释放内存样本数 cloudwatch_aws_rds_freeable_memory_sum: rds 可释放内存总和 cloudwatch_aws_rds_lvm_read_iops_average: rds lvm 读取 iops 平均值 cloudwatch_aws_rds_lvm_read_iops_maximum: rds lvm 读取 iops 最大值 cloudwatch_aws_rds_lvm_read_iops_minimum: rds lvm 读取 iops 最低 cloudwatch_aws_rds_lvm_read_iops_sample_count: rds lvm 读取 iops 样本计数 cloudwatch_aws_rds_lvm_read_iops_sum: rds lvm 读取 iops 总和 cloudwatch_aws_rds_lvm_write_iops_average: rds lvm 写入 iops 平均值 cloudwatch_aws_rds_lvm_write_iops_maximum: rds lvm 写入 iops 最大值 cloudwatch_aws_rds_lvm_write_iops_minimum: rds lvm 写入 iops 最低 cloudwatch_aws_rds_lvm_write_iops_sample_count: rds lvm 写入 iops 样本计数 cloudwatch_aws_rds_lvm_write_iops_sum: rds lvm 写入 iops 总和 cloudwatch_aws_rds_network_receive_throughput_average: rds 网络接收吞吐量平均 cloudwatch_aws_rds_network_receive_throughput_maximum: rds 网络接收吞吐量最大值 cloudwatch_aws_rds_network_receive_throughput_minimum: rds 网络接收吞吐量最小值 cloudwatch_aws_rds_network_receive_throughput_sample_count: rds 网络接收吞吐量样本计数 cloudwatch_aws_rds_network_receive_throughput_sum: rds 网络接收吞吐量总和 cloudwatch_aws_rds_network_transmit_throughput_average: rds 网络传输吞吐量平均值 cloudwatch_aws_rds_network_transmit_throughput_maximum: rds 网络传输吞吐量最大 cloudwatch_aws_rds_network_transmit_throughput_minimum: rds 网络传输吞吐量最小值 cloudwatch_aws_rds_network_transmit_throughput_sample_count: rds 网络传输吞吐量样本计数 cloudwatch_aws_rds_network_transmit_throughput_sum: rds 网络传输吞吐量总和 cloudwatch_aws_rds_read_iops_average: rds 读取 iops 平均值 cloudwatch_aws_rds_read_iops_maximum: rds 最大读取 iops cloudwatch_aws_rds_read_iops_minimum: rds 读取 iops 最低 cloudwatch_aws_rds_read_iops_sample_count: rds 读取 iops 样本计数 cloudwatch_aws_rds_read_iops_sum: rds 读取 iops 总和 cloudwatch_aws_rds_read_latency_average: rds 读取延迟平均值 cloudwatch_aws_rds_read_latency_maximum: rds 读取延迟最大值 cloudwatch_aws_rds_read_latency_minimum: rds 最小读取延迟 cloudwatch_aws_rds_read_latency_sample_count: rds 读取延迟样本计数 cloudwatch_aws_rds_read_latency_sum: rds 读取延迟总和 cloudwatch_aws_rds_read_throughput_average: rds 读取吞吐量平均值 cloudwatch_aws_rds_read_throughput_maximum: rds 最大读取吞吐量 cloudwatch_aws_rds_read_throughput_minimum: rds 最小读取吞吐量 cloudwatch_aws_rds_read_throughput_sample_count: rds 读取吞吐量样本计数 cloudwatch_aws_rds_read_throughput_sum: rds 读取吞吐量总和 cloudwatch_aws_rds_swap_usage_average: rds 交换使用平均值 cloudwatch_aws_rds_swap_usage_maximum: rds 交换使用最大值 cloudwatch_aws_rds_swap_usage_minimum: rds 交换使用量最低 cloudwatch_aws_rds_swap_usage_sample_count: rds 交换使用示例计数 cloudwatch_aws_rds_swap_usage_sum: rds 交换使用总和 cloudwatch_aws_rds_write_iops_average: rds 写入 iops 平均值 cloudwatch_aws_rds_write_iops_maximum: rds 写入 iops 最大值 cloudwatch_aws_rds_write_iops_minimum: rds 写入 iops 最低 cloudwatch_aws_rds_write_iops_sample_count: rds 写入 iops 样本计数 cloudwatch_aws_rds_write_iops_sum: rds 写入 iops 总和 cloudwatch_aws_rds_write_latency_average: rds 写入延迟平均值 cloudwatch_aws_rds_write_latency_maximum: rds 最大写入延迟 cloudwatch_aws_rds_write_latency_minimum: rds 写入延迟最小值 cloudwatch_aws_rds_write_latency_sample_count: rds 写入延迟样本计数 cloudwatch_aws_rds_write_latency_sum: rds 写入延迟总和 cloudwatch_aws_rds_write_throughput_average: rds 写入吞吐量平均值 cloudwatch_aws_rds_write_throughput_maximum: rds 最大写入吞吐量 cloudwatch_aws_rds_write_throughput_minimum: rds 写入吞吐量最小值 cloudwatch_aws_rds_write_throughput_sample_count: rds 写入吞吐量样本计数 cloudwatch_aws_rds_write_throughput_sum: rds 写入吞吐量总和 en: ip_conntrack_count: the number of entries in the conntrack table(unit:int, count) ip_conntrack_max: the max capacity of the conntrack table(unit:int, size) cpu_usage_idle: "CPU idle rate(unit:%)" cpu_usage_active: "CPU usage rate(unit:%)" cpu_usage_system: "CPU kernel state time proportion(unit:%)" cpu_usage_user: "CPU user attitude time proportion(unit:%)" cpu_usage_nice: "The proportion of low priority CPU time, that is, the process NICE value is adjusted to the CPU time between 1-19. Note here that the value range of NICE is -20 to 19, the larger the value, the lower the priority, the lower the priority(unit:%)" cpu_usage_iowait: "CPU waiting for I/O time proportion(unit:%)" cpu_usage_irq: "CPU processing hard interrupt time proportion(unit:%)" cpu_usage_softirq: "CPU processing soft interrupt time proportion(unit:%)" cpu_usage_steal: "In the virtual machine environment, there is this indicator, which means that the CPU is used by other virtual machines for the proportion of time.(unit:%)" cpu_usage_guest: "The time to run other operating systems by virtualization, that is, the proportion of CPU time running the virtual machine(unit:%)" cpu_usage_guest_nice: "The proportion of time to run the virtual machine at low priority(unit:%)" disk_free: "The remaining amount of the hard disk partition (unit: byte)" disk_used: "Hard disk partitional use (unit: byte)" disk_used_percent: "Hard disk partitional use rate (unit:%)" disk_total: "Total amount of hard disk partition (unit: byte)" disk_inodes_free: "Hard disk partition INODE remaining amount" disk_inodes_used: "Hard disk partition INODE usage amount" disk_inodes_total: "The total amount of hard disk partition INODE" diskio_io_time: "From the perspective of the device perspective, the total time of I/O request, the I/O request in the queue is count (unit: millisecond), the counter type, you need to use the function to find the value" diskio_iops_in_progress: "IO requests that have been assigned to device -driven and have not yet been completed, not included in the queue but not yet assigned to the device -driven IO request, Gauge type" diskio_merged_reads: "The number of times of adjacent reading request Merge, the counter type" diskio_merged_writes: "The number of times the request Merge writes, the counter type" diskio_read_bytes: "The number of byte reads, the counter type, you need to use the function to find the Rate to use the value" diskio_read_time: "The total time of reading request (unit: millisecond), the counter type, you need to use the function to find the Rate to have the value of use" diskio_reads: "Read the number of requests, the counter type, you need to use the function to find the Rate to use the value" diskio_weighted_io_time: "From the perspective of the I/O request perspective, I/O wait for the total time. If there are multiple I/O requests at the same time, the time will be superimposed (unit: millisecond)" diskio_write_bytes: "The number of bytes written, the counter type, you need to use the function to find the Rate to use the value" diskio_write_time: "The total time of the request (unit: millisecond), the counter type, you need to use the function to find the rate to have the value of use" diskio_writes: "Write the number of requests, the counter type, you need to use the function to find the rate to use value" kernel_boot_time: "Kernel startup time" kernel_context_switches: "Number of kernel context switching times" kernel_entropy_avail: "Entropy pool inside the Linux system" kernel_interrupts: "Number of kernel interruption" kernel_processes_forked: "ForK's process number" mem_active: "The total number of memory (including Cache and BUFFER memory)" mem_available: "Application can use memory numbers" mem_available_percent: "Memory remaining percentage (0 ~ 100)" mem_buffered: "Used to make buffer size for the file" mem_cached: "The size of the memory used by the cache memory (equal to diskcache minus Swap Cache )" mem_commit_limit: "According to the over allocation ratio ('vm.overCommit _ Ratio'), this is the current total memory that can be allocated on the system." mem_committed_as: "Currently allocated on the system. It is the sum of the memory of all process applications" mem_dirty: "Waiting to be written back to the memory size of the disk" mem_free: "Senior memory number" mem_high_free: "Unused high memory size" mem_high_total: "The total memory size of the high memory (Highmem refers to all the physical memory that is higher than 860 MB of memory, the HighMem area is used for user programs, or for page cache. This area is not directly mapped to the kernel space. The kernels must use different methods to use this section of memory. )" mem_huge_page_size: "The size of each big page" mem_huge_pages_free: "The number of Huge Pages in the pool that have not been allocated" mem_huge_pages_total: "Reserve the total number of Huge Pages" mem_inactive: "Free memory (including the memory of free and available)" mem_low_free: "Unused low size" mem_low_total: "The total size of the low memory memory can achieve the same role of high memory, and it can be used by the kernel to record some of its own data structure" mem_mapped: "The size of the mapping of equipment and files" mem_page_tables: "The size of the index table of the management of the memory paging page" mem_shared: "The total memory shared by multiple processes" mem_slab: "The size of the kernel data structure cache can reduce the consumption of application and release memory" mem_sreclaimable: "The size of the SLAB can be recovered" mem_sunreclaim: "The size of the SLAB cannot be recovered(SUnreclaim+SReclaimable=Slab)" mem_swap_cached: "The size of the swap space used by the cache memory (cache memory), the memory that has been swapped out, but is still stored in the swapfile. Used to be quickly replaced when needed without opening the I/O port again" mem_swap_free: "The size of the switching space is not used" mem_swap_total: "The total size of the exchange space" mem_total: "Total memory" mem_used: "Memory number" mem_used_percent: "The memory has been used by several percentage (0 ~ 100)" mem_vmalloc_chunk: "The largest continuous unused vmalloc area" mem_vmalloc_totalL: "You can vmalloc virtual memory size" mem_vmalloc_used: "Vmalloc's virtual memory size" mem_write_back: "The memory size of the disk is being written back to the disk" mem_write_back_tmp: "Fuse is used to temporarily write back the memory of the buffer area" net_bytes_recv: "Total inbound traffic(bytes) of network card" net_bytes_sent: "Total outbound traffic(bytes) of network card" net_bits_recv: "Total inbound traffic(bits) of network card" net_bits_sent: "Total outbound traffic(bits) of network card" net_drop_in: "The number of packets for network cards" net_drop_out: "The number of packets issued by the network card" net_err_in: "The number of incorrect packets of the network card" net_err_out: "Number of incorrect number of network cards" net_packets_recv: "Net card collection quantity" net_packets_sent: "Number of network card issuance" netstat_tcp_established: "ESTABLISHED status network link number" netstat_tcp_fin_wait1: "FIN _ WAIT1 status network link number" netstat_tcp_fin_wait2: "FIN _ WAIT2 status number of network links" netstat_tcp_last_ack: "LAST_ ACK status number of network links" netstat_tcp_listen: "Number of network links in Listen status" netstat_tcp_syn_recv: "SYN _ RECV status number of network links" netstat_tcp_syn_sent: "SYN _ SENT status number of network links" netstat_tcp_time_wait: "Time _ WAIT status network link number" netstat_udp_socket: "Number of network links in UDP status" processes_blocked: "The number of processes in the unreproducible sleep state('U','D','L')" processes_dead: "Number of processes in recycling('X')" processes_idle: "Number of idle processes hanging('I')" processes_paging: "Number of paging processes('P')" processes_running: "Number of processes during operation('R')" processes_sleeping: "Can interrupt the number of processes('S')" processes_stopped: "Pushing status process number('T')" processes_total: "Total process number" processes_total_threads: "Number of threads" processes_unknown: "Unknown status process number" processes_zombies: "Number of zombies('Z')" swap_used_percent: "SWAP space replace the data volume" system_load1: "1 minute average load value" system_load5: "5 minutes average load value" system_load15: "15 minutes average load value" system_load_norm_1: "1 minute average load value/logical CPU number" system_load_norm_5: "5 minutes average load value/logical CPU number" system_load_norm_15: "15 minutes average load value/logical CPU number" system_n_users: "User number" system_n_cpus: "CPU nuclear number" system_uptime: "System startup time" nginx_accepts: "Since Nginx started, the total number of connections has been established with the client" nginx_active: "The current number of activity connections that Nginx is being processed is equal to Reading/Writing/Waiting" nginx_handled: "Starting from Nginx, the total number of client connections that have been processed" nginx_reading: "Reading the total number of connections on the http request header" nginx_requests: "Since nginx is started, the total number of client requests processed, due to the existence of HTTP Keep-Alive requests, this value will be greater than the handled value" nginx_upstream_check_fall: "UPStream_CHECK module detects the number of back -end failures" nginx_upstream_check_rise: "UPSTREAM _ Check module to detect the number of back -end" nginx_upstream_check_status_code: "The state of the backstream is 1, and the down is 0" nginx_waiting: "When keep-alive is enabled, this value is equal to active – (reading+writing), which means that Nginx has processed the resident connection that is waiting for the next request command" nginx_writing: "The total number of connections to send a response to the client" http_response_content_length: "HTTP message entity transmission length" http_response_http_response_code: "http response status code" http_response_response_time: "When http ring application" http_response_result_code: "URL detection result 0 is normal, otherwise the URL cannot be accessed" # [mysqld_exporter] mysql_global_status_uptime: The number of seconds that the server has been up.(Gauge) mysql_global_status_uptime_since_flush_status: The number of seconds since the most recent FLUSH STATUS statement.(Gauge) mysql_global_status_queries: The number of statements executed by the server. This variable includes statements executed within stored programs, unlike the Questions variable. It does not count COM_PING or COM_STATISTICS commands.(Counter) mysql_global_status_threads_connected: The number of currently open connections.(Counter) mysql_global_status_connections: The number of connection attempts (successful or not) to the MySQL server.(Gauge) mysql_global_status_max_used_connections: The maximum number of connections that have been in use simultaneously since the server started.(Gauge) mysql_global_status_threads_running: The number of threads that are not sleeping.(Gauge) mysql_global_status_questions: The number of statements executed by the server. This includes only statements sent to the server by clients and not statements executed within stored programs, unlike the Queries variable. This variable does not count COM_PING, COM_STATISTICS, COM_STMT_PREPARE, COM_STMT_CLOSE, or COM_STMT_RESET commands.(Counter) mysql_global_status_threads_cached: The number of threads in the thread cache.(Counter) mysql_global_status_threads_created: The number of threads created to handle connections. If Threads_created is big, you may want to increase the thread_cache_size value. The cache miss rate can be calculated as Threads_created/Connections.(Counter) mysql_global_status_created_tmp_tables: The number of internal temporary tables created by the server while executing statements.(Counter) mysql_global_status_created_tmp_disk_tables: The number of internal on-disk temporary tables created by the server while executing statements. You can compare the number of internal on-disk temporary tables created to the total number of internal temporary tables created by comparing Created_tmp_disk_tables and Created_tmp_tables values.(Counter) mysql_global_status_created_tmp_files: How many temporary files mysqld has created.(Counter) mysql_global_status_select_full_join: The number of joins that perform table scans because they do not use indexes. If this value is not 0, you should carefully check the indexes of your tables.(Counter) mysql_global_status_select_full_range_join: The number of joins that used a range search on a reference table.(Counter) mysql_global_status_select_range: The number of joins that used ranges on the first table. This is normally not a critical issue even if the value is quite large.(Counter) mysql_global_status_select_range_check: The number of joins without keys that check for key usage after each row. If this is not 0, you should carefully check the indexes of your tables.(Counter) mysql_global_status_select_scan: The number of joins that did a full scan of the first table.(Counter) mysql_global_status_sort_rows: The number of sorted rows.(Counter) mysql_global_status_sort_range: The number of sorts that were done using ranges.(Counter) mysql_global_status_sort_merge_passes: The number of merge passes that the sort algorithm has had to do. If this value is large, you should consider increasing the value of the sort_buffer_size system variable.(Counter) mysql_global_status_sort_scan: The number of sorts that were done by scanning the table.(Counter) mysql_global_status_slow_queries: The number of queries that have taken more than long_query_time seconds. This counter increments regardless of whether the slow query log is enabled.(Counter) mysql_global_status_aborted_connects: The number of failed attempts to connect to the MySQL server.(Counter) mysql_global_status_aborted_clients: The number of connections that were aborted because the client died without closing the connection properly.(Counter) mysql_global_status_table_locks_immediate: The number of times that a request for a table lock could be granted immediately. Locks Immediate rising and falling is normal activity.(Counter) mysql_global_status_table_locks_waited: The number of times that a request for a table lock could not be granted immediately and a wait was needed. If this is high and you have performance problems, you should first optimize your queries, and then either split your table or tables or use replication.(Counter) mysql_global_status_bytes_received: The number of bytes received from all clients.(Counter) mysql_global_status_bytes_sent: The number of bytes sent to all clients.(Counter) mysql_global_status_innodb_page_size: InnoDB page size (default 16KB). Many values are counted in pages; the page size enables them to be easily converted to bytes.(Gauge) mysql_global_status_buffer_pool_pages: The number of pages in the InnoDB buffer pool.(Gauge) mysql_global_status_commands_total: The number of times each xxx statement has been executed.(Counter) mysql_global_status_handlers_total: Handler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes. This is in fact the layer between the Storage Engine and MySQL.(Counter) mysql_global_status_opened_files: The number of files that have been opened with my_open() (a mysys library function). Parts of the server that open files without using this function do not increment the count.(Counter) mysql_global_status_open_tables: The number of tables that are open.(Gauge) mysql_global_status_opened_tables: The number of tables that have been opened. If Opened_tables is big, your table_open_cache value is probably too small.(Counter) mysql_global_status_table_open_cache_hits: The number of hits for open tables cache lookups.(Counter) mysql_global_status_table_open_cache_misses: The number of misses for open tables cache lookups.(Counter) mysql_global_status_table_open_cache_overflows: The number of overflows for the open tables cache.(Counter) mysql_global_status_innodb_num_open_files: The number of files InnoDB currently holds open.(Gauge) mysql_global_status_connection_errors_total: These variables provide information about errors that occur during the client connection process.(Counter) mysql_global_status_innodb_buffer_pool_read_requests: The number of logical read requests.(Counter) mysql_global_status_innodb_buffer_pool_reads: The number of logical reads that InnoDB could not satisfy from the buffer pool, and had to read directly from disk.(Counter) mysql_global_variables_thread_cache_size: How many threads the server should cache for reuse.(Gauge) mysql_global_variables_max_connections: The maximum permitted number of simultaneous client connections.(Gauge) mysql_global_variables_innodb_buffer_pool_size: The size in bytes of the buffer pool, the memory area where InnoDB caches table and index data. The default value is 134217728 bytes (128MB).(Gauge) mysql_global_variables_innodb_log_buffer_size: The size in bytes of the buffer that InnoDB uses to write to the log files on disk.(Gauge) mysql_global_variables_key_buffer_size: Index blocks for MyISAM tables are buffered and are shared by all threads.(Gauge) mysql_global_variables_query_cache_size: The amount of memory allocated for caching query results.(Gauge) mysql_global_variables_table_open_cache: The number of open tables for all threads.(Gauge) mysql_global_variables_open_files_limit: The number of file descriptors available to mysqld from the operating system.(Gauge) # [redis_exporter] redis_active_defrag_running: When activedefrag is enabled, this indicates whether defragmentation is currently active, and the CPU percentage it intends to utilize. redis_allocator_active_bytes: Total bytes in the allocator active pages, this includes external-fragmentation. redis_allocator_allocated_bytes: Total bytes allocated form the allocator, including internal-fragmentation. Normally the same as used_memory. redis_allocator_frag_bytes: Delta between allocator_active and allocator_allocated. See note about mem_fragmentation_bytes. redis_allocator_frag_ratio: Ratio between allocator_active and allocator_allocated. This is the true (external) fragmentation metric (not mem_fragmentation_ratio). redis_allocator_resident_bytes: Total bytes resident (RSS) in the allocator, this includes pages that can be released to the OS (by MEMORY PURGE, or just waiting). redis_allocator_rss_bytes: Delta between allocator_resident and allocator_active. redis_allocator_rss_ratio: Ratio between allocator_resident and allocator_active. This usually indicates pages that the allocator can and probably will soon release back to the OS. redis_aof_current_rewrite_duration_sec: Duration of the on-going AOF rewrite operation if any. redis_aof_enabled: Flag indicating AOF logging is activated. redis_aof_last_bgrewrite_status: Status of the last AOF rewrite operation. redis_aof_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last AOF rewrite operation. redis_aof_last_rewrite_duration_sec: Duration of the last AOF rewrite operation in seconds. redis_aof_last_write_status: Status of the last write operation to the AOF. redis_aof_rewrite_in_progress: Flag indicating a AOF rewrite operation is on-going. redis_aof_rewrite_scheduled: Flag indicating an AOF rewrite operation will be scheduled once the on-going RDB save is complete. redis_blocked_clients: Number of clients pending on a blocking call (BLPOP, BRPOP, BRPOPLPUSH, BLMOVE, BZPOPMIN, BZPOPMAX). redis_client_recent_max_input_buffer_bytes: Biggest input buffer among current client connections. redis_client_recent_max_output_buffer_bytes: Biggest output buffer among current client connections. redis_cluster_enabled: Indicate Redis cluster is enabled. redis_commands_duration_seconds_total: The total CPU time consumed by these commands.(Counter) redis_commands_processed_total: Total number of commands processed by the server.(Counter) redis_commands_total: The number of calls that reached command execution (not rejected).(Counter) redis_config_maxclients: The value of the maxclients configuration directive. This is the upper limit for the sum of connected_clients, connected_slaves and cluster_connections. redis_config_maxmemory: The value of the maxmemory configuration directive. redis_connected_clients: Number of client connections (excluding connections from replicas). redis_connected_slaves: Number of connected replicas. redis_connections_received_total: Total number of connections accepted by the server.(Counter) redis_cpu_sys_children_seconds_total: System CPU consumed by the background processes.(Counter) redis_cpu_sys_seconds_total: System CPU consumed by the Redis server, which is the sum of system CPU consumed by all threads of the server process (main thread and background threads).(Counter) redis_cpu_user_children_seconds_total: User CPU consumed by the background processes.(Counter) redis_cpu_user_seconds_total: User CPU consumed by the Redis server, which is the sum of user CPU consumed by all threads of the server process (main thread and background threads).(Counter) redis_db_keys: Total number of keys by DB. redis_db_keys_expiring: Total number of expiring keys by DB redis_defrag_hits: Number of value reallocations performed by active the defragmentation process. redis_defrag_misses: Number of aborted value reallocations started by the active defragmentation process. redis_defrag_key_hits: Number of keys that were actively defragmented. redis_defrag_key_misses: Number of keys that were skipped by the active defragmentation process. redis_evicted_keys_total: Number of evicted keys due to maxmemory limit.(Counter) redis_expired_keys_total: Total number of key expiration events.(Counter) redis_expired_stale_percentage: The percentage of keys probably expired. redis_expired_time_cap_reached_total: The count of times that active expiry cycles have stopped early. redis_exporter_last_scrape_connect_time_seconds: The duration(in seconds) to connect when scrape. redis_exporter_last_scrape_duration_seconds: The last scrape duration. redis_exporter_last_scrape_error: The last scrape error status. redis_exporter_scrape_duration_seconds_count: Durations of scrapes by the exporter redis_exporter_scrape_duration_seconds_sum: Durations of scrapes by the exporter redis_exporter_scrapes_total: Current total redis scrapes.(Counter) redis_instance_info: Information about the Redis instance. redis_keyspace_hits_total: Hits total.(Counter) redis_keyspace_misses_total: Misses total.(Counter) redis_last_key_groups_scrape_duration_milliseconds: Duration of the last key group metrics scrape in milliseconds. redis_last_slow_execution_duration_seconds: The amount of time needed for last slow execution, in seconds. redis_latest_fork_seconds: The amount of time needed for last fork, in seconds. redis_lazyfree_pending_objects: The number of objects waiting to be freed (as a result of calling UNLINK, or FLUSHDB and FLUSHALL with the ASYNC option). redis_master_repl_offset: The server's current replication offset. redis_mem_clients_normal: Memory used by normal clients.(Gauge) redis_mem_clients_slaves: Memory used by replica clients - Starting Redis 7.0, replica buffers share memory with the replication backlog, so this field can show 0 when replicas don't trigger an increase of memory usage. redis_mem_fragmentation_bytes: Delta between used_memory_rss and used_memory. Note that when the total fragmentation bytes is low (few megabytes), a high ratio (e.g. 1.5 and above) is not an indication of an issue. redis_mem_fragmentation_ratio: Ratio between used_memory_rss and used_memory. Note that this doesn't only includes fragmentation, but also other process overheads (see the allocator_* metrics), and also overheads like code, shared libraries, stack, etc. redis_mem_not_counted_for_eviction_bytes: (Gauge) redis_memory_max_bytes: Max memory limit in bytes. redis_memory_used_bytes: Total number of bytes allocated by Redis using its allocator (either standard libc, jemalloc, or an alternative allocator such as tcmalloc) redis_memory_used_dataset_bytes: The size in bytes of the dataset (used_memory_overhead subtracted from used_memory) redis_memory_used_lua_bytes: Number of bytes used by the Lua engine. redis_memory_used_overhead_bytes: The sum in bytes of all overheads that the server allocated for managing its internal data structures. redis_memory_used_peak_bytes: Peak memory consumed by Redis (in bytes) redis_memory_used_rss_bytes: Number of bytes that Redis allocated as seen by the operating system (a.k.a resident set size). This is the number reported by tools such as top(1) and ps(1) redis_memory_used_scripts_bytes: Number of bytes used by cached Lua scripts redis_memory_used_startup_bytes: Initial amount of memory consumed by Redis at startup in bytes redis_migrate_cached_sockets_total: The number of sockets open for MIGRATE purposes redis_net_input_bytes_total: Total input bytes(Counter) redis_net_output_bytes_total: Total output bytes(Counter) redis_process_id: Process ID redis_pubsub_channels: Global number of pub/sub channels with client subscriptions redis_pubsub_patterns: Global number of pub/sub pattern with client subscriptions redis_rdb_bgsave_in_progress: Flag indicating a RDB save is on-going redis_rdb_changes_since_last_save: Number of changes since the last dump redis_rdb_current_bgsave_duration_sec: Duration of the on-going RDB save operation if any redis_rdb_last_bgsave_duration_sec: Duration of the last RDB save operation in seconds redis_rdb_last_bgsave_status: Status of the last RDB save operation redis_rdb_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last RDB save operation redis_rdb_last_save_timestamp_seconds: Epoch-based timestamp of last successful RDB save redis_rejected_connections_total: Number of connections rejected because of maxclients limit(Counter) redis_repl_backlog_first_byte_offset: The master offset of the replication backlog buffer redis_repl_backlog_history_bytes: Size in bytes of the data in the replication backlog buffer redis_repl_backlog_is_active: Flag indicating replication backlog is active redis_replica_partial_resync_accepted: The number of accepted partial resync requests(Gauge) redis_replica_partial_resync_denied: The number of denied partial resync requests(Gauge) redis_replica_resyncs_full: The number of full resyncs with replicas redis_replication_backlog_bytes: Memory used by replication backlog redis_second_repl_offset: The offset up to which replication IDs are accepted. redis_slave_expires_tracked_keys: The number of keys tracked for expiry purposes (applicable only to writable replicas)(Gauge) redis_slowlog_last_id: Last id of slowlog redis_slowlog_length: Total slowlog redis_start_time_seconds: Start time of the Redis instance since unix epoch in seconds. redis_target_scrape_request_errors_total: Errors in requests to the exporter redis_up: Flag indicating redis instance is up redis_uptime_in_seconds: Number of seconds since Redis server start # [windows_exporter] windows_cpu_clock_interrupts_total: Total number of received and serviced clock tick interrupts(counter) windows_cpu_core_frequency_mhz: Core frequency in megahertz(gauge) windows_cpu_cstate_seconds_total: Time spent in low-power idle state(counter) windows_cpu_dpcs_total: Total number of received and serviced deferred procedure calls (DPCs)(counter) windows_cpu_idle_break_events_total: Total number of time processor was woken from idle(counter) windows_cpu_interrupts_total: Total number of received and serviced hardware interrupts(counter) windows_cpu_parking_status: Parking Status represents whether a processor is parked or not(gauge) windows_cpu_processor_performance: Processor Performance is the average performance of the processor while it is executing instructions, as a percentage of the nominal performance of the processor. On some processors, Processor Performance may exceed 100%(gauge) windows_cpu_time_total: Time that processor spent in different modes (idle, user, system, ...)(counter) windows_cs_hostname: Labeled system hostname information as provided by ComputerSystem.DNSHostName and ComputerSystem.Domain(gauge) windows_cs_logical_processors: ComputerSystem.NumberOfLogicalProcessors(gauge) windows_cs_physical_memory_bytes: ComputerSystem.TotalPhysicalMemory(gauge) windows_exporter_build_info: A metric with a constant '1' value labeled by version, revision, branch, and goversion from which windows_exporter was built.(gauge) windows_exporter_collector_duration_seconds: Duration of a collection.(gauge) windows_exporter_collector_success: Whether the collector was successful.(gauge) windows_exporter_collector_timeout: Whether the collector timed out.(gauge) windows_exporter_perflib_snapshot_duration_seconds: Duration of perflib snapshot capture(gauge) windows_logical_disk_free_bytes: Free space in bytes (LogicalDisk.PercentFreeSpace)(gauge) windows_logical_disk_idle_seconds_total: Seconds that the disk was idle (LogicalDisk.PercentIdleTime)(counter) windows_logical_disk_read_bytes_total: The number of bytes transferred from the disk during read operations (LogicalDisk.DiskReadBytesPerSec)(counter) windows_logical_disk_read_latency_seconds_total: Shows the average time, in seconds, of a read operation from the disk (LogicalDisk.AvgDiskSecPerRead)(counter) windows_logical_disk_read_seconds_total: Seconds that the disk was busy servicing read requests (LogicalDisk.PercentDiskReadTime)(counter) windows_logical_disk_read_write_latency_seconds_total: Shows the time, in seconds, of the average disk transfer (LogicalDisk.AvgDiskSecPerTransfer)(counter) windows_logical_disk_reads_total: The number of read operations on the disk (LogicalDisk.DiskReadsPerSec)(counter) windows_logical_disk_requests_queued: The number of requests queued to the disk (LogicalDisk.CurrentDiskQueueLength)(gauge) windows_logical_disk_size_bytes: Total space in bytes (LogicalDisk.PercentFreeSpace_Base)(gauge) windows_logical_disk_split_ios_total: The number of I/Os to the disk were split into multiple I/Os (LogicalDisk.SplitIOPerSec)(counter) windows_logical_disk_write_bytes_total: The number of bytes transferred to the disk during write operations (LogicalDisk.DiskWriteBytesPerSec)(counter) windows_logical_disk_write_latency_seconds_total: Shows the average time, in seconds, of a write operation to the disk (LogicalDisk.AvgDiskSecPerWrite)(counter) windows_logical_disk_write_seconds_total: Seconds that the disk was busy servicing write requests (LogicalDisk.PercentDiskWriteTime)(counter) windows_logical_disk_writes_total: The number of write operations on the disk (LogicalDisk.DiskWritesPerSec)(counter) windows_net_bytes_received_total: (Network.BytesReceivedPerSec)(counter) windows_net_bytes_sent_total: (Network.BytesSentPerSec)(counter) windows_net_bytes_total: (Network.BytesTotalPerSec)(counter) windows_net_current_bandwidth: (Network.CurrentBandwidth)(gauge) windows_net_packets_outbound_discarded_total: (Network.PacketsOutboundDiscarded)(counter) windows_net_packets_outbound_errors_total: (Network.PacketsOutboundErrors)(counter) windows_net_packets_received_discarded_total: (Network.PacketsReceivedDiscarded)(counter) windows_net_packets_received_errors_total: (Network.PacketsReceivedErrors)(counter) windows_net_packets_received_total: (Network.PacketsReceivedPerSec)(counter) windows_net_packets_received_unknown_total: (Network.PacketsReceivedUnknown)(counter) windows_net_packets_sent_total: (Network.PacketsSentPerSec)(counter) windows_net_packets_total: (Network.PacketsPerSec)(counter) windows_os_info: OperatingSystem.Caption, OperatingSystem.Version(gauge) windows_os_paging_free_bytes: OperatingSystem.FreeSpaceInPagingFiles(gauge) windows_os_paging_limit_bytes: OperatingSystem.SizeStoredInPagingFiles(gauge) windows_os_physical_memory_free_bytes: OperatingSystem.FreePhysicalMemory(gauge) windows_os_process_memory_limix_bytes: OperatingSystem.MaxProcessMemorySize(gauge) windows_os_processes: OperatingSystem.NumberOfProcesses(gauge) windows_os_processes_limit: OperatingSystem.MaxNumberOfProcesses(gauge) windows_os_time: OperatingSystem.LocalDateTime(gauge) windows_os_timezone: OperatingSystem.LocalDateTime(gauge) windows_os_users: OperatingSystem.NumberOfUsers(gauge) windows_os_virtual_memory_bytes: OperatingSystem.TotalVirtualMemorySize(gauge) windows_os_virtual_memory_free_bytes: OperatingSystem.FreeVirtualMemory(gauge) windows_os_visible_memory_bytes: OperatingSystem.TotalVisibleMemorySize(gauge) windows_service_info: A metric with a constant '1' value labeled with service information(gauge) windows_service_start_mode: The start mode of the service (StartMode)(gauge) windows_service_state: The state of the service (State)(gauge) windows_service_status: The status of the service (Status)(gauge) windows_system_context_switches_total: Total number of context switches (WMI source is PerfOS_System.ContextSwitchesPersec)(counter) windows_system_exception_dispatches_total: Total number of exceptions dispatched (WMI source is PerfOS_System.ExceptionDispatchesPersec)(counter) windows_system_processor_queue_length: Length of processor queue (WMI source is PerfOS_System.ProcessorQueueLength)(gauge) windows_system_system_calls_total: Total number of system calls (WMI source is PerfOS_System.SystemCallsPersec)(counter) windows_system_system_up_time: System boot time (WMI source is PerfOS_System.SystemUpTime)(gauge) windows_system_threads: Current number of threads (WMI source is PerfOS_System.Threads)(gauge) # [node_exporter] # SYSTEM # CPU context switch 次数 node_context_switches_total: context_switches # Interrupts 次数 node_intr_total: Interrupts # 运行的进程数 node_procs_running: Processes in runnable state # 熵池大小 node_entropy_available_bits: Entropy available to random number generators node_time_seconds: System time in seconds since epoch (1970) node_boot_time_seconds: Node boot time, in unixtime # CPU node_cpu_seconds_total: Seconds the CPUs spent in each mode node_load1: cpu load 1m node_load5: cpu load 5m node_load15: cpu load 15m # MEM # 内核态 # 内核用于缓存数据结构供自己使用的内存 node_memory_Slab_bytes: Memory used by the kernel to cache data structures for its own use # slab中可回收的部分 node_memory_SReclaimable_bytes: SReclaimable - Part of Slab, that might be reclaimed, such as caches # slab中不可回收的部分 node_memory_SUnreclaim_bytes: Part of Slab, that cannot be reclaimed on memory pressure # Vmalloc内存区的大小 node_memory_VmallocTotal_bytes: Total size of vmalloc memory area # vmalloc已分配的内存,虚拟地址空间上的连续的内存 node_memory_VmallocUsed_bytes: Amount of vmalloc area which is used # vmalloc区可用的连续最大快的大小,通过此指标可以知道vmalloc可分配连续内存的最大值 node_memory_VmallocChunk_bytes: Largest contiguous block of vmalloc area which is free # 内存的硬件故障删除掉的内存页的总大小 node_memory_HardwareCorrupted_bytes: Amount of RAM that the kernel identified as corrupted / not working # 用于在虚拟和物理内存地址之间映射的内存 node_memory_PageTables_bytes: Memory used to map between virtual and physical memory addresses (gauge) # 内核栈内存,常驻内存,不可回收 node_memory_KernelStack_bytes: Kernel memory stack. This is not reclaimable # 用来访问高端内存,复制高端内存的临时buffer,称为“bounce buffering”,会降低I/O 性能 node_memory_Bounce_bytes: Memory used for block device bounce buffers #用户态 # 单个巨页大小 node_memory_Hugepagesize_bytes: Huge Page size # 系统分配的常驻巨页数 node_memory_HugePages_Total: Total size of the pool of huge pages # 系统空闲的巨页数 node_memory_HugePages_Free: Huge pages in the pool that are not yet allocated # 进程已申请但未使用的巨页数 node_memory_HugePages_Rsvd: Huge pages for which a commitment to allocate from the pool has been made, but no allocation # 超过系统设定的常驻HugePages数量的个数 node_memory_HugePages_Surp: Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages # 透明巨页 Transparent HugePages (THP) node_memory_AnonHugePages_bytes: Memory in anonymous huge pages # inactivelist中的File-backed内存 node_memory_Inactive_file_bytes: File-backed memory on inactive LRU list # inactivelist中的Anonymous内存 node_memory_Inactive_anon_bytes: Anonymous and swap cache on inactive LRU list, including tmpfs (shmem) # activelist中的File-backed内存 node_memory_Active_file_bytes: File-backed memory on active LRU list # activelist中的Anonymous内存 node_memory_Active_anon_bytes: Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs # 禁止换出的页,对应 Unevictable 链表 node_memory_Unevictable_bytes: Amount of unevictable memory that can't be swapped out for a variety of reasons # 共享内存 node_memory_Shmem_bytes: Used shared memory (shared between several processes, thus including RAM disks) # 匿名页内存大小 node_memory_AnonPages_bytes: Memory in user pages not backed by files # 被关联的内存页大小 node_memory_Mapped_bytes: Used memory in mapped pages files which have been mapped, such as libraries # file-backed内存页缓存大小 node_memory_Cached_bytes: Parked file data (file content) cache # 系统中有多少匿名页曾经被swap-out、现在又被swap-in并且swap-in之后页面中的内容一直没发生变化 node_memory_SwapCached_bytes: Memory that keeps track of pages that have been fetched from swap but not yet been modified # 被mlock()系统调用锁定的内存大小 node_memory_Mlocked_bytes: Size of pages locked to memory using the mlock() system call # 块设备(block device)所占用的缓存页 node_memory_Buffers_bytes: Block device (e.g. harddisk) cache node_memory_SwapTotal_bytes: Memory information field SwapTotal_bytes node_memory_SwapFree_bytes: Memory information field SwapFree_bytes # DISK node_filesystem_avail_bytes: Filesystem space available to non-root users in byte node_filesystem_free_bytes: Filesystem free space in bytes node_filesystem_size_bytes: Filesystem size in bytes node_filesystem_files_free: Filesystem total free file nodes node_filesystem_files: Filesystem total free file nodes node_filefd_maximum: Max open files node_filefd_allocated: Open files node_filesystem_readonly: Filesystem read-only status node_filesystem_device_error: Whether an error occurred while getting statistics for the given device node_disk_reads_completed_total: The total number of reads completed successfully node_disk_writes_completed_total: The total number of writes completed successfully node_disk_reads_merged_total: The number of reads merged node_disk_writes_merged_total: The number of writes merged node_disk_read_bytes_total: The total number of bytes read successfully node_disk_written_bytes_total: The total number of bytes written successfully node_disk_io_time_seconds_total: Total seconds spent doing I/Os node_disk_read_time_seconds_total: The total number of seconds spent by all reads node_disk_write_time_seconds_total: The total number of seconds spent by all writes node_disk_io_time_weighted_seconds_total: The weighted of seconds spent doing I/Os # NET node_network_receive_bytes_total: Network device statistic receive_bytes (counter) node_network_transmit_bytes_total: Network device statistic transmit_bytes (counter) node_network_receive_packets_total: Network device statistic receive_bytes node_network_transmit_packets_total: Network device statistic transmit_bytes node_network_receive_errs_total: Network device statistic receive_errs node_network_transmit_errs_total: Network device statistic transmit_errs node_network_receive_drop_total: Network device statistic receive_drop node_network_transmit_drop_total: Network device statistic transmit_drop node_nf_conntrack_entries: Number of currently allocated flow entries for connection tracking node_sockstat_TCP_alloc: Number of TCP sockets in state alloc node_sockstat_TCP_inuse: Number of TCP sockets in state inuse node_sockstat_TCP_orphan: Number of TCP sockets in state orphan node_sockstat_TCP_tw: Number of TCP sockets in state tw node_netstat_Tcp_CurrEstab: Statistic TcpCurrEstab node_sockstat_sockets_used: Number of IPv4 sockets in use # [kafka_exporter] kafka_brokers: count of kafka_brokers (gauge) kafka_topic_partitions: Number of partitions for this Topic (gauge) kafka_topic_partition_current_offset: Current Offset of a Broker at Topic/Partition (gauge) kafka_consumergroup_current_offset: Current Offset of a ConsumerGroup at Topic/Partition (gauge) kafka_consumer_lag_millis: Current approximation of consumer lag for a ConsumerGroup at Topic/Partition (gauge) kafka_topic_partition_under_replicated_partition: 1 if Topic/Partition is under Replicated # [zookeeper_exporter] zk_znode_count: The total count of znodes stored zk_ephemerals_count: The number of Ephemerals nodes zk_watch_count: The number of watchers setup over Zookeeper nodes. zk_approximate_data_size: Size of data in bytes that a zookeeper server has in its data tree zk_outstanding_requests: Number of currently executing requests zk_packets_sent: Count of the number of zookeeper packets sent from a server zk_packets_received: Count of the number of zookeeper packets received by a server zk_num_alive_connections: Number of active clients connected to a zookeeper server zk_open_file_descriptor_count: Number of file descriptors that a zookeeper server has open zk_max_file_descriptor_count: Maximum number of file descriptors that a zookeeper server can open zk_avg_latency: Average time in milliseconds for requests to be processed zk_min_latency: Maximum time in milliseconds for a request to be processed zk_max_latency: Minimum time in milliseconds for a request to be processed ================================================ FILE: docker/compose-host-network-metric-log/etc-nightingale/script/notify.bak.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- import sys import json import urllib2 import smtplib from email.mime.text import MIMEText reload(sys) sys.setdefaultencoding('utf8') notify_channel_funcs = { "email":"email", "sms":"sms", "voice":"voice", "dingtalk":"dingtalk", "wecom":"wecom", "feishu":"feishu" } mail_host = "smtp.163.com" mail_port = 994 mail_user = "ulricqin" mail_pass = "password" mail_from = "ulricqin@163.com" class Sender(object): @classmethod def send_email(cls, payload): if mail_user == "ulricqin" and mail_pass == "password": print("invalid smtp configuration") return users = payload.get('event').get("notify_users_obj") emails = {} for u in users: if u.get("email"): emails[u.get("email")] = 1 if not emails: return recipients = emails.keys() mail_body = payload.get('tpls').get("email.tpl", "email.tpl not found") message = MIMEText(mail_body, 'html', 'utf-8') message['From'] = mail_from message['To'] = ", ".join(recipients) message["Subject"] = payload.get('tpls').get("subject.tpl", "subject.tpl not found") try: smtp = smtplib.SMTP_SSL(mail_host, mail_port) smtp.login(mail_user, mail_pass) smtp.sendmail(mail_from, recipients, message.as_string()) smtp.close() except smtplib.SMTPException, error: print(error) @classmethod def send_wecom(cls, payload): users = payload.get('event').get("notify_users_obj") tokens = {} for u in users: contacts = u.get("contacts") if contacts.get("wecom_robot_token", ""): tokens[contacts.get("wecom_robot_token", "")] = 1 opener = urllib2.build_opener(urllib2.HTTPHandler()) method = "POST" for t in tokens: url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}".format(t) body = { "msgtype": "markdown", "markdown": { "content": payload.get('tpls').get("wecom.tpl", "wecom.tpl not found") } } request = urllib2.Request(url, data=json.dumps(body)) request.add_header("Content-Type",'application/json;charset=utf-8') request.get_method = lambda: method try: connection = opener.open(request) print(connection.read()) except urllib2.HTTPError, error: print(error) @classmethod def send_dingtalk(cls, payload): event = payload.get('event') users = event.get("notify_users_obj") rule_name = event.get("rule_name") event_state = "Triggered" if event.get("is_recovered"): event_state = "Recovered" tokens = {} phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 contacts = u.get("contacts") if contacts.get("dingtalk_robot_token", ""): tokens[contacts.get("dingtalk_robot_token", "")] = 1 opener = urllib2.build_opener(urllib2.HTTPHandler()) method = "POST" for t in tokens: url = "https://oapi.dingtalk.com/robot/send?access_token={}".format(t) body = { "msgtype": "markdown", "markdown": { "title": "{} - {}".format(event_state, rule_name), "text": payload.get('tpls').get("dingtalk.tpl", "dingtalk.tpl not found") + ' '.join(["@"+i for i in phones.keys()]) }, "at": { "atMobiles": phones.keys(), "isAtAll": False } } request = urllib2.Request(url, data=json.dumps(body)) request.add_header("Content-Type",'application/json;charset=utf-8') request.get_method = lambda: method try: connection = opener.open(request) print(connection.read()) except urllib2.HTTPError, error: print(error) @classmethod def send_feishu(cls, payload): users = payload.get('event').get("notify_users_obj") tokens = {} phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 contacts = u.get("contacts") if contacts.get("feishu_robot_token", ""): tokens[contacts.get("feishu_robot_token", "")] = 1 opener = urllib2.build_opener(urllib2.HTTPHandler()) method = "POST" for t in tokens: url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t) body = { "msg_type": "text", "content": { "text": payload.get('tpls').get("feishu.tpl", "feishu.tpl not found") }, "at": { "atMobiles": phones.keys(), "isAtAll": False } } request = urllib2.Request(url, data=json.dumps(body)) request.add_header("Content-Type",'application/json;charset=utf-8') request.get_method = lambda: method try: connection = opener.open(request) print(connection.read()) except urllib2.HTTPError, error: print(error) @classmethod def send_sms(cls, payload): users = payload.get('event').get("notify_users_obj") phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 if phones: print("send_sms not implemented, phones: {}".format(phones.keys())) @classmethod def send_voice(cls, payload): users = payload.get('event').get("notify_users_obj") phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 if phones: print("send_voice not implemented, phones: {}".format(phones.keys())) def main(): payload = json.load(sys.stdin) with open(".payload", 'w') as f: f.write(json.dumps(payload, indent=4)) for ch in payload.get('event').get('notify_channels'): send_func_name = "send_{}".format(notify_channel_funcs.get(ch.strip())) if not hasattr(Sender, send_func_name): print("function: {} not found", send_func_name) continue send_func = getattr(Sender, send_func_name) send_func(payload) def hello(): print("hello nightingale") if __name__ == "__main__": if len(sys.argv) == 1: main() elif sys.argv[1] == "hello": hello() else: print("I am confused") ================================================ FILE: docker/compose-host-network-metric-log/etc-nightingale/script/notify.py ================================================ #!/usr/bin/env python3 # -*- coding: UTF-8 -*- import sys import json class Sender(object): @classmethod def send_email(cls, payload): # already done in go code pass @classmethod def send_wecom(cls, payload): # already done in go code pass @classmethod def send_dingtalk(cls, payload): # already done in go code pass @classmethod def send_feishu(cls, payload): # already done in go code pass @classmethod def send_mm(cls, payload): # already done in go code pass @classmethod def send_sms(cls, payload): users = payload.get('event').get("notify_users_obj") phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 if phones: print("send_sms not implemented, phones: {}".format(phones.keys())) @classmethod def send_voice(cls, payload): users = payload.get('event').get("notify_users_obj") phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 if phones: print("send_voice not implemented, phones: {}".format(phones.keys())) def main(): payload = json.load(sys.stdin) with open(".payload", 'w') as f: f.write(json.dumps(payload, indent=4)) for ch in payload.get('event').get('notify_channels'): send_func_name = "send_{}".format(ch.strip()) if not hasattr(Sender, send_func_name): print("function: {} not found", send_func_name) continue send_func = getattr(Sender, send_func_name) send_func(payload) def hello(): print("hello nightingale") if __name__ == "__main__": if len(sys.argv) == 1: main() elif sys.argv[1] == "hello": hello() else: print("I am confused") ================================================ FILE: docker/compose-host-network-metric-log/etc-nightingale/script/notify_feishu.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- import sys import json import requests class Sender(object): @classmethod def send_email(cls, payload): # already done in go code pass @classmethod def send_wecom(cls, payload): # already done in go code pass @classmethod def send_dingtalk(cls, payload): # already done in go code pass @classmethod def send_ifeishu(cls, payload): users = payload.get('event').get("notify_users_obj") tokens = {} phones = {} for u in users: if u.get("phone"): phones[u.get("phone")] = 1 contacts = u.get("contacts") if contacts.get("feishu_robot_token", ""): tokens[contacts.get("feishu_robot_token", "")] = 1 headers = { "Content-Type": "application/json;charset=utf-8", "Host": "open.feishu.cn" } for t in tokens: url = "https://open.feishu.cn/open-apis/bot/v2/hook/{}".format(t) body = { "msg_type": "text", "content": { "text": payload.get('tpls').get("feishu", "feishu not found") }, "at": { "atMobiles": list(phones.keys()), "isAtAll": False } } response = requests.post(url, headers=headers, data=json.dumps(body)) print(f"notify_ifeishu: token={t} status_code={response.status_code} response_text={response.text}") @classmethod def send_mm(cls, payload): # already done in go code pass @classmethod def send_sms(cls, payload): pass @classmethod def send_voice(cls, payload): pass def main(): payload = json.load(sys.stdin) with open(".payload", 'w') as f: f.write(json.dumps(payload, indent=4)) for ch in payload.get('event').get('notify_channels'): send_func_name = "send_{}".format(ch.strip()) if not hasattr(Sender, send_func_name): print("function: {} not found", send_func_name) continue send_func = getattr(Sender, send_func_name) send_func(payload) def hello(): print("hello nightingale") if __name__ == "__main__": if len(sys.argv) == 1: main() elif sys.argv[1] == "hello": hello() else: print("I am confused") ================================================ FILE: docker/compose-host-network-metric-log/etc-nightingale/script/rule_converter.py ================================================ import json import yaml ''' 将promtheus/vmalert的rule转换为n9e中的rule 支持k8s的rule configmap ''' rule_file = 'rules.yaml' def convert_interval(interval): if interval.endswith('s') or interval.endswith('S'): return int(interval[:-1]) if interval.endswith('m') or interval.endswith('M'): return int(interval[:-1]) * 60 if interval.endswith('h') or interval.endswith('H'): return int(interval[:-1]) * 60 * 60 if interval.endswith('d') or interval.endswith('D'): return int(interval[:-1]) * 60 * 60 * 24 return int(interval) def convert_alert(rule, interval): name = rule['alert'] prom_ql = rule['expr'] if 'for' in rule: prom_for_duration = convert_interval(rule['for']) else: prom_for_duration = 0 prom_eval_interval = convert_interval(interval) note = '' if 'annotations' in rule: for v in rule['annotations'].values(): note = v break annotations = {} if 'annotations' in rule: for k, v in rule['annotations'].items(): annotations[k] = v append_tags = [] severity = 2 if 'labels' in rule: for k, v in rule['labels'].items(): if k != 'severity': append_tags.append('{}={}'.format(k, v)) continue if v == 'critical': severity = 1 elif v == 'info': severity = 3 # elif v == 'warning': # severity = 2 n9e_alert_rule = { "name": name, "note": note, "severity": severity, "disabled": 0, "prom_for_duration": prom_for_duration, "prom_ql": prom_ql, "prom_eval_interval": prom_eval_interval, "enable_stime": "00:00", "enable_etime": "23:59", "enable_days_of_week": [ "1", "2", "3", "4", "5", "6", "0" ], "enable_in_bg": 0, "notify_recovered": 1, "notify_channels": [], "notify_repeat_step": 60, "recover_duration": 0, "callbacks": [], "runbook_url": "", "append_tags": append_tags, "annotations":annotations } return n9e_alert_rule def convert_record(rule, interval): name = rule['record'] prom_ql = rule['expr'] prom_eval_interval = convert_interval(interval) note = '' append_tags = [] if 'labels' in rule: for k, v in rule['labels'].items(): append_tags.append('{}={}'.format(k, v)) n9e_record_rule = { "name": name, "note": note, "disabled": 0, "prom_ql": prom_ql, "prom_eval_interval": prom_eval_interval, "append_tags": append_tags } return n9e_record_rule ''' example of rule group file --- groups: - name: example rules: - alert: HighRequestLatency expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5 for: 10m labels: severity: page annotations: summary: High request latency ''' def deal_group(group): """ parse single prometheus/vmalert rule group """ alert_rules = [] record_rules = [] for rule_segment in group['groups']: if 'interval' in rule_segment: interval = rule_segment['interval'] else: interval = '15s' for rule in rule_segment['rules']: if 'alert' in rule: alert_rules.append(convert_alert(rule, interval)) else: record_rules.append(convert_record(rule, interval)) return alert_rules, record_rules ''' example of k8s rule configmap --- apiVersion: v1 kind: ConfigMap metadata: name: rulefiles-0 data: etcdrules.yaml: | groups: - name: etcd rules: - alert: etcdInsufficientMembers annotations: message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value}}).' expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2) for: 3m labels: severity: critical ''' def deal_configmap(rule_configmap): """ parse rule configmap from k8s """ all_record_rules = [] all_alert_rules = [] for _, rule_group_str in rule_configmap['data'].items(): rule_group = yaml.load(rule_group_str, Loader=yaml.FullLoader) alert_rules, record_rules = deal_group(rule_group) all_alert_rules.extend(alert_rules) all_record_rules.extend(record_rules) return all_alert_rules, all_record_rules def main(): with open(rule_file, 'r') as f: rule_config = yaml.load(f, Loader=yaml.FullLoader) # 如果文件是k8s中的configmap,使用下面的方法 # alert_rules, record_rules = deal_configmap(rule_config) alert_rules, record_rules = deal_group(rule_config) with open("alert-rules.json", 'w') as fw: json.dump(alert_rules, fw, indent=2, ensure_ascii=False) with open("record-rules.json", 'w') as fw: json.dump(record_rules, fw, indent=2, ensure_ascii=False) if __name__ == '__main__': main() ================================================ FILE: docker/compose-host-network-metric-log/etc-prometheus/prometheus.yml ================================================ # my global config global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: # - alertmanager:9093 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: # - "first_rules.yml" # - "second_rules.yml" scrape_configs: # The job name is added as a label `job=` to any timeseries scraped from this config. - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] - job_name: 'nightingale' static_configs: - targets: ['localhost:17000'] ================================================ FILE: docker/compose-postgres/categraf/conf/config.toml ================================================ [global] # whether print configs print_configs = false # add label(agent_hostname) to series # "" -> auto detect hostname # "xx" -> use specified string xx # "$hostname" -> auto detect hostname # "$ip" -> auto detect ip # "$hostname-$ip" -> auto detect hostname and ip to replace the vars hostname = "$HOSTNAME" # will not add label(agent_hostname) if true omit_hostname = false # s | ms precision = "ms" # global collect interval interval = 15 [global.labels] source="categraf" # region = "shanghai" # env = "localhost" [writer_opt] # default: 2000 batch = 2000 # channel(as queue) size chan_size = 10000 [[writers]] url = "http://nightingale:17000/prometheus/v1/write" # Basic auth username basic_auth_user = "" # Basic auth password basic_auth_pass = "" # timeout settings, unit: ms timeout = 5000 dial_timeout = 2500 max_idle_conns_per_host = 100 [http] enable = false address = ":9100" print_access = false run_mode = "release" [heartbeat] enable = true # report os version cpu.util mem.util metadata url = "http://nightingale:17000/v1/n9e/heartbeat" # interval, unit: s interval = 10 # Basic auth username basic_auth_user = "" # Basic auth password basic_auth_pass = "" ## Optional headers # headers = ["X-From", "categraf", "X-Xyz", "abc"] # timeout settings, unit: ms timeout = 5000 dial_timeout = 2500 max_idle_conns_per_host = 100 [ibex] enable = true ## ibex flush interval interval = "1000ms" ## n9e ibex server rpc address servers = ["nightingale:20090"] ## temp script dir meta_dir = "./meta" ================================================ FILE: docker/compose-postgres/categraf/conf/input.cpu/cpu.toml ================================================ # # collect interval # interval = 15 # # whether collect per cpu # collect_per_cpu = false ================================================ FILE: docker/compose-postgres/categraf/conf/input.disk/disk.toml ================================================ # # collect interval # interval = 15 # # By default stats will be gathered for all mount points. # # Set mount_points will restrict the stats to only the specified mount points. # mount_points = ["/"] # Ignore mount points by filesystem type. ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] ignore_mount_points = ["/boot"] ================================================ FILE: docker/compose-postgres/categraf/conf/input.diskio/diskio.toml ================================================ # # collect interval # interval = 15 # # By default, categraf will gather stats for all devices including disk partitions. # # Setting devices will restrict the stats to the specified devices. # devices = ["sda", "sdb", "vd*"] ================================================ FILE: docker/compose-postgres/categraf/conf/input.docker/docker.toml ================================================ # # collect interval # interval = 15 [[instances]] # # append some labels for series # labels = { region="cloud", product="n9e" } # # interval = global.interval * interval_times # interval_times = 1 ## Docker Endpoint ## To use TCP, set endpoint = "tcp://[ip]:[port]" ## To use environment variables (ie, docker-machine), set endpoint = "ENV" endpoint = "unix:///var/run/docker.sock" ## Set to true to collect Swarm metrics(desired_replicas, running_replicas) gather_services = false gather_extend_memstats = false container_id_label_enable = true container_id_label_short_style = true ## Containers to include and exclude. Globs accepted. ## Note that an empty array for both will include all containers container_name_include = [] container_name_exclude = [] ## Container states to include and exclude. Globs accepted. ## When empty only containers in the "running" state will be captured. ## example: container_state_include = ["created", "restarting", "running", "removing", "paused", "exited", "dead"] ## example: container_state_exclude = ["created", "restarting", "running", "removing", "paused", "exited", "dead"] # container_state_include = [] # container_state_exclude = [] ## Timeout for docker list, info, and stats commands timeout = "5s" ## Specifies for which classes a per-device metric should be issued ## Possible values are 'cpu' (cpu0, cpu1, ...), 'blkio' (8:0, 8:1, ...) and 'network' (eth0, eth1, ...) ## Please note that this setting has no effect if 'perdevice' is set to 'true' perdevice_include = [] ## Specifies for which classes a total metric should be issued. Total is an aggregated of the 'perdevice' values. ## Possible values are 'cpu', 'blkio' and 'network' ## Total 'cpu' is reported directly by Docker daemon, and 'network' and 'blkio' totals are aggregated by this plugin. ## Please note that this setting has no effect if 'total' is set to 'false' total_include = ["cpu", "blkio", "network"] ## Which environment variables should we use as a tag ##tag_env = ["JAVA_HOME", "HEAP_SIZE"] ## docker labels to include and exclude as tags. Globs accepted. ## Note that an empty array for both will include all labels as tags docker_label_include = [] docker_label_exclude = ["annotation*", "io.kubernetes*", "*description*", "*maintainer*", "*hash", "*author*"] ## Optional TLS Config # use_tls = false # tls_ca = "/etc/telegraf/ca.pem" # tls_cert = "/etc/telegraf/cert.pem" # tls_key = "/etc/telegraf/key.pem" ## Use TLS but skip chain & host verification # insecure_skip_verify = false ================================================ FILE: docker/compose-postgres/categraf/conf/input.kernel/kernel.toml ================================================ # # collect interval # interval = 15 ================================================ FILE: docker/compose-postgres/categraf/conf/input.mem/mem.toml ================================================ # # collect interval # interval = 15 # # whether collect platform specified metrics collect_platform_fields = true ================================================ FILE: docker/compose-postgres/categraf/conf/input.net/net.toml ================================================ # # collect interval # interval = 15 # # whether collect protocol stats on Linux # collect_protocol_stats = false # # setting interfaces will tell categraf to gather these explicit interfaces # interfaces = ["eth0"] ================================================ FILE: docker/compose-postgres/categraf/conf/input.netstat/netstat.toml ================================================ # # collect interval # interval = 15 ================================================ FILE: docker/compose-postgres/categraf/conf/input.processes/processes.toml ================================================ # # collect interval # interval = 15 # # force use ps command to gather # force_ps = false # # force use /proc to gather # force_proc = false ================================================ FILE: docker/compose-postgres/categraf/conf/input.system/system.toml ================================================ # # collect interval # interval = 15 # # whether collect metric: system_n_users # collect_user_number = false ================================================ FILE: docker/compose-postgres/categraf/conf/prometheus.toml ================================================ [prometheus] enable=true scrape_config_file="/etc/prometheus/prometheus.yml" ## log level, debug warn info error log_level="info" ## wal file storage path ,default ./data-agent # wal_storage_path="/path/to/storage" ## wal reserve time duration, default value is 2 hour # wal_min_duration=2 ================================================ FILE: docker/compose-postgres/docker-compose.yaml ================================================ version: "3.7" networks: nightingale: driver: bridge services: postgres: # platform: linux/x86_64 image: "postgres:12-alpine" container_name: postgres hostname: postgres restart: always ports: - "5432:5432" environment: TZ: Asia/Shanghai POSTGRES_USER: root POSTGRES_PASSWORD: 1234 POSTGRES_DB: n9e_v6 PGDATA: /var/lib/postgresql/data/pgdata volumes: - ./pgdata:/var/lib/postgresql/data - ./initsql_for_postgres:/docker-entrypoint-initdb.d/ networks: - nightingale redis: image: "redis:7.0-alpine" container_name: redis hostname: redis restart: always ports: - "6379:6379" environment: TZ: Asia/Shanghai networks: - nightingale victoriametrics: image: victoriametrics/victoria-metrics:v1.79.12 container_name: victoriametrics hostname: victoriametrics restart: always environment: TZ: Asia/Shanghai ports: - "8428:8428" networks: - nightingale command: - "--loggerTimezone=Asia/Shanghai" nightingale: image: flashcatcloud/nightingale:latest container_name: nightingale hostname: nightingale restart: always environment: GIN_MODE: release TZ: Asia/Shanghai WAIT_HOSTS: postgres:5432, redis:6379 volumes: - ./n9eetc_pg:/app/etc ports: - "17000:17000" networks: - nightingale depends_on: - postgres - redis - victoriametrics links: - postgres:postgres - redis:redis - victoriametrics:victoriametrics command: - /app/n9e categraf: image: "flashcatcloud/categraf:latest" container_name: "categraf" hostname: "categraf01" restart: always environment: TZ: Asia/Shanghai HOST_PROC: /hostfs/proc HOST_SYS: /hostfs/sys HOST_MOUNT_PREFIX: /hostfs WAIT_HOSTS: nightingale:17000, nightingale:20090 volumes: - ./categraf/conf:/etc/categraf/conf - /:/hostfs - /var/run/docker.sock:/var/run/docker.sock - ./prometc_vm:/etc/prometheus # ports: # - "9100:9100/tcp" networks: - nightingale depends_on: - nightingale links: - nightingale:nightingale ================================================ FILE: docker/compose-postgres/initsql_for_postgres/a-n9e-for-Postgres.sql ================================================ CREATE TABLE users ( id bigserial, username varchar(64) not null, nickname varchar(64) not null, password varchar(128) not null default '', phone varchar(16) not null default '', email varchar(64) not null default '', portrait varchar(255) not null default '', roles varchar(255) not null, contacts varchar(1024), maintainer int not null default 0, belong varchar(16) not null default '', last_active_time bigint not null default 0, create_at bigint not null default 0, create_by varchar(64) not null default '', update_at bigint not null default 0, update_by varchar(64) not null default '', PRIMARY KEY (id), UNIQUE (username) ); COMMENT ON COLUMN users.id IS 'id'; COMMENT ON COLUMN users.username IS 'login name, cannot rename'; COMMENT ON COLUMN users.nickname IS 'display name, chinese name'; COMMENT ON COLUMN users.portrait IS 'portrait image url'; COMMENT ON COLUMN users.roles IS 'Admin | Standard | Guest, split by space'; COMMENT ON COLUMN users.contacts IS 'json e.g. {wecom:xx, dingtalk_robot_token:yy}'; COMMENT ON COLUMN users.belong IS 'belong'; insert into users(id, username, nickname, password, roles, create_at, create_by, update_at, update_by) values(1, 'root', '超管', 'root.2020', 'Admin', date_part('epoch',current_timestamp)::int, 'system', date_part('epoch',current_timestamp)::int, 'system'); CREATE TABLE user_group ( id bigserial, name varchar(128) not null default '', note varchar(255) not null default '', create_at bigint not null default 0, create_by varchar(64) not null default '', update_at bigint not null default 0, update_by varchar(64) not null default '', PRIMARY KEY (id) ) ; CREATE INDEX user_group_create_by_idx ON user_group (create_by); CREATE INDEX user_group_update_at_idx ON user_group (update_at); insert into user_group(id, name, create_at, create_by, update_at, update_by) values(1, 'demo-root-group', date_part('epoch',current_timestamp)::int, 'root', date_part('epoch',current_timestamp)::int, 'root'); CREATE TABLE user_group_member ( id bigserial, group_id bigint not null, user_id bigint not null, PRIMARY KEY(id) ) ; CREATE INDEX user_group_member_group_id_idx ON user_group_member (group_id); CREATE INDEX user_group_member_user_id_idx ON user_group_member (user_id); insert into user_group_member(group_id, user_id) values(1, 1); CREATE TABLE configs ( id bigserial, ckey varchar(191) not null, cval text not null default '', note varchar(1024) not null default '', external int not null default 0, encrypted int not null default 0, create_at bigint not null default 0, create_by varchar(64) not null default '', update_at bigint not null default 0, update_by varchar(64) not null default '', PRIMARY KEY (id), UNIQUE (ckey) ); CREATE TABLE role ( id bigserial, name varchar(191) not null default '', note varchar(255) not null default '', PRIMARY KEY (id), UNIQUE (name) ) ; insert into role(name, note) values('Admin', 'Administrator role'); insert into role(name, note) values('Standard', 'Ordinary user role'); insert into role(name, note) values('Guest', 'Readonly user role'); CREATE TABLE role_operation( id bigserial, role_name varchar(128) not null, operation varchar(191) not null, PRIMARY KEY(id) ) ; CREATE INDEX role_operation_role_name_idx ON role_operation (role_name); CREATE INDEX role_operation_operation_idx ON role_operation (operation); -- Admin is special, who has no concrete operation but can do anything. insert into role_operation(role_name, operation) values('Guest', '/metric/explorer'); insert into role_operation(role_name, operation) values('Guest', '/object/explorer'); insert into role_operation(role_name, operation) values('Guest', '/log/explorer'); insert into role_operation(role_name, operation) values('Guest', '/trace/explorer'); insert into role_operation(role_name, operation) values('Guest', '/help/version'); insert into role_operation(role_name, operation) values('Guest', '/help/contact'); insert into role_operation(role_name, operation) values('Standard', '/metric/explorer'); insert into role_operation(role_name, operation) values('Standard', '/object/explorer'); insert into role_operation(role_name, operation) values('Standard', '/log/explorer'); insert into role_operation(role_name, operation) values('Standard', '/trace/explorer'); insert into role_operation(role_name, operation) values('Standard', '/help/version'); insert into role_operation(role_name, operation) values('Standard', '/help/contact'); insert into role_operation(role_name, operation) values('Standard', '/help/servers'); insert into role_operation(role_name, operation) values('Standard', '/help/migrate'); insert into role_operation(role_name, operation) values('Standard', '/alert-rules-built-in'); insert into role_operation(role_name, operation) values('Standard', '/dashboards-built-in'); insert into role_operation(role_name, operation) values('Standard', '/trace/dependencies'); insert into role_operation(role_name, operation) values('Admin', '/help/source'); insert into role_operation(role_name, operation) values('Admin', '/help/sso'); insert into role_operation(role_name, operation) values('Admin', '/help/notification-tpls'); insert into role_operation(role_name, operation) values('Admin', '/help/notification-settings'); insert into role_operation(role_name, operation) values('Standard', '/users'); insert into role_operation(role_name, operation) values('Standard', '/user-groups'); insert into role_operation(role_name, operation) values('Standard', '/user-groups/add'); insert into role_operation(role_name, operation) values('Standard', '/user-groups/put'); insert into role_operation(role_name, operation) values('Standard', '/user-groups/del'); insert into role_operation(role_name, operation) values('Standard', '/busi-groups'); insert into role_operation(role_name, operation) values('Standard', '/busi-groups/add'); insert into role_operation(role_name, operation) values('Standard', '/busi-groups/put'); insert into role_operation(role_name, operation) values('Standard', '/busi-groups/del'); insert into role_operation(role_name, operation) values('Standard', '/targets'); insert into role_operation(role_name, operation) values('Standard', '/targets/add'); insert into role_operation(role_name, operation) values('Standard', '/targets/put'); insert into role_operation(role_name, operation) values('Standard', '/targets/del'); insert into role_operation(role_name, operation) values('Standard', '/dashboards'); insert into role_operation(role_name, operation) values('Standard', '/dashboards/add'); insert into role_operation(role_name, operation) values('Standard', '/dashboards/put'); insert into role_operation(role_name, operation) values('Standard', '/dashboards/del'); insert into role_operation(role_name, operation) values('Standard', '/alert-rules'); insert into role_operation(role_name, operation) values('Standard', '/alert-rules/add'); insert into role_operation(role_name, operation) values('Standard', '/alert-rules/put'); insert into role_operation(role_name, operation) values('Standard', '/alert-rules/del'); insert into role_operation(role_name, operation) values('Standard', '/alert-mutes'); insert into role_operation(role_name, operation) values('Standard', '/alert-mutes/add'); insert into role_operation(role_name, operation) values('Standard', '/alert-mutes/del'); insert into role_operation(role_name, operation) values('Standard', '/alert-subscribes'); insert into role_operation(role_name, operation) values('Standard', '/alert-subscribes/add'); insert into role_operation(role_name, operation) values('Standard', '/alert-subscribes/put'); insert into role_operation(role_name, operation) values('Standard', '/alert-subscribes/del'); insert into role_operation(role_name, operation) values('Standard', '/alert-cur-events'); insert into role_operation(role_name, operation) values('Standard', '/alert-cur-events/del'); insert into role_operation(role_name, operation) values('Standard', '/alert-his-events'); insert into role_operation(role_name, operation) values('Standard', '/job-tpls'); insert into role_operation(role_name, operation) values('Standard', '/job-tpls/add'); insert into role_operation(role_name, operation) values('Standard', '/job-tpls/put'); insert into role_operation(role_name, operation) values('Standard', '/job-tpls/del'); insert into role_operation(role_name, operation) values('Standard', '/job-tasks'); insert into role_operation(role_name, operation) values('Standard', '/job-tasks/add'); insert into role_operation(role_name, operation) values('Standard', '/job-tasks/put'); insert into role_operation(role_name, operation) values('Standard', '/recording-rules'); insert into role_operation(role_name, operation) values('Standard', '/recording-rules/add'); insert into role_operation(role_name, operation) values('Standard', '/recording-rules/put'); insert into role_operation(role_name, operation) values('Standard', '/recording-rules/del'); -- for alert_rule | collect_rule | mute | dashboard grouping CREATE TABLE busi_group ( id bigserial, name varchar(191) not null, label_enable smallint not null default 0, label_value varchar(191) not null default '' , create_at bigint not null default 0, create_by varchar(64) not null default '', update_at bigint not null default 0, update_by varchar(64) not null default '', PRIMARY KEY (id), UNIQUE (name) ) ; COMMENT ON COLUMN busi_group.label_value IS 'if label_enable: label_value can not be blank'; insert into busi_group(id, name, create_at, create_by, update_at, update_by) values(1, 'Default Busi Group', date_part('epoch',current_timestamp)::int, 'root', date_part('epoch',current_timestamp)::int, 'root'); CREATE TABLE busi_group_member ( id bigserial, busi_group_id bigint not null , user_group_id bigint not null , perm_flag char(2) not null , PRIMARY KEY (id) ) ; CREATE INDEX busi_group_member_busi_group_id_idx ON busi_group_member (busi_group_id); CREATE INDEX busi_group_member_user_group_id_idx ON busi_group_member (user_group_id); COMMENT ON COLUMN busi_group_member.busi_group_id IS 'busi group id'; COMMENT ON COLUMN busi_group_member.user_group_id IS 'user group id'; COMMENT ON COLUMN busi_group_member.perm_flag IS 'ro | rw'; insert into busi_group_member(busi_group_id, user_group_id, perm_flag) values(1, 1, 'rw'); -- for dashboard new version CREATE TABLE board ( id bigserial, group_id bigint not null default 0 , name varchar(191) not null, ident varchar(200) not null default '', tags varchar(255) not null , public smallint not null default 0 , built_in smallint not null default 0 , hide smallint not null default 0 , public_cate bigint NOT NULL DEFAULT 0, create_at bigint not null default 0, create_by varchar(64) not null default '', update_at bigint not null default 0, update_by varchar(64) not null default '', note varchar(1024) not null default '', PRIMARY KEY (id), UNIQUE (group_id, name) ) ; CREATE INDEX board_ident_idx ON board (ident); COMMENT ON COLUMN board.group_id IS 'busi group id'; COMMENT ON COLUMN board.tags IS 'split by space'; COMMENT ON COLUMN board.public IS '0:false 1:true'; COMMENT ON COLUMN board.built_in IS '0:false 1:true'; COMMENT ON COLUMN board.hide IS '0:false 1:true'; COMMENT ON COLUMN board.public_cate IS '0 anonymous 1 login 2 busi'; COMMENT ON COLUMN board.note IS 'note'; -- for dashboard new version CREATE TABLE board_payload ( id bigint not null , payload text not null, UNIQUE (id) ) ; COMMENT ON COLUMN board_payload.id IS 'dashboard id'; -- deprecated CREATE TABLE dashboard ( id bigserial, group_id bigint not null default 0 , name varchar(191) not null, tags varchar(255) not null , configs varchar(8192) , create_at bigint not null default 0, create_by varchar(64) not null default '', update_at bigint not null default 0, update_by varchar(64) not null default '', PRIMARY KEY (id), UNIQUE (group_id, name) ) ; COMMENT ON COLUMN dashboard.group_id IS 'busi group id'; COMMENT ON COLUMN dashboard.tags IS 'split by space'; COMMENT ON COLUMN dashboard.configs IS 'dashboard variables'; -- deprecated -- auto create the first subclass 'Default chart group' of dashboard CREATE TABLE chart_group ( id bigserial, dashboard_id bigint not null, name varchar(255) not null, weight int not null default 0, PRIMARY KEY (id) ) ; CREATE INDEX chart_group_dashboard_id_idx ON chart_group (dashboard_id); -- deprecated CREATE TABLE chart ( id bigserial, group_id bigint not null , configs text, weight int not null default 0, PRIMARY KEY (id) ) ; CREATE INDEX chart_group_id_idx ON chart (group_id); COMMENT ON COLUMN chart.group_id IS 'chart group id'; CREATE TABLE chart_share ( id bigserial, cluster varchar(128) not null, datasource_id bigint not null default 0, configs text, create_at bigint not null default 0, create_by varchar(64) not null default '', primary key (id) ) ; CREATE INDEX chart_share_create_at_idx ON chart_share (create_at); CREATE TABLE alert_rule ( id bigserial, group_id bigint not null default 0 , cate varchar(128) not null, datasource_ids varchar(255) not null default '' , cluster varchar(128) not null, name varchar(255) not null, note varchar(1024) not null default '', prod varchar(255) not null default '', algorithm varchar(255) not null default '', algo_params varchar(255), delay int not null default 0, severity smallint not null , disabled smallint not null , prom_for_duration int not null , rule_config text not null , prom_ql text not null , prom_eval_interval int not null , enable_stime varchar(255) not null default '00:00', enable_etime varchar(255) not null default '23:59', enable_days_of_week varchar(255) not null default '' , enable_in_bg smallint not null default 0 , notify_recovered smallint not null , notify_channels varchar(255) not null default '' , notify_groups varchar(255) not null default '' , notify_repeat_step int not null default 0 , notify_max_number int not null default 0 , recover_duration int not null default 0 , callbacks varchar(255) not null default '' , runbook_url varchar(255), append_tags varchar(255) not null default '' , annotations text not null , extra_config text not null , create_at bigint not null default 0, create_by varchar(64) not null default '', update_at bigint not null default 0, update_by varchar(64) not null default '', time_zone varchar(64) not null default '', PRIMARY KEY (id) ) ; CREATE INDEX alert_rule_group_id_idx ON alert_rule (group_id); CREATE INDEX alert_rule_update_at_idx ON alert_rule (update_at); COMMENT ON COLUMN alert_rule.group_id IS 'busi group id'; COMMENT ON COLUMN alert_rule.datasource_ids IS 'datasource ids'; COMMENT ON COLUMN alert_rule.severity IS '1:Emergency 2:Warning 3:Notice'; COMMENT ON COLUMN alert_rule.disabled IS '0:enabled 1:disabled'; COMMENT ON COLUMN alert_rule.prom_for_duration IS 'prometheus for, unit:s'; COMMENT ON COLUMN alert_rule.rule_config IS 'rule_config'; COMMENT ON COLUMN alert_rule.prom_ql IS 'promql'; COMMENT ON COLUMN alert_rule.prom_eval_interval IS 'evaluate interval'; COMMENT ON COLUMN alert_rule.enable_stime IS '00:00'; COMMENT ON COLUMN alert_rule.enable_etime IS '23:59'; COMMENT ON COLUMN alert_rule.enable_days_of_week IS 'split by space: 0 1 2 3 4 5 6'; COMMENT ON COLUMN alert_rule.enable_in_bg IS '1: only this bg 0: global'; COMMENT ON COLUMN alert_rule.notify_recovered IS 'whether notify when recovery'; COMMENT ON COLUMN alert_rule.notify_channels IS 'split by space: sms voice email dingtalk wecom'; COMMENT ON COLUMN alert_rule.notify_groups IS 'split by space: 233 43'; COMMENT ON COLUMN alert_rule.notify_repeat_step IS 'unit: min'; COMMENT ON COLUMN alert_rule.recover_duration IS 'unit: s'; COMMENT ON COLUMN alert_rule.callbacks IS 'split by space: http://a.com/api/x http://a.com/api/y'; COMMENT ON COLUMN alert_rule.append_tags IS 'split by space: service=n9e mod=api'; COMMENT ON COLUMN alert_rule.annotations IS 'annotations'; COMMENT ON COLUMN alert_rule.extra_config IS 'extra_config'; CREATE TABLE alert_mute ( id bigserial, group_id bigint not null default 0 , prod varchar(255) not null default '', note varchar(1024) not null default '', cate varchar(128) not null, cluster varchar(128) not null, datasource_ids varchar(255) not null default '' , tags jsonb NOT NULL , cause varchar(255) not null default '', btime bigint not null default 0 , etime bigint not null default 0 , disabled smallint not null default 0 , mute_time_type smallint not null default 0, periodic_mutes varchar(4096) not null default '', severities varchar(32) not null default '', create_at bigint not null default 0, create_by varchar(64) not null default '', update_at bigint not null default 0, update_by varchar(64) not null default '', PRIMARY KEY (id) ) ; CREATE INDEX alert_mute_group_id_idx ON alert_mute (group_id); CREATE INDEX alert_mute_update_at_idx ON alert_mute (update_at); COMMENT ON COLUMN alert_mute.group_id IS 'busi group id'; COMMENT ON COLUMN alert_mute.datasource_ids IS 'datasource ids'; COMMENT ON COLUMN alert_mute.tags IS 'json,map,tagkey->regexp|value'; COMMENT ON COLUMN alert_mute.btime IS 'begin time'; COMMENT ON COLUMN alert_mute.etime IS 'end time'; COMMENT ON COLUMN alert_mute.disabled IS '0:enabled 1:disabled'; CREATE TABLE alert_subscribe ( id bigserial, name varchar(255) not null default '', disabled int not null default 0, group_id bigint not null default 0, prod varchar(255) not null default '', cate varchar(128) not null, datasource_ids varchar(255) not null default '', cluster varchar(128) not null, rule_id bigint not null default 0, severities varchar(32) not null default '', tags varchar(4096) not null default '[]', redefine_severity smallint default 0 , new_severity smallint not null, redefine_channels smallint default 0 , new_channels varchar(255) not null default '', user_group_ids varchar(250) not null, busi_groups VARCHAR(4096) NOT NULL DEFAULT '[]', note VARCHAR(1024) DEFAULT '', rule_ids VARCHAR(1024) DEFAULT '', webhooks text not null, extra_config text not null, redefine_webhooks int default 0, for_duration bigint not null default 0, create_at bigint not null default 0, create_by varchar(64) not null default '', update_at bigint not null default 0, update_by varchar(64) not null default '', PRIMARY KEY (id) ); CREATE INDEX ON alert_subscribe (update_at); CREATE INDEX ON alert_subscribe (group_id); COMMENT ON COLUMN alert_subscribe.disabled IS '0:enabled 1:disabled'; COMMENT ON COLUMN alert_subscribe.group_id IS 'busi group id'; COMMENT ON COLUMN alert_subscribe.datasource_ids IS 'datasource ids'; COMMENT ON COLUMN alert_subscribe.tags IS 'json,map,tagkey->regexp|value'; COMMENT ON COLUMN alert_subscribe.redefine_severity IS 'is redefine severity?'; COMMENT ON COLUMN alert_subscribe.new_severity IS '0:Emergency 1:Warning 2:Notice'; COMMENT ON COLUMN alert_subscribe.redefine_channels IS 'is redefine channels?'; COMMENT ON COLUMN alert_subscribe.new_channels IS 'split by space: sms voice email dingtalk wecom'; COMMENT ON COLUMN alert_subscribe.user_group_ids IS 'split by space 1 34 5, notify cc to user_group_ids'; COMMENT ON COLUMN alert_subscribe.note IS 'note'; COMMENT ON COLUMN alert_subscribe.rule_ids IS 'rule_ids'; COMMENT ON COLUMN alert_subscribe.extra_config IS 'extra_config'; CREATE TABLE target ( id bigserial, group_id bigint not null default 0, ident varchar(191) not null, note varchar(255) not null default '', tags varchar(512) not null default '', host_tags text, host_ip varchar(15) default '', agent_version varchar(255) default '', engine_name varchar(255) default '', os varchar(31) default '', update_at bigint not null default 0, PRIMARY KEY (id), UNIQUE (ident) ); CREATE INDEX ON target (group_id); CREATE INDEX idx_host_ip ON target (host_ip); CREATE INDEX idx_agent_version ON target (agent_version); CREATE INDEX idx_engine_name ON target (engine_name); CREATE INDEX idx_os ON target (os); COMMENT ON COLUMN target.group_id IS 'busi group id'; COMMENT ON COLUMN target.ident IS 'target id'; COMMENT ON COLUMN target.note IS 'append to alert event as field'; COMMENT ON COLUMN target.tags IS 'append to series data as tags, split by space, append external space at suffix'; COMMENT ON COLUMN target.host_tags IS 'global labels set in conf file'; COMMENT ON COLUMN target.host_ip IS 'IPv4 string'; COMMENT ON COLUMN target.agent_version IS 'agent version'; COMMENT ON COLUMN target.engine_name IS 'engine_name'; COMMENT ON COLUMN target.os IS 'os type'; CREATE TABLE metric_view ( id bigserial, name varchar(191) not null default '', cate smallint not null , configs varchar(8192) not null default '', create_at bigint not null default 0, create_by bigint not null default 0, update_at bigint not null default 0, PRIMARY KEY (id) ) ; CREATE INDEX metric_view_create_by_idx ON metric_view (create_by); COMMENT ON COLUMN metric_view.cate IS '0: preset 1: custom'; COMMENT ON COLUMN metric_view.create_by IS 'user id'; insert into metric_view(name, cate, configs) values('Host View', 0, '{"filters":[{"oper":"=","label":"__name__","value":"cpu_usage_idle"}],"dynamicLabels":[],"dimensionLabels":[{"label":"ident","value":""}]}'); CREATE TABLE recording_rule ( id bigserial, group_id bigint not null default '0', datasource_ids varchar(255) not null default '', cluster varchar(128) not null, name varchar(255) not null , note varchar(255) not null , disabled smallint not null default 0 , prom_ql varchar(8192) not null , prom_eval_interval int not null , append_tags varchar(255) default '' , query_configs text not null , create_at bigint default '0', create_by varchar(64) default '', update_at bigint default '0', update_by varchar(64) default '', PRIMARY KEY (id) ) ; CREATE INDEX recording_rule_group_id_idx ON recording_rule (group_id); CREATE INDEX recording_rule_update_at_idx ON recording_rule (update_at); COMMENT ON COLUMN recording_rule.group_id IS 'group_id'; COMMENT ON COLUMN recording_rule.datasource_ids IS 'datasource ids'; COMMENT ON COLUMN recording_rule.name IS 'new metric name'; COMMENT ON COLUMN recording_rule.note IS 'rule note'; COMMENT ON COLUMN recording_rule.disabled IS '0:enabled 1:disabled'; COMMENT ON COLUMN recording_rule.prom_ql IS 'promql'; COMMENT ON COLUMN recording_rule.prom_eval_interval IS 'evaluate interval'; COMMENT ON COLUMN recording_rule.append_tags IS 'split by space: service=n9e mod=api'; COMMENT ON COLUMN recording_rule.query_configs IS 'query configs'; CREATE TABLE alert_aggr_view ( id bigserial, name varchar(191) not null default '', rule varchar(2048) not null default '', cate smallint not null , create_at bigint not null default 0, create_by bigint not null default 0, update_at bigint not null default 0, PRIMARY KEY (id) ) ; CREATE INDEX alert_aggr_view_create_by_idx ON alert_aggr_view (create_by); COMMENT ON COLUMN alert_aggr_view.cate IS '0: preset 1: custom'; COMMENT ON COLUMN alert_aggr_view.create_by IS 'user id'; insert into alert_aggr_view(name, rule, cate) values('By BusiGroup, Severity', 'field:group_name::field:severity', 0); insert into alert_aggr_view(name, rule, cate) values('By RuleName', 'field:rule_name', 0); CREATE TABLE alert_cur_event ( id bigint not null , cate varchar(128) not null, datasource_id bigint not null default 0 , cluster varchar(128) not null, group_id bigint not null , group_name varchar(255) not null default '' , hash varchar(64) not null , rule_id bigint not null, rule_name varchar(255) not null, rule_note varchar(2048) not null , rule_prod varchar(255) not null default '', rule_algo varchar(255) not null default '', severity smallint not null , prom_for_duration int not null , prom_ql varchar(8192) not null , prom_eval_interval int not null , callbacks varchar(255) not null default '' , runbook_url varchar(255), notify_recovered smallint not null , notify_channels varchar(255) not null default '' , notify_groups varchar(255) not null default '' , notify_repeat_next bigint not null default 0 , notify_cur_number int not null default 0 , target_ident varchar(191) not null default '' , target_note varchar(191) not null default '' , first_trigger_time bigint, trigger_time bigint not null, trigger_value varchar(2048) not null, annotations text not null , rule_config text not null , tags varchar(1024) not null default '' , PRIMARY KEY (id) ) ; CREATE INDEX alert_cur_event_hash_idx ON alert_cur_event (hash); CREATE INDEX alert_cur_event_rule_id_idx ON alert_cur_event (rule_id); CREATE INDEX alert_cur_event_tg_idx ON alert_cur_event (trigger_time, group_id); CREATE INDEX alert_cur_event_nrn_idx ON alert_cur_event (notify_repeat_next); COMMENT ON COLUMN alert_cur_event.id IS 'use alert_his_event.id'; COMMENT ON COLUMN alert_cur_event.datasource_id IS 'datasource id'; COMMENT ON COLUMN alert_cur_event.group_id IS 'busi group id of rule'; COMMENT ON COLUMN alert_cur_event.group_name IS 'busi group name'; COMMENT ON COLUMN alert_cur_event.hash IS 'rule_id + vector_pk'; COMMENT ON COLUMN alert_cur_event.rule_note IS 'alert rule note'; COMMENT ON COLUMN alert_cur_event.severity IS '1:Emergency 2:Warning 3:Notice'; COMMENT ON COLUMN alert_cur_event.prom_for_duration IS 'prometheus for, unit:s'; COMMENT ON COLUMN alert_cur_event.prom_ql IS 'promql'; COMMENT ON COLUMN alert_cur_event.prom_eval_interval IS 'evaluate interval'; COMMENT ON COLUMN alert_cur_event.callbacks IS 'split by space: http://a.com/api/x http://a.com/api/y'; COMMENT ON COLUMN alert_cur_event.notify_recovered IS 'whether notify when recovery'; COMMENT ON COLUMN alert_cur_event.notify_channels IS 'split by space: sms voice email dingtalk wecom'; COMMENT ON COLUMN alert_cur_event.notify_groups IS 'split by space: 233 43'; COMMENT ON COLUMN alert_cur_event.notify_repeat_next IS 'next timestamp to notify, get repeat settings from rule'; COMMENT ON COLUMN alert_cur_event.target_ident IS 'target ident, also in tags'; COMMENT ON COLUMN alert_cur_event.target_note IS 'target note'; COMMENT ON COLUMN alert_cur_event.annotations IS 'annotations'; COMMENT ON COLUMN alert_cur_event.rule_config IS 'rule_config'; COMMENT ON COLUMN alert_cur_event.tags IS 'merge data_tags rule_tags, split by ,,'; CREATE TABLE alert_his_event ( id bigserial, is_recovered smallint not null, cate varchar(128) not null, datasource_id bigint not null default 0 , cluster varchar(128) not null, group_id bigint not null , group_name varchar(255) not null default '' , hash varchar(64) not null , rule_id bigint not null, rule_name varchar(255) not null, rule_note varchar(2048) not null default 'alert rule note', rule_prod varchar(255) not null default '', rule_algo varchar(255) not null default '', severity smallint not null , prom_for_duration int not null , prom_ql varchar(8192) not null , prom_eval_interval int not null , callbacks varchar(255) not null default '' , runbook_url varchar(255), notify_recovered smallint not null , notify_channels varchar(255) not null default '' , notify_groups varchar(255) not null default '' , notify_cur_number int not null default 0 , target_ident varchar(191) not null default '' , target_note varchar(191) not null default '' , first_trigger_time bigint, trigger_time bigint not null, trigger_value varchar(2048) not null, recover_time bigint not null default 0, last_eval_time bigint not null default 0 , tags varchar(1024) not null default '' , annotations text not null , rule_config text not null , PRIMARY KEY (id) ) ; CREATE INDEX alert_his_event_hash_idx ON alert_his_event (hash); CREATE INDEX alert_his_event_rule_id_idx ON alert_his_event (rule_id); CREATE INDEX alert_his_event_tg_idx ON alert_his_event (trigger_time, group_id); CREATE INDEX alert_his_event_nrn_idx ON alert_his_event (last_eval_time); COMMENT ON COLUMN alert_his_event.group_id IS 'busi group id of rule'; COMMENT ON COLUMN alert_his_event.datasource_id IS 'datasource id'; COMMENT ON COLUMN alert_his_event.group_name IS 'busi group name'; COMMENT ON COLUMN alert_his_event.hash IS 'rule_id + vector_pk'; COMMENT ON COLUMN alert_his_event.rule_note IS 'alert rule note'; COMMENT ON COLUMN alert_his_event.severity IS '0:Emergency 1:Warning 2:Notice'; COMMENT ON COLUMN alert_his_event.prom_for_duration IS 'prometheus for, unit:s'; COMMENT ON COLUMN alert_his_event.prom_ql IS 'promql'; COMMENT ON COLUMN alert_his_event.prom_eval_interval IS 'evaluate interval'; COMMENT ON COLUMN alert_his_event.callbacks IS 'split by space: http://a.com/api/x http://a.com/api/y'; COMMENT ON COLUMN alert_his_event.notify_recovered IS 'whether notify when recovery'; COMMENT ON COLUMN alert_his_event.notify_channels IS 'split by space: sms voice email dingtalk wecom'; COMMENT ON COLUMN alert_his_event.notify_groups IS 'split by space: 233 43'; COMMENT ON COLUMN alert_his_event.target_ident IS 'target ident, also in tags'; COMMENT ON COLUMN alert_his_event.target_note IS 'target note'; COMMENT ON COLUMN alert_his_event.last_eval_time IS 'for time filter'; COMMENT ON COLUMN alert_his_event.tags IS 'merge data_tags rule_tags, split by ,,'; COMMENT ON COLUMN alert_his_event.annotations IS 'annotations'; COMMENT ON COLUMN alert_his_event.rule_config IS 'rule_config'; CREATE TABLE task_tpl ( id serial, group_id int not null , title varchar(255) not null default '', account varchar(64) not null, batch int not null default 0, tolerance int not null default 0, timeout int not null default 0, pause varchar(255) not null default '', script text not null, args varchar(512) not null default '', tags varchar(255) not null default '' , create_at bigint not null default 0, create_by varchar(64) not null default '', update_at bigint not null default 0, update_by varchar(64) not null default '', PRIMARY KEY (id) ) ; CREATE INDEX task_tpl_group_id_idx ON task_tpl (group_id); COMMENT ON COLUMN task_tpl.group_id IS 'busi group id'; COMMENT ON COLUMN task_tpl.tags IS 'split by space'; CREATE TABLE task_tpl_host ( ii serial, id int not null , host varchar(128) not null , PRIMARY KEY (ii) ) ; CREATE INDEX task_tpl_host_id_host_idx ON task_tpl_host (id, host); COMMENT ON COLUMN task_tpl_host.id IS 'task tpl id'; COMMENT ON COLUMN task_tpl_host.host IS 'ip or hostname'; CREATE TABLE task_record ( id bigint not null , event_id bigint not null default 0, group_id bigint not null , ibex_address varchar(128) not null, ibex_auth_user varchar(128) not null default '', ibex_auth_pass varchar(128) not null default '', title varchar(255) not null default '', account varchar(64) not null, batch int not null default 0, tolerance int not null default 0, timeout int not null default 0, pause varchar(255) not null default '', script text not null, args varchar(512) not null default '', create_at bigint not null default 0, create_by varchar(64) not null default '', PRIMARY KEY (id) ) ; CREATE INDEX task_record_cg_idx ON task_record (create_at, group_id); CREATE INDEX task_record_create_by_idx ON task_record (create_by); CREATE INDEX task_record_event_id_idx ON task_record (event_id); COMMENT ON COLUMN task_record.id IS 'ibex task id'; COMMENT ON COLUMN task_record.group_id IS 'busi group id'; COMMENT ON COLUMN task_record.event_id IS 'event id'; CREATE TABLE alerting_engines ( id serial, instance varchar(128) not null default '' , datasource_id bigint not null default 0 , engine_cluster varchar(128) not null default '' , clock bigint not null, PRIMARY KEY (id) ) ; COMMENT ON COLUMN alerting_engines.instance IS 'instance identification, e.g. 10.9.0.9:9090'; COMMENT ON COLUMN alerting_engines.datasource_id IS 'datasource id'; COMMENT ON COLUMN alerting_engines.engine_cluster IS 'target reader cluster'; CREATE TABLE datasource ( id serial, name varchar(191) not null default '', identifier varchar(255) not null default '', description varchar(255) not null default '', category varchar(255) not null default '', plugin_id int not null default 0, plugin_type varchar(255) not null default '', plugin_type_name varchar(255) not null default '', cluster_name varchar(255) not null default '', settings text not null, status varchar(255) not null default '', http varchar(4096) not null default '', auth varchar(8192) not null default '', is_default boolean not null default false, weight int not null default 0, created_at bigint not null default 0, created_by varchar(64) not null default '', updated_at bigint not null default 0, updated_by varchar(64) not null default '', UNIQUE (name), PRIMARY KEY (id) ) ; CREATE TABLE builtin_cate ( id bigserial, name varchar(191) not null, user_id bigint not null default 0, PRIMARY KEY (id) ) ; CREATE TABLE notify_tpl ( id bigserial, channel varchar(32) not null, name varchar(255) not null, content text not null, create_at bigint not null default 0, create_by varchar(64) not null default '', update_at bigint not null default 0, update_by varchar(64) not null default '', PRIMARY KEY (id), UNIQUE (channel) ); CREATE TABLE sso_config ( id bigserial, name varchar(191) not null, content text not null, update_at bigint not null default 0, PRIMARY KEY (id), UNIQUE (name) ); CREATE TABLE es_index_pattern ( id bigserial, datasource_id bigint not null default 0, name varchar(191) not null, time_field varchar(128) not null default '@timestamp', allow_hide_system_indices smallint not null default 0, fields_format varchar(4096) not null default '', cross_cluster_enabled int not null default 0, create_at bigint default '0', create_by varchar(64) default '', update_at bigint default '0', update_by varchar(64) default '', note varchar(4096) not null default '', PRIMARY KEY (id), UNIQUE (datasource_id, name) ) ; COMMENT ON COLUMN es_index_pattern.datasource_id IS 'datasource id'; COMMENT ON COLUMN es_index_pattern.note IS 'description of metric in Chinese'; CREATE TABLE builtin_metrics ( id bigserial, collector varchar(191) NOT NULL, typ varchar(191) NOT NULL, name varchar(191) NOT NULL, unit varchar(191) NOT NULL, lang varchar(191) NOT NULL DEFAULT '', note varchar(4096) NOT NULL, expression varchar(4096) NOT NULL, expression_type varchar(32) NOT NULL DEFAULT 'promql', metric_type varchar(191) NOT NULL DEFAULT '', extra_fields text, created_at bigint NOT NULL DEFAULT 0, created_by varchar(191) NOT NULL DEFAULT '', updated_at bigint NOT NULL DEFAULT 0, updated_by varchar(191) NOT NULL DEFAULT '', uuid BIGINT NOT NULL DEFAULT 0, PRIMARY KEY (id), UNIQUE (lang, collector, typ, name) ); CREATE INDEX idx_collector ON builtin_metrics (collector); CREATE INDEX idx_typ ON builtin_metrics (typ); CREATE INDEX idx_name ON builtin_metrics (name); CREATE INDEX idx_lang ON builtin_metrics (lang); COMMENT ON COLUMN builtin_metrics.id IS 'unique identifier'; COMMENT ON COLUMN builtin_metrics.collector IS 'type of collector'; COMMENT ON COLUMN builtin_metrics.typ IS 'type of metric'; COMMENT ON COLUMN builtin_metrics.name IS 'name of metric'; COMMENT ON COLUMN builtin_metrics.unit IS 'unit of metric'; COMMENT ON COLUMN builtin_metrics.lang IS 'language of metric'; COMMENT ON COLUMN builtin_metrics.note IS 'description of metric in Chinese'; COMMENT ON COLUMN builtin_metrics.expression IS 'expression of metric'; COMMENT ON COLUMN builtin_metrics.expression_type IS 'expression type: metric_name or promql'; COMMENT ON COLUMN builtin_metrics.metric_type IS 'metric type like counter/gauge'; COMMENT ON COLUMN builtin_metrics.extra_fields IS 'custom extra fields'; COMMENT ON COLUMN builtin_metrics.created_at IS 'create time'; COMMENT ON COLUMN builtin_metrics.created_by IS 'creator'; COMMENT ON COLUMN builtin_metrics.updated_at IS 'update time'; COMMENT ON COLUMN builtin_metrics.updated_by IS 'updater'; COMMENT ON COLUMN builtin_metrics.uuid IS 'unique identifier'; CREATE TABLE metric_filter ( id BIGSERIAL PRIMARY KEY, name VARCHAR(191) NOT NULL, configs VARCHAR(4096) NOT NULL, groups_perm TEXT, create_at BIGINT NOT NULL DEFAULT 0, create_by VARCHAR(191) NOT NULL DEFAULT '', update_at BIGINT NOT NULL DEFAULT 0, update_by VARCHAR(191) NOT NULL DEFAULT '' ); CREATE INDEX idx_metric_filter_name ON metric_filter (name); CREATE TABLE board_busigroup ( busi_group_id BIGINT NOT NULL DEFAULT 0, board_id BIGINT NOT NULL DEFAULT 0, PRIMARY KEY (busi_group_id, board_id) ); CREATE TABLE builtin_components ( id BIGSERIAL PRIMARY KEY, ident VARCHAR(191) NOT NULL, logo VARCHAR(191) NOT NULL, readme TEXT NOT NULL, disabled INT NOT NULL DEFAULT 0, created_at BIGINT NOT NULL DEFAULT 0, created_by VARCHAR(191) NOT NULL DEFAULT '', updated_at BIGINT NOT NULL DEFAULT 0, updated_by VARCHAR(191) NOT NULL DEFAULT '' ); CREATE INDEX idx_ident ON builtin_components (ident); CREATE TABLE builtin_payloads ( id BIGSERIAL PRIMARY KEY, type VARCHAR(191) NOT NULL, uuid BIGINT NOT NULL DEFAULT 0, component VARCHAR(191) NOT NULL, cate VARCHAR(191) NOT NULL, name VARCHAR(191) NOT NULL, tags VARCHAR(191) NOT NULL DEFAULT '', content TEXT NOT NULL, note VARCHAR(1024) NOT NULL DEFAULT '', created_at BIGINT NOT NULL DEFAULT 0, created_by VARCHAR(191) NOT NULL DEFAULT '', updated_at BIGINT NOT NULL DEFAULT 0, updated_by VARCHAR(191) NOT NULL DEFAULT '' ); CREATE INDEX idx_component ON builtin_payloads (component); CREATE INDEX idx_builtin_payloads_name ON builtin_payloads (name); CREATE INDEX idx_cate ON builtin_payloads (cate); CREATE INDEX idx_type ON builtin_payloads (type); CREATE TABLE dash_annotation ( id bigserial PRIMARY KEY, dashboard_id bigint not null, panel_id varchar(191) not null, tags text, description text, config text, time_start bigint not null default 0, time_end bigint not null default 0, create_at bigint not null default 0, create_by varchar(64) not null default '', update_at bigint not null default 0, update_by varchar(64) not null default '' ); CREATE TABLE source_token ( id bigserial PRIMARY KEY, source_type varchar(64) NOT NULL DEFAULT '', source_id varchar(255) NOT NULL DEFAULT '', token varchar(255) NOT NULL DEFAULT '', expire_at bigint NOT NULL DEFAULT 0, create_at bigint NOT NULL DEFAULT 0, create_by varchar(64) NOT NULL DEFAULT '' ); CREATE INDEX idx_source_token_type_id_token ON source_token (source_type, source_id, token); CREATE TABLE notification_record ( id BIGSERIAL PRIMARY KEY, notify_rule_id BIGINT NOT NULL DEFAULT 0, event_id bigint NOT NULL, sub_id bigint DEFAULT NULL, channel varchar(255) NOT NULL, status bigint DEFAULT NULL, target varchar(1024) NOT NULL, details varchar(2048) DEFAULT '', created_at bigint NOT NULL ); CREATE INDEX idx_evt ON notification_record (event_id); COMMENT ON COLUMN notification_record.event_id IS 'event history id'; COMMENT ON COLUMN notification_record.sub_id IS 'subscribed rule id'; COMMENT ON COLUMN notification_record.channel IS 'notification channel name'; COMMENT ON COLUMN notification_record.status IS 'notification status'; COMMENT ON COLUMN notification_record.target IS 'notification target'; COMMENT ON COLUMN notification_record.details IS 'notification other info'; COMMENT ON COLUMN notification_record.created_at IS 'create time'; CREATE TABLE target_busi_group ( id BIGSERIAL PRIMARY KEY, target_ident varchar(191) NOT NULL, group_id bigint NOT NULL, update_at bigint NOT NULL ); CREATE UNIQUE INDEX idx_target_group ON target_busi_group (target_ident, group_id); CREATE TABLE user_token ( id BIGSERIAL PRIMARY KEY, username varchar(255) NOT NULL DEFAULT '', token_name varchar(255) NOT NULL DEFAULT '', token varchar(255) NOT NULL DEFAULT '', create_at bigint NOT NULL DEFAULT 0, last_used bigint NOT NULL DEFAULT 0 ); CREATE TABLE notify_rule ( id bigserial PRIMARY KEY, name varchar(255) NOT NULL, description text, enable boolean DEFAULT false, user_group_ids varchar(255) NOT NULL DEFAULT '', notify_configs text, pipeline_configs text, create_at bigint NOT NULL DEFAULT 0, create_by varchar(64) NOT NULL DEFAULT '', update_at bigint NOT NULL DEFAULT 0, update_by varchar(64) NOT NULL DEFAULT '' ); CREATE TABLE notify_channel ( id bigserial PRIMARY KEY, name varchar(255) NOT NULL, ident varchar(255) NOT NULL, description text, enable boolean DEFAULT false, param_config text, request_type varchar(50) NOT NULL, request_config text, weight int NOT NULL DEFAULT 0, create_at bigint NOT NULL DEFAULT 0, create_by varchar(64) NOT NULL DEFAULT '', update_at bigint NOT NULL DEFAULT 0, update_by varchar(64) NOT NULL DEFAULT '' ); CREATE TABLE message_template ( id bigserial PRIMARY KEY, name varchar(64) NOT NULL, ident varchar(64) NOT NULL, content text, user_group_ids varchar(64), notify_channel_ident varchar(64) NOT NULL DEFAULT '', private int NOT NULL DEFAULT 0, weight int NOT NULL DEFAULT 0, create_at bigint NOT NULL DEFAULT 0, create_by varchar(64) NOT NULL DEFAULT '', update_at bigint NOT NULL DEFAULT 0, update_by varchar(64) NOT NULL DEFAULT '' ); CREATE TABLE event_pipeline ( id bigserial PRIMARY KEY, name varchar(128) NOT NULL, team_ids text, description varchar(255) NOT NULL DEFAULT '', filter_enable smallint NOT NULL DEFAULT 0, label_filters text, attribute_filters text, processors text, create_at bigint NOT NULL DEFAULT 0, create_by varchar(64) NOT NULL DEFAULT '', update_at bigint NOT NULL DEFAULT 0, update_by varchar(64) NOT NULL DEFAULT '' ); CREATE TABLE embedded_product ( id bigserial PRIMARY KEY, name varchar(255) DEFAULT NULL, url varchar(255) DEFAULT NULL, is_private boolean DEFAULT NULL, team_ids varchar(255), create_at bigint NOT NULL DEFAULT 0, create_by varchar(64) NOT NULL DEFAULT '', update_at bigint NOT NULL DEFAULT 0, update_by varchar(64) NOT NULL DEFAULT '' ); ================================================ FILE: docker/compose-postgres/initsql_for_postgres/b-ibex-for-Postgres.sql ================================================ CREATE TABLE task_meta ( id bigserial, title varchar(255) not null default '', account varchar(64) not null, batch int not null default 0, tolerance int not null default 0, timeout int not null default 0, pause varchar(255) not null default '', script text not null, args varchar(512) not null default '', stdin varchar(1024) not null default '' , creator varchar(64) not null default '', created timestamp not null default CURRENT_TIMESTAMP, PRIMARY KEY (id) ) ; CREATE INDEX task_meta_creator_idx ON task_meta (creator); CREATE INDEX task_meta_created_idx ON task_meta (created); /* start|cancel|kill|pause */ CREATE TABLE task_action ( id bigint not null, action varchar(32) not null, clock bigint not null default 0, PRIMARY KEY (id) ) ; CREATE TABLE task_scheduler ( id bigint not null, scheduler varchar(128) not null default '' ) ; CREATE INDEX task_scheduler_id_scheduler_idx ON task_scheduler (id, scheduler); CREATE TABLE task_scheduler_health ( scheduler varchar(128) not null, clock bigint not null, UNIQUE (scheduler) ) ; CREATE INDEX task_scheduler_health_clock_idx ON task_scheduler_health (clock); CREATE TABLE task_host_doing ( id bigint not null, host varchar(128) not null, clock bigint not null default 0, action varchar(16) not null ) ; CREATE INDEX task_host_doing_id_idx ON task_host_doing (id); CREATE INDEX task_host_doing_host_idx ON task_host_doing (host); CREATE TABLE task_host_0 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_1 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_2 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_3 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_4 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_5 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_6 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_7 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_8 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_9 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_10 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_11 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_12 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_13 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_14 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_15 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_16 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_17 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_18 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_19 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_20 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_21 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_22 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_23 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_24 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_25 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_26 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_27 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_28 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_29 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_30 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_31 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_32 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_33 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_34 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_35 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_36 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_37 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_38 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_39 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_40 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_41 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_42 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_43 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_44 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_45 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_46 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_47 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_48 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_49 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_50 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_51 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_52 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_53 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_54 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_55 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_56 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_57 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_58 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_59 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_60 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_61 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_62 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_63 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_64 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_65 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_66 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_67 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_68 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_69 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_70 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_71 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_72 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_73 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_74 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_75 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_76 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_77 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_78 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_79 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_80 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_81 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_82 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_83 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_84 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_85 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_86 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_87 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_88 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_89 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_90 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_91 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_92 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_93 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_94 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_95 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_96 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_97 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_98 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; CREATE TABLE task_host_99 ( ii bigserial, id bigint not null, host varchar(128) not null, status varchar(32) not null, stdout text, stderr text, UNIQUE (id, host), PRIMARY KEY (ii) ) ; ================================================ FILE: docker/compose-postgres/n9eetc_pg/config.toml ================================================ [Global] RunMode = "release" [Log] # log write dir Dir = "logs" # log level: DEBUG INFO WARNING ERROR Level = "INFO" # stdout, stderr, file Output = "stdout" # # rotate by time # KeepHours = 4 # # rotate by size # RotateNum = 3 # # unit: MB # RotateSize = 256 [HTTP] # http listening address Host = "0.0.0.0" # http listening port Port = 17000 # https cert file path CertFile = "" # https key file path KeyFile = "" # whether print access log PrintAccessLog = false # whether enable pprof PProf = false # expose prometheus /metrics? ExposeMetrics = true # http graceful shutdown timeout, unit: s ShutdownTimeout = 30 # max content length: 64M MaxContentLength = 67108864 # http server read timeout, unit: s ReadTimeout = 20 # http server write timeout, unit: s WriteTimeout = 40 # http server idle timeout, unit: s IdleTimeout = 120 [HTTP.ShowCaptcha] Enable = false [HTTP.APIForAgent] Enable = true # [HTTP.APIForAgent.BasicAuth] # user001 = "ccc26da7b9aba533cbb263a36c07dcc5" [HTTP.APIForService] Enable = false [HTTP.APIForService.BasicAuth] user001 = "ccc26da7b9aba533cbb263a36c07dcc5" [HTTP.JWTAuth] # unit: min AccessExpired = 1500 # unit: min RefreshExpired = 10080 RedisKeyPrefix = "/jwt/" [HTTP.ProxyAuth] # if proxy auth enabled, jwt auth is disabled Enable = false # username key in http proxy header HeaderUserNameKey = "X-User-Name" DefaultRoles = ["Standard"] [HTTP.RSA] # open RSA OpenRSA = false # RSA public key RSAPublicKeyPath = "/etc/n9e/public.pem" # RSA private key RSAPrivateKeyPath = "/etc/n9e/private.pem" # RSA private key password RSAPassWord = "" [DB] DSN="host=postgres port=5432 user=root dbname=n9e_v6 password=1234 sslmode=disable" # enable debug mode or not Debug = false # mysql postgres DBType = "postgres" # unit: s MaxLifetime = 7200 # max open connections MaxOpenConns = 150 # max idle connections MaxIdleConns = 50 [Redis] # address, ip:port or ip1:port,ip2:port for cluster and sentinel(SentinelAddrs) Address = "redis:6379" # Username = "" # Password = "" # DB = 0 # UseTLS = false # TLSMinVersion = "1.2" # standalone cluster sentinel RedisType = "standalone" # Mastername for sentinel type # MasterName = "mymaster" # SentinelUsername = "" # SentinelPassword = "" [Alert] [Alert.Heartbeat] # auto detect if blank IP = "" # unit ms Interval = 1000 EngineName = "default" # [Alert.Alerting] # NotifyConcurrency = 10 [Center] MetricsYamlFile = "./etc/metrics.yaml" I18NHeaderKey = "X-Language" [Center.AnonymousAccess] PromQuerier = true AlertDetail = true [Pushgw] # use target labels in database instead of in series LabelRewrite = true ForceUseServerTS = true # [Pushgw.DebugSample] # ident = "xx" # __name__ = "xx" # [Pushgw.WriterOpt] # QueueMaxSize = 1000000 # QueuePopSize = 1000 [[Pushgw.Writers]] # Url = "http://127.0.0.1:8480/insert/0/prometheus/api/v1/write" Url = "http://victoriametrics:8428/api/v1/write" # Basic auth username BasicAuthUser = "" # Basic auth password BasicAuthPass = "" # timeout settings, unit: ms Headers = ["X-From", "n9e"] Timeout = 10000 DialTimeout = 3000 TLSHandshakeTimeout = 30000 ExpectContinueTimeout = 1000 IdleConnTimeout = 90000 # time duration, unit: ms KeepAlive = 30000 MaxConnsPerHost = 0 MaxIdleConns = 100 MaxIdleConnsPerHost = 100 ## Optional TLS Config # UseTLS = false # TLSCA = "/etc/n9e/ca.pem" # TLSCert = "/etc/n9e/cert.pem" # TLSKey = "/etc/n9e/key.pem" # InsecureSkipVerify = false # [[Writers.WriteRelabels]] # Action = "replace" # SourceLabels = ["__address__"] # Regex = "([^:]+)(?::\\d+)?" # Replacement = "$1:80" # TargetLabel = "__address__" [Ibex] Enable = true RPCListen = "0.0.0.0:20090" ================================================ FILE: docker/compose-postgres/n9eetc_pg/metrics.yaml ================================================ cpu_usage_idle: CPU空闲率(单位:%) cpu_usage_active: CPU使用率(单位:%) cpu_usage_system: CPU内核态时间占比(单位:%) cpu_usage_user: CPU用户态时间占比(单位:%) cpu_usage_nice: 低优先级用户态CPU时间占比,也就是进程nice值被调整为1-19之间的CPU时间。这里注意,nice可取值范围是-20到19,数值越大,优先级反而越低(单位:%) cpu_usage_iowait: CPU等待I/O的时间占比(单位:%) cpu_usage_irq: CPU处理硬中断的时间占比(单位:%) cpu_usage_softirq: CPU处理软中断的时间占比(单位:%) cpu_usage_steal: 在虚拟机环境下有该指标,表示CPU被其他虚拟机争用的时间占比,超过20就表示争抢严重(单位:%) cpu_usage_guest: 通过虚拟化运行其他操作系统的时间,也就是运行虚拟机的CPU时间占比(单位:%) cpu_usage_guest_nice: 以低优先级运行虚拟机的时间占比(单位:%) disk_free: 硬盘分区剩余量(单位:byte) disk_used: 硬盘分区使用量(单位:byte) disk_used_percent: 硬盘分区使用率(单位:%) disk_total: 硬盘分区总量(单位:byte) disk_inodes_free: 硬盘分区inode剩余量 disk_inodes_used: 硬盘分区inode使用量 disk_inodes_total: 硬盘分区inode总量 diskio_io_time: 从设备视角来看I/O请求总时间,队列中有I/O请求就计数(单位:毫秒),counter类型,需要用函数求rate才有使用价值 diskio_iops_in_progress: 已经分配给设备驱动且尚未完成的IO请求,不包含在队列中但尚未分配给设备驱动的IO请求,gauge类型 diskio_merged_reads: 相邻读请求merge读的次数,counter类型 diskio_merged_writes: 相邻写请求merge写的次数,counter类型 diskio_read_bytes: 读取的byte数量,counter类型,需要用函数求rate才有使用价值 diskio_read_time: 读请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值 diskio_reads: 读请求次数,counter类型,需要用函数求rate才有使用价值 diskio_weighted_io_time: 从I/O请求视角来看I/O等待总时间,如果同时有多个I/O请求,时间会叠加(单位:毫秒) diskio_write_bytes: 写入的byte数量,counter类型,需要用函数求rate才有使用价值 diskio_write_time: 写请求总时间(单位:毫秒),counter类型,需要用函数求rate才有使用价值 diskio_writes: 写请求次数,counter类型,需要用函数求rate才有使用价值 kernel_boot_time: 内核启动时间 kernel_context_switches: 内核上下文切换次数 kernel_entropy_avail: linux系统内部的熵池 kernel_interrupts: 内核中断次数 kernel_processes_forked: fork的进程数 mem_active: 活跃使用的内存总数(包括cache和buffer内存) mem_available: 应用程序可用内存数 mem_available_percent: 内存剩余百分比(0~100) mem_buffered: 用来给文件做缓冲大小 mem_cached: 被高速缓冲存储器(cache memory)用的内存的大小(等于 diskcache minus SwapCache ) mem_commit_limit: 根据超额分配比率('vm.overcommit_ratio'),这是当前在系统上分配可用的内存总量,这个限制只是在模式2('vm.overcommit_memory')时启用 mem_committed_as: 目前在系统上分配的内存量。是所有进程申请的内存的总和 mem_dirty: 等待被写回到磁盘的内存大小 mem_free: 空闲内存数 mem_high_free: 未被使用的高位内存大小 mem_high_total: 高位内存总大小(Highmem是指所有内存高于860MB的物理内存,Highmem区域供用户程序使用,或用于页面缓存。该区域不是直接映射到内核空间。内核必须使用不同的手法使用该段内存) mem_huge_page_size: 每个大页的大小 mem_huge_pages_free: 池中尚未分配的 HugePages 数量 mem_huge_pages_total: 预留HugePages的总个数 mem_inactive: 空闲的内存数(包括free和available的内存) mem_low_free: 未被使用的低位大小 mem_low_total: 低位内存总大小,低位可以达到高位内存一样的作用,而且它还能够被内核用来记录一些自己的数据结构 mem_mapped: 设备和文件等映射的大小 mem_page_tables: 管理内存分页页面的索引表的大小 mem_shared: 多个进程共享的内存总额 mem_slab: 内核数据结构缓存的大小,可以减少申请和释放内存带来的消耗 mem_sreclaimable: 可收回Slab的大小 mem_sunreclaim: 不可收回Slab的大小(SUnreclaim+SReclaimable=Slab) mem_swap_cached: 被高速缓冲存储器(cache memory)用的交换空间的大小,已经被交换出来的内存,但仍然被存放在swapfile中。用来在需要的时候很快的被替换而不需要再次打开I/O端口 mem_swap_free: 未被使用交换空间的大小 mem_swap_total: 交换空间的总大小 mem_total: 内存总数 mem_used: 已用内存数 mem_used_percent: 已用内存数百分比(0~100) mem_vmalloc_chunk: 最大的连续未被使用的vmalloc区域 mem_vmalloc_totalL: 可以vmalloc虚拟内存大小 mem_vmalloc_used: vmalloc已使用的虚拟内存大小 mem_write_back: 正在被写回到磁盘的内存大小 mem_write_back_tmp: FUSE用于临时写回缓冲区的内存 net_bytes_recv: 网卡收包总数(bytes) net_bytes_sent: 网卡发包总数(bytes) net_drop_in: 网卡收丢包数量 net_drop_out: 网卡发丢包数量 net_err_in: 网卡收包错误数量 net_err_out: 网卡发包错误数量 net_packets_recv: 网卡收包数量 net_packets_sent: 网卡发包数量 netstat_tcp_established: ESTABLISHED状态的网络链接数 netstat_tcp_fin_wait1: FIN_WAIT1状态的网络链接数 netstat_tcp_fin_wait2: FIN_WAIT2状态的网络链接数 netstat_tcp_last_ack: LAST_ACK状态的网络链接数 netstat_tcp_listen: LISTEN状态的网络链接数 netstat_tcp_syn_recv: SYN_RECV状态的网络链接数 netstat_tcp_syn_sent: SYN_SENT状态的网络链接数 netstat_tcp_time_wait: TIME_WAIT状态的网络链接数 netstat_udp_socket: UDP状态的网络链接数 processes_blocked: 不可中断的睡眠状态下的进程数('U','D','L') processes_dead: 回收中的进程数('X') processes_idle: 挂起的空闲进程数('I') processes_paging: 分页进程数('P') processes_running: 运行中的进程数('R') processes_sleeping: 可中断进程数('S') processes_stopped: 暂停状态进程数('T') processes_total: 总进程数 processes_total_threads: 总线程数 processes_unknown: 未知状态进程数 processes_zombies: 僵尸态进程数('Z') swap_used_percent: Swap空间换出数据量 system_load1: 1分钟平均load值 system_load5: 5分钟平均load值 system_load15: 15分钟平均load值 system_n_users: 用户数 system_n_cpus: CPU核数 system_uptime: 系统启动时间 nginx_accepts: 自nginx启动起,与客户端建立过得连接总数 nginx_active: 当前nginx正在处理的活动连接数,等于Reading/Writing/Waiting总和 nginx_handled: 自nginx启动起,处理过的客户端连接总数 nginx_reading: 正在读取HTTP请求头部的连接总数 nginx_requests: 自nginx启动起,处理过的客户端请求总数,由于存在HTTP Keep-Alive请求,该值会大于handled值 nginx_upstream_check_fall: upstream_check模块检测到后端失败的次数 nginx_upstream_check_rise: upstream_check模块对后端的检测次数 nginx_upstream_check_status_code: 后端upstream的状态,up为1,down为0 nginx_waiting: 开启 keep-alive 的情况下,这个值等于 active – (reading+writing), 意思就是 Nginx 已经处理完正在等候下一次请求指令的驻留连接 nginx_writing: 正在向客户端发送响应的连接总数 http_response_content_length: HTTP消息实体的传输长度 http_response_http_response_code: http响应状态码 http_response_response_time: http响应用时 http_response_result_code: url探测结果0为正常否则url无法访问 # [mysqld_exporter] mysql_global_status_uptime: The number of seconds that the server has been up.(Gauge) mysql_global_status_uptime_since_flush_status: The number of seconds since the most recent FLUSH STATUS statement.(Gauge) mysql_global_status_queries: The number of statements executed by the server. This variable includes statements executed within stored programs, unlike the Questions variable. It does not count COM_PING or COM_STATISTICS commands.(Counter) mysql_global_status_threads_connected: The number of currently open connections.(Counter) mysql_global_status_connections: The number of connection attempts (successful or not) to the MySQL server.(Gauge) mysql_global_status_max_used_connections: The maximum number of connections that have been in use simultaneously since the server started.(Gauge) mysql_global_status_threads_running: The number of threads that are not sleeping.(Gauge) mysql_global_status_questions: The number of statements executed by the server. This includes only statements sent to the server by clients and not statements executed within stored programs, unlike the Queries variable. This variable does not count COM_PING, COM_STATISTICS, COM_STMT_PREPARE, COM_STMT_CLOSE, or COM_STMT_RESET commands.(Counter) mysql_global_status_threads_cached: The number of threads in the thread cache.(Counter) mysql_global_status_threads_created: The number of threads created to handle connections. If Threads_created is big, you may want to increase the thread_cache_size value. The cache miss rate can be calculated as Threads_created/Connections.(Counter) mysql_global_status_created_tmp_tables: The number of internal temporary tables created by the server while executing statements.(Counter) mysql_global_status_created_tmp_disk_tables: The number of internal on-disk temporary tables created by the server while executing statements. You can compare the number of internal on-disk temporary tables created to the total number of internal temporary tables created by comparing Created_tmp_disk_tables and Created_tmp_tables values.(Counter) mysql_global_status_created_tmp_files: How many temporary files mysqld has created.(Counter) mysql_global_status_select_full_join: The number of joins that perform table scans because they do not use indexes. If this value is not 0, you should carefully check the indexes of your tables.(Counter) mysql_global_status_select_full_range_join: The number of joins that used a range search on a reference table.(Counter) mysql_global_status_select_range: The number of joins that used ranges on the first table. This is normally not a critical issue even if the value is quite large.(Counter) mysql_global_status_select_range_check: The number of joins without keys that check for key usage after each row. If this is not 0, you should carefully check the indexes of your tables.(Counter) mysql_global_status_select_scan: The number of joins that did a full scan of the first table.(Counter) mysql_global_status_sort_rows: The number of sorted rows.(Counter) mysql_global_status_sort_range: The number of sorts that were done using ranges.(Counter) mysql_global_status_sort_merge_passes: The number of merge passes that the sort algorithm has had to do. If this value is large, you should consider increasing the value of the sort_buffer_size system variable.(Counter) mysql_global_status_sort_scan: The number of sorts that were done by scanning the table.(Counter) mysql_global_status_slow_queries: The number of queries that have taken more than long_query_time seconds. This counter increments regardless of whether the slow query log is enabled.(Counter) mysql_global_status_aborted_connects: The number of failed attempts to connect to the MySQL server.(Counter) mysql_global_status_aborted_clients: The number of connections that were aborted because the client died without closing the connection properly.(Counter) mysql_global_status_table_locks_immediate: The number of times that a request for a table lock could be granted immediately. Locks Immediate rising and falling is normal activity.(Counter) mysql_global_status_table_locks_waited: The number of times that a request for a table lock could not be granted immediately and a wait was needed. If this is high and you have performance problems, you should first optimize your queries, and then either split your table or tables or use replication.(Counter) mysql_global_status_bytes_received: The number of bytes received from all clients.(Counter) mysql_global_status_bytes_sent: The number of bytes sent to all clients.(Counter) mysql_global_status_innodb_page_size: InnoDB page size (default 16KB). Many values are counted in pages; the page size enables them to be easily converted to bytes.(Gauge) mysql_global_status_buffer_pool_pages: The number of pages in the InnoDB buffer pool.(Gauge) mysql_global_status_commands_total: The number of times each xxx statement has been executed.(Counter) mysql_global_status_handlers_total: Handler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes. This is in fact the layer between the Storage Engine and MySQL.(Counter) mysql_global_status_opened_files: The number of files that have been opened with my_open() (a mysys library function). Parts of the server that open files without using this function do not increment the count.(Counter) mysql_global_status_open_tables: The number of tables that are open.(Gauge) mysql_global_status_opened_tables: The number of tables that have been opened. If Opened_tables is big, your table_open_cache value is probably too small.(Counter) mysql_global_status_table_open_cache_hits: The number of hits for open tables cache lookups.(Counter) mysql_global_status_table_open_cache_misses: The number of misses for open tables cache lookups.(Counter) mysql_global_status_table_open_cache_overflows: The number of overflows for the open tables cache.(Counter) mysql_global_status_innodb_num_open_files: The number of files InnoDB currently holds open.(Gauge) mysql_global_status_connection_errors_total: These variables provide information about errors that occur during the client connection process.(Counter) mysql_global_status_innodb_buffer_pool_read_requests: The number of logical read requests.(Counter) mysql_global_status_innodb_buffer_pool_reads: The number of logical reads that InnoDB could not satisfy from the buffer pool, and had to read directly from disk.(Counter) mysql_global_variables_thread_cache_size: How many threads the server should cache for reuse.(Gauge) mysql_global_variables_max_connections: The maximum permitted number of simultaneous client connections.(Gauge) mysql_global_variables_innodb_buffer_pool_size: The size in bytes of the buffer pool, the memory area where InnoDB caches table and index data. The default value is 134217728 bytes (128MB).(Gauge) mysql_global_variables_innodb_log_buffer_size: The size in bytes of the buffer that InnoDB uses to write to the log files on disk.(Gauge) mysql_global_variables_key_buffer_size: Index blocks for MyISAM tables are buffered and are shared by all threads.(Gauge) mysql_global_variables_query_cache_size: The amount of memory allocated for caching query results.(Gauge) mysql_global_variables_table_open_cache: The number of open tables for all threads.(Gauge) mysql_global_variables_open_files_limit: The number of file descriptors available to mysqld from the operating system.(Gauge) # [redis_exporter] redis_active_defrag_running: When activedefrag is enabled, this indicates whether defragmentation is currently active, and the CPU percentage it intends to utilize. redis_allocator_active_bytes: Total bytes in the allocator active pages, this includes external-fragmentation. redis_allocator_allocated_bytes: Total bytes allocated form the allocator, including internal-fragmentation. Normally the same as used_memory. redis_allocator_frag_bytes: Delta between allocator_active and allocator_allocated. See note about mem_fragmentation_bytes. redis_allocator_frag_ratio: Ratio between allocator_active and allocator_allocated. This is the true (external) fragmentation metric (not mem_fragmentation_ratio). redis_allocator_resident_bytes: Total bytes resident (RSS) in the allocator, this includes pages that can be released to the OS (by MEMORY PURGE, or just waiting). redis_allocator_rss_bytes: Delta between allocator_resident and allocator_active. redis_allocator_rss_ratio: Ratio between allocator_resident and allocator_active. This usually indicates pages that the allocator can and probably will soon release back to the OS. redis_aof_current_rewrite_duration_sec: Duration of the on-going AOF rewrite operation if any. redis_aof_enabled: Flag indicating AOF logging is activated. redis_aof_last_bgrewrite_status: Status of the last AOF rewrite operation. redis_aof_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last AOF rewrite operation. redis_aof_last_rewrite_duration_sec: Duration of the last AOF rewrite operation in seconds. redis_aof_last_write_status: Status of the last write operation to the AOF. redis_aof_rewrite_in_progress: Flag indicating a AOF rewrite operation is on-going. redis_aof_rewrite_scheduled: Flag indicating an AOF rewrite operation will be scheduled once the on-going RDB save is complete. redis_blocked_clients: Number of clients pending on a blocking call (BLPOP, BRPOP, BRPOPLPUSH, BLMOVE, BZPOPMIN, BZPOPMAX). redis_client_recent_max_input_buffer_bytes: Biggest input buffer among current client connections. redis_client_recent_max_output_buffer_bytes: Biggest output buffer among current client connections. redis_cluster_enabled: Indicate Redis cluster is enabled. redis_commands_duration_seconds_total: The total CPU time consumed by these commands.(Counter) redis_commands_processed_total: Total number of commands processed by the server.(Counter) redis_commands_total: The number of calls that reached command execution (not rejected).(Counter) redis_config_maxclients: The value of the maxclients configuration directive. This is the upper limit for the sum of connected_clients, connected_slaves and cluster_connections. redis_config_maxmemory: The value of the maxmemory configuration directive. redis_connected_clients: Number of client connections (excluding connections from replicas). redis_connected_slaves: Number of connected replicas. redis_connections_received_total: Total number of connections accepted by the server.(Counter) redis_cpu_sys_children_seconds_total: System CPU consumed by the background processes.(Counter) redis_cpu_sys_seconds_total: System CPU consumed by the Redis server, which is the sum of system CPU consumed by all threads of the server process (main thread and background threads).(Counter) redis_cpu_user_children_seconds_total: User CPU consumed by the background processes.(Counter) redis_cpu_user_seconds_total: User CPU consumed by the Redis server, which is the sum of user CPU consumed by all threads of the server process (main thread and background threads).(Counter) redis_db_keys: Total number of keys by DB. redis_db_keys_expiring: Total number of expiring keys by DB redis_defrag_hits: Number of value reallocations performed by active the defragmentation process. redis_defrag_misses: Number of aborted value reallocations started by the active defragmentation process. redis_defrag_key_hits: Number of keys that were actively defragmented. redis_defrag_key_misses: Number of keys that were skipped by the active defragmentation process. redis_evicted_keys_total: Number of evicted keys due to maxmemory limit.(Counter) redis_expired_keys_total: Total number of key expiration events.(Counter) redis_expired_stale_percentage: The percentage of keys probably expired. redis_expired_time_cap_reached_total: The count of times that active expiry cycles have stopped early. redis_exporter_last_scrape_connect_time_seconds: The duration(in seconds) to connect when scrape. redis_exporter_last_scrape_duration_seconds: The last scrape duration. redis_exporter_last_scrape_error: The last scrape error status. redis_exporter_scrape_duration_seconds_count: Durations of scrapes by the exporter redis_exporter_scrape_duration_seconds_sum: Durations of scrapes by the exporter redis_exporter_scrapes_total: Current total redis scrapes.(Counter) redis_instance_info: Information about the Redis instance. redis_keyspace_hits_total: Hits total.(Counter) redis_keyspace_misses_total: Misses total.(Counter) redis_last_key_groups_scrape_duration_milliseconds: Duration of the last key group metrics scrape in milliseconds. redis_last_slow_execution_duration_seconds: The amount of time needed for last slow execution, in seconds. redis_latest_fork_seconds: The amount of time needed for last fork, in seconds. redis_lazyfree_pending_objects: The number of objects waiting to be freed (as a result of calling UNLINK, or FLUSHDB and FLUSHALL with the ASYNC option). redis_master_repl_offset: The server's current replication offset. redis_mem_clients_normal: Memory used by normal clients.(Gauge) redis_mem_clients_slaves: Memory used by replica clients - Starting Redis 7.0, replica buffers share memory with the replication backlog, so this field can show 0 when replicas don't trigger an increase of memory usage. redis_mem_fragmentation_bytes: Delta between used_memory_rss and used_memory. Note that when the total fragmentation bytes is low (few megabytes), a high ratio (e.g. 1.5 and above) is not an indication of an issue. redis_mem_fragmentation_ratio: Ratio between used_memory_rss and used_memory. Note that this doesn't only includes fragmentation, but also other process overheads (see the allocator_* metrics), and also overheads like code, shared libraries, stack, etc. redis_mem_not_counted_for_eviction_bytes: (Gauge) redis_memory_max_bytes: Max memory limit in bytes. redis_memory_used_bytes: Total number of bytes allocated by Redis using its allocator (either standard libc, jemalloc, or an alternative allocator such as tcmalloc) redis_memory_used_dataset_bytes: The size in bytes of the dataset (used_memory_overhead subtracted from used_memory) redis_memory_used_lua_bytes: Number of bytes used by the Lua engine. redis_memory_used_overhead_bytes: The sum in bytes of all overheads that the server allocated for managing its internal data structures. redis_memory_used_peak_bytes: Peak memory consumed by Redis (in bytes) redis_memory_used_rss_bytes: Number of bytes that Redis allocated as seen by the operating system (a.k.a resident set size). This is the number reported by tools such as top(1) and ps(1) redis_memory_used_scripts_bytes: Number of bytes used by cached Lua scripts redis_memory_used_startup_bytes: Initial amount of memory consumed by Redis at startup in bytes redis_migrate_cached_sockets_total: The number of sockets open for MIGRATE purposes redis_net_input_bytes_total: Total input bytes(Counter) redis_net_output_bytes_total: Total output bytes(Counter) redis_process_id: Process ID redis_pubsub_channels: Global number of pub/sub channels with client subscriptions redis_pubsub_patterns: Global number of pub/sub pattern with client subscriptions redis_rdb_bgsave_in_progress: Flag indicating a RDB save is on-going redis_rdb_changes_since_last_save: Number of changes since the last dump redis_rdb_current_bgsave_duration_sec: Duration of the on-going RDB save operation if any redis_rdb_last_bgsave_duration_sec: Duration of the last RDB save operation in seconds redis_rdb_last_bgsave_status: Status of the last RDB save operation redis_rdb_last_cow_size_bytes: The size in bytes of copy-on-write memory during the last RDB save operation redis_rdb_last_save_timestamp_seconds: Epoch-based timestamp of last successful RDB save redis_rejected_connections_total: Number of connections rejected because of maxclients limit(Counter) redis_repl_backlog_first_byte_offset: The master offset of the replication backlog buffer redis_repl_backlog_history_bytes: Size in bytes of the data in the replication backlog buffer redis_repl_backlog_is_active: Flag indicating replication backlog is active redis_replica_partial_resync_accepted: The number of accepted partial resync requests(Gauge) redis_replica_partial_resync_denied: The number of denied partial resync requests(Gauge) redis_replica_resyncs_full: The number of full resyncs with replicas redis_replication_backlog_bytes: Memory used by replication backlog redis_second_repl_offset: The offset up to which replication IDs are accepted. redis_slave_expires_tracked_keys: The number of keys tracked for expiry purposes (applicable only to writable replicas)(Gauge) redis_slowlog_last_id: Last id of slowlog redis_slowlog_length: Total slowlog redis_start_time_seconds: Start time of the Redis instance since unix epoch in seconds. redis_target_scrape_request_errors_total: Errors in requests to the exporter redis_up: Flag indicating redis instance is up redis_uptime_in_seconds: Number of seconds since Redis server start # [windows_exporter] windows_cpu_clock_interrupts_total: Total number of received and serviced clock tick interrupts(counter) windows_cpu_core_frequency_mhz: Core frequency in megahertz(gauge) windows_cpu_cstate_seconds_total: Time spent in low-power idle state(counter) windows_cpu_dpcs_total: Total number of received and serviced deferred procedure calls (DPCs)(counter) windows_cpu_idle_break_events_total: Total number of time processor was woken from idle(counter) windows_cpu_interrupts_total: Total number of received and serviced hardware interrupts(counter) windows_cpu_parking_status: Parking Status represents whether a processor is parked or not(gauge) windows_cpu_processor_performance: Processor Performance is the average performance of the processor while it is executing instructions, as a percentage of the nominal performance of the processor. On some processors, Processor Performance may exceed 100%(gauge) windows_cpu_time_total: Time that processor spent in different modes (idle, user, system, ...)(counter) windows_cs_hostname: Labeled system hostname information as provided by ComputerSystem.DNSHostName and ComputerSystem.Domain(gauge) windows_cs_logical_processors: ComputerSystem.NumberOfLogicalProcessors(gauge) windows_cs_physical_memory_bytes: ComputerSystem.TotalPhysicalMemory(gauge) windows_exporter_build_info: A metric with a constant '1' value labeled by version, revision, branch, and goversion from which windows_exporter was built.(gauge) windows_exporter_collector_duration_seconds: Duration of a collection.(gauge) windows_exporter_collector_success: Whether the collector was successful.(gauge) windows_exporter_collector_timeout: Whether the collector timed out.(gauge) windows_exporter_perflib_snapshot_duration_seconds: Duration of perflib snapshot capture(gauge) windows_logical_disk_free_bytes: Free space in bytes (LogicalDisk.PercentFreeSpace)(gauge) windows_logical_disk_idle_seconds_total: Seconds that the disk was idle (LogicalDisk.PercentIdleTime)(counter) windows_logical_disk_read_bytes_total: The number of bytes transferred from the disk during read operations (LogicalDisk.DiskReadBytesPerSec)(counter) windows_logical_disk_read_latency_seconds_total: Shows the average time, in seconds, of a read operation from the disk (LogicalDisk.AvgDiskSecPerRead)(counter) windows_logical_disk_read_seconds_total: Seconds that the disk was busy servicing read requests (LogicalDisk.PercentDiskReadTime)(counter) windows_logical_disk_read_write_latency_seconds_total: Shows the time, in seconds, of the average disk transfer (LogicalDisk.AvgDiskSecPerTransfer)(counter) windows_logical_disk_reads_total: The number of read operations on the disk (LogicalDisk.DiskReadsPerSec)(counter) windows_logical_disk_requests_queued: The number of requests queued to the disk (LogicalDisk.CurrentDiskQueueLength)(gauge) windows_logical_disk_size_bytes: Total space in bytes (LogicalDisk.PercentFreeSpace_Base)(gauge) windows_logical_disk_split_ios_total: The number of I/Os to the disk were split into multiple I/Os (LogicalDisk.SplitIOPerSec)(counter) windows_logical_disk_write_bytes_total: The number of bytes transferred to the disk during write operations (LogicalDisk.DiskWriteBytesPerSec)(counter) windows_logical_disk_write_latency_seconds_total: Shows the average time, in seconds, of a write operation to the disk (LogicalDisk.AvgDiskSecPerWrite)(counter) windows_logical_disk_write_seconds_total: Seconds that the disk was busy servicing write requests (LogicalDisk.PercentDiskWriteTime)(counter) windows_logical_disk_writes_total: The number of write operations on the disk (LogicalDisk.DiskWritesPerSec)(counter) windows_net_bytes_received_total: (Network.BytesReceivedPerSec)(counter) windows_net_bytes_sent_total: (Network.BytesSentPerSec)(counter) windows_net_bytes_total: (Network.BytesTotalPerSec)(counter) windows_net_current_bandwidth: (Network.CurrentBandwidth)(gauge) windows_net_packets_outbound_discarded_total: (Network.PacketsOutboundDiscarded)(counter) windows_net_packets_outbound_errors_total: (Network.PacketsOutboundErrors)(counter) windows_net_packets_received_discarded_total: (Network.PacketsReceivedDiscarded)(counter) windows_net_packets_received_errors_total: (Network.PacketsReceivedErrors)(counter) windows_net_packets_received_total: (Network.PacketsReceivedPerSec)(counter) windows_net_packets_received_unknown_total: (Network.PacketsReceivedUnknown)(counter) windows_net_packets_sent_total: (Network.PacketsSentPerSec)(counter) windows_net_packets_total: (Network.PacketsPerSec)(counter) windows_os_info: OperatingSystem.Caption, OperatingSystem.Version(gauge) windows_os_paging_free_bytes: OperatingSystem.FreeSpaceInPagingFiles(gauge) windows_os_paging_limit_bytes: OperatingSystem.SizeStoredInPagingFiles(gauge) windows_os_physical_memory_free_bytes: OperatingSystem.FreePhysicalMemory(gauge) windows_os_process_memory_limix_bytes: OperatingSystem.MaxProcessMemorySize(gauge) windows_os_processes: OperatingSystem.NumberOfProcesses(gauge) windows_os_processes_limit: OperatingSystem.MaxNumberOfProcesses(gauge) windows_os_time: OperatingSystem.LocalDateTime(gauge) windows_os_timezone: OperatingSystem.LocalDateTime(gauge) windows_os_users: OperatingSystem.NumberOfUsers(gauge) windows_os_virtual_memory_bytes: OperatingSystem.TotalVirtualMemorySize(gauge) windows_os_virtual_memory_free_bytes: OperatingSystem.FreeVirtualMemory(gauge) windows_os_visible_memory_bytes: OperatingSystem.TotalVisibleMemorySize(gauge) windows_service_info: A metric with a constant '1' value labeled with service information(gauge) windows_service_start_mode: The start mode of the service (StartMode)(gauge) windows_service_state: The state of the service (State)(gauge) windows_service_status: The status of the service (Status)(gauge) windows_system_context_switches_total: Total number of context switches (WMI source is PerfOS_System.ContextSwitchesPersec)(counter) windows_system_exception_dispatches_total: Total number of exceptions dispatched (WMI source is PerfOS_System.ExceptionDispatchesPersec)(counter) windows_system_processor_queue_length: Length of processor queue (WMI source is PerfOS_System.ProcessorQueueLength)(gauge) windows_system_system_calls_total: Total number of system calls (WMI source is PerfOS_System.SystemCallsPersec)(counter) windows_system_system_up_time: System boot time (WMI source is PerfOS_System.SystemUpTime)(gauge) windows_system_threads: Current number of threads (WMI source is PerfOS_System.Threads)(gauge) # [node_exporter] # SYSTEM # CPU context switch 次数 node_context_switches_total: context_switches # Interrupts 次数 node_intr_total: Interrupts # 运行的进程数 node_procs_running: Processes in runnable state # 熵池大小 node_entropy_available_bits: Entropy available to random number generators node_time_seconds: System time in seconds since epoch (1970) node_boot_time_seconds: Node boot time, in unixtime # CPU node_cpu_seconds_total: Seconds the CPUs spent in each mode node_load1: cpu load 1m node_load5: cpu load 5m node_load15: cpu load 15m # MEM # 内核态 # 用户追踪已从交换区获取但尚未修改的页面的内存 node_memory_SwapCached_bytes: Memory that keeps track of pages that have been fetched from swap but not yet been modified # 内核用于缓存数据结构供自己使用的内存 node_memory_Slab_bytes: Memory used by the kernel to cache data structures for its own use # slab中可回收的部分 node_memory_SReclaimable_bytes: SReclaimable - Part of Slab, that might be reclaimed, such as caches # slab中不可回收的部分 node_memory_SUnreclaim_bytes: Part of Slab, that cannot be reclaimed on memory pressure # Vmalloc内存区的大小 node_memory_VmallocTotal_bytes: Total size of vmalloc memory area # vmalloc已分配的内存,虚拟地址空间上的连续的内存 node_memory_VmallocUsed_bytes: Amount of vmalloc area which is used # vmalloc区可用的连续最大快的大小,通过此指标可以知道vmalloc可分配连续内存的最大值 node_memory_VmallocChunk_bytes: Largest contiguous block of vmalloc area which is free # 内存的硬件故障删除掉的内存页的总大小 node_memory_HardwareCorrupted_bytes: Amount of RAM that the kernel identified as corrupted / not working # 用于在虚拟和物理内存地址之间映射的内存 node_memory_PageTables_bytes: Memory used to map between virtual and physical memory addresses (gauge) # 内核栈内存,常驻内存,不可回收 node_memory_KernelStack_bytes: Kernel memory stack. This is not reclaimable # 用来访问高端内存,复制高端内存的临时buffer,称为“bounce buffering”,会降低I/O 性能 node_memory_Bounce_bytes: Memory used for block device bounce buffers #用户态 # 单个巨页大小 node_memory_Hugepagesize_bytes: Huge Page size # 系统分配的常驻巨页数 node_memory_HugePages_Total: Total size of the pool of huge pages # 系统空闲的巨页数 node_memory_HugePages_Free: Huge pages in the pool that are not yet allocated # 进程已申请但未使用的巨页数 node_memory_HugePages_Rsvd: Huge pages for which a commitment to allocate from the pool has been made, but no allocation # 超过系统设定的常驻HugePages数量的个数 node_memory_HugePages_Surp: Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages # 透明巨页 Transparent HugePages (THP) node_memory_AnonHugePages_bytes: Memory in anonymous huge pages # inactivelist中的File-backed内存 node_memory_Inactive_file_bytes: File-backed memory on inactive LRU list # inactivelist中的Anonymous内存 node_memory_Inactive_anon_bytes: Anonymous and swap cache on inactive LRU list, including tmpfs (shmem) # activelist中的File-backed内存 node_memory_Active_file_bytes: File-backed memory on active LRU list # activelist中的Anonymous内存 node_memory_Active_anon_bytes: Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs # 禁止换出的页,对应 Unevictable 链表 node_memory_Unevictable_bytes: Amount of unevictable memory that can't be swapped out for a variety of reasons # 共享内存 node_memory_Shmem_bytes: Used shared memory (shared between several processes, thus including RAM disks) # 匿名页内存大小 node_memory_AnonPages_bytes: Memory in user pages not backed by files # 被关联的内存页大小 node_memory_Mapped_bytes: Used memory in mapped pages files which have been mapped, such as libraries # file-backed内存页缓存大小 node_memory_Cached_bytes: Parked file data (file content) cache # 系统中有多少匿名页曾经被swap-out、现在又被swap-in并且swap-in之后页面中的内容一直没发生变化 node_memory_SwapCached_bytes: Memory that keeps track of pages that have been fetched from swap but not yet been modified # 被mlock()系统调用锁定的内存大小 node_memory_Mlocked_bytes: Size of pages locked to memory using the mlock() system call # 块设备(block device)所占用的缓存页 node_memory_Buffers_bytes: Block device (e.g. harddisk) cache node_memory_SwapTotal_bytes: Memory information field SwapTotal_bytes node_memory_SwapFree_bytes: Memory information field SwapFree_bytes # DISK node_filesystem_files_free: Filesystem space available to non-root users in byte node_filesystem_free_bytes: Filesystem free space in bytes node_filesystem_size_bytes: Filesystem size in bytes node_filesystem_files_free: Filesystem total free file nodes node_filesystem_files: Filesystem total free file nodes node_filefd_maximum: Max open files node_filefd_allocated: Open files node_filesystem_readonly: Filesystem read-only status node_filesystem_device_error: Whether an error occurred while getting statistics for the given device node_disk_reads_completed_total: The total number of reads completed successfully node_disk_writes_completed_total: The total number of writes completed successfully node_disk_reads_merged_total: The number of reads merged node_disk_writes_merged_total: The number of writes merged node_disk_read_bytes_total: The total number of bytes read successfully node_disk_written_bytes_total: The total number of bytes written successfully node_disk_io_time_seconds_total: Total seconds spent doing I/Os node_disk_read_time_seconds_total: The total number of seconds spent by all reads node_disk_write_time_seconds_total: The total number of seconds spent by all writes node_disk_io_time_weighted_seconds_total: The weighted of seconds spent doing I/Os # NET node_network_receive_bytes_total: Network device statistic receive_bytes (counter) node_network_transmit_bytes_total: Network device statistic transmit_bytes (counter) node_network_receive_packets_total: Network device statistic receive_bytes node_network_transmit_packets_total: Network device statistic transmit_bytes node_network_receive_errs_total: Network device statistic receive_errs node_network_transmit_errs_total: Network device statistic transmit_errs node_network_receive_drop_total: Network device statistic receive_drop node_network_transmit_drop_total: Network device statistic transmit_drop node_nf_conntrack_entries: Number of currently allocated flow entries for connection tracking node_sockstat_TCP_alloc: Number of TCP sockets in state alloc node_sockstat_TCP_inuse: Number of TCP sockets in state inuse node_sockstat_TCP_orphan: Number of TCP sockets in state orphan node_sockstat_TCP_tw: Number of TCP sockets in state tw node_netstat_Tcp_CurrEstab: Statistic TcpCurrEstab node_sockstat_sockets_used: Number of IPv4 sockets in use # [kafka_exporter] kafka_brokers: count of kafka_brokers (gauge) kafka_topic_partitions: Number of partitions for this Topic (gauge) kafka_topic_partition_current_offset: Current Offset of a Broker at Topic/Partition (gauge) kafka_consumergroup_current_offset: Current Offset of a ConsumerGroup at Topic/Partition (gauge) kafka_consumer_lag_millis: Current approximation of consumer lag for a ConsumerGroup at Topic/Partition (gauge) kafka_topic_partition_under_replicated_partition: 1 if Topic/Partition is under Replicated # [zookeeper_exporter] zk_znode_count: The total count of znodes stored zk_ephemerals_count: The number of Ephemerals nodes zk_watch_count: The number of watchers setup over Zookeeper nodes. zk_approximate_data_size: Size of data in bytes that a zookeeper server has in its data tree zk_outstanding_requests: Number of currently executing requests zk_packets_sent: Count of the number of zookeeper packets sent from a server zk_packets_received: Count of the number of zookeeper packets received by a server zk_num_alive_connections: Number of active clients connected to a zookeeper server zk_open_file_descriptor_count: Number of file descriptors that a zookeeper server has open zk_max_file_descriptor_count: Maximum number of file descriptors that a zookeeper server can open zk_avg_latency: Average time in milliseconds for requests to be processed zk_min_latency: Maximum time in milliseconds for a request to be processed zk_max_latency: Minimum time in milliseconds for a request to be processed ================================================ FILE: docker/compose-postgres/prometc_vm/prometheus.yml ================================================ # my global config global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. scrape_configs: # The job name is added as a label `job=` to any timeseries scraped from this config. - job_name: 'victoriametrics' # metrics_path defaults to '/metrics' # scheme defaults to 'http'. static_configs: - targets: ['victoriametrics:8428'] - job_name: 'n9e' # static_configs: # - targets: ['n9e:17000'] file_sd_configs: - files: - targets.json remote_write: - url: 'http://n9e:17000/prometheus/v1/write' ================================================ FILE: docker/compose-postgres/prometc_vm/targets.json ================================================ [ { "targets": [ "n9e:17000" ] } ] ================================================ FILE: docker/initsql/a-n9e.sql ================================================ set names utf8mb4; -- drop database if exists n9e_v6; create database n9e_v6; use n9e_v6; CREATE TABLE `users` ( `id` bigint unsigned not null auto_increment, `username` varchar(64) not null comment 'login name, cannot rename', `nickname` varchar(64) not null comment 'display name, chinese name', `password` varchar(128) not null default '', `phone` varchar(16) not null default '', `email` varchar(64) not null default '', `portrait` varchar(255) not null default '' comment 'portrait image url', `roles` varchar(255) not null comment 'Admin | Standard | Guest, split by space', `contacts` varchar(1024) comment 'json e.g. {wecom:xx, dingtalk_robot_token:yy}', `maintainer` tinyint(1) not null default 0, `belong` varchar(191) DEFAULT '' COMMENT 'belong', `last_active_time` bigint DEFAULT 0 COMMENT 'last_active_time', `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`), UNIQUE KEY (`username`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; insert into `users`(id, username, nickname, password, roles, create_at, create_by, update_at, update_by) values(1, 'root', '超管', 'root.2020', 'Admin', unix_timestamp(now()), 'system', unix_timestamp(now()), 'system'); CREATE TABLE `user_group` ( `id` bigint unsigned not null auto_increment, `name` varchar(128) not null default '', `note` varchar(255) not null default '', `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`), KEY (`create_by`), KEY (`update_at`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; insert into user_group(id, name, create_at, create_by, update_at, update_by) values(1, 'demo-root-group', unix_timestamp(now()), 'root', unix_timestamp(now()), 'root'); CREATE TABLE `user_group_member` ( `id` bigint unsigned not null auto_increment, `group_id` bigint unsigned not null, `user_id` bigint unsigned not null, KEY (`group_id`), KEY (`user_id`), PRIMARY KEY(`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; insert into user_group_member(group_id, user_id) values(1, 1); CREATE TABLE `configs` ( `id` bigint unsigned not null auto_increment, `ckey` varchar(191) not null, `note` varchar(1024) NOT NULL DEFAULT '' COMMENT 'note', `cval` text COMMENT 'config value', `external` bigint DEFAULT 0 COMMENT '0\\:built-in 1\\:external', `encrypted` bigint DEFAULT 0 COMMENT '0\\:plaintext 1\\:ciphertext', `create_at` bigint DEFAULT 0 COMMENT 'create_at', `create_by` varchar(64) NOT NULL DEFAULT '' COMMENT 'create_by', `update_at` bigint DEFAULT 0 COMMENT 'update_at', `update_by` varchar(64) NOT NULL DEFAULT '' COMMENT 'update_by', PRIMARY KEY (`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `role` ( `id` bigint unsigned not null auto_increment, `name` varchar(191) not null default '', `note` varchar(255) not null default '', PRIMARY KEY (`id`), UNIQUE KEY (`name`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; insert into `role`(name, note) values('Admin', 'Administrator role'); insert into `role`(name, note) values('Standard', 'Ordinary user role'); insert into `role`(name, note) values('Guest', 'Readonly user role'); CREATE TABLE `role_operation`( `id` bigint unsigned not null auto_increment, `role_name` varchar(128) not null, `operation` varchar(191) not null, KEY (`role_name`), KEY (`operation`), PRIMARY KEY(`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; -- Admin is special, who has no concrete operation but can do anything. insert into `role_operation`(role_name, operation) values('Guest', '/metric/explorer'); insert into `role_operation`(role_name, operation) values('Guest', '/object/explorer'); insert into `role_operation`(role_name, operation) values('Guest', '/log/explorer'); insert into `role_operation`(role_name, operation) values('Guest', '/trace/explorer'); insert into `role_operation`(role_name, operation) values('Guest', '/help/version'); insert into `role_operation`(role_name, operation) values('Guest', '/help/contact'); insert into `role_operation`(role_name, operation) values('Standard', '/metric/explorer'); insert into `role_operation`(role_name, operation) values('Standard', '/object/explorer'); insert into `role_operation`(role_name, operation) values('Standard', '/log/explorer'); insert into `role_operation`(role_name, operation) values('Standard', '/trace/explorer'); insert into `role_operation`(role_name, operation) values('Standard', '/help/version'); insert into `role_operation`(role_name, operation) values('Standard', '/help/contact'); insert into `role_operation`(role_name, operation) values('Standard', '/help/servers'); insert into `role_operation`(role_name, operation) values('Standard', '/help/migrate'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules-built-in'); insert into `role_operation`(role_name, operation) values('Standard', '/dashboards-built-in'); insert into `role_operation`(role_name, operation) values('Standard', '/trace/dependencies'); insert into `role_operation`(role_name, operation) values('Standard', '/users'); insert into `role_operation`(role_name, operation) values('Standard', '/user-groups'); insert into `role_operation`(role_name, operation) values('Standard', '/user-groups/add'); insert into `role_operation`(role_name, operation) values('Standard', '/user-groups/put'); insert into `role_operation`(role_name, operation) values('Standard', '/user-groups/del'); insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups'); insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups/add'); insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups/put'); insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups/del'); insert into `role_operation`(role_name, operation) values('Standard', '/targets'); insert into `role_operation`(role_name, operation) values('Standard', '/targets/add'); insert into `role_operation`(role_name, operation) values('Standard', '/targets/put'); insert into `role_operation`(role_name, operation) values('Standard', '/targets/del'); insert into `role_operation`(role_name, operation) values('Standard', '/dashboards'); insert into `role_operation`(role_name, operation) values('Standard', '/dashboards/add'); insert into `role_operation`(role_name, operation) values('Standard', '/dashboards/put'); insert into `role_operation`(role_name, operation) values('Standard', '/dashboards/del'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules/add'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules/put'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules/del'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-mutes'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-mutes/add'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-mutes/del'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes/add'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes/put'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes/del'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-cur-events'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-cur-events/del'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-his-events'); insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls'); insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls/add'); insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls/put'); insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls/del'); insert into `role_operation`(role_name, operation) values('Standard', '/job-tasks'); insert into `role_operation`(role_name, operation) values('Standard', '/job-tasks/add'); insert into `role_operation`(role_name, operation) values('Standard', '/job-tasks/put'); insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules'); insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules/add'); insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules/put'); insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules/del'); -- for alert_rule | collect_rule | mute | dashboard grouping CREATE TABLE `busi_group` ( `id` bigint unsigned not null auto_increment, `name` varchar(191) not null, `label_enable` tinyint(1) not null default 0, `label_value` varchar(191) not null default '' comment 'if label_enable: label_value can not be blank', `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`), UNIQUE KEY (`name`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; insert into busi_group(id, name, create_at, create_by, update_at, update_by) values(1, 'Default Busi Group', unix_timestamp(now()), 'root', unix_timestamp(now()), 'root'); CREATE TABLE `busi_group_member` ( `id` bigint unsigned not null auto_increment, `busi_group_id` bigint not null comment 'busi group id', `user_group_id` bigint not null comment 'user group id', `perm_flag` char(2) not null comment 'ro | rw', PRIMARY KEY (`id`), KEY (`busi_group_id`), KEY (`user_group_id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; insert into busi_group_member(busi_group_id, user_group_id, perm_flag) values(1, 1, 'rw'); -- for dashboard new version CREATE TABLE `board` ( `id` bigint unsigned not null auto_increment, `group_id` bigint not null default 0 comment 'busi group id', `name` varchar(191) not null, `ident` varchar(200) not null default '', `tags` varchar(255) not null comment 'split by space', `public` tinyint(1) not null default 0 comment '0:false 1:true', `built_in` tinyint(1) not null default 0 comment '0:false 1:true', `hide` tinyint(1) not null default 0 comment '0:false 1:true', `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', `note` varchar(1024) not null default '' comment 'note', `public_cate` bigint NOT NULL NOT NULL DEFAULT 0 COMMENT '0 anonymous 1 login 2 busi', PRIMARY KEY (`id`), UNIQUE KEY (`group_id`, `name`), KEY(`ident`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; -- for dashboard new version CREATE TABLE `board_payload` ( `id` bigint unsigned not null comment 'dashboard id', `payload` mediumtext not null, UNIQUE KEY (`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; -- deprecated CREATE TABLE `dashboard` ( `id` bigint unsigned not null auto_increment, `group_id` bigint not null default 0 comment 'busi group id', `name` varchar(191) not null, `tags` varchar(255) not null comment 'split by space', `configs` varchar(8192) comment 'dashboard variables', `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`), UNIQUE KEY (`group_id`, `name`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; -- deprecated -- auto create the first subclass 'Default chart group' of dashboard CREATE TABLE `chart_group` ( `id` bigint unsigned not null auto_increment, `dashboard_id` bigint unsigned not null, `name` varchar(255) not null, `weight` int not null default 0, PRIMARY KEY (`id`), KEY (`dashboard_id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; -- deprecated CREATE TABLE `chart` ( `id` bigint unsigned not null auto_increment, `group_id` bigint unsigned not null comment 'chart group id', `configs` text, `weight` int not null default 0, PRIMARY KEY (`id`), KEY (`group_id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `chart_share` ( `id` bigint unsigned not null auto_increment, `cluster` varchar(128) not null, `datasource_id` bigint NOT NULL NOT NULL DEFAULT 0 COMMENT 'datasource id', `configs` text, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', primary key (`id`), key (`create_at`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `alert_rule` ( `id` bigint unsigned not null auto_increment, `group_id` bigint not null default 0 comment 'busi group id', `cate` varchar(128) not null, `datasource_ids` varchar(255) not null default '' comment 'datasource ids', `cluster` varchar(128) not null, `name` varchar(255) not null, `note` varchar(1024) not null default '', `prod` varchar(255) not null default '', `algorithm` varchar(255) not null default '', `algo_params` varchar(255), `delay` int not null default 0, `severity` tinyint(1) not null comment '1:Emergency 2:Warning 3:Notice', `disabled` tinyint(1) not null comment '0:enabled 1:disabled', `prom_for_duration` int not null comment 'prometheus for, unit:s', `rule_config` text not null comment 'rule_config', `prom_ql` text not null comment 'promql', `prom_eval_interval` int not null comment 'evaluate interval', `enable_stime` varchar(255) not null default '00:00', `enable_etime` varchar(255) not null default '23:59', `enable_days_of_week` varchar(255) not null default '' comment 'split by space: 0 1 2 3 4 5 6', `enable_in_bg` tinyint(1) not null default 0 comment '1: only this bg 0: global', `notify_recovered` tinyint(1) not null comment 'whether notify when recovery', `notify_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom', `notify_groups` varchar(255) not null default '' comment 'split by space: 233 43', `notify_repeat_step` int not null default 0 comment 'unit: min', `notify_max_number` int not null default 0 comment '', `recover_duration` int not null default 0 comment 'unit: s', `callbacks` varchar(4096) not null default '' comment 'split by space: http://a.com/api/x http://a.com/api/y', `runbook_url` varchar(4096), `append_tags` varchar(255) not null default '' comment 'split by space: service=n9e mod=api', `annotations` text not null comment 'annotations', `extra_config` text, `notify_rule_ids` varchar(1024) DEFAULT '', `notify_version` int DEFAULT 0, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', `cron_pattern` varchar(64), `time_zone` varchar(64) not null default '', `datasource_queries` text, PRIMARY KEY (`id`), KEY (`group_id`), KEY (`update_at`) ) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `alert_mute` ( `id` bigint unsigned not null auto_increment, `group_id` bigint not null default 0 comment 'busi group id', `prod` varchar(255) not null default '', `note` varchar(1024) not null default '', `cate` varchar(128) not null, `cluster` varchar(128) not null, `datasource_ids` varchar(255) not null default '' comment 'datasource ids', `tags` varchar(4096) default '[]' comment 'json,map,tagkey->regexp|value', `cause` varchar(255) not null default '', `btime` bigint not null default 0 comment 'begin time', `etime` bigint not null default 0 comment 'end time', `disabled` tinyint(1) not null default 0 comment '0:enabled 1:disabled', `mute_time_type` tinyint(1) not null default 0, `periodic_mutes` varchar(4096) not null default '', `severities` varchar(32) not null default '', `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`), KEY (`create_at`), KEY (`group_id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `alert_subscribe` ( `id` bigint unsigned not null auto_increment, `name` varchar(255) not null default '', `disabled` tinyint(1) not null default 0 comment '0:enabled 1:disabled', `group_id` bigint not null default 0 comment 'busi group id', `prod` varchar(255) not null default '', `cate` varchar(128) not null, `datasource_ids` varchar(255) not null default '' comment 'datasource ids', `cluster` varchar(128) not null, `rule_id` bigint not null default 0, `rule_ids` varchar(1024), `severities` varchar(32) not null default '', `tags` varchar(4096) not null default '' comment 'json,map,tagkey->regexp|value', `redefine_severity` tinyint(1) default 0 comment 'is redefine severity?', `new_severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice', `redefine_channels` tinyint(1) default 0 comment 'is redefine channels?', `new_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom', `user_group_ids` varchar(250) not null comment 'split by space 1 34 5, notify cc to user_group_ids', `busi_groups` varchar(4096), `note` VARCHAR(1024) DEFAULT '' COMMENT 'note', `webhooks` text not null, `extra_config` text, `redefine_webhooks` tinyint(1) default 0, `for_duration` bigint not null default 0, `notify_rule_ids` varchar(1024) DEFAULT '', `notify_version` int DEFAULT 0, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`), KEY (`update_at`), KEY (`group_id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `target` ( `id` bigint unsigned not null auto_increment, `group_id` bigint not null default 0 comment 'busi group id', `ident` varchar(191) not null comment 'target id', `note` varchar(255) not null default '' comment 'append to alert event as field', `tags` varchar(512) not null default '' comment 'append to series data as tags, split by space, append external space at suffix', `host_tags` text COMMENT 'global labels set in conf file', `host_ip` varchar(15) default '' COMMENT 'IPv4 string', `agent_version` varchar(255) default '' COMMENT 'agent version', `engine_name` varchar(255) DEFAULT '' COMMENT 'engine name', `os` VARCHAR(31) DEFAULT '' COMMENT 'os type', `update_at` bigint not null default 0, PRIMARY KEY (`id`), UNIQUE KEY (`ident`), KEY (`group_id`), INDEX `idx_host_ip` (`host_ip`), INDEX `idx_agent_version` (`agent_version`), INDEX `idx_engine_name` (`engine_name`), INDEX `idx_os` (`os`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `metric_view` ( `id` bigint unsigned not null auto_increment, `name` varchar(191) not null default '', `cate` tinyint(1) not null comment '0: preset 1: custom', `configs` varchar(8192) not null default '', `create_at` bigint not null default 0, `create_by` bigint not null default 0 comment 'user id', `update_at` bigint not null default 0, PRIMARY KEY (`id`), KEY (`create_by`) ) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4; insert into metric_view(name, cate, configs) values('Host View', 0, '{"filters":[{"oper":"=","label":"__name__","value":"cpu_usage_idle"}],"dynamicLabels":[],"dimensionLabels":[{"label":"ident","value":""}]}'); CREATE TABLE `recording_rule` ( `id` bigint unsigned not null auto_increment, `group_id` bigint not null default '0' comment 'group_id', `datasource_ids` varchar(255) not null default '' comment 'datasource ids', `cluster` varchar(128) not null, `name` varchar(255) not null comment 'new metric name', `note` varchar(255) not null comment 'rule note', `disabled` tinyint(1) not null default 0 comment '0:enabled 1:disabled', `prom_ql` varchar(8192) not null comment 'promql', `prom_eval_interval` int not null comment 'evaluate interval', `cron_pattern` varchar(255) default '' comment 'cron pattern', `append_tags` varchar(255) default '' comment 'split by space: service=n9e mod=api', `query_configs` text NOT NULL, `create_at` bigint default '0', `create_by` varchar(64) default '', `update_at` bigint default '0', `update_by` varchar(64) default '', `datasource_queries` text, PRIMARY KEY (`id`), KEY `group_id` (`group_id`), KEY `update_at` (`update_at`) ) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `alert_aggr_view` ( `id` bigint unsigned not null auto_increment, `name` varchar(191) not null default '', `rule` varchar(2048) not null default '', `cate` tinyint(1) not null comment '0: preset 1: custom', `create_at` bigint not null default 0, `create_by` bigint not null default 0 comment 'user id', `update_at` bigint not null default 0, PRIMARY KEY (`id`), KEY (`create_by`) ) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4; insert into alert_aggr_view(name, rule, cate) values('By BusiGroup, Severity', 'field:group_name::field:severity', 0); insert into alert_aggr_view(name, rule, cate) values('By RuleName', 'field:rule_name', 0); CREATE TABLE `alert_cur_event` ( `id` bigint unsigned not null comment 'use alert_his_event.id', `cate` varchar(128) not null, `datasource_id` bigint not null default 0 comment 'datasource id', `cluster` varchar(128) not null, `group_id` bigint unsigned not null comment 'busi group id of rule', `group_name` varchar(255) not null default '' comment 'busi group name', `hash` varchar(64) not null comment 'rule_id + vector_pk', `rule_id` bigint unsigned not null, `rule_name` varchar(255) not null, `rule_note` varchar(2048) not null default 'alert rule note', `rule_prod` varchar(255) not null default '', `rule_algo` varchar(255) not null default '', `severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice', `prom_for_duration` int not null comment 'prometheus for, unit:s', `prom_ql` varchar(8192) not null comment 'promql', `prom_eval_interval` int not null comment 'evaluate interval', `callbacks` varchar(2048) not null default '' comment 'split by space: http://a.com/api/x http://a.com/api/y', `runbook_url` varchar(255), `notify_recovered` tinyint(1) not null comment 'whether notify when recovery', `notify_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom', `notify_groups` varchar(255) not null default '' comment 'split by space: 233 43', `notify_repeat_next` bigint not null default 0 comment 'next timestamp to notify, get repeat settings from rule', `notify_cur_number` int not null default 0 comment '', `target_ident` varchar(191) not null default '' comment 'target ident, also in tags', `target_note` varchar(191) not null default '' comment 'target note', `first_trigger_time` bigint, `trigger_time` bigint not null, `trigger_value` text not null, `annotations` text not null comment 'annotations', `rule_config` text not null comment 'annotations', `tags` varchar(1024) not null default '' comment 'merge data_tags rule_tags, split by ,,', `original_tags` text comment 'labels key=val,,k2=v2', `notify_rule_ids` text COMMENT 'notify rule ids', PRIMARY KEY (`id`), KEY (`hash`), KEY (`rule_id`), KEY (`trigger_time`, `group_id`), KEY (`notify_repeat_next`) ) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `alert_his_event` ( `id` bigint unsigned not null AUTO_INCREMENT, `is_recovered` tinyint(1) not null, `cate` varchar(128) not null, `datasource_id` bigint not null default 0 comment 'datasource id', `cluster` varchar(128) not null, `group_id` bigint unsigned not null comment 'busi group id of rule', `group_name` varchar(255) not null default '' comment 'busi group name', `hash` varchar(64) not null comment 'rule_id + vector_pk', `rule_id` bigint unsigned not null, `rule_name` varchar(255) not null, `rule_note` varchar(2048) not null default 'alert rule note', `rule_prod` varchar(255) not null default '', `rule_algo` varchar(255) not null default '', `severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice', `prom_for_duration` int not null comment 'prometheus for, unit:s', `prom_ql` varchar(8192) not null comment 'promql', `prom_eval_interval` int not null comment 'evaluate interval', `callbacks` varchar(2048) not null default '' comment 'split by space: http://a.com/api/x http://a.com/api/y', `runbook_url` varchar(255), `notify_recovered` tinyint(1) not null comment 'whether notify when recovery', `notify_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom', `notify_groups` varchar(255) not null default '' comment 'split by space: 233 43', `notify_cur_number` int not null default 0 comment '', `target_ident` varchar(191) not null default '' comment 'target ident, also in tags', `target_note` varchar(191) not null default '' comment 'target note', `first_trigger_time` bigint, `trigger_time` bigint not null, `trigger_value` text not null, `recover_time` bigint not null default 0, `last_eval_time` bigint not null default 0 comment 'for time filter', `tags` varchar(1024) not null default '' comment 'merge data_tags rule_tags, split by ,,', `original_tags` text comment 'labels key=val,,k2=v2', `annotations` text not null comment 'annotations', `rule_config` text not null comment 'annotations', `notify_rule_ids` text COMMENT 'notify rule ids', PRIMARY KEY (`id`), INDEX `idx_last_eval_time` (`last_eval_time`), KEY (`hash`), KEY (`rule_id`), KEY (`trigger_time`, `group_id`) ) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `board_busigroup` ( `busi_group_id` bigint(20) NOT NULL DEFAULT '0' COMMENT 'busi group id', `board_id` bigint(20) NOT NULL DEFAULT '0' COMMENT 'board id', PRIMARY KEY (`busi_group_id`, `board_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE `builtin_components` ( `id` bigint UNSIGNED NOT NULL AUTO_INCREMENT COMMENT 'unique identifier', `ident` varchar(191) NOT NULL, `logo` mediumtext COMMENT '''logo of component''', `readme` text NOT NULL COMMENT '''readme of component''', `created_at` bigint NOT NULL DEFAULT 0 COMMENT '''create time''', `created_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''', `updated_at` bigint NOT NULL DEFAULT 0 COMMENT '''update time''', `updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''', `disabled` int NOT NULL DEFAULT 0 COMMENT '''is disabled or not''', PRIMARY KEY (`id`), KEY (`ident`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE `builtin_payloads` ( `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '''unique identifier''', `component_id` bigint NOT NULL DEFAULT 0 COMMENT '''component_id of payload''', `uuid` bigint(20) NOT NULL COMMENT '''uuid of payload''', `type` varchar(191) NOT NULL COMMENT '''type of payload''', `component` varchar(191) NOT NULL COMMENT '''component of payload''', `cate` varchar(191) NOT NULL COMMENT '''category of payload''', `name` varchar(191) NOT NULL COMMENT '''name of payload''', `tags` varchar(191) NOT NULL DEFAULT '' COMMENT '''tags of payload''', `content` longtext NOT NULL COMMENT '''content of payload''', `note` varchar(1024) NOT NULL DEFAULT '' COMMENT '''note of payload''', `created_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''create time''', `created_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''', `updated_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''update time''', `updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''', PRIMARY KEY (`id`), KEY `idx_component` (`component`), KEY `idx_name` (`name`), KEY `idx_cate` (`cate`), KEY `idx_uuid` (`uuid`), KEY `idx_type` (`type`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE notification_record ( `id` BIGINT PRIMARY KEY AUTO_INCREMENT, `notify_rule_id` BIGINT NOT NULL DEFAULT 0, `event_id` bigint NOT NULL COMMENT 'event history id', `sub_id` bigint COMMENT 'subscribed rule id', `channel` varchar(255) NOT NULL COMMENT 'notification channel name', `status` bigint COMMENT 'notification status', `target` varchar(1024) NOT NULL COMMENT 'notification target', `details` varchar(2048) DEFAULT '' COMMENT 'notification other info', `created_at` bigint NOT NULL COMMENT 'create time', INDEX idx_evt (event_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE `task_tpl` ( `id` int unsigned NOT NULL AUTO_INCREMENT, `group_id` int unsigned not null comment 'busi group id', `title` varchar(255) not null default '', `account` varchar(64) not null, `batch` int unsigned not null default 0, `tolerance` int unsigned not null default 0, `timeout` int unsigned not null default 0, `pause` varchar(255) not null default '', `script` text not null, `args` varchar(512) not null default '', `tags` varchar(255) not null default '' comment 'split by space', `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`), KEY (`group_id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `task_tpl_host` ( `ii` int unsigned NOT NULL AUTO_INCREMENT, `id` int unsigned not null comment 'task tpl id', `host` varchar(128) not null comment 'ip or hostname', PRIMARY KEY (`ii`), KEY (`id`, `host`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `task_record` ( `id` bigint unsigned not null comment 'ibex task id', `event_id` bigint not null comment 'event id' default 0, `group_id` bigint not null comment 'busi group id', `ibex_address` varchar(128) not null, `ibex_auth_user` varchar(128) not null default '', `ibex_auth_pass` varchar(128) not null default '', `title` varchar(255) not null default '', `account` varchar(64) not null, `batch` int unsigned not null default 0, `tolerance` int unsigned not null default 0, `timeout` int unsigned not null default 0, `pause` varchar(255) not null default '', `script` text not null, `args` varchar(512) not null default '', `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', PRIMARY KEY (`id`), KEY (`create_at`, `group_id`), KEY (`create_by`), INDEX `idx_event_id` (`event_id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `alerting_engines` ( `id` int unsigned NOT NULL AUTO_INCREMENT, `instance` varchar(128) not null default '' comment 'instance identification, e.g. 10.9.0.9:9090', `datasource_id` bigint not null default 0 comment 'datasource id', `engine_cluster` varchar(128) not null default '' comment 'n9e-alert cluster', `clock` bigint not null, PRIMARY KEY (`id`), INDEX `idx_inst` (`instance`), INDEX `idx_clock` (`clock`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `datasource` ( `id` int unsigned NOT NULL AUTO_INCREMENT, `name` varchar(191) not null default '', `identifier` varchar(255) not null default '', `description` varchar(255) not null default '', `category` varchar(255) not null default '', `plugin_id` int unsigned not null default 0, `plugin_type` varchar(255) not null default '', `plugin_type_name` varchar(255) not null default '', `cluster_name` varchar(255) not null default '', `settings` text not null, `status` varchar(255) not null default '', `http` varchar(4096) not null default '', `auth` varchar(8192) not null default '', `is_default` boolean COMMENT 'is default datasource', `weight` int not null default 0, `created_at` bigint not null default 0, `created_by` varchar(64) not null default '', `updated_at` bigint not null default 0, `updated_by` varchar(64) not null default '', UNIQUE KEY (`name`), PRIMARY KEY (`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `builtin_cate` ( `id` bigint unsigned not null auto_increment, `name` varchar(191) not null, `user_id` bigint not null default 0, PRIMARY KEY (`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `notify_tpl` ( `id` bigint unsigned not null auto_increment, `channel` varchar(32) not null, `name` varchar(255) not null, `content` text not null, `create_at` bigint DEFAULT 0 COMMENT 'create_at', `create_by` varchar(64) DEFAULT '' COMMENT 'create_by', `update_at` bigint DEFAULT 0 COMMENT 'update_at', `update_by` varchar(64) DEFAULT '' COMMENT 'update_by', PRIMARY KEY (`id`), UNIQUE KEY (`channel`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `sso_config` ( `id` bigint unsigned not null auto_increment, `name` varchar(191) not null, `content` text not null, `update_at` bigint DEFAULT 0 COMMENT 'update_at', PRIMARY KEY (`id`), UNIQUE KEY (`name`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `es_index_pattern` ( `id` bigint unsigned not null auto_increment, `datasource_id` bigint not null default 0 comment 'datasource id', `name` varchar(191) not null, `time_field` varchar(128) not null default '@timestamp', `allow_hide_system_indices` tinyint(1) not null default 0, `fields_format` varchar(4096) not null default '', `cross_cluster_enabled` int not null default 0, `note` varchar(1024) not null default '', `create_at` bigint default '0', `create_by` varchar(64) default '', `update_at` bigint default '0', `update_by` varchar(64) default '', PRIMARY KEY (`id`), UNIQUE KEY (`datasource_id`, `name`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `builtin_metrics` ( `id` bigint unsigned NOT NULL AUTO_INCREMENT COMMENT 'unique identifier', `collector` varchar(191) NOT NULL COMMENT '''type of collector''', `typ` varchar(191) NOT NULL COMMENT '''type of metric''', `name` varchar(191) NOT NULL COMMENT '''name of metric''', `unit` varchar(191) NOT NULL COMMENT '''unit of metric''', `lang` varchar(191) NOT NULL DEFAULT 'zh' COMMENT '''language''', `note` varchar(4096) NOT NULL COMMENT '''description of metric''', `expression` varchar(4096) NOT NULL COMMENT '''expression of metric''', `expression_type` varchar(32) NOT NULL DEFAULT 'promql' COMMENT '''expression type: metric_name or promql''', `metric_type` varchar(191) NOT NULL DEFAULT '' COMMENT '''metric type like counter/gauge''', `extra_fields` text COMMENT '''custom extra fields''', `created_at` bigint NOT NULL DEFAULT 0 COMMENT '''create time''', `created_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''', `updated_at` bigint NOT NULL DEFAULT 0 COMMENT '''update time''', `updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''', `uuid` bigint NOT NULL DEFAULT 0 COMMENT '''uuid''', PRIMARY KEY (`id`), INDEX `idx_uuid` (`uuid`), INDEX `idx_collector` (`collector`), INDEX `idx_typ` (`typ`), INDEX `idx_builtinmetric_name` (`name` ASC), INDEX `idx_lang` (`lang`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE `metric_filter` ( `id` bigint NOT NULL AUTO_INCREMENT COMMENT 'unique identifier', `name` varchar(191) NOT NULL COMMENT '''name of metric filter''', `configs` varchar(4096) NOT NULL COMMENT '''configuration of metric filter''', `groups_perm` text, `create_at` bigint NOT NULL DEFAULT 0 COMMENT '''create time''', `create_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''', `update_at` bigint NOT NULL DEFAULT 0 COMMENT '''update time''', `update_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''', PRIMARY KEY (`id`), INDEX `idx_metricfilter_name` (`name` ASC) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE `target_busi_group` ( `id` bigint NOT NULL AUTO_INCREMENT, `target_ident` varchar(191) NOT NULL, `group_id` bigint NOT NULL, `update_at` bigint NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `idx_target_group` (`target_ident`,`group_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE `dash_annotation` ( `id` bigint unsigned not null auto_increment, `dashboard_id` bigint not null comment 'dashboard id', `panel_id` varchar(191) not null comment 'panel id', `tags` text comment 'tags array json string', `description` text comment 'annotation description', `config` text comment 'annotation config', `time_start` bigint not null default 0 comment 'start timestamp', `time_end` bigint not null default 0 comment 'end timestamp', `create_at` bigint not null default 0 comment 'create time', `create_by` varchar(64) not null default '' comment 'creator', `update_at` bigint not null default 0 comment 'update time', `update_by` varchar(64) not null default '' comment 'updater', PRIMARY KEY (`id`), KEY `idx_dashboard_id` (`dashboard_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE `user_token` ( `id` bigint NOT NULL AUTO_INCREMENT, `username` varchar(255) NOT NULL DEFAULT '', `token_name` varchar(255) NOT NULL DEFAULT '', `token` varchar(255) NOT NULL DEFAULT '', `create_at` bigint NOT NULL DEFAULT 0, `last_used` bigint NOT NULL DEFAULT 0, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE `notify_rule` ( `id` bigint unsigned not null auto_increment, `name` varchar(255) not null, `description` text, `enable` tinyint(1) not null default 0, `user_group_ids` varchar(255) not null default '', `notify_configs` text, `pipeline_configs` text, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `notify_channel` ( `id` bigint unsigned not null auto_increment, `name` varchar(255) not null, `ident` varchar(255) not null, `description` text, `enable` tinyint(1) not null default 0, `param_config` text, `request_type` varchar(50) not null, `request_config` text, `weight` int not null default 0, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `message_template` ( `id` bigint unsigned not null auto_increment, `name` varchar(64) not null, `ident` varchar(64) not null, `content` text, `user_group_ids` varchar(64), `notify_channel_ident` varchar(64) not null default '', `private` int not null default 0, `weight` int not null default 0, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `event_pipeline` ( `id` bigint unsigned not null auto_increment, `name` varchar(128) not null, `team_ids` text, `description` varchar(255) not null default '', `filter_enable` tinyint(1) not null default 0, `label_filters` text, `attr_filters` text, `processor_configs` text, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `embedded_product` ( `id` bigint unsigned NOT NULL AUTO_INCREMENT, `name` varchar(255) DEFAULT NULL, `url` varchar(255) DEFAULT NULL, `is_private` boolean DEFAULT NULL, `team_ids` varchar(255), `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE `task_meta` ( `id` bigint unsigned NOT NULL AUTO_INCREMENT, `title` varchar(255) not null default '', `account` varchar(64) not null, `batch` bigint not null default 0, `tolerance` bigint not null default 0, `timeout` bigint not null default 0, `pause` varchar(255) not null default '', `script` text not null, `args` varchar(512) not null default '', `stdin` varchar(1024) not null default '', `creator` varchar(64) not null default '', `created` timestamp not null default CURRENT_TIMESTAMP, PRIMARY KEY (`id`), KEY `idx_task_meta_creator` (`creator`), KEY `idx_task_meta_created` (`created`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; /* start|cancel|kill|pause */ CREATE TABLE `task_action` ( `id` bigint unsigned not null, `action` varchar(32) not null, `clock` bigint not null default 0, PRIMARY KEY (`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `task_scheduler` ( `id` bigint unsigned not null, `scheduler` varchar(128) not null default '', KEY (`id`, `scheduler`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `task_scheduler_health` ( `scheduler` varchar(128) NOT NULL, `clock` bigint not null, UNIQUE KEY `idx_task_scheduler_health_scheduler` (`scheduler`), KEY (`clock`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `task_host_doing` ( `id` bigint unsigned not null, `host` varchar(128) not null, `clock` bigint not null default 0, `action` varchar(16) not null, KEY `idx_task_host_doing_id` (`id`), KEY `idx_task_host_doing_host` (`host`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_0 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_1 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_2 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_3 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_4 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_5 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_6 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_7 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_8 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_9 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_10 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_11 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_12 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_13 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_14 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_15 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_16 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_17 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_18 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_19 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_20 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_21 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_22 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_23 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_24 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_25 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_26 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_27 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_28 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_29 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_30 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_31 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_32 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_33 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_34 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_35 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_36 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_37 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_38 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_39 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_40 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_41 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_42 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_43 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_44 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_45 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_46 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_47 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_48 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_49 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_50 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_51 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_52 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_53 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_54 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_55 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_56 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_57 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_58 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_59 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_60 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_61 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_62 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_63 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_64 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_65 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_66 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_67 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_68 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_69 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_70 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_71 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_72 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_73 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_74 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_75 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_76 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_77 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_78 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_79 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_80 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_81 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_82 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_83 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_84 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_85 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_86 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_87 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_88 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_89 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_90 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_91 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_92 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_93 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_94 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_95 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_96 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_97 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_98 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE task_host_99 ( `ii` bigint unsigned NOT NULL AUTO_INCREMENT, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, UNIQUE KEY `idx_id_host` (`id`, `host`), PRIMARY KEY (`ii`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `source_token` ( `id` bigint unsigned NOT NULL AUTO_INCREMENT, `source_type` varchar(64) NOT NULL DEFAULT '' COMMENT 'source type', `source_id` varchar(255) NOT NULL DEFAULT '' COMMENT 'source identifier', `token` varchar(255) NOT NULL DEFAULT '' COMMENT 'access token', `expire_at` bigint NOT NULL DEFAULT 0 COMMENT 'expire timestamp', `create_at` bigint NOT NULL DEFAULT 0 COMMENT 'create timestamp', `create_by` varchar(64) NOT NULL DEFAULT '' COMMENT 'creator', PRIMARY KEY (`id`), KEY `idx_source_type_id_token` (`source_type`, `source_id`, `token`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; ================================================ FILE: docker/initsql/c-init.sql ================================================ CREATE USER IF NOT EXISTS 'root'@'127.0.0.1' IDENTIFIED BY '1234'; GRANT ALL PRIVILEGES ON *.* TO 'root'@'127.0.0.1' WITH GRANT OPTION; CREATE USER IF NOT EXISTS 'root'@'localhost' IDENTIFIED BY '1234'; GRANT ALL PRIVILEGES ON *.* TO 'root'@'localhost' WITH GRANT OPTION; CREATE USER IF NOT EXISTS 'root'@'%' IDENTIFIED BY '1234'; GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' WITH GRANT OPTION; FLUSH PRIVILEGES; ================================================ FILE: docker/migratesql/migrate.sql ================================================ /* v7.0.0-beta.3 */ CREATE TABLE `builtin_metrics` ( `id` bigint unsigned NOT NULL AUTO_INCREMENT COMMENT 'unique identifier', `collector` varchar(191) NOT NULL COMMENT 'type of collector', `typ` varchar(191) NOT NULL COMMENT 'type of metric', `name` varchar(191) NOT NULL COMMENT 'name of metric', `unit` varchar(191) NOT NULL COMMENT 'unit of metric', `lang` varchar(191) NOT NULL DEFAULT '' COMMENT 'language of metric', `note` varchar(4096) NOT NULL COMMENT 'description of metric in Chinese', `expression` varchar(4096) NOT NULL COMMENT 'expression of metric', `created_at` bigint NOT NULL DEFAULT 0 COMMENT 'create time', `created_by` varchar(191) NOT NULL DEFAULT '' COMMENT 'creator', `updated_at` bigint NOT NULL DEFAULT 0 COMMENT 'update time', `updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT 'updater', PRIMARY KEY (`id`), INDEX `idx_collector` (`collector`), INDEX `idx_typ` (`typ`), INDEX `idx_name` (`name`), INDEX `idx_lang` (`lang`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE `metric_filter` ( `id` bigint NOT NULL AUTO_INCREMENT COMMENT 'unique identifier', `name` varchar(191) NOT NULL COMMENT 'name of metric filter', `configs` varchar(4096) NOT NULL COMMENT 'configuration of metric filter', `groups_perm` text, `create_at` bigint NOT NULL DEFAULT '0' COMMENT 'create time', `create_by` varchar(191) NOT NULL DEFAULT '' COMMENT 'creator', `update_at` bigint NOT NULL DEFAULT '0' COMMENT 'update time', `update_by` varchar(191) NOT NULL DEFAULT '' COMMENT 'updater', PRIMARY KEY (`id`), KEY `idx_name` (`name`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE `board_busigroup` ( `busi_group_id` bigint(20) NOT NULL DEFAULT '0' COMMENT 'busi group id', `board_id` bigint(20) NOT NULL DEFAULT '0' COMMENT 'board id', PRIMARY KEY (`busi_group_id`, `board_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; /* v7.0.0-beta.6 */ CREATE TABLE `builtin_components` ( `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '''unique identifier''', `ident` varchar(191) NOT NULL COMMENT '''identifier of component''', `logo` varchar(191) NOT NULL COMMENT '''logo of component''', `readme` text NOT NULL COMMENT '''readme of component''', `created_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''create time''', `created_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''', `updated_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''update time''', `updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''', PRIMARY KEY (`id`), KEY `idx_ident` (`ident`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; CREATE TABLE `builtin_payloads` ( `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '''unique identifier''', `uuid` bigint(20) NOT NULL COMMENT '''uuid of payload''', `type` varchar(191) NOT NULL COMMENT '''type of payload''', `component` varchar(191) NOT NULL COMMENT '''component of payload''', `cate` varchar(191) NOT NULL COMMENT '''category of payload''', `name` varchar(191) NOT NULL COMMENT '''name of payload''', `tags` varchar(191) NOT NULL DEFAULT '' COMMENT '''tags of payload''', `content` longtext NOT NULL COMMENT '''content of payload''', `created_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''create time''', `created_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''creator''', `updated_at` bigint(20) NOT NULL DEFAULT 0 COMMENT '''update time''', `updated_by` varchar(191) NOT NULL DEFAULT '' COMMENT '''updater''', PRIMARY KEY (`id`), KEY `idx_component` (`component`), KEY `idx_name` (`name`), KEY `idx_cate` (`cate`), KEY `idx_uuid` (`uuid`), KEY `idx_type` (`type`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; /* v7.0.0-beta.7 */ ALTER TABLE users ADD COLUMN last_active_time BIGINT NOT NULL DEFAULT 0; /* v7.0.0-beta.13 */ ALTER TABLE recording_rule ADD COLUMN cron_pattern VARCHAR(255) DEFAULT '' COMMENT 'cron pattern'; /* v7.0.0-beta.14 */ ALTER TABLE alert_cur_event ADD COLUMN original_tags TEXT COMMENT 'labels key=val,,k2=v2'; ALTER TABLE alert_his_event ADD COLUMN original_tags TEXT COMMENT 'labels key=val,,k2=v2'; /* v7.1.0 */ ALTER TABLE target ADD COLUMN os VARCHAR(31) DEFAULT '' COMMENT 'os type'; /* v7.2.0 */ CREATE TABLE notification_record ( `id` BIGINT PRIMARY KEY AUTO_INCREMENT, `event_id` BIGINT NOT NULL, `sub_id` BIGINT NOT NULL, `channel` VARCHAR(255) NOT NULL, `status` TINYINT NOT NULL DEFAULT 0, `target` VARCHAR(1024) NOT NULL, `details` VARCHAR(2048), `created_at` BIGINT NOT NULL, INDEX idx_evt (event_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; /* v7.3.0 2024-08-26 */ ALTER TABLE `target` ADD COLUMN `host_tags` TEXT COMMENT 'global labels set in conf file'; /* v7.3.4 2024-08-28 */ ALTER TABLE `builtin_payloads` ADD COLUMN `component_id` bigint(20) NOT NULL DEFAULT 0 COMMENT 'component_id'; /* v7.4.0 2024-09-20 */ CREATE TABLE `target_busi_group` ( `id` bigint NOT NULL AUTO_INCREMENT, `target_ident` varchar(191) NOT NULL, `group_id` bigint NOT NULL, `update_at` bigint NOT NULL, PRIMARY KEY (`id`), UNIQUE KEY `idx_target_group` (`target_ident`,`group_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; /* v7.7.0 2024-11-13 */ ALTER TABLE `recording_rule` ADD COLUMN `datasource_queries` TEXT; ALTER TABLE `alert_rule` ADD COLUMN `datasource_queries` TEXT; /* v7.7.2 2024-12-02 */ ALTER TABLE alert_subscribe MODIFY COLUMN rule_ids varchar(1024); ALTER TABLE alert_subscribe MODIFY COLUMN busi_groups varchar(4096); /* v8.0.0-beta.1 2024-12-13 */ ALTER TABLE `alert_rule` ADD COLUMN `cron_pattern` VARCHAR(64); ALTER TABLE `builtin_components` MODIFY COLUMN `logo` mediumtext COMMENT '''logo of component'''; /* v8.0.0-beta.2 2024-12-26 */ ALTER TABLE `es_index_pattern` ADD COLUMN `cross_cluster_enabled` int not null default 0; /* v8.0.0-beta.3 2025-01-03 */ ALTER TABLE `builtin_components` ADD COLUMN `disabled` INT NOT NULL DEFAULT 0 COMMENT 'is disabled or not'; CREATE TABLE `dash_annotation` ( `id` bigint unsigned not null auto_increment, `dashboard_id` bigint not null comment 'dashboard id', `panel_id` varchar(191) not null comment 'panel id', `tags` text comment 'tags array json string', `description` text comment 'annotation description', `config` text comment 'annotation config', `time_start` bigint not null default 0 comment 'start timestamp', `time_end` bigint not null default 0 comment 'end timestamp', `create_at` bigint not null default 0 comment 'create time', `create_by` varchar(64) not null default '' comment 'creator', `update_at` bigint not null default 0 comment 'update time', `update_by` varchar(64) not null default '' comment 'updater', PRIMARY KEY (`id`), KEY `idx_dashboard_id` (`dashboard_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; /* v8.0.0-beta.5 2025-02-05 */ CREATE TABLE `user_token` ( `id` bigint NOT NULL AUTO_INCREMENT, `username` varchar(255) NOT NULL DEFAULT '', `token_name` varchar(255) NOT NULL DEFAULT '', `token` varchar(255) NOT NULL DEFAULT '', `create_at` bigint NOT NULL DEFAULT 0, `last_used` bigint NOT NULL DEFAULT 0, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; /* v8.0.0-beta.7 2025-03-01 */ CREATE TABLE `notify_rule` ( `id` bigint unsigned not null auto_increment, `name` varchar(255) not null, `description` text, `enable` tinyint(1) not null default 0, `user_group_ids` varchar(255) not null default '', `notify_configs` text, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `notify_channel` ( `id` bigint unsigned not null auto_increment, `name` varchar(255) not null, `ident` varchar(255) not null, `description` text, `enable` tinyint(1) not null default 0, `param_config` text, `request_type` varchar(50) not null, `request_config` text, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; CREATE TABLE `message_template` ( `id` bigint unsigned not null auto_increment, `name` varchar(64) not null, `ident` varchar(64) not null, `content` text, `user_group_ids` varchar(64), `notify_channel_ident` varchar(64) not null default '', `private` int not null default 0, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; ALTER TABLE `alert_rule` ADD COLUMN `notify_rule_ids` varchar(1024) DEFAULT ''; ALTER TABLE `alert_rule` ADD COLUMN `notify_version` int DEFAULT 0; ALTER TABLE `alert_subscribe` ADD COLUMN `notify_rule_ids` varchar(1024) DEFAULT ''; ALTER TABLE `alert_subscribe` ADD COLUMN `notify_version` int DEFAULT 0; ALTER TABLE `notification_record` ADD COLUMN `notify_rule_id` BIGINT NOT NULL DEFAULT 0; /* v8.0.0-beta.9 2025-03-17 */ ALTER TABLE `message_template` ADD COLUMN `weight` int not null default 0; ALTER TABLE `notify_channel` ADD COLUMN `weight` int not null default 0; /* v8.0.0-beta.11 2025-04-10 */ ALTER TABLE `es_index_pattern` ADD COLUMN `note` varchar(1024) not null default ''; ALTER TABLE `datasource` ADD COLUMN `identifier` varchar(255) not null default ''; /* v8.0.0-beta.11 2025-05-15 */ ALTER TABLE `notify_rule` ADD COLUMN `pipeline_configs` text; CREATE TABLE `event_pipeline` ( `id` bigint unsigned not null auto_increment, `name` varchar(128) not null, `team_ids` text, `description` varchar(255) not null default '', `filter_enable` tinyint(1) not null default 0, `attr_filters` text, `processor_configs` text, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`) ) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4; /* v8.0.0 2025-05-15 */ CREATE TABLE `embedded_product` ( `id` bigint unsigned NOT NULL AUTO_INCREMENT, `name` varchar(255) DEFAULT NULL, `url` varchar(255) DEFAULT NULL, `is_private` boolean DEFAULT NULL, `team_ids` varchar(255), `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; /* v8.0.0 2025-05-29 */ CREATE TABLE `source_token` ( `id` bigint unsigned NOT NULL AUTO_INCREMENT, `source_type` varchar(64) NOT NULL DEFAULT '' COMMENT 'source type', `source_id` varchar(255) NOT NULL DEFAULT '' COMMENT 'source identifier', `token` varchar(255) NOT NULL DEFAULT '' COMMENT 'access token', `expire_at` bigint NOT NULL DEFAULT 0 COMMENT 'expire timestamp', `create_at` bigint NOT NULL DEFAULT 0 COMMENT 'create timestamp', `create_by` varchar(64) NOT NULL DEFAULT '' COMMENT 'creator', PRIMARY KEY (`id`), KEY `idx_source_type_id_token` (`source_type`, `source_id`, `token`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; /* v8.0.0-beta.12 2025-06-03 */ ALTER TABLE `alert_his_event` ADD COLUMN `notify_rule_ids` text COMMENT 'notify rule ids'; ALTER TABLE `alert_cur_event` ADD COLUMN `notify_rule_ids` text COMMENT 'notify rule ids'; /* v8.0.0-beta.13 */ -- 删除 builtin_metrics 表的 idx_collector_typ_name 唯一索引 DROP INDEX IF EXISTS `idx_collector_typ_name` ON `builtin_metrics`; /* v8.0.0 2025-07-03 */ ALTER TABLE `builtin_metrics` ADD COLUMN `translation` TEXT COMMENT 'translation of metric' AFTER `lang`; /* v8.4.0 2025-10-15 */ ALTER TABLE `notify_rule` ADD COLUMN `extra_config` text COMMENT 'extra config'; /* v8.4.1 2025-11-10 */ ALTER TABLE `alert_rule` ADD COLUMN `pipeline_configs` text COMMENT 'pipeline configs'; /* v8.4.2 2025-11-13 */ ALTER TABLE `board` ADD COLUMN `note` varchar(1024) not null default '' comment 'note'; ALTER TABLE `builtin_payloads` ADD COLUMN `note` varchar(1024) not null default '' comment 'note of payload'; /* v9 2026-01-09 */ ALTER TABLE `event_pipeline` ADD COLUMN `typ` varchar(128) NOT NULL DEFAULT '' COMMENT 'pipeline type: builtin, user-defined'; ALTER TABLE `event_pipeline` ADD COLUMN `use_case` varchar(128) NOT NULL DEFAULT '' COMMENT 'use case: metric_explorer, event_summary, event_pipeline'; ALTER TABLE `event_pipeline` ADD COLUMN `trigger_mode` varchar(128) NOT NULL DEFAULT 'event' COMMENT 'trigger mode: event, api, cron'; ALTER TABLE `event_pipeline` ADD COLUMN `disabled` tinyint(1) NOT NULL DEFAULT 0 COMMENT 'disabled flag'; ALTER TABLE `event_pipeline` ADD COLUMN `nodes` text COMMENT 'workflow nodes (JSON)'; ALTER TABLE `event_pipeline` ADD COLUMN `connections` text COMMENT 'node connections (JSON)'; ALTER TABLE `event_pipeline` ADD COLUMN `input_variables` text COMMENT 'input variables (JSON)'; ALTER TABLE `event_pipeline` ADD COLUMN `label_filters` text COMMENT 'label filters (JSON)'; CREATE TABLE `event_pipeline_execution` ( `id` varchar(36) NOT NULL COMMENT 'execution id', `pipeline_id` bigint NOT NULL COMMENT 'pipeline id', `pipeline_name` varchar(128) DEFAULT '' COMMENT 'pipeline name snapshot', `event_id` bigint DEFAULT 0 COMMENT 'related alert event id', `mode` varchar(16) NOT NULL DEFAULT 'event' COMMENT 'trigger mode: event/api/cron', `status` varchar(16) NOT NULL DEFAULT 'running' COMMENT 'status: running/success/failed', `node_results` mediumtext COMMENT 'node execution results (JSON)', `error_message` varchar(1024) DEFAULT '' COMMENT 'error message', `error_node` varchar(36) DEFAULT '' COMMENT 'error node id', `created_at` bigint NOT NULL DEFAULT 0 COMMENT 'start timestamp', `finished_at` bigint DEFAULT 0 COMMENT 'finish timestamp', `duration_ms` bigint DEFAULT 0 COMMENT 'duration in milliseconds', `trigger_by` varchar(64) DEFAULT '' COMMENT 'trigger by', `inputs_snapshot` text COMMENT 'inputs snapshot', PRIMARY KEY (`id`), KEY `idx_pipeline_id` (`pipeline_id`), KEY `idx_event_id` (`event_id`), KEY `idx_mode` (`mode`), KEY `idx_status` (`status`), KEY `idx_created_at` (`created_at`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='event pipeline execution records'; /* v8.5.0 builtin_metrics new fields */ ALTER TABLE `builtin_metrics` ADD COLUMN `expression_type` varchar(32) NOT NULL DEFAULT 'promql' COMMENT 'expression type: metric_name or promql'; ALTER TABLE `builtin_metrics` ADD COLUMN `metric_type` varchar(191) NOT NULL DEFAULT '' COMMENT 'metric type like counter/gauge'; ALTER TABLE `builtin_metrics` ADD COLUMN `extra_fields` text COMMENT 'custom extra fields'; /* v9 2026-01-16 saved_view */ CREATE TABLE `saved_view` ( `id` bigint NOT NULL AUTO_INCREMENT, `name` varchar(255) NOT NULL COMMENT 'view name', `page` varchar(64) NOT NULL COMMENT 'page identifier', `filter` text COMMENT 'filter config (JSON)', `public_cate` int NOT NULL DEFAULT 0 COMMENT 'public category: 0-self, 1-team, 2-all', `gids` text COMMENT 'team group ids (JSON)', `create_at` bigint NOT NULL DEFAULT 0 COMMENT 'create timestamp', `create_by` varchar(64) NOT NULL DEFAULT '' COMMENT 'creator', `update_at` bigint NOT NULL DEFAULT 0 COMMENT 'update timestamp', `update_by` varchar(64) NOT NULL DEFAULT '' COMMENT 'updater', PRIMARY KEY (`id`), KEY `idx_page` (`page`), KEY `idx_create_by` (`create_by`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='saved views for pages'; CREATE TABLE `user_view_favorite` ( `id` bigint NOT NULL AUTO_INCREMENT, `view_id` bigint NOT NULL COMMENT 'saved view id', `user_id` bigint NOT NULL COMMENT 'user id', `create_at` bigint NOT NULL DEFAULT 0 COMMENT 'create timestamp', PRIMARY KEY (`id`), KEY `idx_view_id` (`view_id`), KEY `idx_user_id` (`user_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='user favorite views'; /* v9 2026-01-20 datasource weight */ ALTER TABLE `datasource` ADD COLUMN `weight` int not null default 0 COMMENT 'weight for sorting'; /* v9 2026-01-20 alert_rule time_zone support */ ALTER TABLE `alert_rule` ADD COLUMN `time_zone` varchar(64) not null default ''; ================================================ FILE: docker/sqlite.sql ================================================ CREATE TABLE `users` ( `id` integer primary key autoincrement, `username` varchar(64) not null unique, `nickname` varchar(64) not null, `password` varchar(128) not null default '', `phone` varchar(16) not null default '', `email` varchar(64) not null default '', `portrait` varchar(255) not null default '', `roles` varchar(255) not null, `contacts` varchar(1024), `maintainer` tinyint(1) not null default 0, `belong` varchar(16) not null default '', `last_active_time` bigint not null default 0, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '' ); CREATE UNIQUE INDEX idx_users_username ON `users` (username); insert into `users`(id, username, nickname, password, roles, create_at, create_by, update_at, update_by) values(1, 'root', '超管', 'root.2020', 'Admin', strftime('%s', 'now'), 'system', strftime('%s', 'now'), 'system'); CREATE TABLE `user_group` ( `id` integer primary key autoincrement, `name` varchar(128) not null default '', `note` varchar(255) not null default '', `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '' ); CREATE INDEX `idx_user_group_create_by` ON `user_group` (`create_by` asc); CREATE INDEX `idx_user_group_update_at` ON `user_group` (`update_at` asc); insert into user_group(id, name, create_at, create_by, update_at, update_by) values(1, 'demo-root-group', strftime('%s', 'now'), 'root', strftime('%s', 'now'), 'root'); CREATE TABLE `user_group_member` ( `id` integer primary key autoincrement, `group_id` bigint unsigned not null, `user_id` bigint unsigned not null ); CREATE INDEX `idx_user_group_member_group_id` ON `user_group_member` (`group_id` asc); CREATE INDEX `idx_user_group_member_user_id` ON `user_group_member` (`user_id` asc); insert into user_group_member(group_id, user_id) values(1, 1); CREATE TABLE `configs` ( `id` integer primary key autoincrement, `ckey` varchar(191) not null, `cval` text not null, `note` varchar(1024) not null default '', `external` tinyint(1) not null default 0, `encrypted` tinyint(1) not null default 0, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '' ); CREATE TABLE `role` ( `id` integer primary key autoincrement, `name` varchar(191) not null unique default '', `note` varchar(255) not null default '' ); insert into `role`(name, note) values('Admin', 'Administrator role'); insert into `role`(name, note) values('Standard', 'Ordinary user role'); insert into `role`(name, note) values('Guest', 'Readonly user role'); CREATE TABLE `role_operation`( `id` integer primary key autoincrement, `role_name` varchar(128) not null, `operation` varchar(191) not null ); CREATE INDEX `idx_role_operation_role_name` ON `role_operation` (`role_name` asc); CREATE INDEX `idx_role_operation_operation` ON `role_operation` (`operation` asc); -- Admin is special, who has no concrete operation but can do anything. insert into `role_operation`(role_name, operation) values('Guest', '/metric/explorer'); insert into `role_operation`(role_name, operation) values('Guest', '/object/explorer'); insert into `role_operation`(role_name, operation) values('Guest', '/log/explorer'); insert into `role_operation`(role_name, operation) values('Guest', '/trace/explorer'); insert into `role_operation`(role_name, operation) values('Guest', '/help/version'); insert into `role_operation`(role_name, operation) values('Guest', '/help/contact'); insert into `role_operation`(role_name, operation) values('Standard', '/metric/explorer'); insert into `role_operation`(role_name, operation) values('Standard', '/object/explorer'); insert into `role_operation`(role_name, operation) values('Standard', '/log/explorer'); insert into `role_operation`(role_name, operation) values('Standard', '/trace/explorer'); insert into `role_operation`(role_name, operation) values('Standard', '/help/version'); insert into `role_operation`(role_name, operation) values('Standard', '/help/contact'); insert into `role_operation`(role_name, operation) values('Standard', '/help/servers'); insert into `role_operation`(role_name, operation) values('Standard', '/help/migrate'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules-built-in'); insert into `role_operation`(role_name, operation) values('Standard', '/dashboards-built-in'); insert into `role_operation`(role_name, operation) values('Standard', '/trace/dependencies'); insert into `role_operation`(role_name, operation) values('Admin', '/help/source'); insert into `role_operation`(role_name, operation) values('Admin', '/help/sso'); insert into `role_operation`(role_name, operation) values('Admin', '/help/notification-tpls'); insert into `role_operation`(role_name, operation) values('Admin', '/help/notification-settings'); insert into `role_operation`(role_name, operation) values('Standard', '/users'); insert into `role_operation`(role_name, operation) values('Standard', '/user-groups'); insert into `role_operation`(role_name, operation) values('Standard', '/user-groups/add'); insert into `role_operation`(role_name, operation) values('Standard', '/user-groups/put'); insert into `role_operation`(role_name, operation) values('Standard', '/user-groups/del'); insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups'); insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups/add'); insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups/put'); insert into `role_operation`(role_name, operation) values('Standard', '/busi-groups/del'); insert into `role_operation`(role_name, operation) values('Standard', '/targets'); insert into `role_operation`(role_name, operation) values('Standard', '/targets/add'); insert into `role_operation`(role_name, operation) values('Standard', '/targets/put'); insert into `role_operation`(role_name, operation) values('Standard', '/targets/del'); insert into `role_operation`(role_name, operation) values('Standard', '/dashboards'); insert into `role_operation`(role_name, operation) values('Standard', '/dashboards/add'); insert into `role_operation`(role_name, operation) values('Standard', '/dashboards/put'); insert into `role_operation`(role_name, operation) values('Standard', '/dashboards/del'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules/add'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules/put'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-rules/del'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-mutes'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-mutes/add'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-mutes/del'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes/add'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes/put'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-subscribes/del'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-cur-events'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-cur-events/del'); insert into `role_operation`(role_name, operation) values('Standard', '/alert-his-events'); insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls'); insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls/add'); insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls/put'); insert into `role_operation`(role_name, operation) values('Standard', '/job-tpls/del'); insert into `role_operation`(role_name, operation) values('Standard', '/job-tasks'); insert into `role_operation`(role_name, operation) values('Standard', '/job-tasks/add'); insert into `role_operation`(role_name, operation) values('Standard', '/job-tasks/put'); insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules'); insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules/add'); insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules/put'); insert into `role_operation`(role_name, operation) values('Standard', '/recording-rules/del'); -- for alert_rule | collect_rule | mute | dashboard grouping CREATE TABLE `busi_group` ( `id` integer primary key autoincrement, `name` varchar(191) not null unique, `label_enable` tinyint(1) not null default 0, `label_value` varchar(191) not null default '', `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '' ); insert into busi_group(id, name, create_at, create_by, update_at, update_by) values(1, 'Default Busi Group', strftime('%s', 'now'), 'root', strftime('%s', 'now'), 'root'); CREATE TABLE `busi_group_member` ( `id` integer primary key autoincrement, `busi_group_id` bigint not null, `user_group_id` bigint not null, `perm_flag` char(2) not null ); CREATE INDEX `idx_busi_group_member_busi_group_id` ON `busi_group_member` (`busi_group_id` asc); CREATE INDEX `idx_busi_group_member_user_group_id` ON `busi_group_member` (`user_group_id` asc); insert into busi_group_member(busi_group_id, user_group_id, perm_flag) values(1, 1, 'rw'); -- for dashboard new version CREATE TABLE `board` ( `id` integer primary key autoincrement, `group_id` bigint not null default 0, `name` varchar(191) not null, `ident` varchar(200) not null default '', `tags` varchar(255) not null, `public` tinyint(1) not null default 0, `built_in` tinyint(1) not null default 0, `hide` tinyint(1) not null default 0, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', `note` varchar(1024) not null default '', `public_cate` bigint not null default 0 ); CREATE UNIQUE INDEX idx_board_group_id_name ON `board` (group_id, name); CREATE INDEX `idx_board_ident` ON `board` (`ident` asc); -- for dashboard new version CREATE TABLE `board_payload` ( `id` bigint unsigned not null unique, `payload` mediumtext not null ); CREATE TABLE `chart` ( `id` integer primary key autoincrement, `group_id` integer not null, `configs` text, `weight` integer not null default 0 ); CREATE INDEX idx_chart_group_id ON `chart` (group_id); CREATE TABLE `chart_share` ( `id` integer primary key autoincrement, `cluster` varchar(128) not null, `datasource_id` bigint unsigned not null default 0, `configs` text, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '' ); CREATE INDEX `idx_chart_share_create_at` ON `chart_share` (`create_at` asc); CREATE TABLE `alert_rule` ( `id` integer primary key autoincrement, `group_id` bigint not null default 0, `cate` varchar(128) not null, `datasource_ids` varchar(255) not null default '', `cluster` varchar(128) not null, `name` varchar(255) not null, `note` varchar(1024) not null default '', `prod` varchar(255) not null default '', `algorithm` varchar(255) not null default '', `algo_params` varchar(255), `delay` int not null default 0, `severity` tinyint(1) not null, `disabled` tinyint(1) not null, `prom_for_duration` int not null, `rule_config` text not null, `prom_ql` text not null, `prom_eval_interval` int not null, `enable_stime` varchar(255) not null default '00:00', `enable_etime` varchar(255) not null default '23:59', `enable_days_of_week` varchar(255) not null default '', `enable_in_bg` tinyint(1) not null default 0, `notify_recovered` tinyint(1) not null, `notify_channels` varchar(255) not null default '', `notify_groups` varchar(255) not null default '', `notify_repeat_step` int not null default 0, `notify_max_number` int not null default 0, `recover_duration` int not null default 0 , `callbacks` varchar(4096) not null default '', `runbook_url` varchar(4096), `append_tags` varchar(255) not null default '', `annotations` text not null, `extra_config` text not null, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '', `cron_pattern` varchar(64), `time_zone` varchar(64) not null default '', `datasource_queries` text ); CREATE INDEX `idx_alert_rule_group_id` ON `alert_rule` (`group_id` asc); CREATE INDEX `idx_alert_rule_update_at` ON `alert_rule` (`update_at` asc); CREATE TABLE `alert_mute` ( `id` integer primary key autoincrement, `group_id` bigint not null default 0, `prod` varchar(255) not null default '', `note` varchar(1024) not null default '', `cate` varchar(128) not null, `cluster` varchar(128) not null, `datasource_ids` varchar(255) not null default '', `tags` varchar(4096) default '[]', `cause` varchar(255) not null default '', `btime` bigint not null default 0, `etime` bigint not null default 0, `disabled` tinyint(1) not null default 0, `mute_time_type` tinyint(1) not null default 0, `periodic_mutes` varchar(4096) not null default '', `severities` varchar(32) not null default '', `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '' ); CREATE INDEX `idx_alert_mute_create_at` ON `alert_mute` (`create_at` asc); CREATE INDEX `idx_alert_mute_group_id` ON `alert_mute` (`group_id` asc); CREATE TABLE `alert_subscribe` ( `id` integer primary key autoincrement, `name` varchar(255) not null default '', `disabled` tinyint(1) not null default 0, `group_id` bigint not null default 0, `prod` varchar(255) not null default '', `cate` varchar(128) not null, `datasource_ids` varchar(255) not null default '', `cluster` varchar(128) not null, `rule_id` bigint not null default 0, `severities` varchar(32) not null default '', `tags` varchar(4096) not null default '', `redefine_severity` tinyint(1) default 0, `new_severity` tinyint(1) not null, `redefine_channels` tinyint(1) default 0, `new_channels` varchar(255) not null default '', `user_group_ids` varchar(250) not null, `busi_groups` VARCHAR(4096) NOT NULL DEFAULT '[]', `note` VARCHAR(1024) DEFAULT '', `rule_ids` VARCHAR(1024) DEFAULT '', `webhooks` text not null, `extra_config` text not null, `redefine_webhooks` tinyint(1) default 0, `for_duration` bigint not null default 0, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '' ); CREATE INDEX `idx_alert_subscribe_update_at` ON `alert_subscribe` (`update_at` asc); CREATE INDEX `idx_alert_subscribe_group_id` ON `alert_subscribe` (`group_id` asc); CREATE TABLE `target` ( `id` integer primary key autoincrement, `group_id` bigint not null default 0, `ident` varchar(191) not null unique, `note` varchar(255) not null default '', `tags` varchar(512) not null default '', `host_ip` varchar(15) default '', `agent_version` varchar(255) default '', `host_tags` text, `engine_name` varchar(255) default '', `os` varchar(31) default '', `update_at` bigint not null default 0 ); CREATE INDEX `idx_target_group_id` ON `target` (`group_id` asc); CREATE UNIQUE INDEX idx_target_ident ON `target` (ident); CREATE INDEX idx_host_ip ON `target` (host_ip); CREATE INDEX idx_agent_version ON `target` (agent_version); CREATE INDEX idx_engine_name ON `target` (engine_name); CREATE INDEX idx_os ON `target` (os); CREATE TABLE `metric_view` ( `id` integer primary key autoincrement, `name` varchar(191) not null default '', `cate` tinyint(1) not null, `configs` varchar(8192) not null default '', `create_at` bigint not null default 0, `create_by` bigint not null default 0, `update_at` bigint not null default 0 ); CREATE INDEX `idx_metric_view_create_by` ON `metric_view` (`create_by` asc); insert into metric_view(name, cate, configs) values('Host View', 0, '{"filters":[{"oper":"=","label":"__name__","value":"cpu_usage_idle"}],"dynamicLabels":[],"dimensionLabels":[{"label":"ident","value":""}]}'); CREATE TABLE `recording_rule` ( `id` integer primary key autoincrement, `group_id` bigint not null default '0', `datasource_ids` varchar(255) not null default '', `cluster` varchar(128) not null, `name` varchar(255) not null, `note` varchar(255) not null, `disabled` tinyint(1) not null default 0, `prom_ql` varchar(8192) not null, `prom_eval_interval` int not null, `cron_pattern` varchar(255) default '', `append_tags` varchar(255) default '', `query_configs` text not null, `create_at` bigint default '0', `create_by` varchar(64) default '', `update_at` bigint default '0', `update_by` varchar(64) default '', `datasource_queries` text ); CREATE INDEX `idx_recording_rule_group_id` ON `recording_rule` (`group_id` asc); CREATE INDEX `idx_recording_rule_update_at` ON `recording_rule` (`update_at` asc); CREATE TABLE `alert_aggr_view` ( `id` integer primary key autoincrement, `name` varchar(191) not null default '', `rule` varchar(2048) not null default '', `cate` tinyint(1) not null, `create_at` bigint not null default 0, `create_by` bigint not null default 0, `update_at` bigint not null default 0 ); CREATE INDEX `idx_alert_aggr_view_create_by` ON `alert_aggr_view` (`create_by` asc); insert into alert_aggr_view(name, rule, cate) values('By BusiGroup, Severity', 'field:group_name::field:severity', 0); insert into alert_aggr_view(name, rule, cate) values('By RuleName', 'field:rule_name', 0); CREATE TABLE `alert_cur_event` ( `id` integer primary key autoincrement, `cate` varchar(128) not null, `datasource_id` bigint not null default 0, `cluster` varchar(128) not null, `group_id` bigint unsigned not null, `group_name` varchar(255) not null default '', `hash` varchar(64) not null, `rule_id` bigint unsigned not null, `rule_name` varchar(255) not null, `rule_note` varchar(2048) not null default 'alert rule note', `rule_prod` varchar(255) not null default '', `rule_algo` varchar(255) not null default '', `severity` tinyint(1) not null, `prom_for_duration` int not null, `prom_ql` varchar(8192) not null, `prom_eval_interval` int not null, `callbacks` varchar(255) not null default '', `runbook_url` varchar(255), `notify_recovered` tinyint(1) not null, `notify_channels` varchar(255) not null default '', `notify_groups` varchar(255) not null default '', `notify_repeat_next` bigint not null default 0, `notify_cur_number` int not null default 0, `target_ident` varchar(191) not null default '', `target_note` varchar(191) not null default '', `first_trigger_time` bigint, `trigger_time` bigint not null, `trigger_value` varchar(2048) not null, `annotations` text not null, `rule_config` text not null, `tags` varchar(1024) not null default '' ); CREATE INDEX `idx_alert_cur_event_hash` ON `alert_cur_event` (`hash` asc); CREATE INDEX `idx_alert_cur_event_rule_id` ON `alert_cur_event` (`rule_id` asc); CREATE INDEX `idx_alert_cur_event_trigger_time_group_id` ON `alert_cur_event` (`trigger_time`, `group_id` asc); CREATE INDEX `idx_alert_cur_event_notify_repeat_next` ON `alert_cur_event` (`notify_repeat_next` asc); CREATE TABLE `alert_his_event` ( `id` integer primary key autoincrement, `is_recovered` tinyint(1) not null, `cate` varchar(128) not null, `datasource_id` bigint not null default 0, `cluster` varchar(128) not null, `group_id` bigint unsigned not null, `group_name` varchar(255) not null default '', `hash` varchar(64) not null, `rule_id` bigint unsigned not null, `rule_name` varchar(255) not null, `rule_note` varchar(2048) not null default 'alert rule note', `rule_prod` varchar(255) not null default '', `rule_algo` varchar(255) not null default '', `severity` tinyint(1) not null, `prom_for_duration` int not null, `prom_ql` varchar(8192) not null, `prom_eval_interval` int not null, `callbacks` varchar(255) not null default '', `runbook_url` varchar(255), `notify_recovered` tinyint(1) not null, `notify_channels` varchar(255) not null default '', `notify_groups` varchar(255) not null default '', `notify_cur_number` int not null default 0, `target_ident` varchar(191) not null default '', `target_note` varchar(191) not null default '', `first_trigger_time` bigint, `trigger_time` bigint not null, `trigger_value` varchar(2048) not null, `recover_time` bigint not null default 0, `last_eval_time` bigint not null default 0, `original_tags` varchar(8192), `tags` varchar(1024) not null default '', `annotations` text not null, `rule_config` text not null ); CREATE INDEX `idx_alert_his_event_last_eval_time` ON `alert_his_event` (`last_eval_time` asc); CREATE INDEX `idx_alert_his_event_hash` ON `alert_his_event` (`hash` asc); CREATE INDEX `idx_alert_his_event_rule_id` ON `alert_his_event` (`rule_id` asc); CREATE INDEX `idx_alert_his_event_trigger_time_group_id` ON `alert_his_event` (`trigger_time`, `group_id` asc); CREATE TABLE `board_busigroup` ( `busi_group_id` bigint(20) NOT NULL DEFAULT '0', `board_id` bigint(20) NOT NULL DEFAULT '0', primary key (`busi_group_id`, `board_id`) ); CREATE TABLE `builtin_components` ( `id` integer primary key autoincrement, `ident` varchar(191) not null, `logo` varchar(191) not null, `readme` text not null, `created_at` bigint(20) not null default 0, `created_by` varchar(191) not null default '', `updated_at` bigint(20) not null default 0, `updated_by` varchar(191) not null default '' ); CREATE INDEX `idx_builtin_components_ident` ON `builtin_components` (`ident` asc); CREATE TABLE `builtin_payloads` ( `id` integer primary key autoincrement, `component_id` integer not null default 0, `uuid` integer not null, `type` varchar(191) not null, `component` varchar(191) not null, `cate` varchar(191) not null, `name` varchar(191) not null, `tags` varchar(191) not null default '', `content` longtext not null, `note` varchar(1024) not null default '', `created_at` bigint(20) not null default 0, `created_by` varchar(191) not null default '', `updated_at` bigint(20) not null default 0, `updated_by` varchar(191) not null default '' ); CREATE INDEX `idx_builtin_payloads_component` ON `builtin_payloads` (`component` asc); CREATE INDEX `idx_builtin_payloads_name` ON `builtin_payloads` (`name` asc); CREATE INDEX `idx_builtin_payloads_cate` ON `builtin_payloads` (`cate` asc); CREATE INDEX `idx_builtin_payloads_type` ON `builtin_payloads` (`type` asc); CREATE INDEX idx_uuid ON `builtin_payloads` (uuid); CREATE TABLE `notification_record` ( `id` integer primary key autoincrement, `event_id` integer not null, `sub_id` integer, `channel` varchar(255) not null, `status` integer, `target` varchar(1024) not null, `details` varchar(2048) default '', `created_at` integer not null ); CREATE INDEX idx_evt ON notification_record (event_id); CREATE TABLE `task_tpl` ( `id` integer primary key autoincrement, `group_id` int unsigned not null, `title` varchar(255) not null default '', `account` varchar(64) not null, `batch` int unsigned not null default 0, `tolerance` int unsigned not null default 0, `timeout` int unsigned not null default 0, `pause` varchar(255) not null default '', `script` text not null, `args` varchar(512) not null default '', `tags` varchar(255) not null default '', `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '' ); CREATE INDEX `idx_task_tpl_group_id` ON `task_tpl` (`group_id` asc); CREATE TABLE `task_tpl_host` ( `ii` integer primary key autoincrement, `id` int unsigned not null, `host` varchar(128) not null ); CREATE INDEX `idx_task_tpl_host_id_host` ON `task_tpl_host` (`id`, `host` asc); CREATE TABLE `task_record` ( `id` integer primary key autoincrement, `event_id` bigint not null default 0, `group_id` bigint not null, `ibex_address` varchar(128) not null, `ibex_auth_user` varchar(128) not null default '', `ibex_auth_pass` varchar(128) not null default '', `title` varchar(255) not null default '', `account` varchar(64) not null, `batch` int unsigned not null default 0, `tolerance` int unsigned not null default 0, `timeout` int unsigned not null default 0, `pause` varchar(255) not null default '', `script` text not null, `args` varchar(512) not null default '', `create_at` bigint not null default 0, `create_by` varchar(64) not null default '' ); CREATE INDEX `idx_task_record_create_at_group_id` ON `task_record` (`create_at`, `group_id` asc); CREATE INDEX `idx_task_record_create_by` ON `task_record` (`create_by` asc); CREATE INDEX `idx_task_record_event_id` ON `task_record` (`event_id` asc); CREATE TABLE `alerting_engines` ( `id` integer primary key autoincrement, `instance` varchar(128) not null default '', `datasource_id` bigint not null default 0, `engine_cluster` varchar(128) not null default '', `clock` bigint not null ); CREATE TABLE `datasource` ( `id` integer primary key autoincrement, `name` varchar(191) not null default '' unique, `description` varchar(255) not null default '', `category` varchar(255) not null default '', `plugin_id` int unsigned not null default 0, `plugin_type` varchar(255) not null default '', `plugin_type_name` varchar(255) not null default '', `cluster_name` varchar(255) not null default '', `settings` text not null, `status` varchar(255) not null default '', `http` varchar(4096) not null default '', `auth` varchar(8192) not null default '', `is_default` tinyint not null default 0, `weight` int not null default 0, `created_at` bigint not null default 0, `created_by` varchar(64) not null default '', `updated_at` bigint not null default 0, `updated_by` varchar(64) not null default '' ); CREATE UNIQUE INDEX idx_datasource_name ON datasource (name); CREATE TABLE `builtin_cate` ( `id` integer primary key autoincrement, `name` varchar(191) not null, `user_id` bigint not null default 0 ); CREATE TABLE `notify_tpl` ( `id` integer primary key autoincrement, `channel` varchar(32) not null unique, `name` varchar(255) not null, `content` text not null, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '' ); CREATE UNIQUE INDEX idx_notify_tpl_channel ON notify_tpl (channel); CREATE TABLE `sso_config` ( `id` integer primary key autoincrement, `name` varchar(191) not null unique, `content` text not null, `update_at` bigint not null default 0 ); CREATE UNIQUE INDEX idx_sso_config_name ON sso_config (name); CREATE TABLE `es_index_pattern` ( `id` integer primary key autoincrement, `datasource_id` bigint not null default 0, `name` varchar(191) not null, `time_field` varchar(128) not null default '@timestamp', `allow_hide_system_indices` tinyint(1) not null default 0, `fields_format` varchar(4096) not null default '', `cross_cluster_enabled` int not null default 0, `create_at` bigint default '0', `create_by` varchar(64) default '', `update_at` bigint default '0', `update_by` varchar(64) default '', unique (`datasource_id`, `name`) ); CREATE UNIQUE INDEX idx_es_index_pattern_datasource_id_name ON es_index_pattern (datasource_id, name); CREATE TABLE `builtin_metrics` ( `id` integer primary key autoincrement, `collector` varchar(191) NOT NULL, `typ` varchar(191) NOT NULL, `name` varchar(191) NOT NULL, `unit` varchar(191) NOT NULL, `lang` varchar(191) NOT NULL DEFAULT '', `note` varchar(4096) NOT NULL, `expression` varchar(4096) NOT NULL, `expression_type` varchar(32) NOT NULL DEFAULT 'promql', `metric_type` varchar(191) NOT NULL DEFAULT '', `extra_fields` text, `created_at` bigint NOT NULL DEFAULT 0, `created_by` varchar(191) NOT NULL DEFAULT '', `updated_at` bigint NOT NULL DEFAULT 0, `updated_by` varchar(191) NOT NULL DEFAULT '', `uuid integer` not null default 0 ); CREATE INDEX idx_collector ON builtin_metrics (collector); CREATE INDEX idx_typ ON builtin_metrics (typ); CREATE INDEX idx_builtinmetric_name ON builtin_metrics (name); CREATE INDEX idx_lang ON builtin_metrics (lang); CREATE TABLE `metric_filter` ( `id` integer primary key autoincrement, `name` varchar(191) NOT NULL, `configs` varchar(4096) NOT NULL, `groups_perm` text, `create_at` bigint NOT NULL DEFAULT '0', `create_by` varchar(191) NOT NULL DEFAULT '', `update_at` bigint NOT NULL DEFAULT '0', `update_by` varchar(191) NOT NULL DEFAULT '' ); CREATE INDEX `idx_metric_filter_name` ON `metric_filter` (`name` asc); CREATE TABLE `target_busi_group` ( `id` integer primary key autoincrement, `target_ident` varchar(191) not null, `group_id` integer not null, `update_at` integer not null ); CREATE UNIQUE INDEX idx_target_busi_group ON target_busi_group (target_ident, group_id); CREATE TABLE `dash_annotation` ( `id` integer primary key autoincrement, `dashboard_id` bigint not null, `panel_id` varchar(191) not null, `tags` text, `description` text, `config` text, `time_start` bigint not null default 0, `time_end` bigint not null default 0, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, `update_by` varchar(64) not null default '' ); CREATE TABLE `task_meta` ( `id` integer primary key autoincrement, `title` varchar(255) not null default '', `account` varchar(64) not null, `batch` int unsigned not null default 0, `tolerance` int unsigned not null default 0, `timeout` int unsigned not null default 0, `pause` varchar(255) not null default '', `script` text not null, `args` varchar(512) not null default '', `stdin` varchar(1024) not null default '', `creator` varchar(64) not null default '', `created` timestamp not null default CURRENT_TIMESTAMP ); CREATE INDEX `idx_task_meta_creator` ON `task_meta` (`creator` asc); CREATE INDEX `idx_task_meta_created` ON `task_meta` (`created` asc); /* start|cancel|kill|pause */ CREATE TABLE `task_action` ( `id` integer primary key autoincrement, `action` varchar(32) not null, `clock` bigint not null default 0 ); CREATE TABLE `task_scheduler` ( `id` bigint unsigned not null, `scheduler` varchar(128) not null default '' ); CREATE INDEX `idx_task_scheduler_id_scheduler` ON `task_scheduler` (`id`, `scheduler` asc); CREATE TABLE `task_scheduler_health` ( `scheduler` varchar(128) not null unique, `clock` bigint not null ); CREATE INDEX `idx_task_scheduler_health_clock` ON `task_scheduler_health` (`clock` asc); CREATE TABLE `task_host_doing` ( `id` bigint unsigned not null, `host` varchar(128) not null, `clock` bigint not null default 0, `action` varchar(16) not null ); CREATE INDEX `idx_task_host_doing_id` ON `task_host_doing` (`id` asc); CREATE INDEX `idx_task_host_doing_host` ON `task_host_doing` (`host` asc); CREATE TABLE task_host_0 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_1 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_2 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_3 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_4 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_5 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_6 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_7 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_8 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_9 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_10 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_11 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_12 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_13 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_14 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_15 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_16 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_17 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_18 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_19 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_20 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_21 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_22 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_23 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_24 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_25 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_26 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_27 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_28 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_29 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_30 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_31 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_32 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_33 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_34 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_35 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_36 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_37 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_38 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_39 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_40 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_41 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_42 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_43 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_44 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_45 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_46 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_47 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_48 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_49 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_50 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_51 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_52 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_53 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_54 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_55 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_56 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_57 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_58 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_59 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_60 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_61 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_62 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_63 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_64 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_65 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_66 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_67 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_68 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_69 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_70 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_71 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_72 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_73 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_74 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_75 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_76 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_77 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_78 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_79 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_80 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_81 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_82 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_83 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_84 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_85 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_86 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_87 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_88 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_89 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_90 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_91 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_92 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_93 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_94 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_95 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_96 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_97 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_98 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); CREATE TABLE task_host_99 ( `ii` integer primary key autoincrement, `id` bigint unsigned not null, `host` varchar(128) not null, `status` varchar(32) not null, `stdout` text, `stderr` text, unique (`id`, `host`) ); ================================================ FILE: dscache/cache.go ================================================ package dscache import ( "sync" "github.com/ccfos/nightingale/v6/datasource" "github.com/toolkits/pkg/logger" ) type Cache struct { datas map[string]map[int64]datasource.Datasource mutex *sync.RWMutex } var DsCache = Cache{ datas: make(map[string]map[int64]datasource.Datasource), mutex: new(sync.RWMutex), } func (cs *Cache) Put(cate string, dsId int64, ds datasource.Datasource) { cs.mutex.Lock() if _, found := cs.datas[cate]; !found { cs.datas[cate] = make(map[int64]datasource.Datasource) } if _, found := cs.datas[cate][dsId]; found { if cs.datas[cate][dsId].Equal(ds) { cs.mutex.Unlock() return } } cs.mutex.Unlock() // InitClient() 在用户配置错误或远端不可用时, 会非常耗时, mutex被长期持有, 导致Get()会超时 err := ds.InitClient() if err != nil { logger.Errorf("init plugin:%s %d %+v client fail: %v", cate, dsId, ds, err) return } logger.Debugf("init plugin:%s %d %+v client success", cate, dsId, ds) cs.mutex.Lock() cs.datas[cate][dsId] = ds cs.mutex.Unlock() } func (cs *Cache) Get(cate string, dsId int64) (datasource.Datasource, bool) { cs.mutex.RLock() defer cs.mutex.RUnlock() if _, found := cs.datas[cate]; !found { return nil, false } if _, found := cs.datas[cate][dsId]; !found { return nil, false } return cs.datas[cate][dsId], true } func (cs *Cache) Delete(cate string, dsId int64) { cs.mutex.Lock() defer cs.mutex.Unlock() if _, found := cs.datas[cate]; !found { return } delete(cs.datas[cate], dsId) logger.Debugf("delete plugin:%s %d from cache", cate, dsId) } // GetAllIds 返回缓存中所有数据源的 ID,按类型分组 func (cs *Cache) GetAllIds() map[string][]int64 { cs.mutex.RLock() defer cs.mutex.RUnlock() result := make(map[string][]int64) for cate, dsMap := range cs.datas { ids := make([]int64, 0, len(dsMap)) for dsId := range dsMap { ids = append(ids, dsId) } result[cate] = ids } return result } ================================================ FILE: dscache/sync.go ================================================ package dscache import ( "context" "encoding/base64" "strings" "sync/atomic" "time" "github.com/ccfos/nightingale/v6/datasource" _ "github.com/ccfos/nightingale/v6/datasource/ck" _ "github.com/ccfos/nightingale/v6/datasource/doris" "github.com/ccfos/nightingale/v6/datasource/es" _ "github.com/ccfos/nightingale/v6/datasource/mysql" _ "github.com/ccfos/nightingale/v6/datasource/opensearch" _ "github.com/ccfos/nightingale/v6/datasource/postgresql" _ "github.com/ccfos/nightingale/v6/datasource/victorialogs" "github.com/ccfos/nightingale/v6/dskit/tdengine" "github.com/ccfos/nightingale/v6/models" "github.com/ccfos/nightingale/v6/pkg/ctx" "github.com/ccfos/nightingale/v6/pkg/poster" "github.com/toolkits/pkg/logger" ) var FromAPIHook func() var DatasourceProcessHook func(items []datasource.DatasourceInfo) []datasource.DatasourceInfo func Init(ctx *ctx.Context, fromAPI bool) { if !ctx.IsCenter { // 从 center 同步密钥 var rsaConfig = new(models.RsaConfig) c, err := poster.GetByUrls[*models.RsaConfig](ctx, "/v1/n9e/datasource-rsa-config") if err != nil || c == nil { logger.Fatalf("failed to get datasource rsa-config, error: %v", err) } rsaConfig = c if c.OpenRSA { logger.Infof("datasource rsa is open in n9e-plus") rsaConfig.PrivateKeyBytes, err = base64.StdEncoding.DecodeString(c.RSAPrivateKey) if err != nil { logger.Fatalf("failed to decode rsa-config, error: %v", err) } } models.SetRsaConfig(rsaConfig) } go getDatasourcesFromDBLoop(ctx, fromAPI) } type ListInput struct { Page int `json:"p"` Limit int `json:"limit"` Category string `json:"category"` PluginType string `json:"plugin_type"` // prometheus Status string `json:"status"` } type DSReply struct { RequestID string `json:"request_id"` Data struct { Items []datasource.DatasourceInfo `json:"items"` } `json:"data"` } type DSReplyEncrypt struct { RequestID string `json:"request_id"` Data string `json:"data"` } var PromDefaultDatasourceId int64 func getDatasourcesFromDBLoop(ctx *ctx.Context, fromAPI bool) { for { if !fromAPI { foundDefaultDatasource := false items, err := models.GetDatasources(ctx) if err != nil { logger.Errorf("get datasource from database fail: %v", err) //stat.CounterExternalErrorTotal.WithLabelValues("db", "get_cluster").Inc() time.Sleep(time.Second * 2) continue } var dss []datasource.DatasourceInfo for _, item := range items { if item.PluginType == "prometheus" && item.IsDefault { atomic.StoreInt64(&PromDefaultDatasourceId, item.Id) foundDefaultDatasource = true } // logger.Debugf("get datasource: %+v", item) ds := datasource.DatasourceInfo{ Id: item.Id, Name: item.Name, Description: item.Description, Category: item.Category, PluginId: item.PluginId, Type: item.PluginType, PluginTypeName: item.PluginTypeName, Settings: item.SettingsJson, HTTPJson: item.HTTPJson, AuthJson: item.AuthJson, Status: item.Status, IsDefault: item.IsDefault, Weight: item.Weight, } if item.PluginType == "elasticsearch" { esN9eToDatasourceInfo(&ds, item) } else if item.PluginType == "tdengine" { tdN9eToDatasourceInfo(&ds, item) } else { ds.Settings = make(map[string]interface{}) for k, v := range item.SettingsJson { ds.Settings[k] = v } } dss = append(dss, ds) } if !foundDefaultDatasource && atomic.LoadInt64(&PromDefaultDatasourceId) != 0 { logger.Debugf("no default datasource found") atomic.StoreInt64(&PromDefaultDatasourceId, 0) } if DatasourceProcessHook != nil { dss = DatasourceProcessHook(dss) } PutDatasources(dss) } else { FromAPIHook() } time.Sleep(time.Second * 2) } } func tdN9eToDatasourceInfo(ds *datasource.DatasourceInfo, item models.Datasource) { ds.Settings = make(map[string]interface{}) ds.Settings["tdengine.cluster_name"] = item.Name ds.Settings["tdengine.addr"] = item.HTTPJson.Url ds.Settings["tdengine.timeout"] = item.HTTPJson.Timeout ds.Settings["tdengine.dial_timeout"] = item.HTTPJson.DialTimeout ds.Settings["tdengine.max_idle_conns_per_host"] = item.HTTPJson.MaxIdleConnsPerHost ds.Settings["tdengine.headers"] = item.HTTPJson.Headers ds.Settings["tdengine.basic"] = tdengine.TDengineBasicAuth{ User: item.AuthJson.BasicAuthUser, Password: item.AuthJson.BasicAuthPassword, } } func esN9eToDatasourceInfo(ds *datasource.DatasourceInfo, item models.Datasource) { ds.Settings = make(map[string]interface{}) ds.Settings["es.nodes"] = []string{item.HTTPJson.Url} if len(item.HTTPJson.Urls) > 0 { ds.Settings["es.nodes"] = item.HTTPJson.Urls } ds.Settings["es.timeout"] = item.HTTPJson.Timeout ds.Settings["es.basic"] = es.BasicAuth{ Username: item.AuthJson.BasicAuthUser, Password: item.AuthJson.BasicAuthPassword, } ds.Settings["es.tls"] = es.TLS{ SkipTlsVerify: item.HTTPJson.TLS.SkipTlsVerify, } ds.Settings["es.version"] = item.SettingsJson["version"] ds.Settings["es.headers"] = item.HTTPJson.Headers ds.Settings["es.min_interval"] = item.SettingsJson["min_interval"] ds.Settings["es.max_shard"] = item.SettingsJson["max_shard"] ds.Settings["es.enable_write"] = item.SettingsJson["enable_write"] } func PutDatasources(items []datasource.DatasourceInfo) { // 记录当前有效的数据源 ID,按类型分组 validIds := make(map[string]map[int64]struct{}) ids := make([]int64, 0) for _, item := range items { if item.Type == "prometheus" { continue } if item.Type == "loki" { continue } if item.Name == "" { logger.Warningf("cluster name is empty, ignore %+v", item) continue } typ := strings.ReplaceAll(item.Type, ".logging", "") ds, err := datasource.GetDatasourceByType(typ, item.Settings) if err != nil { logger.Debugf("get plugin:%+v fail: %v", item, err) continue } err = ds.Validate(context.Background()) if err != nil { logger.Warningf("get plugin:%+v fail: %v", item, err) continue } ids = append(ids, item.Id) // 记录有效的数据源 ID if _, ok := validIds[typ]; !ok { validIds[typ] = make(map[int64]struct{}) } validIds[typ][item.Id] = struct{}{} // 异步初始化 client 不然数据源同步的会很慢 go func() { defer func() { if r := recover(); r != nil { logger.Errorf("panic in datasource item: %+v panic:%v", item, r) } }() DsCache.Put(typ, item.Id, ds) }() } // 删除 items 中不存在但 DsCache 中存在的数据源 cachedIds := DsCache.GetAllIds() for cate, dsIds := range cachedIds { for _, dsId := range dsIds { if _, ok := validIds[cate]; !ok { // 该类型在 items 中完全不存在,删除缓存中的所有该类型数据源 DsCache.Delete(cate, dsId) } else if _, ok := validIds[cate][dsId]; !ok { // 该数据源 ID 在 items 中不存在,删除 DsCache.Delete(cate, dsId) } } } // logger.Debugf("get plugin by type success Ids:%v", ids) } ================================================ FILE: dskit/clickhouse/clickhouse.go ================================================ package clickhouse import ( "context" "crypto/tls" "database/sql" "errors" "fmt" "strings" "time" "github.com/ccfos/nightingale/v6/dskit/sqlbase" "github.com/ccfos/nightingale/v6/dskit/types" "github.com/ClickHouse/clickhouse-go/v2" "github.com/mitchellh/mapstructure" "github.com/toolkits/pkg/logger" ckDriver "gorm.io/driver/clickhouse" "gorm.io/gorm" ) const ( ckDataSource = "clickhouse://%s:%s@%s?read_timeout=10s" DefaultLimit = 500 ) type Clickhouse struct { Nodes []string `json:"ck.nodes" mapstructure:"ck.nodes"` User string `json:"ck.user" mapstructure:"ck.user"` Password string `json:"ck.password" mapstructure:"ck.password"` Timeout int `json:"ck.timeout" mapstructure:"ck.timeout"` MaxQueryRows int `json:"ck.max_query_rows" mapstructure:"ck.max_query_rows"` Protocol string `json:"ck.protocol" mapstructure:"ck.protocol"` SkipSSLVerify bool `json:"ck.skip_ssl_verify" mapstructure:"ck.skip_ssl_verify"` SecureConnection bool `json:"ck.secure_connection" mapstructure:"ck.secure_connection"` // 连接池配置(可选) MaxIdleConns int `json:"ck.max_idle_conns" mapstructure:"ck.max_idle_conns"` // 最大空闲连接数 MaxOpenConns int `json:"ck.max_open_conns" mapstructure:"ck.max_open_conns"` // 最大打开连接数 ConnMaxLifetime int `json:"ck.conn_max_lifetime" mapstructure:"ck.conn_max_lifetime"` // 连接最大生命周期(秒) Client *gorm.DB `json:"-"` ClientByHTTP *sql.DB `json:"-"` } func (c *Clickhouse) InitCli() error { if c.MaxQueryRows == 0 { c.MaxQueryRows = DefaultLimit } if len(c.Nodes) == 0 { return fmt.Errorf("not found ck shard, please check datasource config") } // 前端只允许 host:port,直接使用第一个节点 addr := c.Nodes[0] prot := strings.ToLower(strings.TrimSpace(c.Protocol)) // 如果用户显式指定 protocol,只允许 http 或 native if prot != "" { if prot != "http" && prot != "native" { return fmt.Errorf("unsupported clickhouse protocol: %s, only `http`, `https` or `native` allowed", c.Protocol) } // HTTP(S) 路径(使用 clickhouse-go HTTP client) if prot == "http" { opts := &clickhouse.Options{ Addr: []string{addr}, Auth: clickhouse.Auth{Username: c.User, Password: c.Password}, Settings: clickhouse.Settings{"max_execution_time": 60}, DialTimeout: 10 * time.Second, Protocol: clickhouse.HTTP, } // 仅当显式指定 https 时才启用 TLS 并使用 SkipSSL 控制 InsecureSkipVerify if c.SecureConnection { opts.TLS = &tls.Config{InsecureSkipVerify: c.SkipSSLVerify} } ckconn := clickhouse.OpenDB(opts) if ckconn == nil { return errors.New("db conn failed") } // 应用连接池配置到 HTTP sql.DB if c.MaxIdleConns > 0 { ckconn.SetMaxIdleConns(c.MaxIdleConns) } if c.MaxOpenConns > 0 { ckconn.SetMaxOpenConns(c.MaxOpenConns) } if c.ConnMaxLifetime > 0 { ckconn.SetConnMaxLifetime(time.Duration(c.ConnMaxLifetime) * time.Second) } c.ClientByHTTP = ckconn return nil } // native 路径(使用 gorm + native driver) dsn := fmt.Sprintf(ckDataSource, c.User, c.Password, addr) // 如果启用了 SecureConnection,为 DSN 添加 TLS 参数;SkipSSLVerify 控制是否跳过证书校验 if c.SecureConnection { dsn = dsn + "&secure=true" if c.SkipSSLVerify { dsn = dsn + "&skip_verify=true" } } db, err := gorm.Open( ckDriver.New( ckDriver.Config{ DSN: dsn, DisableDatetimePrecision: true, DontSupportRenameColumn: true, SkipInitializeWithVersion: false, }), ) if err != nil { return err } // 应用连接池配置到 gorm 底层 *sql.DB if sqlDB, derr := db.DB(); derr == nil { if c.MaxIdleConns > 0 { sqlDB.SetMaxIdleConns(c.MaxIdleConns) } if c.MaxOpenConns > 0 { sqlDB.SetMaxOpenConns(c.MaxOpenConns) } if c.ConnMaxLifetime > 0 { sqlDB.SetConnMaxLifetime(time.Duration(c.ConnMaxLifetime) * time.Second) } } else { logger.Debugf("clickhouse: get native sql DB failed: %v", derr) } c.Client = db return nil } opts := &clickhouse.Options{ Addr: []string{addr}, Auth: clickhouse.Auth{Username: c.User, Password: c.Password}, Settings: clickhouse.Settings{"max_execution_time": 60}, DialTimeout: 10 * time.Second, Protocol: clickhouse.HTTP, } ckconn := clickhouse.OpenDB(opts) if ckconn != nil { // 做一次 Ping 校验,避免把 native 端口误当作 HTTP 使用 if err := ckconn.Ping(); err == nil { if c.MaxIdleConns > 0 { ckconn.SetMaxIdleConns(c.MaxIdleConns) } if c.MaxOpenConns > 0 { ckconn.SetMaxOpenConns(c.MaxOpenConns) } if c.ConnMaxLifetime > 0 { ckconn.SetConnMaxLifetime(time.Duration(c.ConnMaxLifetime) * time.Second) } c.ClientByHTTP = ckconn return nil } else { logger.Debugf("clickhouse http ping failed for %s, fallback to native: %v", addr, err) _ = ckconn.Close() } } // 作为最后回退,尝试 native 连接 host := strings.TrimPrefix(strings.TrimPrefix(addr, "http://"), "https://") dsn := fmt.Sprintf(ckDataSource, c.User, c.Password, host) // 如果启用了 SecureConnection,为 DSN 添加 TLS 参数;SkipSSLVerify 控制是否跳过证书校验 if c.SecureConnection { dsn = dsn + "&secure=true" if c.SkipSSLVerify { dsn = dsn + "&skip_verify=true" } } db, err := gorm.Open( ckDriver.New( ckDriver.Config{ DSN: dsn, DisableDatetimePrecision: true, DontSupportRenameColumn: true, SkipInitializeWithVersion: false, }), ) if err != nil { return err } if sqlDB, derr := db.DB(); derr == nil { if c.MaxIdleConns > 0 { sqlDB.SetMaxIdleConns(c.MaxIdleConns) } if c.MaxOpenConns > 0 { sqlDB.SetMaxOpenConns(c.MaxOpenConns) } if c.ConnMaxLifetime > 0 { sqlDB.SetConnMaxLifetime(time.Duration(c.ConnMaxLifetime) * time.Second) } } c.Client = db return nil } const ( ShowDatabases = "SHOW DATABASES" ShowTables = "SELECT name FROM system.tables WHERE database = '%s'" DescTable = "SELECT name,type FROM system.columns WHERE database='%s' AND table = '%s';" ) func (c *Clickhouse) QueryRows(ctx context.Context, query string) (*sql.Rows, error) { var ( rows *sql.Rows err error ) if c.ClientByHTTP != nil { rows, err = c.ClientByHTTP.Query(query) if err != nil { return nil, err } } else if c.Client != nil { rows, err = c.Client.Raw(query).Rows() if err != nil { return nil, err } } else { return nil, fmt.Errorf("clickhouse client is nil") } return rows, nil } // ShowDatabases lists all databases in Clickhouse func (c *Clickhouse) ShowDatabases(ctx context.Context) ([]string, error) { res := make([]string, 0) rows, err := c.QueryRows(ctx, ShowDatabases) if err != nil { return nil, err } for rows.Next() { var r string if err := rows.Scan(&r); err != nil { return nil, err } res = append(res, r) } return res, nil } // ShowTables lists all tables in a given database func (c *Clickhouse) ShowTables(ctx context.Context, database string) ([]string, error) { res := make([]string, 0) showTables := fmt.Sprintf(ShowTables, database) rows, err := c.QueryRows(ctx, showTables) if err != nil { return nil, err } for rows.Next() { var r string if err := rows.Scan(&r); err != nil { return nil, err } res = append(res, r) } return res, nil } // DescribeTable describes the schema of a specified table in Clickhouse func (c *Clickhouse) DescribeTable(ctx context.Context, query interface{}) ([]*types.ColumnProperty, error) { var ( ret []*types.ColumnProperty ) ckQueryParam := new(QueryParam) if err := mapstructure.Decode(query, ckQueryParam); err != nil { return nil, err } descTable := fmt.Sprintf(DescTable, ckQueryParam.Database, ckQueryParam.Table) rows, err := c.QueryRows(ctx, descTable) if err != nil { return nil, err } for rows.Next() { var column types.ColumnProperty if err := rows.Scan(&column.Field, &column.Type); err != nil { return nil, err } ret = append(ret, &column) } return ret, nil } func (c *Clickhouse) ExecQueryBySqlDB(ctx context.Context, sql string) ([]map[string]interface{}, error) { rows, err := c.QueryRows(ctx, sql) if err != nil { return nil, err } defer rows.Close() columns, err := rows.Columns() if err != nil { return nil, err } var results []map[string]interface{} for rows.Next() { columnValues := make([]interface{}, len(columns)) columnPointers := make([]interface{}, len(columns)) for i := range columnValues { columnPointers[i] = &columnValues[i] } if err := rows.Scan(columnPointers...); err != nil { continue } rowMap := make(map[string]interface{}) for i, colName := range columns { val := columnValues[i] bytes, ok := val.([]byte) if ok { rowMap[colName] = string(bytes) } else { rowMap[colName] = val } } results = append(results, rowMap) } return results, nil } func (c *Clickhouse) Query(ctx context.Context, query interface{}) ([]map[string]interface{}, error) { ckQuery := new(QueryParam) if err := mapstructure.Decode(query, ckQuery); err != nil { return nil, err } // 校验SQL的合法性, 过滤掉 write请求 sqlItem := strings.Split(strings.ToUpper(ckQuery.Sql), " ") for _, item := range sqlItem { if _, ok := ckBannedOp[item]; ok { return nil, fmt.Errorf("operation %s is forbid, only read db, please check your sql", item) } } // 检查匹配数据长度,防止数据量过大 err := c.CheckMaxQueryRows(ctx, ckQuery.Sql) if err != nil { return nil, err } dbRows := make([]map[string]interface{}, 0) if c.ClientByHTTP != nil { dbRows, err = c.ExecQueryBySqlDB(ctx, ckQuery.Sql) } else { err = c.Client.Raw(ckQuery.Sql).Find(&dbRows).Error } if err != nil { return nil, fmt.Errorf("fetch data failed, sql is %s, err is %s", ckQuery.Sql, err.Error()) } return dbRows, nil } func (c *Clickhouse) CheckMaxQueryRows(ctx context.Context, sql string) error { subSql := strings.ReplaceAll(sql, ";", "") subSql = fmt.Sprintf("SELECT COUNT(*) as count FROM (%s) AS subquery;", subSql) dbRows, err := c.ExecQueryBySqlDB(ctx, subSql) if err != nil { return fmt.Errorf("fetch data failed, sql is %s, err is %s", subSql, err.Error()) } if len(dbRows) > 0 { if count, exists := dbRows[0]["count"]; exists { v, err := sqlbase.ParseFloat64Value(count) if err != nil { return err } if v > float64(c.MaxQueryRows) { return fmt.Errorf("query result rows count %d exceeds the maximum limit %d", int(v), c.MaxQueryRows) } } } return nil } ================================================ FILE: dskit/clickhouse/clickhouse_test.go ================================================ package clickhouse import ( "context" "encoding/json" "fmt" "testing" "time" "github.com/ccfos/nightingale/v6/dskit/types" ) func Test_Timeseries(t *testing.T) { ck := &Clickhouse{ Nodes: []string{"127.0.0.1:8123"}, User: "default", Password: "123456", } err := ck.InitCli() if err != nil { t.Fatal(err) } data, err := ck.QueryTimeseries(context.TODO(), &QueryParam{ Sql: `select * from default.student limit 20`, From: time.Now().Unix() - 300, To: time.Now().Unix(), TimeField: "created_at", TimeFormat: "datetime", Keys: types.Keys{ LabelKey: "age", }, }) if err != nil { t.Fatal(err) } bs, err := json.Marshal(data) if err != nil { t.Fatal(err) } fmt.Println(string(bs)) } ================================================ FILE: dskit/clickhouse/timeseries.go ================================================ package clickhouse import ( "context" "fmt" "github.com/ccfos/nightingale/v6/dskit/sqlbase" "github.com/ccfos/nightingale/v6/dskit/types" ) const ( TimeFieldFormatEpochMilli = "epoch_millis" TimeFieldFormatEpochSecond = "epoch_second" ) // 时序数据相关的API type QueryParam struct { Limit int `json:"limit" mapstructure:"limit"` Sql string `json:"sql" mapstructure:"sql"` Ref string `json:"ref" mapstructure:"ref"` From int64 `json:"from" mapstructure:"from"` To int64 `json:"to" mapstructure:"to"` TimeField string `json:"time_field" mapstructure:"time_field"` TimeFormat string `json:"time_format" mapstructure:"time_format"` Keys types.Keys `json:"keys" mapstructure:"keys"` Database string `json:"database" mapstructure:"database"` Table string `json:"table" mapstructure:"table"` } var ( ckBannedOp = map[string]struct{}{ "CREATE": {}, "INSERT": {}, "ALTER": {}, "REVOKE": {}, "DROP": {}, "RENAME": {}, "ATTACH": {}, "DETACH": {}, "OPTIMIZE": {}, "TRUNCATE": {}, "SET": {}, } ) func (c *Clickhouse) QueryTimeseries(ctx context.Context, query *QueryParam) ([]types.MetricValues, error) { if query.Keys.ValueKey == "" { return nil, fmt.Errorf("valueKey is required") } rows, err := c.Query(ctx, query) if err != nil { return nil, err } // 构造成时续数据 return sqlbase.FormatMetricValues(query.Keys, rows, true), nil } ================================================ FILE: dskit/doris/doris.go ================================================ package doris import ( "context" "database/sql" "encoding/json" "errors" "fmt" "reflect" "strings" "time" "unicode" "github.com/ccfos/nightingale/v6/dskit/pool" "github.com/ccfos/nightingale/v6/dskit/types" _ "github.com/go-sql-driver/mysql" // MySQL driver "github.com/mitchellh/mapstructure" ) const ( ShowIndexFieldIndexType = "index_type" ShowIndexFieldColumnName = "column_name" ShowIndexKeyName = "key_name" SQLShowIndex = "SHOW INDEX FROM " ) // Doris struct to hold connection details and the connection object type Doris struct { Addr string `json:"doris.addr" mapstructure:"doris.addr"` // fe mysql endpoint FeAddr string `json:"doris.fe_addr" mapstructure:"doris.fe_addr"` // fe http endpoint User string `json:"doris.user" mapstructure:"doris.user"` // Password string `json:"doris.password" mapstructure:"doris.password"` // Timeout int `json:"doris.timeout" mapstructure:"doris.timeout"` // ms MaxIdleConns int `json:"doris.max_idle_conns" mapstructure:"doris.max_idle_conns"` MaxOpenConns int `json:"doris.max_open_conns" mapstructure:"doris.max_open_conns"` ConnMaxLifetime int `json:"doris.conn_max_lifetime" mapstructure:"doris.conn_max_lifetime"` MaxQueryRows int `json:"doris.max_query_rows" mapstructure:"doris.max_query_rows"` ClusterName string `json:"doris.cluster_name" mapstructure:"doris.cluster_name"` EnableWrite bool `json:"doris.enable_write" mapstructure:"doris.enable_write"` // 写用户,用来区分读写用户,减少数据源 UserWrite string `json:"doris.user_write" mapstructure:"doris.user_write"` PasswordWrite string `json:"doris.password_write" mapstructure:"doris.password_write"` } // NewDorisWithSettings initializes a new Doris instance with the given settings func NewDorisWithSettings(ctx context.Context, settings interface{}) (*Doris, error) { newest := new(Doris) settingsMap := map[string]interface{}{} if reflect.TypeOf(settings).Kind() == reflect.String { if err := json.Unmarshal([]byte(settings.(string)), &settingsMap); err != nil { return nil, err } } else { var assert bool settingsMap, assert = settings.(map[string]interface{}) if !assert { return nil, errors.New("settings type invalid") } } if err := mapstructure.Decode(settingsMap, newest); err != nil { return nil, err } return newest, nil } // NewConn establishes a new connection to Doris func (d *Doris) NewConn(ctx context.Context, database string) (*sql.DB, error) { if len(d.Addr) == 0 { return nil, errors.New("empty fe-node addr") } // Set default values similar to postgres implementation if d.Timeout == 0 { d.Timeout = 60000 } if d.MaxIdleConns == 0 { d.MaxIdleConns = 10 } if d.MaxOpenConns == 0 { d.MaxOpenConns = 100 } if d.ConnMaxLifetime == 0 { d.ConnMaxLifetime = 14400 } if d.MaxQueryRows == 0 { d.MaxQueryRows = 500 } var keys []string keys = append(keys, d.Addr) keys = append(keys, d.User, d.Password) if len(database) > 0 { keys = append(keys, database) } cachedKey := strings.Join(keys, ":") // cache conn with database conn, ok := pool.PoolClient.Load(cachedKey) if ok { return conn.(*sql.DB), nil } var db *sql.DB var err error defer func() { if db != nil && err == nil { pool.PoolClient.Store(cachedKey, db) } }() // Simplified connection logic for Doris using MySQL driver dsn := fmt.Sprintf("%s:%s@tcp(%s)/%s?charset=utf8", d.User, d.Password, d.Addr, database) db, err = sql.Open("mysql", dsn) if err != nil { return nil, err } // Set connection pool configuration db.SetMaxIdleConns(d.MaxIdleConns) db.SetMaxOpenConns(d.MaxOpenConns) db.SetConnMaxLifetime(time.Duration(d.ConnMaxLifetime) * time.Second) return db, nil } // NewWriteConn establishes a new connection to Doris for write operations // When EnableWrite is true and UserWrite is configured, it uses the write user credentials // Otherwise, it reuses the read connection from NewConn func (d *Doris) NewWriteConn(ctx context.Context, database string) (*sql.DB, error) { // If write user is not configured, reuse the read connection if !d.EnableWrite || len(d.UserWrite) == 0 { return d.NewConn(ctx, database) } if len(d.Addr) == 0 { return nil, errors.New("empty fe-node addr") } // Set default values similar to postgres implementation if d.Timeout == 0 { d.Timeout = 60000 } if d.MaxIdleConns == 0 { d.MaxIdleConns = 10 } if d.MaxOpenConns == 0 { d.MaxOpenConns = 100 } if d.ConnMaxLifetime == 0 { d.ConnMaxLifetime = 14400 } if d.MaxQueryRows == 0 { d.MaxQueryRows = 500 } // Use write user credentials user := d.UserWrite password := d.PasswordWrite var keys []string keys = append(keys, d.Addr) keys = append(keys, user, password) if len(database) > 0 { keys = append(keys, database) } cachedKey := strings.Join(keys, ":") // cache conn with database conn, ok := pool.PoolClient.Load(cachedKey) if ok { return conn.(*sql.DB), nil } var db *sql.DB var err error defer func() { if db != nil && err == nil { pool.PoolClient.Store(cachedKey, db) } }() // Simplified connection logic for Doris using MySQL driver dsn := fmt.Sprintf("%s:%s@tcp(%s)/%s?charset=utf8", user, password, d.Addr, database) db, err = sql.Open("mysql", dsn) if err != nil { return nil, err } // Set connection pool configuration for write connections // Use more conservative values since write operations are typically less frequent writeMaxIdleConns := max(d.MaxIdleConns/5, 2) writeMaxOpenConns := max(d.MaxOpenConns/10, 5) db.SetMaxIdleConns(writeMaxIdleConns) db.SetMaxOpenConns(writeMaxOpenConns) db.SetConnMaxLifetime(time.Duration(d.ConnMaxLifetime) * time.Second) return db, nil } // createTimeoutContext creates a context with timeout based on Doris configuration func (d *Doris) createTimeoutContext(ctx context.Context) (context.Context, context.CancelFunc) { timeout := d.Timeout if timeout == 0 { timeout = 60000 } return context.WithTimeout(ctx, time.Duration(timeout)*time.Millisecond) } // ShowDatabases lists all databases in Doris func (d *Doris) ShowDatabases(ctx context.Context) ([]string, error) { timeoutCtx, cancel := d.createTimeoutContext(ctx) defer cancel() db, err := d.NewConn(timeoutCtx, "") if err != nil { return []string{}, err } rows, err := db.QueryContext(timeoutCtx, "SHOW DATABASES") if err != nil { return nil, err } defer rows.Close() databases := make([]string, 0) for rows.Next() { var dbName string if err := rows.Scan(&dbName); err != nil { continue } databases = append(databases, dbName) } return databases, nil } // ShowResources lists all resources with type resourceType in Doris func (d *Doris) ShowResources(ctx context.Context, resourceType string) ([]string, error) { timeoutCtx, cancel := d.createTimeoutContext(ctx) defer cancel() db, err := d.NewConn(timeoutCtx, "") if err != nil { return []string{}, err } // 使用 SHOW RESOURCES 命令 query := fmt.Sprintf("SHOW RESOURCES WHERE RESOURCETYPE = '%s'", resourceType) rows, err := db.QueryContext(timeoutCtx, query) if err != nil { return nil, fmt.Errorf("failed to execute query: %w", err) } defer rows.Close() distinctName := make(map[string]struct{}) // 获取列信息 columns, err := rows.Columns() if err != nil { return nil, fmt.Errorf("failed to get columns: %w", err) } // 准备接收数据的变量 values := make([]interface{}, len(columns)) valuePtrs := make([]interface{}, len(columns)) for i := range values { valuePtrs[i] = &values[i] } // 遍历结果集 for rows.Next() { err := rows.Scan(valuePtrs...) if err != nil { return nil, fmt.Errorf("error scanning row: %w", err) } // 提取资源名称并添加到 map 中(自动去重) if name, ok := values[0].([]byte); ok { distinctName[string(name)] = struct{}{} } else if nameStr, ok := values[0].(string); ok { distinctName[nameStr] = struct{}{} } } if err := rows.Err(); err != nil { return nil, fmt.Errorf("error iterating rows: %w", err) } // 将 map 转换为切片 resources := make([]string, 0) for name := range distinctName { resources = append(resources, name) } return resources, nil } // ShowTables lists all tables in a given database func (d *Doris) ShowTables(ctx context.Context, database string) ([]string, error) { timeoutCtx, cancel := d.createTimeoutContext(ctx) defer cancel() db, err := d.NewConn(timeoutCtx, database) if err != nil { return nil, err } query := fmt.Sprintf("SHOW TABLES IN %s", database) rows, err := db.QueryContext(timeoutCtx, query) if err != nil { return nil, err } defer rows.Close() tables := make([]string, 0) for rows.Next() { var tableName string if err := rows.Scan(&tableName); err != nil { continue } tables = append(tables, tableName) } return tables, nil } // DescTable describes the schema of a specified table in Doris func (d *Doris) DescTable(ctx context.Context, database, table string) ([]*types.ColumnProperty, error) { timeoutCtx, cancel := d.createTimeoutContext(ctx) defer cancel() db, err := d.NewConn(timeoutCtx, database) if err != nil { return nil, err } query := fmt.Sprintf("DESCRIBE %s.%s", database, table) rows, err := db.QueryContext(timeoutCtx, query) if err != nil { return nil, err } defer rows.Close() // 日志报表中需要把 .type 转化成内部类型 // TODO: 是否有复合类型, Array/JSON/Tuple/Nested, 是否有更多的类型 convertDorisType := func(origin string) (string, bool) { lower := strings.ToLower(origin) switch lower { case "double": return types.LogExtractValueTypeFloat, true case "datetime", "date": return types.LogExtractValueTypeDate, false case "text": return types.LogExtractValueTypeText, true default: if strings.Contains(lower, "int") { return types.LogExtractValueTypeLong, true } // 日期类型统一按照.date处理 if strings.HasPrefix(lower, "date") { return types.LogExtractValueTypeDate, false } if strings.HasPrefix(lower, "varchar") || strings.HasPrefix(lower, "char") { return types.LogExtractValueTypeText, true } if strings.HasPrefix(lower, "decimal") { return types.LogExtractValueTypeFloat, true } } return origin, false } var columns []*types.ColumnProperty for rows.Next() { var ( field string typ string null string key string defaultValue sql.NullString extra string ) if err := rows.Scan(&field, &typ, &null, &key, &defaultValue, &extra); err != nil { continue } type2, indexable := convertDorisType(typ) columns = append(columns, &types.ColumnProperty{ Field: field, Type: typ, // You might want to convert MySQL types to your custom types Type2: type2, Indexable: indexable, }) } return columns, nil } type TableIndexInfo struct { ColumnName string `json:"column_name"` IndexName string `json:"index_name"` IndexType string `json:"index_type"` } // ShowIndexes 查询表的所有索引信息 func (d *Doris) ShowIndexes(ctx context.Context, database, table string) ([]TableIndexInfo, error) { if database == "" || table == "" { return nil, fmt.Errorf("database and table names cannot be empty") } tCtx, cancel := d.createTimeoutContext(ctx) defer cancel() db, err := d.NewConn(tCtx, database) if err != nil { return nil, err } querySQL := fmt.Sprintf("%s `%s`.`%s`", SQLShowIndex, database, table) rows, err := db.QueryContext(tCtx, querySQL) if err != nil { return nil, fmt.Errorf("failed to query indexes: %w", err) } defer rows.Close() columns, err := rows.Columns() if err != nil { return nil, fmt.Errorf("failed to get columns: %w", err) } count := len(columns) // 预映射列索引 colIdx := map[string]int{ ShowIndexKeyName: -1, ShowIndexFieldColumnName: -1, ShowIndexFieldIndexType: -1, } for i, col := range columns { lCol := strings.ToLower(col) if lCol == ShowIndexKeyName || lCol == ShowIndexFieldColumnName || lCol == ShowIndexFieldIndexType { colIdx[lCol] = i } } var result []TableIndexInfo for rows.Next() { // 使用 sql.RawBytes 可以接受任何类型并转为 string,避免复杂的类型断言 scanArgs := make([]interface{}, count) values := make([]sql.RawBytes, count) for i := range values { scanArgs[i] = &values[i] } if err = rows.Scan(scanArgs...); err != nil { return nil, err } info := TableIndexInfo{} if i := colIdx[ShowIndexFieldColumnName]; i != -1 && i < count { info.ColumnName = string(values[i]) } if i := colIdx[ShowIndexKeyName]; i != -1 && i < count { info.IndexName = string(values[i]) } if i := colIdx[ShowIndexFieldIndexType]; i != -1 && i < count { info.IndexType = string(values[i]) } if info.ColumnName != "" { result = append(result, info) } } if err = rows.Err(); err != nil { return nil, fmt.Errorf("error iterating rows: %w", err) } return result, nil } // SelectRows selects rows from a specified table in Doris based on a given query with MaxQueryRows check func (d *Doris) SelectRows(ctx context.Context, database, table, query string) ([]map[string]interface{}, error) { sql := fmt.Sprintf("SELECT * FROM %s.%s", database, table) if query != "" { sql += " " + query } // 检查查询结果行数 err := d.CheckMaxQueryRows(ctx, database, sql) if err != nil { return nil, err } return d.ExecQuery(ctx, database, sql) } // ExecQuery executes a given SQL query in Doris and returns the results func (d *Doris) ExecQuery(ctx context.Context, database string, sql string) ([]map[string]interface{}, error) { timeoutCtx, cancel := d.createTimeoutContext(ctx) defer cancel() db, err := d.NewConn(timeoutCtx, database) if err != nil { return nil, err } rows, err := db.QueryContext(timeoutCtx, sql) if err != nil { return nil, err } defer rows.Close() columns, err := rows.Columns() if err != nil { return nil, err } var results []map[string]interface{} for rows.Next() { columnValues := make([]interface{}, len(columns)) columnPointers := make([]interface{}, len(columns)) for i := range columnValues { columnPointers[i] = &columnValues[i] } if err := rows.Scan(columnPointers...); err != nil { continue } rowMap := make(map[string]interface{}) for i, colName := range columns { val := columnValues[i] bytes, ok := val.([]byte) if ok { rowMap[colName] = string(bytes) } else { rowMap[colName] = val } } results = append(results, rowMap) } return results, nil } // ExecContext executes a given SQL query in Doris and returns the results func (d *Doris) ExecContext(ctx context.Context, database string, sql string) error { timeoutCtx, cancel := d.createTimeoutContext(ctx) defer cancel() db, err := d.NewWriteConn(timeoutCtx, database) if err != nil { return err } _, err = db.ExecContext(timeoutCtx, sql) return err } // ExecBatchSQL 执行多条 SQL 语句 func (d *Doris) ExecBatchSQL(ctx context.Context, database string, sqlBatch string) error { // 分割 SQL 语句 sqlStatements := SplitSQLStatements(sqlBatch) // 逐条执行 SQL 语句 for _, ql := range sqlStatements { // 跳过空语句 ql = strings.TrimSpace(ql) if ql == "" { continue } // 检查是否是 CREATE DATABASE 语句 isCreateDB := strings.HasPrefix(strings.ToUpper(ql), "CREATE DATABASE") // strings.HasPrefix(strings.ToUpper(sql), "CREATE SCHEMA") // 暂时不支持CREATE SCHEMA // 对于 CREATE DATABASE 语句,使用空数据库名连接 currentDB := database if isCreateDB { currentDB = "" } // 执行单条 SQL,ExecContext 内部已经包含超时处理 err := d.ExecContext(ctx, currentDB, ql) if err != nil { return fmt.Errorf("exec sql failed, sql:%s, err:%w", sqlBatch, err) } } return nil } // SplitSQLStatements 将多条 SQL 语句分割成单独的语句 func SplitSQLStatements(sqlBatch string) []string { var statements []string var currentStatement strings.Builder // 状态标记 var ( inString bool // 是否在字符串内 inComment bool // 是否在单行注释内 inMultilineComment bool // 是否在多行注释内 escaped bool // 前一个字符是否为转义字符 ) for i := 0; i < len(sqlBatch); i++ { char := sqlBatch[i] currentStatement.WriteByte(char) // 处理转义字符 if inString && char == '\\' { escaped = !escaped continue } // 处理字符串 if char == '\'' && !inComment && !inMultilineComment { if !escaped { inString = !inString } escaped = false continue } // 处理单行注释 if !inString && !inMultilineComment && !inComment && char == '-' && i+1 < len(sqlBatch) && sqlBatch[i+1] == '-' { inComment = true currentStatement.WriteByte(sqlBatch[i+1]) // 写入第二个'-' i++ continue } // 处理多行注释开始 if !inString && !inComment && char == '/' && i+1 < len(sqlBatch) && sqlBatch[i+1] == '*' { inMultilineComment = true currentStatement.WriteByte(sqlBatch[i+1]) // 写入'*' i++ continue } // 处理多行注释结束 if inMultilineComment && char == '*' && i+1 < len(sqlBatch) && sqlBatch[i+1] == '/' { inMultilineComment = false currentStatement.WriteByte(sqlBatch[i+1]) // 写入'/' i++ continue } // 处理换行符,结束单行注释 if inComment && (char == '\n' || char == '\r') { inComment = false } // 分割SQL语句 if char == ';' && !inString && !inMultilineComment && !inComment { // 收集到分号后面的单行注释(如果有) for j := i + 1; j < len(sqlBatch); j++ { nextChar := sqlBatch[j] // 检查是否是注释开始 if nextChar == '-' && j+1 < len(sqlBatch) && sqlBatch[j+1] == '-' { // 找到了注释,添加到当前语句 currentStatement.WriteByte(nextChar) // 添加'-' currentStatement.WriteByte(sqlBatch[j+1]) // 添加第二个'-' j++ // 读取直到行尾 for k := j + 1; k < len(sqlBatch); k++ { commentChar := sqlBatch[k] currentStatement.WriteByte(commentChar) j = k if commentChar == '\n' || commentChar == '\r' { break } } i = j break } else if !isWhitespace(nextChar) { // 非注释且非空白字符,停止收集 break } else { // 是空白字符,添加到当前语句 currentStatement.WriteByte(nextChar) i = j } } statements = append(statements, strings.TrimSpace(currentStatement.String())) currentStatement.Reset() continue } escaped = false } // 处理最后一条可能没有分号的语句 lastStatement := strings.TrimSpace(currentStatement.String()) if lastStatement != "" { statements = append(statements, lastStatement) } return statements } // 判断字符是否为空白字符 func isWhitespace(c byte) bool { return unicode.IsSpace(rune(c)) } ================================================ FILE: dskit/doris/logs.go ================================================ package doris import ( "context" "sort" ) // 日志相关的操作 const ( TimeseriesAggregationTimestamp = "__ts__" ) // TODO: 待测试, MAP/ARRAY/STRUCT/JSON 等类型能否处理 func (d *Doris) QueryLogs(ctx context.Context, query *QueryParam) ([]map[string]interface{}, error) { // 等同于 Query() return d.Query(ctx, query) } // 本质是查询时序数据, 取第一组, SQL由上层封装, 不再做复杂的解析和截断 func (d *Doris) QueryHistogram(ctx context.Context, query *QueryParam) ([][]float64, error) { values, err := d.QueryTimeseries(ctx, query) if err != nil { return [][]float64{}, nil } if len(values) > 0 && len(values[0].Values) > 0 { items := values[0].Values sort.Slice(items, func(i, j int) bool { if len(items[i]) > 0 && len(items[j]) > 0 { return items[i][0] < items[j][0] } return false }) return items, nil } return [][]float64{}, nil } ================================================ FILE: dskit/doris/sql_analyzer.go ================================================ package doris import ( "regexp" "strings" "github.com/pingcap/tidb/pkg/parser" "github.com/pingcap/tidb/pkg/parser/ast" _ "github.com/pingcap/tidb/pkg/parser/test_driver" // required for parser ) // mapAccessPattern matches Doris map/array access syntax like `col['key']` or col["key"] var mapAccessPattern = regexp.MustCompile(`\[['"]\w+['"]\]`) // castStringPattern matches Doris CAST(... AS STRING) syntax var castStringPattern = regexp.MustCompile(`(?i)\bAS\s+STRING\b`) // macro patterns var timeGroupPattern = regexp.MustCompile(`\$__timeGroup\([^)]+\)`) var timeFilterPattern = regexp.MustCompile(`\$__timeFilter\([^)]+\)`) var intervalPattern = regexp.MustCompile(`\$__interval`) // SQLAnalyzeResult holds the analysis result of a SQL statement type SQLAnalyzeResult struct { IsSelectLike bool // whether the statement is a SELECT-like query HasTopAgg bool // whether the top-level query has aggregate functions LimitConst *int64 // top-level LIMIT constant value (nil if no LIMIT or non-constant) } // AnalyzeSQL analyzes a SQL statement and extracts top-level features func AnalyzeSQL(sql string) (*SQLAnalyzeResult, error) { // Preprocess SQL to remove Doris-specific syntax that TiDB parser doesn't support preprocessedSQL := preprocessDorisSQL(sql) p := parser.New() stmtNodes, _, err := p.Parse(preprocessedSQL, "", "") if err != nil { return nil, err } if len(stmtNodes) == 0 { return &SQLAnalyzeResult{}, nil } result := &SQLAnalyzeResult{} stmt := stmtNodes[0] switch s := stmt.(type) { case *ast.SelectStmt: result.IsSelectLike = true analyzeSelectStmt(s, result) case *ast.SetOprStmt: // UNION / INTERSECT / EXCEPT result.IsSelectLike = true analyzeSetOprStmt(s, result) default: result.IsSelectLike = false } return result, nil } // analyzeSelectStmt analyzes a SELECT statement func analyzeSelectStmt(sel *ast.SelectStmt, result *SQLAnalyzeResult) { // Check if top-level SELECT has aggregate functions if sel.Fields != nil { for _, field := range sel.Fields.Fields { if field.Expr != nil && hasAggregateFunc(field.Expr) { result.HasTopAgg = true break } } } // Check if any CTE has aggregate functions if !result.HasTopAgg && sel.With != nil { for _, cte := range sel.With.CTEs { if selectHasAggregate(cte.Query) { result.HasTopAgg = true break } } } // Extract top-level LIMIT if sel.Limit != nil && sel.Limit.Count != nil { if val, ok := extractConstValue(sel.Limit.Count); ok { result.LimitConst = &val } } } // selectHasAggregate checks if a node (SELECT, UNION, or SubqueryExpr) has aggregate functions func selectHasAggregate(node ast.Node) bool { switch n := node.(type) { case *ast.SelectStmt: if n.Fields != nil { for _, field := range n.Fields.Fields { if field.Expr != nil && hasAggregateFunc(field.Expr) { return true } } } case *ast.SetOprStmt: // For UNION, check all branches if n.SelectList != nil { for _, sel := range n.SelectList.Selects { if selectHasAggregate(sel) { return true } } } case *ast.SubqueryExpr: // CTE query is wrapped in SubqueryExpr if n.Query != nil { return selectHasAggregate(n.Query) } } return false } // analyzeSetOprStmt analyzes UNION/INTERSECT/EXCEPT statements func analyzeSetOprStmt(setOpr *ast.SetOprStmt, result *SQLAnalyzeResult) { // UNION's LIMIT is at the outermost level if setOpr.Limit != nil && setOpr.Limit.Count != nil { if val, ok := extractConstValue(setOpr.Limit.Count); ok { result.LimitConst = &val } } // Check if all branches are aggregates (conservative: if any is non-aggregate, don't skip) if setOpr.SelectList == nil || len(setOpr.SelectList.Selects) == 0 { return } allAgg := true for _, sel := range setOpr.SelectList.Selects { if selectStmt, ok := sel.(*ast.SelectStmt); ok { if selectStmt.Fields != nil { hasAgg := false for _, field := range selectStmt.Fields.Fields { if field.Expr != nil && hasAggregateFunc(field.Expr) { hasAgg = true break } } if !hasAgg { allAgg = false break } } } } result.HasTopAgg = allAgg } // hasAggregateFunc checks if an expression contains aggregate functions (without entering subqueries) func hasAggregateFunc(expr ast.ExprNode) bool { checker := &aggregateChecker{} expr.Accept(checker) return checker.found } // aggregateChecker implements ast.Visitor to find aggregate functions type aggregateChecker struct { found bool } func (c *aggregateChecker) Enter(n ast.Node) (ast.Node, bool) { if c.found { return n, true // stop traversal } switch node := n.(type) { case *ast.SubqueryExpr: return n, true // don't enter subquery case *ast.AggregateFuncExpr: c.found = true return n, true case *ast.FuncCallExpr: // Check for Doris-specific aggregate/statistic functions funcName := strings.ToUpper(node.FnName.L) if isDorisAggregateFunc(funcName) { c.found = true return n, true } } return n, false // continue traversal } func (c *aggregateChecker) Leave(n ast.Node) (ast.Node, bool) { return n, true } // isDorisAggregateFunc checks if a function is a Doris-specific aggregate/statistic function func isDorisAggregateFunc(funcName string) bool { dorisAggFuncs := map[string]bool{ // Standard aggregates (in case parser doesn't recognize them) "COUNT": true, "SUM": true, "AVG": true, "MIN": true, "MAX": true, "ANY": true, "ANY_VALUE": true, // HLL related "HLL_UNION_AGG": true, "HLL_RAW_AGG": true, "HLL_CARDINALITY": true, "HLL_UNION": true, "HLL_HASH": true, // Bitmap related "BITMAP_UNION": true, "BITMAP_UNION_COUNT": true, "BITMAP_INTERSECT": true, "BITMAP_COUNT": true, "BITMAP_AND_COUNT": true, "BITMAP_OR_COUNT": true, "BITMAP_XOR_COUNT": true, "BITMAP_AND_NOT_COUNT": true, // Other aggregates "PERCENTILE": true, "PERCENTILE_APPROX": true, "APPROX_COUNT_DISTINCT": true, "NDV": true, "COLLECT_LIST": true, "COLLECT_SET": true, "GROUP_CONCAT": true, "GROUP_BIT_AND": true, "GROUP_BIT_OR": true, "GROUP_BIT_XOR": true, "GROUPING": true, "GROUPING_ID": true, // Statistical functions "STDDEV": true, "STDDEV_POP": true, "STDDEV_SAMP": true, "STD": true, "VARIANCE": true, "VAR_POP": true, "VAR_SAMP": true, "COVAR_POP": true, "COVAR_SAMP": true, "CORR": true, // Window functions that are also aggregates "FIRST_VALUE": true, "LAST_VALUE": true, "LAG": true, "LEAD": true, "ROW_NUMBER": true, "RANK": true, "DENSE_RANK": true, "NTILE": true, "CUME_DIST": true, "PERCENT_RANK": true, } return dorisAggFuncs[funcName] } // extractConstValue extracts constant integer value from an expression func extractConstValue(expr ast.ExprNode) (int64, bool) { switch v := expr.(type) { case ast.ValueExpr: switch val := v.GetValue().(type) { case int64: return val, true case uint64: return int64(val), true case float64: return int64(val), true case int: return int64(val), true } } return 0, false } // preprocessDorisSQL removes Doris-specific syntax that TiDB parser doesn't support func preprocessDorisSQL(sql string) string { // Remove map/array access syntax like ['key'] or ["key"] // This is used in Doris for accessing map/variant/json fields sql = mapAccessPattern.ReplaceAllString(sql, "") // Replace Doris CAST(... AS STRING) with CAST(... AS CHAR) sql = castStringPattern.ReplaceAllString(sql, "AS CHAR") // Replace macros with valid SQL equivalents sql = timeGroupPattern.ReplaceAllString(sql, "ts") sql = timeFilterPattern.ReplaceAllString(sql, "1=1") sql = intervalPattern.ReplaceAllString(sql, "60") return sql } // NeedsRowCountCheck determines if a SQL query needs row count checking // Returns: needsCheck bool, directReject bool, rejectReason string func NeedsRowCountCheck(sql string, maxQueryRows int) (bool, bool, string) { result, err := AnalyzeSQL(sql) if err != nil { // Parse failed, fall back to probe check return true, false, "" } if !result.IsSelectLike { // Not a SELECT query, skip check return false, false, "" } // Rule 1: Top-level has aggregate functions -> skip check if result.HasTopAgg { return false, false, "" } // Rule 2: Top-level LIMIT <= maxRows -> skip check if result.LimitConst != nil && *result.LimitConst <= int64(maxQueryRows) { return false, false, "" } // Otherwise, needs probe check (including LIMIT > maxRows, since actual result may be smaller) return true, false, "" } ================================================ FILE: dskit/doris/sql_analyzer_test.go ================================================ package doris import ( "testing" ) func TestAnalyzeSQL_AggregateQueries(t *testing.T) { tests := []struct { name string sql string wantHasAgg bool wantIsSelect bool }{ // Standard aggregate functions - should skip check { name: "COUNT(*)", sql: "SELECT COUNT(*) AS `cnt`, FLOOR(UNIX_TIMESTAMP(event_date) DIV 10) * 10 AS `time`, CAST(`labels`['event'] AS STRING) AS `labels.event` FROM `db_insight_doris`.`ewall_event` WHERE `event_date` BETWEEN FROM_UNIXTIME(1768965669) AND FROM_UNIXTIME(1768965969) GROUP BY `time`, `labels.event` ORDER BY `time` ASC", wantHasAgg: true, wantIsSelect: true, }, { name: "COUNT with column", sql: "SELECT COUNT(id) FROM users", wantHasAgg: true, wantIsSelect: true, }, { name: "SUM function", sql: "SELECT SUM(amount) FROM orders", wantHasAgg: true, wantIsSelect: true, }, { name: "AVG function", sql: "SELECT AVG(price) FROM products", wantHasAgg: true, wantIsSelect: true, }, { name: "MIN function", sql: "SELECT MIN(created_at) FROM logs", wantHasAgg: true, wantIsSelect: true, }, { name: "MAX function", sql: "SELECT MAX(score) FROM results", wantHasAgg: true, wantIsSelect: true, }, { name: "Multiple aggregates", sql: "SELECT COUNT(*), SUM(amount), AVG(price) FROM orders", wantHasAgg: true, wantIsSelect: true, }, { name: "Aggregate with GROUP BY", sql: "SELECT user_id, COUNT(*) FROM orders GROUP BY user_id", wantHasAgg: true, wantIsSelect: true, }, { name: "Aggregate with WHERE and GROUP BY", sql: "SELECT category, SUM(sales) FROM products WHERE status = 'active' GROUP BY category", wantHasAgg: true, wantIsSelect: true, }, { name: "Aggregate with HAVING", sql: "SELECT user_id, COUNT(*) as cnt FROM orders GROUP BY user_id HAVING cnt > 10", wantHasAgg: true, wantIsSelect: true, }, // macro queries with aggregates { name: "COUNT with timeGroup", sql: "SELECT COUNT(*) AS `cnt`, $__timeGroup(timestamp,$__interval) AS `time` FROM `apm`.`traces_span` WHERE (`service_name` = 'demo-logic-server') AND $__timeFilter(`timestamp`) GROUP BY `time` ORDER BY `time` ASC", wantHasAgg: true, wantIsSelect: true, }, { name: "CTE with ratio calculation", sql: "WITH `time_totals` AS (SELECT $__timeGroup(timestamp,$__interval) AS `time`, COUNT(*) AS `total_count` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) GROUP BY `time`), `time_counts` AS (SELECT ANY_VALUE(`service_name`) AS `service_name`, $__timeGroup(timestamp,$__interval) AS `time`, COUNT(*) AS `count` FROM `apm`.`traces_span` WHERE (`service_name` = 'demo-logic-server') AND $__timeFilter(`timestamp`) GROUP BY `time`) SELECT tc.`service_name`, tc.`time`, ROUND(tc.`count` * 100.0 / tt.`total_count`, 2) AS `ratio` FROM `time_counts` tc JOIN `time_totals` tt ON tc.`time` = tt.`time` ORDER BY tc.`time` ASC", wantHasAgg: true, // CTE has aggregate functions wantIsSelect: true, }, { name: "CTE with top values and ratio", sql: "WITH `top_values` AS (SELECT `service_name` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) GROUP BY `service_name` ORDER BY COUNT(*) DESC LIMIT 5), `time_totals` AS (SELECT $__timeGroup(timestamp,$__interval) AS `time`, COUNT(*) AS `total_count` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) GROUP BY `time`), `time_counts` AS (SELECT `service_name`, $__timeGroup(timestamp,$__interval) AS `time`, COUNT(*) AS `count` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) AND `service_name` IN (SELECT `service_name` FROM `top_values`) GROUP BY `service_name`, `time`) SELECT tc.`service_name`, tc.`time`, ROUND(tc.`count` * 100.0 / tt.`total_count`, 2) AS `ratio` FROM `time_counts` tc JOIN `time_totals` tt ON tc.`time` = tt.`time` ORDER BY tc.`time` ASC", wantHasAgg: true, // CTE has aggregate functions wantIsSelect: true, }, { name: "PERCENTILE_APPROX with timeGroup", sql: "SELECT PERCENTILE_APPROX(`duration`, 0.95) AS `p95`, $__timeGroup(timestamp,$__interval) AS `time` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) GROUP BY `time` ORDER BY `time` ASC", wantHasAgg: true, wantIsSelect: true, }, { name: "COUNT DISTINCT with timeGroup", sql: "SELECT COUNT(DISTINCT `duration`) AS `unique_count`, $__timeGroup(timestamp,$__interval) AS `time` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) GROUP BY `time` ORDER BY `time` ASC", wantHasAgg: true, wantIsSelect: true, }, { name: "CASE WHEN with COUNT and ROUND", sql: "SELECT ROUND(COUNT(CASE WHEN `duration` IS NOT NULL THEN 1 END) * 100.0 / COUNT(*), 2) AS `exist_ratio`, $__timeGroup(timestamp,$__interval) AS `time` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) GROUP BY `time` ORDER BY `time` ASC", wantHasAgg: true, wantIsSelect: true, }, { name: "AVG with timeGroup", sql: "SELECT AVG(`duration`) AS `avg`, $__timeGroup(timestamp,$__interval) AS `time` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`) GROUP BY `time` ORDER BY `time` ASC", wantHasAgg: true, wantIsSelect: true, }, { name: "Simple COUNT with timeFilter", sql: "SELECT COUNT(*) AS `cnt` FROM `apm`.`traces_span` WHERE (`span_name` = 'GET /backend/detail') AND $__timeFilter(`timestamp`)", wantHasAgg: true, wantIsSelect: true, }, { name: "CTE with CROSS JOIN ratio", sql: "WITH `total` AS (SELECT COUNT(*) AS `total_count` FROM `apm`.`traces_span` WHERE $__timeFilter(`timestamp`)), `value_counts` AS (SELECT ANY_VALUE(`span_kind`) AS `span_kind`, COUNT(*) AS `count` FROM `apm`.`traces_span` WHERE (`span_kind` = 'SPAN_KIND_SERVER') AND $__timeFilter(`timestamp`)) SELECT vc.`span_kind`, vc.`count` AS `count`, ROUND(vc.`count` * 100.0 / t.`total_count`, 2) AS `ratio` FROM `value_counts` vc CROSS JOIN `total` t ORDER BY vc.`count` DESC;", wantHasAgg: true, // CTE has aggregate functions wantIsSelect: true, }, // Non-aggregate queries - should not skip check { name: "Simple SELECT *", sql: "SELECT * FROM users", wantHasAgg: false, wantIsSelect: true, }, { name: "SELECT with columns", sql: "SELECT id, name, email FROM users", wantHasAgg: false, wantIsSelect: true, }, { name: "SELECT with WHERE", sql: "SELECT * FROM users WHERE status = 'active'", wantHasAgg: false, wantIsSelect: true, }, { name: "SELECT with JOIN", sql: "SELECT u.name, o.amount FROM users u JOIN orders o ON u.id = o.user_id", wantHasAgg: false, wantIsSelect: true, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result, err := AnalyzeSQL(tt.sql) if err != nil { t.Fatalf("AnalyzeSQL() error = %v", err) } if result.HasTopAgg != tt.wantHasAgg { t.Errorf("name: %s, HasTopAgg = %v, want %v", tt.name, result.HasTopAgg, tt.wantHasAgg) } if result.IsSelectLike != tt.wantIsSelect { t.Errorf("IsSelectLike = %v, want %v", result.IsSelectLike, tt.wantIsSelect) } }) } } func TestAnalyzeSQL_SubqueryWithAggregate(t *testing.T) { // Aggregate in subquery should NOT skip check for main query tests := []struct { name string sql string wantHasAgg bool }{ { name: "Aggregate in subquery only", sql: "SELECT * FROM (SELECT user_id, COUNT(*) as cnt FROM orders GROUP BY user_id) t", wantHasAgg: false, // top-level has no aggregate }, { name: "Aggregate in WHERE subquery", sql: "SELECT * FROM users WHERE id IN (SELECT user_id FROM orders GROUP BY user_id HAVING COUNT(*) > 5)", wantHasAgg: false, // top-level has no aggregate }, { name: "Both top-level and subquery aggregates", sql: "SELECT COUNT(*) FROM (SELECT user_id FROM orders GROUP BY user_id) t", wantHasAgg: true, // top-level has aggregate }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result, err := AnalyzeSQL(tt.sql) if err != nil { t.Fatalf("AnalyzeSQL() error = %v", err) } if result.HasTopAgg != tt.wantHasAgg { t.Errorf("HasTopAgg = %v, want %v", result.HasTopAgg, tt.wantHasAgg) } }) } } func TestAnalyzeSQL_LimitQueries(t *testing.T) { tests := []struct { name string sql string wantLimit *int64 wantIsSelect bool }{ { name: "LIMIT 10", sql: "SELECT * FROM users LIMIT 10", wantLimit: ptr(int64(10)), wantIsSelect: true, }, { name: "LIMIT 100", sql: "SELECT * FROM users LIMIT 100", wantLimit: ptr(int64(100)), wantIsSelect: true, }, { name: "LIMIT 1000", sql: "SELECT * FROM users LIMIT 1000", wantLimit: ptr(int64(1000)), wantIsSelect: true, }, { name: "LIMIT with OFFSET", sql: "SELECT * FROM users LIMIT 50 OFFSET 100", wantLimit: ptr(int64(50)), wantIsSelect: true, }, { name: "No LIMIT", sql: "SELECT * FROM users", wantLimit: nil, wantIsSelect: true, }, { name: "LIMIT 0", sql: "SELECT * FROM users LIMIT 0", wantLimit: ptr(int64(0)), wantIsSelect: true, }, { name: "LIMIT 1", sql: "SELECT * FROM users LIMIT 1", wantLimit: ptr(int64(1)), wantIsSelect: true, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result, err := AnalyzeSQL(tt.sql) if err != nil { t.Fatalf("AnalyzeSQL() error = %v", err) } if result.IsSelectLike != tt.wantIsSelect { t.Errorf("IsSelectLike = %v, want %v", result.IsSelectLike, tt.wantIsSelect) } if tt.wantLimit == nil { if result.LimitConst != nil { t.Errorf("LimitConst = %v, want nil", *result.LimitConst) } } else { if result.LimitConst == nil { t.Errorf("LimitConst = nil, want %v", *tt.wantLimit) } else if *result.LimitConst != *tt.wantLimit { t.Errorf("LimitConst = %v, want %v", *result.LimitConst, *tt.wantLimit) } } }) } } func TestAnalyzeSQL_UnionQueries(t *testing.T) { tests := []struct { name string sql string wantHasAgg bool wantLimit *int64 }{ { name: "UNION without aggregate", sql: "SELECT id, name FROM users UNION SELECT id, name FROM admins", wantHasAgg: false, wantLimit: nil, }, { name: "UNION ALL without aggregate", sql: "SELECT * FROM users UNION ALL SELECT * FROM admins", wantHasAgg: false, wantLimit: nil, }, { name: "UNION with aggregate in all branches", sql: "SELECT COUNT(*) FROM users UNION SELECT COUNT(*) FROM admins", wantHasAgg: true, wantLimit: nil, }, { name: "UNION with aggregate in one branch only", sql: "SELECT COUNT(*) FROM users UNION SELECT id FROM admins", wantHasAgg: false, // not all branches have aggregate wantLimit: nil, }, { name: "UNION with outer LIMIT", sql: "SELECT * FROM users UNION SELECT * FROM admins LIMIT 100", wantHasAgg: false, wantLimit: ptr(int64(100)), }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result, err := AnalyzeSQL(tt.sql) if err != nil { t.Fatalf("AnalyzeSQL() error = %v", err) } if result.HasTopAgg != tt.wantHasAgg { t.Errorf("HasTopAgg = %v, want %v", result.HasTopAgg, tt.wantHasAgg) } if tt.wantLimit == nil { if result.LimitConst != nil { t.Errorf("LimitConst = %v, want nil", *result.LimitConst) } } else { if result.LimitConst == nil { t.Errorf("LimitConst = nil, want %v", *tt.wantLimit) } else if *result.LimitConst != *tt.wantLimit { t.Errorf("LimitConst = %v, want %v", *result.LimitConst, *tt.wantLimit) } } }) } } func TestAnalyzeSQL_NonSelectStatements(t *testing.T) { tests := []struct { name string sql string wantIsSelect bool }{ { name: "SHOW DATABASES", sql: "SHOW DATABASES", wantIsSelect: false, }, { name: "SHOW TABLES", sql: "SHOW TABLES", wantIsSelect: false, }, { name: "DESCRIBE table", sql: "DESCRIBE users", wantIsSelect: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result, err := AnalyzeSQL(tt.sql) if err != nil { // Some statements may not be parseable, which is fine return } if result.IsSelectLike != tt.wantIsSelect { t.Errorf("IsSelectLike = %v, want %v", result.IsSelectLike, tt.wantIsSelect) } }) } } func TestNeedsRowCountCheck(t *testing.T) { maxRows := 500 tests := []struct { name string sql string wantNeedCheck bool wantReject bool }{ // Should skip check (needsCheck = false) { name: "Aggregate COUNT(*)", sql: "SELECT COUNT(*) FROM users", wantNeedCheck: false, wantReject: false, }, { name: "Aggregate SUM", sql: "SELECT SUM(amount) FROM orders", wantNeedCheck: false, wantReject: false, }, { name: "Aggregate with GROUP BY", sql: "SELECT user_id, COUNT(*) FROM orders GROUP BY user_id", wantNeedCheck: false, wantReject: false, }, { name: "LIMIT equal to max", sql: "SELECT * FROM users LIMIT 500", wantNeedCheck: false, wantReject: false, }, { name: "LIMIT less than max", sql: "SELECT * FROM users LIMIT 100", wantNeedCheck: false, wantReject: false, }, { name: "LIMIT 1", sql: "SELECT * FROM users LIMIT 1", wantNeedCheck: false, wantReject: false, }, // LIMIT > maxRows still needs probe check (actual result might be smaller) { name: "LIMIT exceeds max", sql: "SELECT * FROM users LIMIT 1000", wantNeedCheck: true, wantReject: false, }, { name: "LIMIT much larger than max", sql: "SELECT * FROM users LIMIT 10000", wantNeedCheck: true, wantReject: false, }, // Should execute probe check (needsCheck = true) { name: "No LIMIT no aggregate", sql: "SELECT * FROM users", wantNeedCheck: true, wantReject: false, }, { name: "SELECT with WHERE no LIMIT", sql: "SELECT * FROM users WHERE status = 'active'", wantNeedCheck: true, wantReject: false, }, { name: "SELECT with JOIN no LIMIT", sql: "SELECT u.*, o.* FROM users u JOIN orders o ON u.id = o.user_id", wantNeedCheck: true, wantReject: false, }, { name: "Aggregate in subquery only", sql: "SELECT * FROM (SELECT user_id, COUNT(*) as cnt FROM orders GROUP BY user_id) t", wantNeedCheck: true, wantReject: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { needsCheck, directReject, _ := NeedsRowCountCheck(tt.sql, maxRows) if needsCheck != tt.wantNeedCheck { t.Errorf("needsCheck = %v, want %v", needsCheck, tt.wantNeedCheck) } if directReject != tt.wantReject { t.Errorf("directReject = %v, want %v", directReject, tt.wantReject) } }) } } func TestNeedsRowCountCheck_DorisSpecificFunctions(t *testing.T) { maxRows := 500 tests := []struct { name string sql string wantNeedCheck bool }{ // Doris HLL functions { name: "HLL_UNION_AGG", sql: "SELECT HLL_UNION_AGG(hll_col) FROM user_stats", wantNeedCheck: false, }, { name: "HLL_CARDINALITY", sql: "SELECT HLL_CARDINALITY(hll_col) FROM user_stats", wantNeedCheck: false, }, // Doris Bitmap functions { name: "BITMAP_UNION_COUNT", sql: "SELECT BITMAP_UNION_COUNT(bitmap_col) FROM user_tags", wantNeedCheck: false, }, { name: "BITMAP_UNION", sql: "SELECT BITMAP_UNION(bitmap_col) FROM user_tags GROUP BY category", wantNeedCheck: false, }, // Other Doris aggregate functions { name: "APPROX_COUNT_DISTINCT", sql: "SELECT APPROX_COUNT_DISTINCT(user_id) FROM events", wantNeedCheck: false, }, { name: "GROUP_CONCAT", sql: "SELECT GROUP_CONCAT(name) FROM users GROUP BY department", wantNeedCheck: false, }, { name: "PERCENTILE_APPROX", sql: "SELECT PERCENTILE_APPROX(latency, 0.99) FROM requests", wantNeedCheck: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { needsCheck, _, _ := NeedsRowCountCheck(tt.sql, maxRows) if needsCheck != tt.wantNeedCheck { t.Errorf("needsCheck = %v, want %v (should skip check for Doris aggregate functions)", needsCheck, tt.wantNeedCheck) } }) } } func TestNeedsRowCountCheck_ComplexQueries(t *testing.T) { maxRows := 500 tests := []struct { name string sql string wantNeedCheck bool wantReject bool }{ { name: "CTE with aggregate", sql: "WITH user_counts AS (SELECT user_id, COUNT(*) as cnt FROM orders GROUP BY user_id) SELECT * FROM user_counts", wantNeedCheck: false, // CTE has aggregate, skip check wantReject: false, }, { name: "Complex JOIN with aggregate", sql: "SELECT u.department, COUNT(*) FROM users u JOIN orders o ON u.id = o.user_id GROUP BY u.department", wantNeedCheck: false, // has aggregate wantReject: false, }, { name: "Nested subquery", sql: "SELECT * FROM users WHERE id IN (SELECT user_id FROM orders WHERE amount > 100)", wantNeedCheck: true, wantReject: false, }, { name: "DISTINCT query", sql: "SELECT DISTINCT category FROM products", wantNeedCheck: true, // DISTINCT is not aggregate wantReject: false, }, { name: "ORDER BY with LIMIT", sql: "SELECT * FROM users ORDER BY created_at DESC LIMIT 100", wantNeedCheck: false, // has valid LIMIT wantReject: false, }, { name: "Multiple aggregates in single query", sql: "SELECT COUNT(*), SUM(amount), AVG(amount), MIN(amount), MAX(amount) FROM orders", wantNeedCheck: false, wantReject: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { needsCheck, directReject, _ := NeedsRowCountCheck(tt.sql, maxRows) if needsCheck != tt.wantNeedCheck { t.Errorf("needsCheck = %v, want %v", needsCheck, tt.wantNeedCheck) } if directReject != tt.wantReject { t.Errorf("directReject = %v, want %v", directReject, tt.wantReject) } }) } } func TestNeedsRowCountCheck_EdgeCases(t *testing.T) { maxRows := 500 tests := []struct { name string sql string wantNeedCheck bool wantReject bool }{ { name: "Empty-ish LIMIT 0", sql: "SELECT * FROM users LIMIT 0", wantNeedCheck: false, wantReject: false, }, { name: "LIMIT at boundary", sql: "SELECT * FROM users LIMIT 501", wantNeedCheck: true, // 501 > 500, needs probe check wantReject: false, }, { name: "SELECT with trailing semicolon", sql: "SELECT * FROM users;", wantNeedCheck: true, wantReject: false, }, { name: "SELECT with extra whitespace", sql: " SELECT * FROM users ", wantNeedCheck: true, wantReject: false, }, { name: "Lowercase keywords", sql: "select count(*) from users", wantNeedCheck: false, wantReject: false, }, { name: "Mixed case keywords", sql: "Select Count(*) From users", wantNeedCheck: false, wantReject: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { needsCheck, directReject, _ := NeedsRowCountCheck(tt.sql, maxRows) if needsCheck != tt.wantNeedCheck { t.Errorf("needsCheck = %v, want %v", needsCheck, tt.wantNeedCheck) } if directReject != tt.wantReject { t.Errorf("directReject = %v, want %v", directReject, tt.wantReject) } }) } } func TestNeedsRowCountCheck_DifferentMaxRows(t *testing.T) { tests := []struct { name string sql string maxRows int wantNeedCheck bool wantReject bool }{ { name: "LIMIT 100 with maxRows 50", sql: "SELECT * FROM users LIMIT 100", maxRows: 50, wantNeedCheck: true, // LIMIT > maxRows, needs probe check wantReject: false, }, { name: "LIMIT 100 with maxRows 100", sql: "SELECT * FROM users LIMIT 100", maxRows: 100, wantNeedCheck: false, wantReject: false, }, { name: "LIMIT 100 with maxRows 200", sql: "SELECT * FROM users LIMIT 100", maxRows: 200, wantNeedCheck: false, wantReject: false, }, { name: "No LIMIT with maxRows 1000", sql: "SELECT * FROM users", maxRows: 1000, wantNeedCheck: true, wantReject: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { needsCheck, directReject, _ := NeedsRowCountCheck(tt.sql, tt.maxRows) if needsCheck != tt.wantNeedCheck { t.Errorf("needsCheck = %v, want %v", needsCheck, tt.wantNeedCheck) } if directReject != tt.wantReject { t.Errorf("directReject = %v, want %v", directReject, tt.wantReject) } }) } } // TestSummary_SkipProbeCheck prints a summary of which SQL patterns skip the probe check func TestSummary_SkipProbeCheck(t *testing.T) { maxRows := 500 skipCheckCases := []struct { category string sql string }{ // Aggregate functions {"Aggregate - COUNT(*)", "SELECT COUNT(*) FROM users"}, {"Aggregate - COUNT(col)", "SELECT COUNT(id) FROM users"}, {"Aggregate - SUM", "SELECT SUM(amount) FROM orders"}, {"Aggregate - AVG", "SELECT AVG(price) FROM products"}, {"Aggregate - MIN", "SELECT MIN(created_at) FROM logs"}, {"Aggregate - MAX", "SELECT MAX(score) FROM results"}, {"Aggregate - GROUP BY", "SELECT user_id, COUNT(*) FROM orders GROUP BY user_id"}, {"Aggregate - HAVING", "SELECT user_id, SUM(amount) FROM orders GROUP BY user_id HAVING SUM(amount) > 1000"}, // Doris specific aggregates {"Doris - HLL_UNION_AGG", "SELECT HLL_UNION_AGG(hll_col) FROM stats"}, {"Doris - BITMAP_UNION_COUNT", "SELECT BITMAP_UNION_COUNT(bitmap_col) FROM tags"}, {"Doris - APPROX_COUNT_DISTINCT", "SELECT APPROX_COUNT_DISTINCT(user_id) FROM events"}, {"Doris - GROUP_CONCAT", "SELECT GROUP_CONCAT(name) FROM users GROUP BY dept"}, // LIMIT <= maxRows {"LIMIT - Equal to max", "SELECT * FROM users LIMIT 500"}, {"LIMIT - Less than max", "SELECT * FROM users LIMIT 100"}, {"LIMIT - With OFFSET", "SELECT * FROM users LIMIT 100 OFFSET 50"}, {"LIMIT - Value 1", "SELECT * FROM users LIMIT 1"}, {"LIMIT - Value 0", "SELECT * FROM users LIMIT 0"}, } t.Log("=== SQL patterns that SKIP probe check (no extra query needed) ===") for _, tc := range skipCheckCases { needsCheck, _, _ := NeedsRowCountCheck(tc.sql, maxRows) status := "✓ SKIP" if needsCheck { status = "✗ NEEDS CHECK (unexpected)" } t.Logf(" %s: %s\n SQL: %s", status, tc.category, tc.sql) } needsCheckCases := []struct { category string sql string }{ {"No LIMIT - Simple SELECT", "SELECT * FROM users"}, {"No LIMIT - With WHERE", "SELECT * FROM users WHERE status = 'active'"}, {"No LIMIT - With JOIN", "SELECT u.*, o.* FROM users u JOIN orders o ON u.id = o.user_id"}, {"No LIMIT - Subquery with agg", "SELECT * FROM (SELECT user_id, COUNT(*) FROM orders GROUP BY user_id) t"}, {"No LIMIT - DISTINCT", "SELECT DISTINCT category FROM products"}, {"LIMIT > max (actual may be smaller)", "SELECT * FROM users LIMIT 1000"}, {"LIMIT >> max", "SELECT * FROM users LIMIT 10000"}, } t.Log("\n=== SQL patterns that NEED probe check ===") for _, tc := range needsCheckCases { needsCheck, _, _ := NeedsRowCountCheck(tc.sql, maxRows) status := "✓ NEEDS CHECK" if !needsCheck { status = "✗ SKIP (unexpected)" } t.Logf(" %s: %s\n SQL: %s", status, tc.category, tc.sql) } } // ptr is a helper function to create a pointer to int64 func ptr(v int64) *int64 { return &v } ================================================ FILE: dskit/doris/template.md ================================================ ## SQL变量 | 字段名 | 含义 | 使用场景 | | ---- | ---- | ---- | |database|数据库|无| |table|表名|| |time_field|时间戳的字段|| |query|查询条件|日志原文| |from|开始时间|| |to|结束时间|| |aggregation|聚合算法|时序图| |field|聚合的字段|时序图| |limit|分页参数|日志原文| |offset|分页参数|日志原文| |interval|直方图的时间粒度|直方图| ## 日志原文 ### 直方图 ``` # 如何计算interval的值 max := 60 // 最多60个柱子 interval := ($to-$from) / max interval = interval - interval%10 if interval <= 0 { interval = 60 } ``` ``` SELECT count() as cnt, FLOOR(UNIX_TIMESTAMP($time_field) / $interval) * $interval AS __ts__ FROM $table WHERE $time_field BETWEEN FROM_UNIXTIME($from) AND FROM_UNIXTIME($to) GROUP BY __ts__; ``` ``` { "database":"$database", "sql":"$sql", "keys:": { "valueKey":"cnt", "timeKey":"__ts__" } } ``` ### 日志原文 ``` SELECT * from $table WHERE $time_field BETWEEN FROM_UNIXTIME($from) AND FROM_UNIXTIME($to) ORDER by $time_filed LIMIT $limit OFFSET $offset; ``` ``` { "database":"$database", "sql":"$sql" } ``` ## 时序图 ### 日志行数 ``` SELECT COUNT() AS cnt, DATE_FORMAT(date, '%Y-%m-%d %H:%i:00') AS __ts__ FROM nginx_access_log WHERE $time_field BETWEEN FROM_UNIXTIME($from) AND FROM_UNIXTIME($to) GROUP BY __ts__ ``` ``` { "database":"$database", "sql":"$sql", "keys:": { "valueKey":"cnt", "timeKey":"__ts__" } } ``` ### max/min/avg/sum ``` SELECT $aggregation($field) AS series, DATE_FORMAT(date, '%Y-%m-%d %H:%i:00') AS __ts__ FROM nginx_access_log WHERE $time_field BETWEEN FROM_UNIXTIME($from) AND FROM_UNIXTIME($to) GROUP BY __ts__ ``` ``` { "database":"$database", "sql":"$sql", "keys:": { "valueKey":"series", "timeKey":"__ts__" } } ``` ### 分位值 ``` SELECT percentile($field, 0.95) AS series, DATE_FORMAT(date, '%Y-%m-%d %H:%i:00') AS __ts__ FROM nginx_access_log WHERE $time_field BETWEEN FROM_UNIXTIME($from) AND FROM_UNIXTIME($to) GROUP BY __ts__ ``` ``` { "database":"$database", "sql":"$sql", "keys:": { "valueKey":"series", "timeKey":"__ts__" } } ``` ================================================ FILE: dskit/doris/timeseries.go ================================================ package doris import ( "context" "fmt" "strings" "github.com/ccfos/nightingale/v6/dskit/sqlbase" "github.com/ccfos/nightingale/v6/dskit/types" ) const ( TimeFieldFormatEpochMilli = "epoch_millis" TimeFieldFormatEpochSecond = "epoch_second" TimeFieldFormatDateTime = "datetime" ) // 不再拼接SQL, 完全信赖用户的输入 type QueryParam struct { Database string `json:"database"` Sql string `json:"sql"` Keys types.Keys `json:"keys" mapstructure:"keys"` } var ( DorisBannedOp = map[string]struct{}{ "CREATE": {}, "INSERT": {}, "ALTER": {}, "REVOKE": {}, "DROP": {}, "RENAME": {}, "ATTACH": {}, "DETACH": {}, "OPTIMIZE": {}, "TRUNCATE": {}, "SET": {}, } ) // Query executes a given SQL query in Doris and returns the results with MaxQueryRows check func (d *Doris) Query(ctx context.Context, query *QueryParam) ([]map[string]interface{}, error) { // 校验SQL的合法性, 过滤掉 write请求 sqlItem := strings.Split(strings.ToUpper(query.Sql), " ") for _, item := range sqlItem { if _, ok := DorisBannedOp[item]; ok { return nil, fmt.Errorf("operation %s is forbid, only read db, please check your sql", item) } } // 检查查询结果行数 err := d.CheckMaxQueryRows(ctx, query.Database, query.Sql) if err != nil { return nil, err } rows, err := d.ExecQuery(ctx, query.Database, query.Sql) if err != nil { return nil, err } return rows, nil } // QueryTimeseries executes a time series data query using the given parameters with MaxQueryRows check func (d *Doris) QueryTimeseries(ctx context.Context, query *QueryParam) ([]types.MetricValues, error) { // 使用 Query 方法执行查询,Query方法内部已包含MaxQueryRows检查 rows, err := d.Query(ctx, query) if err != nil { return nil, err } return sqlbase.FormatMetricValues(query.Keys, rows), nil } // CheckMaxQueryRows checks if the query result exceeds the maximum allowed rows // It uses SQL analysis to skip unnecessary checks for aggregate queries or queries with LIMIT <= maxRows // For queries that need checking, it uses probe approach (LIMIT maxRows+1) instead of COUNT(*) for better performance func (d *Doris) CheckMaxQueryRows(ctx context.Context, database, sql string) error { maxQueryRows := d.MaxQueryRows if maxQueryRows == 0 { maxQueryRows = 500 } cleanedSQL := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(sql), ";")) // Step 1: Analyze SQL to determine if check is needed needsCheck, _, _ := NeedsRowCountCheck(cleanedSQL, maxQueryRows) if !needsCheck { return nil } // Step 2: Execute probe query (more efficient than COUNT(*)) return d.probeRowCount(ctx, database, cleanedSQL, maxQueryRows) } // probeRowCount uses threshold probing to check row count // It reads at most maxRows+1 rows, which is O(maxRows) instead of O(totalRows) for COUNT(*) // Doris optimizes LIMIT queries by stopping scan early once limit is reached func (d *Doris) probeRowCount(ctx context.Context, database, sql string, maxRows int) error { timeoutCtx, cancel := d.createTimeoutContext(ctx) defer cancel() // Probe SQL: only need to check if exceeds threshold, not actual data probeSQL := fmt.Sprintf("SELECT 1 FROM (%s) AS __probe_chk LIMIT %d", sql, maxRows+1) results, err := d.ExecQuery(timeoutCtx, database, probeSQL) if err != nil { return err } // If returned rows > maxRows, it exceeds the limit if len(results) > maxRows { return fmt.Errorf("query result rows count exceeds the maximum limit %d", maxRows) } return nil } ================================================ FILE: dskit/mysql/mysql.go ================================================ // @Author: Ciusyan 5/10/24 package mysql import ( "context" "encoding/json" "errors" "fmt" "strings" "time" "github.com/ccfos/nightingale/v6/dskit/pool" "github.com/ccfos/nightingale/v6/dskit/sqlbase" "github.com/ccfos/nightingale/v6/dskit/types" _ "github.com/go-sql-driver/mysql" // MySQL driver "github.com/mitchellh/mapstructure" "gorm.io/driver/mysql" "gorm.io/gorm" ) type MySQL struct { Shards []Shard `json:"mysql.shards" mapstructure:"mysql.shards"` } type Shard struct { Addr string `json:"mysql.addr" mapstructure:"mysql.addr"` DB string `json:"mysql.db" mapstructure:"mysql.db"` User string `json:"mysql.user" mapstructure:"mysql.user"` Password string `json:"mysql.password" mapstructure:"mysql.password"` Timeout int `json:"mysql.timeout" mapstructure:"mysql.timeout"` MaxIdleConns int `json:"mysql.max_idle_conns" mapstructure:"mysql.max_idle_conns"` MaxOpenConns int `json:"mysql.max_open_conns" mapstructure:"mysql.max_open_conns"` ConnMaxLifetime int `json:"mysql.conn_max_lifetime" mapstructure:"mysql.conn_max_lifetime"` MaxQueryRows int `json:"mysql.max_query_rows" mapstructure:"mysql.max_query_rows"` } func NewMySQLWithSettings(ctx context.Context, settings interface{}) (*MySQL, error) { newest := new(MySQL) settingsMap := map[string]interface{}{} switch s := settings.(type) { case string: if err := json.Unmarshal([]byte(s), &settingsMap); err != nil { return nil, err } case map[string]interface{}: settingsMap = s default: return nil, errors.New("unsupported settings type") } if err := mapstructure.Decode(settingsMap, newest); err != nil { return nil, err } return newest, nil } // NewConn establishes a new connection to MySQL func (m *MySQL) NewConn(ctx context.Context, database string) (*gorm.DB, error) { if len(m.Shards) == 0 { return nil, errors.New("empty pgsql shards") } shard := m.Shards[0] if shard.Timeout == 0 { shard.Timeout = 300 } if shard.MaxIdleConns == 0 { shard.MaxIdleConns = 10 } if shard.MaxOpenConns == 0 { shard.MaxOpenConns = 100 } if shard.ConnMaxLifetime == 0 { shard.ConnMaxLifetime = 300 } if shard.MaxQueryRows == 0 { shard.MaxQueryRows = 100 } if len(shard.Addr) == 0 { return nil, errors.New("empty addr") } if len(shard.Addr) == 0 { return nil, errors.New("empty addr") } var keys []string var err error keys = append(keys, shard.Addr) keys = append(keys, shard.Password, shard.User) if len(database) > 0 { keys = append(keys, database) } cachedKey := strings.Join(keys, ":") // cache conn with database conn, ok := pool.PoolClient.Load(cachedKey) if ok { return conn.(*gorm.DB), nil } var db *gorm.DB defer func() { if db != nil && err == nil { pool.PoolClient.Store(cachedKey, db) } }() dsn := fmt.Sprintf("%s:%s@tcp(%s)/%s?charset=utf8&parseTime=True", shard.User, shard.Password, shard.Addr, database) db, err = sqlbase.NewDB( ctx, mysql.Open(dsn), shard.MaxIdleConns, shard.MaxOpenConns, time.Duration(shard.ConnMaxLifetime)*time.Second, ) return db, err } func (m *MySQL) ShowDatabases(ctx context.Context) ([]string, error) { db, err := m.NewConn(ctx, "") if err != nil { return nil, err } return sqlbase.ShowDatabases(ctx, db, "SHOW DATABASES") } func (m *MySQL) ShowTables(ctx context.Context, database string) ([]string, error) { db, err := m.NewConn(ctx, database) if err != nil { return nil, err } return sqlbase.ShowTables(ctx, db, "SHOW TABLES") } func (m *MySQL) DescTable(ctx context.Context, database, table string) ([]*types.ColumnProperty, error) { db, err := m.NewConn(ctx, database) if err != nil { return nil, err } query := fmt.Sprintf("DESCRIBE %s", table) return sqlbase.DescTable(ctx, db, query) } func (m *MySQL) SelectRows(ctx context.Context, database, table, query string) ([]map[string]interface{}, error) { db, err := m.NewConn(ctx, database) if err != nil { return nil, err } return sqlbase.SelectRows(ctx, db, table, query) } func (m *MySQL) ExecQuery(ctx context.Context, database string, sql string) ([]map[string]interface{}, error) { db, err := m.NewConn(ctx, database) if err != nil { return nil, err } return sqlbase.ExecQuery(ctx, db, sql) } ================================================ FILE: dskit/mysql/mysql_test.go ================================================ // @Author: Ciusyan 5/11/24 package mysql import ( "context" "testing" "github.com/stretchr/testify/require" ) func TestNewMySQLWithSettings(t *testing.T) { tests := []struct { name string settings interface{} wantErr bool }{ { name: "valid string settings", settings: `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}`, wantErr: false, }, { name: "invalid settings type", settings: 12345, wantErr: true, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { got, err := NewMySQLWithSettings(context.Background(), tt.settings) if (err != nil) != tt.wantErr { t.Errorf("NewMySQLWithSettings() error = %v, wantErr %v", err, tt.wantErr) } t.Log(got) }) } } func TestNewConn(t *testing.T) { ctx := context.Background() settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}` mysql, err := NewMySQLWithSettings(ctx, settings) require.NoError(t, err) tests := []struct { name string database string wantErr bool }{ { name: "valid connection", database: "db1", wantErr: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { _, err := mysql.NewConn(ctx, tt.database) if (err != nil) != tt.wantErr { t.Errorf("NewConn() error = %v, wantErr %v", err, tt.wantErr) return } }) } } func TestShowDatabases(t *testing.T) { ctx := context.Background() settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}` mysql, err := NewMySQLWithSettings(ctx, settings) require.NoError(t, err) databases, err := mysql.ShowDatabases(ctx) require.NoError(t, err) t.Log(databases) } func TestShowTables(t *testing.T) { ctx := context.Background() settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}` mysql, err := NewMySQLWithSettings(ctx, settings) require.NoError(t, err) tables, err := mysql.ShowTables(ctx, "db1") require.NoError(t, err) t.Log(tables) } func TestDescTable(t *testing.T) { ctx := context.Background() settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}` mysql, err := NewMySQLWithSettings(ctx, settings) require.NoError(t, err) descTable, err := mysql.DescTable(ctx, "db1", "students") require.NoError(t, err) for _, desc := range descTable { t.Logf("%+v", *desc) } } func TestExecQuery(t *testing.T) { ctx := context.Background() settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}` mysql, err := NewMySQLWithSettings(ctx, settings) require.NoError(t, err) rows, err := mysql.ExecQuery(ctx, "db1", "SELECT * FROM students WHERE id = 10008") require.NoError(t, err) for _, row := range rows { t.Log(row) } } func TestSelectRows(t *testing.T) { ctx := context.Background() settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}` mysql, err := NewMySQLWithSettings(ctx, settings) require.NoError(t, err) rows, err := mysql.SelectRows(ctx, "db1", "students", "id > 10008") require.NoError(t, err) for _, row := range rows { t.Log(row) } } ================================================ FILE: dskit/mysql/timeseries.go ================================================ package mysql import ( "context" "fmt" "strings" "github.com/ccfos/nightingale/v6/dskit/sqlbase" "github.com/ccfos/nightingale/v6/dskit/types" "gorm.io/gorm" ) // Query executes a given SQL query in MySQL and returns the results func (m *MySQL) Query(ctx context.Context, query *sqlbase.QueryParam) ([]map[string]interface{}, error) { db, err := m.NewConn(ctx, "") if err != nil { return nil, err } err = m.CheckMaxQueryRows(db, ctx, query) if err != nil { return nil, err } return sqlbase.Query(ctx, db, query) } // QueryTimeseries executes a time series data query using the given parameters func (m *MySQL) QueryTimeseries(ctx context.Context, query *sqlbase.QueryParam) ([]types.MetricValues, error) { db, err := m.NewConn(ctx, "") if err != nil { return nil, err } err = m.CheckMaxQueryRows(db, ctx, query) if err != nil { return nil, err } return sqlbase.QueryTimeseries(ctx, db, query) } func (m *MySQL) CheckMaxQueryRows(db *gorm.DB, ctx context.Context, query *sqlbase.QueryParam) error { sql := strings.ReplaceAll(query.Sql, ";", "") checkQuery := &sqlbase.QueryParam{ Sql: fmt.Sprintf("SELECT COUNT(*) as count FROM (%s) AS subquery;", sql), } res, err := sqlbase.Query(ctx, db, checkQuery) if err != nil { return err } if len(res) > 0 { if count, exists := res[0]["count"]; exists { v, err := sqlbase.ParseFloat64Value(count) if err != nil { return err } maxQueryRows := m.Shards[0].MaxQueryRows if maxQueryRows == 0 { maxQueryRows = 500 } if v > float64(maxQueryRows) { return fmt.Errorf("query result rows count %d exceeds the maximum limit %d", int(v), maxQueryRows) } } } return nil } ================================================ FILE: dskit/mysql/timeseries_test.go ================================================ // @Author: Ciusyan 5/11/24 package mysql import ( "context" "testing" "github.com/ccfos/nightingale/v6/dskit/sqlbase" "github.com/ccfos/nightingale/v6/dskit/types" "github.com/stretchr/testify/require" ) func TestQuery(t *testing.T) { ctx := context.Background() settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}` mysql, err := NewMySQLWithSettings(ctx, settings) require.NoError(t, err) param := &sqlbase.QueryParam{ Sql: "SELECT * FROM students WHERE id > 10900", Keys: types.Keys{ ValueKey: "", LabelKey: "", TimeKey: "", TimeFormat: "", }, } rows, err := mysql.Query(ctx, param) require.NoError(t, err) for _, row := range rows { t.Log(row) } } func TestQueryTimeseries(t *testing.T) { ctx := context.Background() settings := `{"mysql.addr":"localhost:3306","mysql.user":"root","mysql.password":"root","mysql.maxIdleConns":5,"mysql.maxOpenConns":10,"mysql.connMaxLifetime":30}` mysql, err := NewMySQLWithSettings(ctx, settings) require.NoError(t, err) // Prepare a test query parameter param := &sqlbase.QueryParam{ Sql: "SELECT id, grade, student_name, a_grade, update_time FROM students WHERE grade > 20000", // Modify SQL query to select specific columns Keys: types.Keys{ ValueKey: "grade a_grade", // Set the value key to the column name containing the metric value LabelKey: "id student_name", // Set the label key to the column name containing the metric label TimeKey: "update_time", // Set the time key to the column name containing the timestamp TimeFormat: "2006-01-02 15:04:05 +0000 UTC", // Provide the time format according to the timestamp column's format }, } // Execute the query and retrieve the time series data metricValues, err := mysql.QueryTimeseries(ctx, param) require.NoError(t, err) for _, metric := range metricValues { t.Log(metric) } } ================================================ FILE: dskit/pool/pool.go ================================================ package pool import ( "bytes" "sync" "time" gc "github.com/patrickmn/go-cache" ) var ( PoolClient = new(sync.Map) ) var ( // default cache instance, do not use this if you want to specify the defaultExpiration DefaultCache = gc.New(time.Hour*24, time.Hour) ) var ( bytesPool = sync.Pool{ New: func() interface{} { return new(bytes.Buffer) }, } ) func PoolGetBytesBuffer() *bytes.Buffer { buf := bytesPool.Get().(*bytes.Buffer) buf.Reset() return buf } func PoolPutBytesBuffer(buf *bytes.Buffer) { if buf == nil { return } bytesPool.Put(buf) } ================================================ FILE: dskit/postgres/postgres.go ================================================ // @Author: Ciusyan 5/20/24 package postgres import ( "context" "encoding/json" "errors" "fmt" "net/url" "strings" "time" "github.com/ccfos/nightingale/v6/dskit/pool" "github.com/ccfos/nightingale/v6/dskit/sqlbase" "github.com/ccfos/nightingale/v6/dskit/types" _ "github.com/lib/pq" // PostgreSQL driver "github.com/mitchellh/mapstructure" "gorm.io/driver/postgres" "gorm.io/gorm" ) type PostgreSQL struct { Shard `json:",inline" mapstructure:",squash"` } type Shard struct { Addr string `json:"pgsql.addr" mapstructure:"pgsql.addr"` DB string `json:"pgsql.db" mapstructure:"pgsql.db"` User string `json:"pgsql.user" mapstructure:"pgsql.user"` Password string `json:"pgsql.password" mapstructure:"pgsql.password" ` Timeout int `json:"pgsql.timeout" mapstructure:"pgsql.timeout"` MaxIdleConns int `json:"pgsql.max_idle_conns" mapstructure:"pgsql.max_idle_conns"` MaxOpenConns int `json:"pgsql.max_open_conns" mapstructure:"pgsql.max_open_conns"` ConnMaxLifetime int `json:"pgsql.conn_max_lifetime" mapstructure:"pgsql.conn_max_lifetime"` MaxQueryRows int `json:"pgsql.max_query_rows" mapstructure:"pgsql.max_query_rows"` } // NewPostgreSQLWithSettings initializes a new PostgreSQL instance with the given settings func NewPostgreSQLWithSettings(ctx context.Context, settings interface{}) (*PostgreSQL, error) { newest := new(PostgreSQL) settingsMap := map[string]interface{}{} switch s := settings.(type) { case string: if err := json.Unmarshal([]byte(s), &settingsMap); err != nil { return nil, err } case map[string]interface{}: settingsMap = s case *PostgreSQL: return s, nil case PostgreSQL: return &s, nil case Shard: newest.Shard = s return newest, nil case *Shard: newest.Shard = *s return newest, nil default: return nil, errors.New("unsupported settings type") } if err := mapstructure.Decode(settingsMap, newest); err != nil { return nil, err } return newest, nil } // NewConn establishes a new connection to PostgreSQL func (p *PostgreSQL) NewConn(ctx context.Context, database string) (*gorm.DB, error) { if len(p.DB) == 0 && len(database) == 0 { return nil, errors.New("empty pgsql database") // 兼容阿里实时数仓Holgres, 连接时必须指定db名字 } if p.Shard.Timeout == 0 { p.Shard.Timeout = 60 } if p.Shard.MaxIdleConns == 0 { p.Shard.MaxIdleConns = 10 } if p.Shard.MaxOpenConns == 0 { p.Shard.MaxOpenConns = 100 } if p.Shard.ConnMaxLifetime == 0 { p.Shard.ConnMaxLifetime = 14400 } if len(p.Shard.Addr) == 0 { return nil, errors.New("empty fe-node addr") } var keys []string var err error keys = append(keys, p.Shard.Addr) keys = append(keys, p.Shard.Password, p.Shard.User) if len(database) > 0 { keys = append(keys, database) } cachedKey := strings.Join(keys, ":") // cache conn with database conn, ok := pool.PoolClient.Load(cachedKey) if ok { return conn.(*gorm.DB), nil } var db *gorm.DB defer func() { if db != nil && err == nil { pool.PoolClient.Store(cachedKey, db) } }() // Simplified connection logic for PostgreSQL dsn := fmt.Sprintf("postgres://%s:%s@%s/%s?sslmode=disable&TimeZone=Asia/Shanghai", url.QueryEscape(p.Shard.User), url.QueryEscape(p.Shard.Password), p.Shard.Addr, database) db, err = sqlbase.NewDB( ctx, postgres.Open(dsn), p.Shard.MaxIdleConns, p.Shard.MaxOpenConns, time.Duration(p.Shard.ConnMaxLifetime)*time.Second, ) if err != nil { if db != nil { sqlDB, _ := db.DB() if sqlDB != nil { sqlDB.Close() } } return nil, err } return db, nil } // ShowDatabases lists all databases in PostgreSQL func (p *PostgreSQL) ShowDatabases(ctx context.Context, searchKeyword string) ([]string, error) { db, err := p.NewConn(ctx, "postgres") if err != nil { return nil, err } sql := fmt.Sprintf("SELECT datname FROM pg_database WHERE datistemplate = false AND datname LIKE %s", "'%"+searchKeyword+"%'") return sqlbase.ShowDatabases(ctx, db, sql) } // ShowTables lists all tables in a given database func (p *PostgreSQL) ShowTables(ctx context.Context, searchKeyword string) (map[string][]string, error) { db, err := p.NewConn(ctx, p.DB) if err != nil { return nil, err } sql := fmt.Sprintf("SELECT schemaname, tablename FROM pg_tables WHERE schemaname !='information_schema' and schemaname !='pg_catalog' and tablename LIKE %s", "'%"+searchKeyword+"%'") rets, err := sqlbase.ExecQuery(ctx, db, sql) if err != nil { return nil, err } tabs := make(map[string][]string, 3) for _, row := range rets { if val, ok := row["schemaname"].(string); ok { tabs[val] = append(tabs[val], row["tablename"].(string)) } } return tabs, nil } // DescTable describes the schema of a specified table in PostgreSQL // scheme default: public if not specified func (p *PostgreSQL) DescTable(ctx context.Context, scheme, table string) ([]*types.ColumnProperty, error) { db, err := p.NewConn(ctx, p.DB) if err != nil { return nil, err } if scheme == "" { scheme = "public" } query := fmt.Sprintf("SELECT column_name, data_type, is_nullable, column_default FROM information_schema.columns WHERE table_name = '%s' AND table_schema = '%s'", table, scheme) return sqlbase.DescTable(ctx, db, query) } // SelectRows selects rows from a specified table in PostgreSQL based on a given query func (p *PostgreSQL) SelectRows(ctx context.Context, table, where string) ([]map[string]interface{}, error) { db, err := p.NewConn(ctx, p.DB) if err != nil { return nil, err } return sqlbase.SelectRows(ctx, db, table, where) } // ExecQuery executes a SQL query in PostgreSQL func (p *PostgreSQL) ExecQuery(ctx context.Context, sql string) ([]map[string]interface{}, error) { db, err := p.NewConn(ctx, p.DB) if err != nil { return nil, err } return sqlbase.ExecQuery(ctx, db, sql) } ================================================ FILE: dskit/postgres/timeseries.go ================================================ package postgres import ( "context" "fmt" "strings" "github.com/ccfos/nightingale/v6/dskit/sqlbase" "github.com/ccfos/nightingale/v6/dskit/types" "gorm.io/gorm" ) // Query executes a given SQL query in PostgreSQL and returns the results func (p *PostgreSQL) Query(ctx context.Context, query *sqlbase.QueryParam) ([]map[string]interface{}, error) { db, err := p.NewConn(ctx, p.Shard.DB) if err != nil { return nil, err } err = p.CheckMaxQueryRows(db, ctx, query) if err != nil { return nil, err } return sqlbase.Query(ctx, db, query) } // QueryTimeseries executes a time series data query using the given parameters func (p *PostgreSQL) QueryTimeseries(ctx context.Context, query *sqlbase.QueryParam) ([]types.MetricValues, error) { db, err := p.NewConn(ctx, p.Shard.DB) if err != nil { return nil, err } err = p.CheckMaxQueryRows(db, ctx, query) if err != nil { return nil, err } return sqlbase.QueryTimeseries(ctx, db, query, true) } func (p *PostgreSQL) CheckMaxQueryRows(db *gorm.DB, ctx context.Context, query *sqlbase.QueryParam) error { sql := strings.ReplaceAll(query.Sql, ";", "") checkQuery := &sqlbase.QueryParam{ Sql: fmt.Sprintf("SELECT COUNT(*) as count FROM (%s) AS subquery;", sql), } res, err := sqlbase.Query(ctx, db, checkQuery) if err != nil { return err } if len(res) > 0 { if count, exists := res[0]["count"]; exists { v, err := sqlbase.ParseFloat64Value(count) if err != nil { return err } maxQueryRows := p.Shard.MaxQueryRows if maxQueryRows == 0 { maxQueryRows = 500 } if v > float64(maxQueryRows) { return fmt.Errorf("query result rows count %d exceeds the maximum limit %d", int(v), maxQueryRows) } } } return nil } ================================================ FILE: dskit/sqlbase/base.go ================================================ // @Author: Ciusyan 5/19/24 package sqlbase import ( "context" "database/sql" "fmt" "strings" "time" "gorm.io/gorm" "github.com/ccfos/nightingale/v6/dskit/types" ) // NewDB creates a new Gorm DB instance based on the provided gorm.Dialector and configures the connection pool func NewDB(ctx context.Context, dialector gorm.Dialector, maxIdleConns, maxOpenConns int, connMaxLifetime time.Duration) (*gorm.DB, error) { // Create a new Gorm DB instance db, err := gorm.Open(dialector, &gorm.Config{}) if err != nil { return db, err } // Configure the connection pool sqlDB, err := db.DB() if err != nil { return nil, err } sqlDB.SetMaxIdleConns(maxIdleConns) sqlDB.SetMaxOpenConns(maxOpenConns) sqlDB.SetConnMaxLifetime(connMaxLifetime) return db.WithContext(ctx), sqlDB.Ping() } func CloseDB(db *gorm.DB) error { if db != nil { sqlDb, err := db.DB() if err != nil { return err } return sqlDb.Close() } return nil } // ShowTables retrieves a list of all tables in the specified database func ShowTables(ctx context.Context, db *gorm.DB, query string) ([]string, error) { tables := make([]string, 0) rows, err := db.WithContext(ctx).Raw(query).Rows() if err != nil { return nil, err } defer rows.Close() for rows.Next() { var table string if err := rows.Scan(&table); err != nil { return nil, err } tables = append(tables, table) } return tables, nil } // ShowDatabases retrieves a list of all databases in the connected database server func ShowDatabases(ctx context.Context, db *gorm.DB, query string) ([]string, error) { var databases []string rows, err := db.WithContext(ctx).Raw(query).Rows() if err != nil { return nil, err } defer rows.Close() for rows.Next() { var database string if err := rows.Scan(&database); err != nil { return nil, err } databases = append(databases, database) } return databases, nil } // DescTable describes the schema of a specified table in MySQL or PostgreSQL func DescTable(ctx context.Context, db *gorm.DB, query string) ([]*types.ColumnProperty, error) { rows, err := db.WithContext(ctx).Raw(query).Rows() if err != nil { return nil, err } defer rows.Close() var columns []*types.ColumnProperty for rows.Next() { var ( field string typ string null string key sql.NullString defaultValue sql.NullString extra sql.NullString ) switch db.Dialector.Name() { case "mysql": if err := rows.Scan(&field, &typ, &null, &key, &defaultValue, &extra); err != nil { continue } case "postgres", "sqlserver": if err := rows.Scan(&field, &typ, &null, &defaultValue); err != nil { continue } case "oracle": if err := rows.Scan(&field, &typ, &null); err != nil { continue } } // Convert the database-specific type to internal type type2, indexable := ConvertDBType(db.Dialector.Name(), typ) columns = append(columns, &types.ColumnProperty{ Field: field, Type: typ, Type2: type2, Indexable: indexable, }) } return columns, nil } // ExecQuery executes the specified query and returns the result rows func ExecQuery(ctx context.Context, db *gorm.DB, sql string) ([]map[string]interface{}, error) { rows, err := db.WithContext(ctx).Raw(sql).Rows() if err != nil { return nil, err } defer rows.Close() columns, err := rows.Columns() if err != nil { return nil, err } var results []map[string]interface{} for rows.Next() { columnValues := make([]interface{}, len(columns)) columnPointers := make([]interface{}, len(columns)) for i := range columnValues { columnPointers[i] = &columnValues[i] } if err := rows.Scan(columnPointers...); err != nil { continue } rowMap := make(map[string]interface{}) for i, colName := range columns { val := columnValues[i] bytes, ok := val.([]byte) if ok { rowMap[colName] = string(bytes) } else { rowMap[colName] = val } } results = append(results, rowMap) } return results, nil } // SelectRows selects rows from a specified table based on a given query func SelectRows(ctx context.Context, db *gorm.DB, table, query string) ([]map[string]interface{}, error) { sql := fmt.Sprintf("SELECT * FROM %s", table) if query != "" { sql += " WHERE " + query } return ExecQuery(ctx, db, sql) } // convertDBType converts MySQL or PostgreSQL data types to custom internal types and determines if they are indexable func ConvertDBType(dialect, dbType string) (string, bool) { typ := strings.ToLower(dbType) // Common type conversions switch { case strings.HasPrefix(typ, "int"), strings.HasPrefix(typ, "tinyint"), strings.HasPrefix(typ, "smallint"), strings.HasPrefix(typ, "mediumint"), strings.HasPrefix(typ, "bigint"), strings.HasPrefix(typ, "serial"), strings.HasPrefix(typ, "bigserial"): return types.LogExtractValueTypeLong, true case strings.HasPrefix(typ, "varchar"), strings.HasPrefix(typ, "text"), strings.HasPrefix(typ, "char"), strings.HasPrefix(typ, "tinytext"), strings.HasPrefix(typ, "mediumtext"), strings.HasPrefix(typ, "longtext"), strings.HasPrefix(typ, "character varying"), strings.HasPrefix(typ, "nvarchar"), strings.HasPrefix(typ, "nchar"), strings.HasPrefix(typ, "bpchar"): return types.LogExtractValueTypeText, true case strings.HasPrefix(typ, "float"), strings.HasPrefix(typ, "double"), strings.HasPrefix(typ, "decimal"), strings.HasPrefix(typ, "numeric"), strings.HasPrefix(typ, "real"), strings.HasPrefix(typ, "double precision"): return types.LogExtractValueTypeFloat, true case strings.HasPrefix(typ, "date"), strings.HasPrefix(typ, "datetime"), strings.HasPrefix(typ, "timestamp"), strings.HasPrefix(typ, "timestamptz"), strings.HasPrefix(typ, "time"), strings.HasPrefix(typ, "smalldatetime"): return types.LogExtractValueTypeDate, false case strings.HasPrefix(typ, "boolean"), strings.HasPrefix(typ, "bit"), strings.HasPrefix(typ, "bool"): return types.LogExtractValueTypeBool, false } // Specific type conversions for MySQL if dialect == "mysql" { switch { default: return typ, false } } // Specific type conversions for PostgreSQL if dialect == "postgres" { switch { default: return typ, false } } if dialect == "oracle" { switch { default: return typ, false } } // Can continue to add specific 'dialect' type ... return typ, false } ================================================ FILE: dskit/sqlbase/timeseries.go ================================================ // @Author: Ciusyan 5/20/24 package sqlbase import ( "context" "crypto/md5" "encoding/json" "fmt" "math" "reflect" "sort" "strconv" "strings" "time" "github.com/ccfos/nightingale/v6/dskit/types" "github.com/prometheus/common/model" "gorm.io/gorm" ) type QueryParam struct { Sql string `json:"sql"` Keys types.Keys `json:"keys" mapstructure:"keys"` } var ( BannedOp = map[string]struct{}{ "CREATE": {}, "INSERT": {}, "UPDATE": {}, "DELETE": {}, "ALTER": {}, "REVOKE": {}, "DROP": {}, "RENAME": {}, "TRUNCATE": {}, "SET": {}, } ) // Query executes a given SQL query and returns the results func Query(ctx context.Context, db *gorm.DB, query *QueryParam) ([]map[string]interface{}, error) { // Validate SQL to prevent write operations if needed sqlItem := strings.Split(strings.ToUpper(query.Sql), " ") for _, item := range sqlItem { if _, ok := BannedOp[item]; ok { return nil, fmt.Errorf("operation %s is forbidden, only read operations are allowed, please check your SQL", item) } } return ExecQuery(ctx, db, query.Sql) } // QueryTimeseries executes a time series data query using the given parameters func QueryTimeseries(ctx context.Context, db *gorm.DB, query *QueryParam, ignoreDefault ...bool) ([]types.MetricValues, error) { rows, err := Query(ctx, db, query) if err != nil { return nil, err } return FormatMetricValues(query.Keys, rows, ignoreDefault...), nil } func FormatMetricValues(keys types.Keys, rows []map[string]interface{}, ignoreDefault ...bool) []types.MetricValues { ignore := false if len(ignoreDefault) > 0 { ignore = ignoreDefault[0] } keyMap := make(map[string]string) for _, valueMetric := range strings.Split(keys.ValueKey, " ") { keyMap[valueMetric] = "value" } for _, labelMetric := range strings.Split(keys.LabelKey, " ") { keyMap[labelMetric] = "label" } if keys.TimeKey == "" { // 默认支持 __time__ 和 time 作为时间字段 // 用户可以使用 as __time__ 来避免与表中已有的 time 字段冲突 keyMap["__time__"] = "time" keyMap["time"] = "time" } else { keyMap[keys.TimeKey] = "time" } var dataResps []types.MetricValues dataMap := make(map[string]*types.MetricValues) for _, row := range rows { labels := make(map[string]string) metricValue := make(map[string]float64) metricTs := make(map[string]float64) // Process each column based on its designated role (value, label, time) for k, v := range row { switch keyMap[k] { case "value": val, err := ParseFloat64Value(v) if err != nil { continue } metricValue[k] = val case "label": labels[k] = fmt.Sprintf("%v", v) case "time": ts, err := ParseTime(v, keys.TimeFormat) if err != nil { continue } metricTs[k] = float64(ts.Unix()) default: // Default to labels for any unrecognized columns if !ignore && keys.LabelKey == "" { // 只有当 labelKey 为空时,才将剩余的列作为 label labels[k] = fmt.Sprintf("%v", v) } } } // Compile and store the metric values for metricName, value := range metricValue { // NaN 无法执行json.Marshal(), 接口会报错 if math.IsNaN(value) { continue } metrics := make(model.Metric) var labelsStr []string for k1, v1 := range labels { metrics[model.LabelName(k1)] = model.LabelValue(v1) labelsStr = append(labelsStr, fmt.Sprintf("%s=%s", k1, v1)) } metrics["__name__"] = model.LabelValue(metricName) labelsStr = append(labelsStr, fmt.Sprintf("__name__=%s", metricName)) // Hash the labels to use as a key sort.Strings(labelsStr) labelsStrHash := fmt.Sprintf("%x", md5.Sum([]byte(strings.Join(labelsStr, ",")))) // Append new values to the existing metric, if present var ts float64 var exists bool if keys.TimeKey == "" { // 没有配置 timeKey,按优先级查找:__time__ > time ts, exists = metricTs["__time__"] if !exists { ts, exists = metricTs["time"] } } else { // 用户配置了 timeKey,使用用户配置的 ts, exists = metricTs[keys.TimeKey] } if !exists { // Default to current time if not specified // 大多数情况下offset为空 // 对于记录规则延迟计算的情况,统计值的时间戳需要有偏移,以便跟统计值对应 ts = float64(time.Now().Unix()) - float64(keys.Offset) } valuePair := []float64{ts, value} if existing, ok := dataMap[labelsStrHash]; ok { existing.Values = append(existing.Values, valuePair) } else { dataResp := types.MetricValues{ Metric: metrics, Values: [][]float64{valuePair}, } dataMap[labelsStrHash] = &dataResp } } } // Convert the map to a slice for the response for _, v := range dataMap { sort.Slice(v.Values, func(i, j int) bool { return v.Values[i][0] < v.Values[j][0] }) // Sort by timestamp dataResps = append(dataResps, *v) } return dataResps } // ParseFloat64Value attempts to convert an interface{} to float64 using reflection func ParseFloat64Value(val interface{}) (float64, error) { v := reflect.ValueOf(val) switch v.Kind() { case reflect.Float64, reflect.Float32: return v.Float(), nil case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: return float64(v.Int()), nil case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64: return float64(v.Uint()), nil case reflect.String: return strconv.ParseFloat(v.String(), 64) case reflect.Slice: if v.Type().Elem().Kind() == reflect.Uint8 { return strconv.ParseFloat(string(v.Bytes()), 64) } case reflect.Interface: return ParseFloat64Value(v.Interface()) case reflect.Ptr: if !v.IsNil() { return ParseFloat64Value(v.Elem().Interface()) } case reflect.Struct: if num, ok := val.(json.Number); ok { return num.Float64() } } return 0, fmt.Errorf("cannot convert type %T to float64", val) } // ParseTime attempts to parse a time value from an interface{} using a specified format func ParseTime(val interface{}, format string) (time.Time, error) { v := reflect.ValueOf(val) switch v.Kind() { case reflect.String: str := v.String() return parseTimeFromString(str, format) case reflect.Slice: if v.Type().Elem().Kind() == reflect.Uint8 { str := string(v.Bytes()) return parseTimeFromString(str, format) } case reflect.Int, reflect.Int64: return time.Unix(v.Int(), 0), nil case reflect.Float64: return time.Unix(int64(v.Float()), 0), nil case reflect.Interface: return ParseTime(v.Interface(), format) case reflect.Ptr: if !v.IsNil() { return ParseTime(v.Elem().Interface(), format) } case reflect.Struct: if t, ok := val.(time.Time); ok { return t, nil } } return time.Time{}, fmt.Errorf("invalid time value type: %v", val) } func parseTimeFromString(str, format string) (time.Time, error) { // If a custom time format is provided, use it to parse the string if format != "" { parsedTime, err := time.Parse(format, str) if err == nil { return parsedTime, nil } return time.Time{}, fmt.Errorf("failed to parse time '%s' with format '%s': %v", str, format, err) } // Try to parse the string as RFC3339, RFC3339Nano, or Unix timestamp if parsedTime, err := time.Parse(time.RFC3339, str); err == nil { return parsedTime, nil } if parsedTime, err := time.Parse(time.DateTime, str); err == nil { return parsedTime, nil } if parsedTime, err := time.Parse("2006-01-02 15:04:05.000000", str); err == nil { return parsedTime, nil } if parsedTime, err := time.Parse(time.RFC3339Nano, str); err == nil { return parsedTime, nil } if timestamp, err := strconv.ParseInt(str, 10, 64); err == nil { return time.Unix(timestamp, 0), nil } if timestamp, err := strconv.ParseFloat(str, 64); err == nil { return time.Unix(int64(timestamp), 0), nil } return time.Time{}, fmt.Errorf("failed to parse time '%s'", str) } ================================================ FILE: dskit/sqlbase/timeseries_test.go ================================================ // @Author: Ciusyan 5/17/24 package sqlbase import ( "encoding/json" "testing" "time" "github.com/ccfos/nightingale/v6/dskit/types" ) func TestFormatMetricValues(t *testing.T) { tests := []struct { name string keys types.Keys rows []map[string]interface{} want []types.MetricValues }{ { name: "cases1", keys: types.Keys{ ValueKey: "grade a_grade", LabelKey: "id student_name", TimeKey: "update_time", TimeFormat: "2006-01-02 15:04:05", }, rows: []map[string]interface{}{ { "id": "10007", "grade": 20003, "student_name": "邵子韬", "a_grade": 69, "update_time": "2024-05-14 10:00:00", }, { "id": "10007", "grade": 20003, "student_name": "邵子韬", "a_grade": 69, "update_time": "2024-05-14 10:05:00", }, { "id": "10007", "grade": 20003, "student_name": "邵子韬", "a_grade": 69, "update_time": "2024-05-14 10:10:00", }, { "id": "10008", "grade": 20004, "student_name": "Ciusyan", "a_grade": 100, "update_time": "2024-05-14 12:00:00", }, }, }, { name: "test __time__ priority over time", keys: types.Keys{ ValueKey: "value", LabelKey: "host", }, rows: []map[string]interface{}{ { "host": "server1", "value": 100, "time": int64(1715642100), // 这个应该被忽略 "__time__": int64(1715642135), // 这个应该被使用 }, }, }, { name: "test fallback to time when __time__ not exists", keys: types.Keys{ ValueKey: "value", LabelKey: "host", }, rows: []map[string]interface{}{ { "host": "server2", "value": 200, "time": int64(1715642200), // 应该使用这个 }, }, }, { name: "test __time__ alone without time field", keys: types.Keys{ ValueKey: "value", LabelKey: "host", }, rows: []map[string]interface{}{ { "host": "server3", "value": 300, "__time__": int64(1715642300), // 应该使用这个 }, }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { got := FormatMetricValues(tt.keys, tt.rows) for _, g := range got { t.Log(g) } }) } } func TestParseFloat64Value(t *testing.T) { ptr := func(val float64) *float64 { return &val } tests := []struct { name string input interface{} want float64 wantErr bool }{ {"float64", 1.23, 1.23, false}, {"float32", float32(1.23), float64(float32(1.23)), false}, {"int", 123, 123, false}, {"int64", int64(123), 123, false}, {"uint", uint(123), 123, false}, {"uint64", uint64(123), 123, false}, {"string", "1.23", 1.23, false}, {"[]byte", []byte("1.23"), 1.23, false}, {"json.Number", json.Number("1.23"), 1.23, false}, {"interface", interface{}(1.23), 1.23, false}, {"pointer", ptr(1.23), 1.23, false}, {"invalid string", "abc", 0, true}, {"invalid type", struct{}{}, 0, true}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { got, err := ParseFloat64Value(tt.input) if (err != nil) != tt.wantErr { t.Errorf("parseFloat64Value() error = %v, wantErr %v", err, tt.wantErr) return } if got != tt.want { t.Errorf("parseFloat64Value() = %v, want %v", got, tt.want) } }) } } func TestParseTime(t *testing.T) { ptrTime := func(t time.Time) *time.Time { return &t } tests := []struct { name string input interface{} format string want time.Time wantErr bool }{ {"RFC3339", "2024-05-14T12:34:56Z", "", time.Date(2024, 5, 14, 12, 34, 56, 0, time.UTC), false}, {"RFC3339Nano", "2024-05-14T12:34:56.789Z", "", time.Date(2024, 5, 14, 12, 34, 56, 789000000, time.UTC), false}, {"Unix timestamp int", int64(1715642135), "", time.Unix(1715642135, 0), false}, {"Unix timestamp float64", 1715642135.0, "", time.Unix(int64(1715642135), 0), false}, {"custom format", "14/05/2024", "02/01/2006", time.Date(2024, 5, 14, 0, 0, 0, 0, time.UTC), false}, {"slice", []byte("2024-05-14T12:34:56Z"), "", time.Date(2024, 5, 14, 12, 34, 56, 0, time.UTC), false}, {"interface", interface{}("2024-05-14T12:34:56Z"), "", time.Date(2024, 5, 14, 12, 34, 56, 0, time.UTC), false}, {"pointer", ptrTime(time.Date(2024, 5, 14, 12, 34, 56, 0, time.UTC)), "", time.Date(2024, 5, 14, 12, 34, 56, 0, time.UTC), false}, {"invalid format", "14-05-2024", "02/01/2006", time.Time{}, true}, {"invalid type", struct{}{}, "", time.Time{}, true}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { got, err := ParseTime(tt.input, tt.format) if (err != nil) != tt.wantErr { t.Errorf("ParseTime() error = %v, wantErr %v", err, tt.wantErr) return } if !got.Equal(tt.want) { t.Errorf("ParseTime() = %v, want %v", got, tt.want) } }) } } ================================================ FILE: dskit/tdengine/tdengine.go ================================================ package tdengine import ( "context" "encoding/base64" "encoding/json" "fmt" "net" "net/http" "strings" "time" "github.com/ccfos/nightingale/v6/dskit/types" "github.com/ccfos/nightingale/v6/pkg/tlsx" "github.com/toolkits/pkg/logger" ) type Tdengine struct { Addr string `json:"tdengine.addr" mapstructure:"tdengine.addr"` Basic *TDengineBasicAuth `json:"tdengine.basic" mapstructure:"tdengine.basic"` Token string `json:"tdengine.token" mapstructure:"tdengine.token"` Timeout int64 `json:"tdengine.timeout" mapstructure:"tdengine.timeout"` DialTimeout int64 `json:"tdengine.dial_timeout" mapstructure:"tdengine.dial_timeout"` MaxIdleConnsPerHost int `json:"tdengine.max_idle_conns_per_host" mapstructure:"tdengine.max_idle_conns_per_host"` Headers map[string]string `json:"tdengine.headers" mapstructure:"tdengine.headers"` SkipTlsVerify bool `json:"tdengine.skip_tls_verify" mapstructure:"tdengine.skip_tls_verify"` tlsx.ClientConfig header map[string][]string `json:"-"` client *http.Client `json:"-"` } type TDengineBasicAuth struct { User string `json:"tdengine.user" mapstructure:"tdengine.user"` Password string `json:"tdengine.password" mapstructure:"tdengine.password"` IsEncrypt bool `json:"tdengine.is_encrypt" mapstructure:"tdengine.is_encrypt"` } type APIResponse struct { Code int `json:"code"` ColumnMeta [][]interface{} `json:"column_meta"` Data [][]interface{} `json:"data"` Rows int `json:"rows"` } type QueryParam struct { Database string `json:"database"` Table string `json:"table"` } func (tc *Tdengine) InitCli() { tc.client = &http.Client{ Transport: &http.Transport{ Proxy: http.ProxyFromEnvironment, DialContext: (&net.Dialer{ Timeout: 30 * time.Second, KeepAlive: 30 * time.Second, }).DialContext, IdleConnTimeout: 90 * time.Second, TLSHandshakeTimeout: 10 * time.Second, ExpectContinueTimeout: 1 * time.Second, DisableCompression: true, }, } tc.header = map[string][]string{ "Connection": {"keep-alive"}, } for k, v := range tc.Headers { kv := strings.Split(v, ":") if len(kv) != 2 { continue } tc.header[k] = []string{v} } if tc.Basic != nil { basic := base64.StdEncoding.EncodeToString([]byte(tc.Basic.User + ":" + tc.Basic.Password)) tc.header["Authorization"] = []string{fmt.Sprintf("Basic %s", basic)} } } func (tc *Tdengine) QueryTable(query string) (APIResponse, error) { var apiResp APIResponse req, err := http.NewRequest("POST", tc.Addr+"/rest/sql", strings.NewReader(query)) if err != nil { return apiResp, err } for k, v := range tc.header { req.Header[k] = v } req.Header.Set("Content-Type", "application/x-www-form-urlencoded") resp, err := tc.client.Do(req) if err != nil { return apiResp, err } defer resp.Body.Close() // 限制响应体大小为10MB maxSize := int64(10 * 1024 * 1024) // 10MB limitedReader := http.MaxBytesReader(nil, resp.Body, maxSize) if resp.StatusCode != http.StatusOK { return apiResp, fmt.Errorf("HTTP error, status: %s", resp.Status) } err = json.NewDecoder(limitedReader).Decode(&apiResp) if err != nil { if strings.Contains(err.Error(), "http: request body too large") { return apiResp, fmt.Errorf("response body exceeds 10MB limit") } return apiResp, err } return apiResp, nil } func (tc *Tdengine) ShowDatabases(context.Context) ([]string, error) { databases := make([]string, 0) data, err := tc.QueryTable("show databases") if err != nil { return databases, err } for _, row := range data.Data { databases = append(databases, row[0].(string)) } return databases, nil } func (tc *Tdengine) ShowTables(ctx context.Context, database string) ([]string, error) { tables := make([]string, 0) sql := fmt.Sprintf("show %s", database) data, err := tc.QueryTable(sql) if err != nil { return tables, err } for _, row := range data.Data { tables = append(tables, row[0].(string)) } return tables, nil } func (tc *Tdengine) DescribeTable(ctx context.Context, query interface{}) ([]*types.ColumnProperty, error) { var columns []*types.ColumnProperty queryMap, ok := query.(map[string]string) if !ok { return nil, fmt.Errorf("invalid query") } sql := fmt.Sprintf("select * from %s.%s limit 1", queryMap["database"], queryMap["table"]) data, err := tc.QueryTable(sql) if err != nil { return columns, err } for _, row := range data.ColumnMeta { var colType string switch t := row[1].(type) { case float64: // v2版本数字类型映射 switch int(t) { case 1: colType = "BOOL" case 2: colType = "TINYINT" case 3: colType = "SMALLINT" case 4: colType = "INT" case 5: colType = "BIGINT" case 6: colType = "FLOAT" case 7: colType = "DOUBLE" case 8: colType = "BINARY" case 9: colType = "TIMESTAMP" case 10: colType = "NCHAR" default: colType = "UNKNOWN" } case string: // v3版本直接使用字符串类型 colType = t default: logger.Warningf("unexpected column type format: %v", row[1]) colType = "UNKNOWN" } column := &types.ColumnProperty{ Field: row[0].(string), Type: colType, } columns = append(columns, column) } return columns, nil } ================================================ FILE: dskit/types/timeseries.go ================================================ package types import ( "bytes" "fmt" "strconv" "github.com/prometheus/common/model" ) // 时序数据 type MetricValues struct { Metric model.Metric `json:"metric"` Values [][]float64 `json:"values"` } type HistogramValues struct { Total int64 `json:"total"` Values [][]float64 `json:"values"` } // 瞬时值 type AggregateValues struct { Labels map[string]string `json:"labels"` Values map[string]float64 `json:"values"` } // string func (m *MetricValues) String() string { var buf bytes.Buffer buf.WriteString(fmt.Sprintf("Metric: %+v ", m.Metric)) buf.WriteString("Values: ") for _, v := range m.Values { buf.WriteString(" [") for i, ts := range v { if i > 0 { buf.WriteString(", ") } buf.WriteString(strconv.FormatFloat(ts, 'f', -1, 64)) } buf.WriteString("] ") } return buf.String() } type Keys struct { ValueKey string `json:"valueKey" mapstructure:"valueKey"` // 多个用空格分隔 LabelKey string `json:"labelKey" mapstructure:"labelKey"` // 多个用空格分隔 TimeKey string `json:"timeKey" mapstructure:"timeKey"` TimeFormat string `json:"timeFormat" mapstructure:"timeFormat"` // not used anymore Offset int `json:"offset" mapstructure:"offset"` } ================================================ FILE: dskit/types/types.go ================================================ package types const ( LogExtractValueTypeLong = "long" LogExtractValueTypeFloat = "float" LogExtractValueTypeText = "text" LogExtractValueTypeDate = "date" LogExtractValueTypeBool = "bool" LogExtractValueTypeObject = "object" LogExtractValueTypeArray = "array" LogExtractValueTypeJSON = "json" ) type ColumnProperty struct { Field string `json:"field"` Type string `json:"type"` Type2 string `json:"type2,omitempty"` // field_property.Type Indexable bool `json:"indexable"` // 是否可以索引 } ================================================ FILE: dskit/victorialogs/victorialogs.go ================================================ package victorialogs import ( "bufio" "context" "crypto/tls" "encoding/json" "fmt" "io" "net/http" "net/url" "strconv" "strings" "time" ) type VictoriaLogs struct { VictorialogsAddr string `json:"victorialogs.addr" mapstructure:"victorialogs.addr"` VictorialogsBasic struct { VictorialogsUser string `json:"victorialogs.user" mapstructure:"victorialogs.user"` VictorialogsPass string `json:"victorialogs.password" mapstructure:"victorialogs.password"` IsEncrypt bool `json:"victorialogs.is_encrypt" mapstructure:"victorialogs.is_encrypt"` } `json:"victorialogs.basic" mapstructure:"victorialogs.basic"` VictorialogsTls struct { SkipTlsVerify bool `json:"victorialogs.tls.skip_tls_verify" mapstructure:"victorialogs.tls.skip_tls_verify"` } `json:"victorialogs.tls" mapstructure:"victorialogs.tls"` Headers map[string]string `json:"victorialogs.headers" mapstructure:"victorialogs.headers"` Timeout int64 `json:"victorialogs.timeout" mapstructure:"victorialogs.timeout"` // millis ClusterName string `json:"victorialogs.cluster_name" mapstructure:"victorialogs.cluster_name"` MaxQueryRows int `json:"victorialogs.max_query_rows" mapstructure:"victorialogs.max_query_rows"` HTTPClient *http.Client `json:"-" mapstructure:"-"` } // LogEntry 日志条目 type LogEntry map[string]interface{} // PrometheusResponse Prometheus 响应格式 type PrometheusResponse struct { Status string `json:"status"` Data PrometheusData `json:"data"` Error string `json:"error,omitempty"` } // PrometheusData Prometheus 数据部分 type PrometheusData struct { ResultType string `json:"resultType"` Result []PrometheusItem `json:"result"` } // PrometheusItem Prometheus 数据项 type PrometheusItem struct { Metric map[string]string `json:"metric"` Value []interface{} `json:"value,omitempty"` // [timestamp, value] Values [][]interface{} `json:"values,omitempty"` // [[timestamp, value], ...] } // HitsResult hits 查询响应 type HitsResult struct { Hits []struct { Total int64 `json:"total"` } } // InitHTTPClient 初始化 HTTP 客户端 func (vl *VictoriaLogs) InitHTTPClient() error { transport := &http.Transport{ MaxIdleConns: 100, MaxIdleConnsPerHost: 10, IdleConnTimeout: 90 * time.Second, TLSClientConfig: &tls.Config{ InsecureSkipVerify: vl.VictorialogsTls.SkipTlsVerify, }, } timeout := time.Duration(vl.Timeout) * time.Millisecond if timeout == 0 { timeout = 60 * time.Second } vl.HTTPClient = &http.Client{ Transport: transport, Timeout: timeout, } return nil } // Query 执行日志查询 // GET/POST /select/logsql/query?query=&start=&end=&limit= func (vl *VictoriaLogs) Query(ctx context.Context, query string, start, end int64, limit int) ([]LogEntry, error) { params := url.Values{} params.Set("query", query) if start > 0 { params.Set("start", strconv.FormatInt(start, 10)) } if end > 0 { params.Set("end", strconv.FormatInt(end, 10)) } if limit > 0 { params.Set("limit", strconv.Itoa(limit)) } else { params.Set("limit", strconv.Itoa(vl.MaxQueryRows)) // 默认 1000 条 } endpoint := fmt.Sprintf("%s/select/logsql/query", vl.VictorialogsAddr) resp, err := vl.doRequest(ctx, "POST", endpoint, params) if err != nil { return nil, err } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("read response body failed: %w", err) } if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("query failed: status=%d, body=%s", resp.StatusCode, string(body)) } // VictoriaLogs returns NDJSON format (one JSON object per line) var logs []LogEntry scanner := bufio.NewScanner(strings.NewReader(string(body))) for scanner.Scan() { line := scanner.Text() if line == "" { continue } var entry LogEntry if err := json.Unmarshal([]byte(line), &entry); err != nil { return nil, fmt.Errorf("decode log entry failed: %w, line=%s", err, line) } logs = append(logs, entry) } if err := scanner.Err(); err != nil { return nil, fmt.Errorf("scan response failed: %w", err) } return logs, nil } // StatsQuery 执行统计查询(单点时间) // POST /select/logsql/stats_query?query=&time=